Repository: kylin Updated Branches: refs/heads/yang-m1 aebef6239 -> 51a7b012f
KYLIN-1623 Make the hll precision for data samping configurable Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/51a7b012 Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/51a7b012 Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/51a7b012 Branch: refs/heads/yang-m1 Commit: 51a7b012f1b6d7a1a84f5b9df8918bebad9b7050 Parents: aebef62 Author: shaofengshi <shaofeng...@apache.org> Authored: Tue Apr 26 18:43:14 2016 +0800 Committer: shaofengshi <shaofeng...@apache.org> Committed: Tue Apr 26 18:45:53 2016 +0800 ---------------------------------------------------------------------- .../src/main/java/org/apache/kylin/common/KylinConfigBase.java | 4 ++++ .../src/main/java/org/apache/kylin/cube/util/CubingUtils.java | 2 +- .../java/org/apache/kylin/engine/mr/common/CubeStatsReader.java | 5 ++--- .../kylin/engine/mr/steps/FactDistinctColumnsReducer.java | 4 +++- .../kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java | 2 +- .../org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java | 2 +- .../main/java/org/apache/kylin/engine/spark/SparkCubing.java | 2 +- 7 files changed, 13 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/51a7b012/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java ---------------------------------------------------------------------- diff --git a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java index 011a76b..c9d1b98 100644 --- a/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java +++ b/core-common/src/main/java/org/apache/kylin/common/KylinConfigBase.java @@ -641,4 +641,8 @@ abstract public class KylinConfigBase implements Serializable { public int getDimCountDistinctMaxCardinality() { return Integer.parseInt(getOptional("kylin.query.dim.distinct.max", "5000000")); } + + public int getCubeStatsHLLPrecision() { + return Integer.parseInt(getOptional("kylin.job.cubing.inmem.sampling.hll.precision", "14")); + } } http://git-wip-us.apache.org/repos/asf/kylin/blob/51a7b012/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java ---------------------------------------------------------------------- diff --git a/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java b/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java index c541326..ac81318 100644 --- a/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java +++ b/core-cube/src/main/java/org/apache/kylin/cube/util/CubingUtils.java @@ -102,7 +102,7 @@ public class CubingUtils { }); final Map<Long, HyperLogLogPlusCounter> result = Maps.newHashMapWithExpectedSize(allCuboidIds.size()); for (Long cuboidId : allCuboidIds) { - result.put(cuboidId, new HyperLogLogPlusCounter(14)); + result.put(cuboidId, new HyperLogLogPlusCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision())); Integer[] cuboidBitSet = new Integer[Long.bitCount(cuboidId)]; long mask = Long.highestOneBit(baseCuboidId); http://git-wip-us.apache.org/repos/asf/kylin/blob/51a7b012/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java ---------------------------------------------------------------------- diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java index bac074b..44d5ce1 100644 --- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java +++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/common/CubeStatsReader.java @@ -42,7 +42,6 @@ import org.apache.kylin.cube.cuboid.CuboidScheduler; import org.apache.kylin.cube.kv.CubeDimEncMap; import org.apache.kylin.cube.model.CubeDesc; import org.apache.kylin.engine.mr.HadoopUtil; -import org.apache.kylin.engine.mr.steps.InMemCuboidJob; import org.apache.kylin.measure.hllc.HyperLogLogPlusCounter; import org.apache.kylin.metadata.datatype.DataType; import org.apache.kylin.metadata.model.MeasureDesc; @@ -68,7 +67,7 @@ import java.util.Map; */ public class CubeStatsReader { - private static final Logger logger = LoggerFactory.getLogger(InMemCuboidJob.class); + private static final Logger logger = LoggerFactory.getLogger(CubeStatsReader.class); final CubeSegment seg; final int samplingPercentage; @@ -100,7 +99,7 @@ public class CubeStatsReader { } else if (key.get() == -1) { mapperOverlapRatio = Bytes.toDouble(value.getBytes()); } else { - HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(14); + HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(kylinConfig.getCubeStatsHLLPrecision()); ByteArray byteArray = new ByteArray(value.getBytes()); hll.readRegisters(byteArray.asBuffer()); counterMap.put(key.get(), hll); http://git-wip-us.apache.org/repos/asf/kylin/blob/51a7b012/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java ---------------------------------------------------------------------- diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java index 211e947..e550183 100644 --- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java +++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctColumnsReducer.java @@ -61,6 +61,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<Text, Text, NullWri private TblColRef col = null; private boolean isStatistics = false; private boolean outputTouched = false; + private KylinConfig cubeConfig; @Override protected void setup(Context context) throws IOException { @@ -70,6 +71,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<Text, Text, NullWri KylinConfig config = AbstractHadoopJob.loadKylinPropsAndMetadata(); String cubeName = conf.get(BatchConstants.CFG_CUBE_NAME); CubeInstance cube = CubeManager.getInstance(config).getCube(cubeName); + cubeConfig = cube.getConfig(); cubeDesc = cube.getDescriptor(); columnList = CubeManager.getInstance(config).getAllDictColumnsOnFact(cubeDesc); @@ -106,7 +108,7 @@ public class FactDistinctColumnsReducer extends KylinReducer<Text, Text, NullWri // for hll long cuboidId = 0 - Bytes.toLong(key.getBytes(), 0, Bytes.SIZEOF_LONG); for (Text value : values) { - HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(14); + HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(cubeConfig.getCubeStatsHLLPrecision()); ByteBuffer bf = ByteBuffer.wrap(value.getBytes(), 0, value.getLength()); hll.readRegisters(bf); http://git-wip-us.apache.org/repos/asf/kylin/blob/51a7b012/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java ---------------------------------------------------------------------- diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java index 44de69a..f4c9e16 100644 --- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java +++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/FactDistinctHiveColumnsMapper.java @@ -73,7 +73,7 @@ public class FactDistinctHiveColumnsMapper<KEYIN> extends FactDistinctColumnsMap allCuboidsHLL = new HyperLogLogPlusCounter[cuboidIds.length]; for (int i = 0; i < cuboidIds.length; i++) { - allCuboidsHLL[i] = new HyperLogLogPlusCounter(14); + allCuboidsHLL[i] = new HyperLogLogPlusCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision()); } hf = Hashing.murmur3_32(); http://git-wip-us.apache.org/repos/asf/kylin/blob/51a7b012/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java ---------------------------------------------------------------------- diff --git a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java index 8c37fec..fa6f9e2 100644 --- a/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java +++ b/engine-mr/src/main/java/org/apache/kylin/engine/mr/steps/MergeStatisticsStep.java @@ -97,7 +97,7 @@ public class MergeStatisticsStep extends AbstractExecutable { // sampling percentage; averageSamplingPercentage += Bytes.toInt(value.getBytes()); } else if (key.get() > 0) { - HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(14); + HyperLogLogPlusCounter hll = new HyperLogLogPlusCounter(kylinConf.getCubeStatsHLLPrecision()); ByteArray byteArray = new ByteArray(value.getBytes()); hll.readRegisters(byteArray.asBuffer()); http://git-wip-us.apache.org/repos/asf/kylin/blob/51a7b012/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java ---------------------------------------------------------------------- diff --git a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java index ef35067..70c1032 100644 --- a/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java +++ b/engine-spark/src/main/java/org/apache/kylin/engine/spark/SparkCubing.java @@ -246,7 +246,7 @@ public class SparkCubing extends AbstractApplication { List<Long> allCuboidIds = cuboidScheduler.getAllCuboidIds(); final HashMap<Long, HyperLogLogPlusCounter> zeroValue = Maps.newHashMap(); for (Long id : allCuboidIds) { - zeroValue.put(id, new HyperLogLogPlusCounter(14)); + zeroValue.put(id, new HyperLogLogPlusCounter(cubeDesc.getConfig().getCubeStatsHLLPrecision())); } CubeJoinedFlatTableDesc flatTableDesc = new CubeJoinedFlatTableDesc(cubeDesc, null);