KYLIN-2396 percentile pre-aggregation support
Project: http://git-wip-us.apache.org/repos/asf/kylin/repo Commit: http://git-wip-us.apache.org/repos/asf/kylin/commit/95d44121 Tree: http://git-wip-us.apache.org/repos/asf/kylin/tree/95d44121 Diff: http://git-wip-us.apache.org/repos/asf/kylin/diff/95d44121 Branch: refs/heads/master-cdh5.7 Commit: 95d4412194fb6fc08c99600d162826d9350a7a80 Parents: 4625b05 Author: lidongsjtu <lid...@apache.org> Authored: Mon Jan 16 11:12:36 2017 +0800 Committer: lidongsjtu <lid...@apache.org> Committed: Mon Jan 16 17:25:28 2017 +0800 ---------------------------------------------------------------------- .../org/apache/kylin/common/util/MathUtil.java | 32 ++++++ core-metadata/pom.xml | 4 + .../kylin/measure/MeasureTypeFactory.java | 4 +- .../measure/percentile/PercentileAggFunc.java | 44 ++++++++ .../percentile/PercentileAggregator.java | 64 ++++++++++++ .../measure/percentile/PercentileContUdf.java | 37 +++++++ .../measure/percentile/PercentileCounter.java | 97 ++++++++++++++++++ .../percentile/PercentileMeasureType.java | 102 +++++++++++++++++++ .../percentile/PercentileSerializer.java | 71 +++++++++++++ .../percentile/PercentileAggregatorTest.java | 55 ++++++++++ .../percentile/PercentileCounterTest.java | 79 ++++++++++++++ .../percentile/PercentileSerializerTest.java | 68 +++++++++++++ .../kylin/measure/percentile/TDigestTest.java | 57 +++++++++++ .../localmeta/cube_desc/ci_inner_join_cube.json | 12 ++- .../localmeta/cube_desc/ci_left_join_cube.json | 12 ++- .../apache/kylin/query/ITKylinQueryTest.java | 18 ++-- .../resources/query/sql_percentile/query01.sql | 2 + pom.xml | 6 ++ storage-hbase/pom.xml | 1 + 19 files changed, 755 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-common/src/main/java/org/apache/kylin/common/util/MathUtil.java ---------------------------------------------------------------------- diff --git a/core-common/src/main/java/org/apache/kylin/common/util/MathUtil.java b/core-common/src/main/java/org/apache/kylin/common/util/MathUtil.java new file mode 100644 index 0000000..ae674c6 --- /dev/null +++ b/core-common/src/main/java/org/apache/kylin/common/util/MathUtil.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kylin.common.util; + +import java.util.List; + +public class MathUtil { + public static double findMedianInSortedList(List<Double> m) { + int middle = m.size() / 2; + if (m.size() % 2 == 1) { + return m.get(middle); + } else { + return (m.get(middle - 1) + m.get(middle)) / 2.0; + } + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/pom.xml ---------------------------------------------------------------------- diff --git a/core-metadata/pom.xml b/core-metadata/pom.xml index b3fe885..87c4438 100644 --- a/core-metadata/pom.xml +++ b/core-metadata/pom.xml @@ -47,6 +47,10 @@ <groupId>org.roaringbitmap</groupId> <artifactId>RoaringBitmap</artifactId> </dependency> + <dependency> + <groupId>com.tdunning</groupId> + <artifactId>t-digest</artifactId> + </dependency> <!-- Env & Test --> <dependency> http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java index 8e58858..5d0e007 100644 --- a/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java +++ b/core-metadata/src/main/java/org/apache/kylin/measure/MeasureTypeFactory.java @@ -28,6 +28,7 @@ import org.apache.kylin.measure.bitmap.BitmapMeasureType; import org.apache.kylin.measure.dim.DimCountDistinctMeasureType; import org.apache.kylin.measure.extendedcolumn.ExtendedColumnMeasureType; import org.apache.kylin.measure.hllc.HLLCMeasureType; +import org.apache.kylin.measure.percentile.PercentileMeasureType; import org.apache.kylin.measure.raw.RawMeasureType; import org.apache.kylin.measure.topn.TopNMeasureType; import org.apache.kylin.metadata.datatype.DataType; @@ -109,6 +110,7 @@ abstract public class MeasureTypeFactory<T> { factoryInsts.add(new TopNMeasureType.Factory()); factoryInsts.add(new RawMeasureType.Factory()); factoryInsts.add(new ExtendedColumnMeasureType.Factory()); + factoryInsts.add(new PercentileMeasureType.Factory()); logger.info("Checking custom measure types from kylin config"); @@ -143,7 +145,7 @@ abstract public class MeasureTypeFactory<T> { List<MeasureTypeFactory<?>> list = factories.get(funcName); if (list == null) list = Lists.newArrayListWithCapacity(2); - factories.put(funcName, list); + factories.put(funcName, list); list.add(factory); } http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileAggFunc.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileAggFunc.java b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileAggFunc.java new file mode 100644 index 0000000..ad02019 --- /dev/null +++ b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileAggFunc.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +package org.apache.kylin.measure.percentile; + +public class PercentileAggFunc { + public static PercentileCounter init() { + return null; + } + + public static PercentileCounter add(PercentileCounter counter, Object v, Object r) { + PercentileCounter c = (PercentileCounter) v; + Number n = (Number) r; + if (counter == null) { + counter = new PercentileCounter(c.compression, n.doubleValue()); + } + counter.merge(c); + return counter; + } + + public static PercentileCounter merge(PercentileCounter counter0, PercentileCounter counter1) { + counter0.merge(counter1); + return counter0; + } + + public static double result(PercentileCounter counter) { + return counter == null ? 0L : counter.getResultEstimate(); + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileAggregator.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileAggregator.java b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileAggregator.java new file mode 100644 index 0000000..d6b93eb --- /dev/null +++ b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileAggregator.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +package org.apache.kylin.measure.percentile; + +import org.apache.kylin.measure.MeasureAggregator; + +public class PercentileAggregator extends MeasureAggregator<PercentileCounter> { + final double compression; + PercentileCounter sum = null; + + public PercentileAggregator(double compression) { + this.compression = compression; + } + + @Override + public void reset() { + sum = null; + } + + @Override + public void aggregate(PercentileCounter value) { + if (sum == null) + sum = new PercentileCounter(value); + else + sum.merge(value); + } + + @Override + public PercentileCounter aggregate(PercentileCounter value1, PercentileCounter value2) { + PercentileCounter merged = new PercentileCounter(value1); + merged.merge(value2); + return merged; + } + + @Override + public PercentileCounter getState() { + return sum; + } + + @Override + public int getMemBytesEstimate() { + // 10K as upbound + // Test on random double data, 20 tDigest, each has 5000000 doubles. Finally merged into one tDigest. + // Before compress: 10309 bytes + // After compress: 8906 bytes + return 10 * 1024; + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileContUdf.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileContUdf.java b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileContUdf.java new file mode 100644 index 0000000..4ef6b75 --- /dev/null +++ b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileContUdf.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +package org.apache.kylin.measure.percentile; + +public class PercentileContUdf { + public static double init() { + return 0; + } + + public static double add(double accumulator, double v, double r) { + return 0; + } + + public static double merge(double accumulator0, double accumulator1) { + return 0; + } + + public static double result(long accumulator) { + return 0; + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileCounter.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileCounter.java b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileCounter.java new file mode 100644 index 0000000..bf505cf --- /dev/null +++ b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileCounter.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +package org.apache.kylin.measure.percentile; + +import java.io.Serializable; +import java.nio.ByteBuffer; + +import com.tdunning.math.stats.AVLTreeDigest; +import com.tdunning.math.stats.TDigest; + +public class PercentileCounter implements Serializable { + private static final double INVALID_QUANTILE_RATIO = -1; + + double compression; + double quantileRatio; + + TDigest registers; + + public PercentileCounter(double compression) { + this(compression, INVALID_QUANTILE_RATIO); + } + + public PercentileCounter(PercentileCounter another) { + this(another.compression, another.quantileRatio); + merge(another); + } + + public PercentileCounter(double compression, double quantileRatio) { + this.compression = compression; + this.quantileRatio = quantileRatio; + reInitRegisters(); + } + + private void reInitRegisters() { + this.registers = TDigest.createAvlTreeDigest(this.compression); + } + + public void add(double v) { + registers.add(v); + } + + public void merge(PercentileCounter counter) { + assert this.compression == counter.compression; + registers.add(counter.registers); + } + + public double getResultEstimate() { + return registers.quantile(quantileRatio); + } + + public void writeRegisters(ByteBuffer out) { + registers.compress(); + registers.asSmallBytes(out); + } + + public void readRegisters(ByteBuffer in) { + registers = AVLTreeDigest.fromBytes(in); + compression = registers.compression(); + } + + public int getBytesEstimate() { + return maxLength(); + } + + public int maxLength() { + // 10KB for max length + return 10 * 1024; + } + + public int peekLength(ByteBuffer in) { + int mark = in.position(); + AVLTreeDigest.fromBytes(in); + int total = in.position() - mark; + in.position(mark); + return total; + } + + public void clear() { + reInitRegisters(); + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileMeasureType.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileMeasureType.java b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileMeasureType.java new file mode 100644 index 0000000..45ebe89 --- /dev/null +++ b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileMeasureType.java @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +package org.apache.kylin.measure.percentile; + +import java.util.Map; + +import org.apache.kylin.common.util.Dictionary; +import org.apache.kylin.measure.MeasureAggregator; +import org.apache.kylin.measure.MeasureIngester; +import org.apache.kylin.measure.MeasureType; +import org.apache.kylin.measure.MeasureTypeFactory; +import org.apache.kylin.metadata.datatype.DataType; +import org.apache.kylin.metadata.datatype.DataTypeSerializer; +import org.apache.kylin.metadata.model.MeasureDesc; +import org.apache.kylin.metadata.model.TblColRef; + +import com.google.common.collect.ImmutableMap; + +public class PercentileMeasureType extends MeasureType<PercentileCounter> { + // compression ratio saved in DataType.precision + private final DataType dataType; + public static final String FUNC_PERCENTILE = "PERCENTILE"; + public static final String DATATYPE_PERCENTILE = "percentile"; + + public PercentileMeasureType(String funcName, DataType dataType) { + this.dataType = dataType; + } + + public static class Factory extends MeasureTypeFactory<PercentileCounter> { + + @Override + public MeasureType<PercentileCounter> createMeasureType(String funcName, DataType dataType) { + return new PercentileMeasureType(funcName, dataType); + } + + @Override + public String getAggrFunctionName() { + return FUNC_PERCENTILE; + } + + @Override + public String getAggrDataTypeName() { + return DATATYPE_PERCENTILE; + } + + @Override + public Class<? extends DataTypeSerializer<PercentileCounter>> getAggrDataTypeSerializer() { + return PercentileSerializer.class; + } + } + + @Override + public MeasureIngester<PercentileCounter> newIngester() { + return new MeasureIngester<PercentileCounter>() { + PercentileCounter current = new PercentileCounter(dataType.getPrecision()); + + @Override + public PercentileCounter valueOf(String[] values, MeasureDesc measureDesc, Map<TblColRef, Dictionary<String>> dictionaryMap) { + PercentileCounter counter = current; + counter.clear(); + for (String v : values) { + if (v != null) + counter.add(Double.parseDouble(v)); + } + return counter; + } + }; + } + + @Override + public MeasureAggregator<PercentileCounter> newAggregator() { + return new PercentileAggregator(dataType.getPrecision()); + } + + @Override + public boolean needRewrite() { + return true; + } + + static final Map<String, Class<?>> UDAF_MAP = ImmutableMap.<String, Class<?>> of(PercentileMeasureType.FUNC_PERCENTILE, PercentileAggFunc.class); + + @Override + public Map<String, Class<?>> getRewriteCalciteAggrFunctions() { + return UDAF_MAP; + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileSerializer.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileSerializer.java b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileSerializer.java new file mode 100644 index 0000000..a0a2a77 --- /dev/null +++ b/core-metadata/src/main/java/org/apache/kylin/measure/percentile/PercentileSerializer.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. +*/ + +package org.apache.kylin.measure.percentile; + +import java.nio.ByteBuffer; + +import org.apache.kylin.metadata.datatype.DataType; +import org.apache.kylin.metadata.datatype.DataTypeSerializer; + +public class PercentileSerializer extends DataTypeSerializer<PercentileCounter> { + // be thread-safe and avoid repeated obj creation + private ThreadLocal<PercentileCounter> current = new ThreadLocal<>(); + + private double compression; + + public PercentileSerializer(DataType type) { + this.compression = type.getPrecision(); + } + + @Override + public int peekLength(ByteBuffer in) { + return current().peekLength(in); + } + + @Override + public int maxLength() { + return current().maxLength(); + } + + @Override + public int getStorageBytesEstimate() { + return current().getBytesEstimate(); + } + + private PercentileCounter current() { + PercentileCounter counter = current.get(); + if (counter == null) { + counter = new PercentileCounter(compression); + current.set(counter); + } + return counter; + } + + @Override + public void serialize(PercentileCounter value, ByteBuffer out) { + value.writeRegisters(out); + } + + @Override + public PercentileCounter deserialize(ByteBuffer in) { + PercentileCounter counter = current(); + counter.readRegisters(in); + return counter; + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileAggregatorTest.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileAggregatorTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileAggregatorTest.java new file mode 100644 index 0000000..9d59d46 --- /dev/null +++ b/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileAggregatorTest.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kylin.measure.percentile; + +import static org.junit.Assert.assertEquals; + +import java.util.Collections; +import java.util.List; +import java.util.Random; + +import org.apache.kylin.common.util.MathUtil; +import org.junit.Test; + +import com.google.common.collect.Lists; + +public class PercentileAggregatorTest { + @Test + public void testAggregate() { + double compression = 100; + int datasize = 10000; + PercentileAggregator aggregator = new PercentileAggregator(compression); + Random random = new Random(); + List<Double> dataset = Lists.newArrayListWithCapacity(datasize); + for (int i = 0; i < datasize; i++) { + double d = random.nextDouble(); + dataset.add(d); + + PercentileCounter c = new PercentileCounter(compression, 0.5); + c.add(d); + aggregator.aggregate(c); + } + Collections.sort(dataset); + + double actualResult = aggregator.getState().getResultEstimate(); + double expectResult = MathUtil.findMedianInSortedList(dataset); + assertEquals(expectResult, actualResult, 0.001); + } + +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileCounterTest.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileCounterTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileCounterTest.java new file mode 100644 index 0000000..abaa409 --- /dev/null +++ b/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileCounterTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kylin.measure.percentile; + +import static org.junit.Assert.assertEquals; + +import java.util.Collections; +import java.util.List; +import java.util.Random; + +import org.apache.kylin.common.util.MathUtil; +import org.junit.Test; + +import com.google.common.collect.Lists; +import com.tdunning.math.stats.TDigest; + +public class PercentileCounterTest { + @Test + public void testBasic() { + int times = 1; + int compression = 100; + for (int t = 0; t < times; t++) { + PercentileCounter counter = new PercentileCounter(compression, 0.5); + Random random = new Random(); + int dataSize = 10000; + List<Double> dataset = Lists.newArrayListWithCapacity(dataSize); + for (int i = 0; i < dataSize; i++) { + double d = random.nextDouble(); + counter.add(d); + dataset.add(d); + } + Collections.sort(dataset); + + double actualResult = counter.getResultEstimate(); + double expectedResult = MathUtil.findMedianInSortedList(dataset); + assertEquals(expectedResult, actualResult, 0.001); + } + } + + @Test + public void testTDigest() { + double compression = 100; + double quantile = 0.5; + + PercentileCounter counter = new PercentileCounter(compression, quantile); + TDigest tDigest = TDigest.createAvlTreeDigest(compression); + + Random random = new Random(); + int dataSize = 10000; + List<Double> dataset = Lists.newArrayListWithCapacity(dataSize); + for (int i = 0; i < dataSize; i++) { + double d = random.nextDouble(); + counter.add(d); + tDigest.add(d); + } + double actualResult = counter.getResultEstimate(); + + Collections.sort(dataset); + double expectedResult = tDigest.quantile(quantile); + + assertEquals(expectedResult, actualResult, 0); + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileSerializerTest.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileSerializerTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileSerializerTest.java new file mode 100644 index 0000000..07b4495 --- /dev/null +++ b/core-metadata/src/test/java/org/apache/kylin/measure/percentile/PercentileSerializerTest.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kylin.measure.percentile; + +import static org.junit.Assert.assertEquals; + +import java.nio.ByteBuffer; +import java.util.Random; + +import org.apache.kylin.common.util.LocalFileMetadataTestCase; +import org.apache.kylin.metadata.datatype.DataType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +/** + * Created by dongli on 5/21/16. + */ +public class PercentileSerializerTest extends LocalFileMetadataTestCase { + + @Before + public void setup() throws Exception { + createTestMetadata(); + } + + @After + public void after() throws Exception { + cleanAfterClass(); + } + + @Test + public void testBasic() { + PercentileSerializer serializer = new PercentileSerializer(DataType.getType("percentile(100)")); + PercentileCounter counter = new PercentileCounter(100, 0.5); + Random random = new Random(); + for (int i = 0; i < 1000; i++) { + counter.add(random.nextDouble()); + } + double markResult = counter.getResultEstimate(); + + ByteBuffer buffer = ByteBuffer.allocateDirect(serializer.getStorageBytesEstimate()); + serializer.serialize(counter, buffer); + + buffer.flip(); + counter = serializer.deserialize(buffer); + PercentileCounter counter1 = new PercentileCounter(100, 0.5); + counter1.merge(counter); + + assertEquals(markResult, counter1.getResultEstimate(), 0.01); + } + +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/core-metadata/src/test/java/org/apache/kylin/measure/percentile/TDigestTest.java ---------------------------------------------------------------------- diff --git a/core-metadata/src/test/java/org/apache/kylin/measure/percentile/TDigestTest.java b/core-metadata/src/test/java/org/apache/kylin/measure/percentile/TDigestTest.java new file mode 100644 index 0000000..1adb604 --- /dev/null +++ b/core-metadata/src/test/java/org/apache/kylin/measure/percentile/TDigestTest.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.kylin.measure.percentile; + +import static org.junit.Assert.assertEquals; + +import java.util.Collections; +import java.util.List; +import java.util.Random; + +import org.apache.kylin.common.util.MathUtil; +import org.junit.Ignore; +import org.junit.Test; + +import com.google.common.collect.Lists; +import com.tdunning.math.stats.TDigest; + +@Ignore +public class TDigestTest { + @Test + public void testBasic() { + int times = 1; + int compression = 100; + for (int t = 0; t < times; t++) { + TDigest tDigest = TDigest.createAvlTreeDigest(compression); + Random random = new Random(); + int dataSize = 10000; + List<Double> dataset = Lists.newArrayListWithCapacity(dataSize); + for (int i = 0; i < dataSize; i++) { + double d = random.nextDouble(); + tDigest.add(d); + dataset.add(d); + } + Collections.sort(dataset); + + double actualResult = tDigest.quantile(0.5); + double expectedResult = MathUtil.findMedianInSortedList(dataset); + assertEquals(expectedResult, actualResult, 0.01); + } + } +} http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/examples/test_case_data/localmeta/cube_desc/ci_inner_join_cube.json ---------------------------------------------------------------------- diff --git a/examples/test_case_data/localmeta/cube_desc/ci_inner_join_cube.json b/examples/test_case_data/localmeta/cube_desc/ci_inner_join_cube.json index 1ebd7f3..0fda3b3 100644 --- a/examples/test_case_data/localmeta/cube_desc/ci_inner_join_cube.json +++ b/examples/test_case_data/localmeta/cube_desc/ci_inner_join_cube.json @@ -267,6 +267,16 @@ }, "returntype" : "raw" } + }, { + "name" : "GVM_PERCENTILE", + "function" : { + "expression" : "PERCENTILE", + "parameter" : { + "type" : "column", + "value" : "TEST_KYLIN_FACT.PRICE" + }, + "returntype" : "percentile(100)" + } } ], "dictionaries": [ { "column": "TEST_KYLIN_FACT.TEST_COUNT_DISTINCT_BITMAP", @@ -358,7 +368,7 @@ "name" : "f3", "columns" : [ { "qualifier" : "m", - "measure_refs" : [ "TEST_EXTENDED_COLUMN", "TRANS_ID_RAW", "PRICE_RAW", "CAL_DT_RAW", "BUYER_CONTACT", "SELLER_CONTACT" ] + "measure_refs" : [ "TEST_EXTENDED_COLUMN", "TRANS_ID_RAW", "PRICE_RAW", "CAL_DT_RAW", "BUYER_CONTACT", "SELLER_CONTACT", "GVM_PERCENTILE" ] } ] } ] }, http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/examples/test_case_data/localmeta/cube_desc/ci_left_join_cube.json ---------------------------------------------------------------------- diff --git a/examples/test_case_data/localmeta/cube_desc/ci_left_join_cube.json b/examples/test_case_data/localmeta/cube_desc/ci_left_join_cube.json index 4048b6e..51139ae 100644 --- a/examples/test_case_data/localmeta/cube_desc/ci_left_join_cube.json +++ b/examples/test_case_data/localmeta/cube_desc/ci_left_join_cube.json @@ -267,6 +267,16 @@ }, "returntype" : "raw" } + }, { + "name" : "GVM_PERCENTILE", + "function" : { + "expression" : "PERCENTILE", + "parameter" : { + "type" : "column", + "value" : "TEST_KYLIN_FACT.PRICE" + }, + "returntype" : "percentile(100)" + } } ], "dictionaries": [ { "column": "TEST_KYLIN_FACT.TEST_COUNT_DISTINCT_BITMAP", @@ -358,7 +368,7 @@ "name" : "f3", "columns" : [ { "qualifier" : "m", - "measure_refs" : [ "TEST_EXTENDED_COLUMN", "TRANS_ID_RAW", "PRICE_RAW", "CAL_DT_RAW", "BUYER_CONTACT", "SELLER_CONTACT" ] + "measure_refs" : [ "TEST_EXTENDED_COLUMN", "TRANS_ID_RAW", "PRICE_RAW", "CAL_DT_RAW", "BUYER_CONTACT", "SELLER_CONTACT", "GVM_PERCENTILE" ] } ] } ] }, http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/kylin-it/src/test/java/org/apache/kylin/query/ITKylinQueryTest.java ---------------------------------------------------------------------- diff --git a/kylin-it/src/test/java/org/apache/kylin/query/ITKylinQueryTest.java b/kylin-it/src/test/java/org/apache/kylin/query/ITKylinQueryTest.java index 82d0873..02134d4 100644 --- a/kylin-it/src/test/java/org/apache/kylin/query/ITKylinQueryTest.java +++ b/kylin-it/src/test/java/org/apache/kylin/query/ITKylinQueryTest.java @@ -192,17 +192,17 @@ public class ITKylinQueryTest extends KylinTestBase { public void testSnowflakeQuery() throws Exception { execAndCompQuery(getQueryFolderPrefix() + "src/test/resources/query/sql_snowflake", null, true); } - + @Test public void testDateTimeQuery() throws Exception { execAndCompQuery(getQueryFolderPrefix() + "src/test/resources/query/sql_datetime", null, true); } - + @Test public void testExtendedColumnQuery() throws Exception { execAndCompQuery(getQueryFolderPrefix() + "src/test/resources/query/sql_extended_column", null, true); } - + @Test public void testLikeQuery() throws Exception { execAndCompQuery(getQueryFolderPrefix() + "src/test/resources/query/sql_like", null, true); @@ -270,8 +270,7 @@ public class ITKylinQueryTest extends KylinTestBase { this.batchExecuteQuery(getQueryFolderPrefix() + "src/test/resources/query/sql_intersect_count"); } } - - + @Test public void testMultiModelQuery() throws Exception { if ("left".equalsIgnoreCase(joinType)) { @@ -280,7 +279,7 @@ public class ITKylinQueryTest extends KylinTestBase { joinType = "left"; } } - + @Test public void testDimDistinctCountQuery() throws Exception { execAndCompQuery(getQueryFolderPrefix() + "src/test/resources/query/sql_distinct_dim", null, true); @@ -392,10 +391,15 @@ public class ITKylinQueryTest extends KylinTestBase { // compare the result Assert.assertEquals(expectVersion, queriedVersion); } - + @Test public void testSelectStarColumnCount() throws Exception { execAndCompColumnCount("select * from test_kylin_fact limit 10", 11); execAndCompColumnCount("select * from test_kylin_fact", 11); } + + @Test + public void testPercentileQuery() throws Exception { + batchExecuteQuery("src/test/resources/query/sql_percentile"); + } } http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/kylin-it/src/test/resources/query/sql_percentile/query01.sql ---------------------------------------------------------------------- diff --git a/kylin-it/src/test/resources/query/sql_percentile/query01.sql b/kylin-it/src/test/resources/query/sql_percentile/query01.sql new file mode 100644 index 0000000..4f6d573 --- /dev/null +++ b/kylin-it/src/test/resources/query/sql_percentile/query01.sql @@ -0,0 +1,2 @@ +select seller_id, percentile(price, 0.5) from test_kylin_fact +group by seller_id \ No newline at end of file http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index e391103..e8ccb6c 100644 --- a/pom.xml +++ b/pom.xml @@ -102,6 +102,7 @@ <supercsv.version>2.4.0</supercsv.version> <cors.version>2.5</cors.version> <tomcat.version>8.5.9</tomcat.version> + <t-digest.version>3.1</t-digest.version> <!-- REST Service --> <spring.framework.version>3.2.17.RELEASE</spring.framework.version> @@ -658,6 +659,11 @@ <version>${roaring.version}</version> </dependency> <dependency> + <groupId>com.tdunning</groupId> + <artifactId>t-digest</artifactId> + <version>${t-digest.version}</version> + </dependency> + <dependency> <groupId>cglib</groupId> <artifactId>cglib</artifactId> <version>${cglib.version}</version> http://git-wip-us.apache.org/repos/asf/kylin/blob/95d44121/storage-hbase/pom.xml ---------------------------------------------------------------------- diff --git a/storage-hbase/pom.xml b/storage-hbase/pom.xml index eb2c104..3aea531 100644 --- a/storage-hbase/pom.xml +++ b/storage-hbase/pom.xml @@ -121,6 +121,7 @@ <include>org.apache.kylin:kylin-core-cube</include> <include>com.ning:compress-lzf</include> <include>org.roaringbitmap:RoaringBitmap</include> + <include>com.tdunning:t-digest</include> <!-- below for inverted index only --> <include>com.n3twork.druid:extendedset</include> <include>org.apache.commons:commons-lang3</include>