This is an automated email from the ASF dual-hosted git repository. rongr pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new c68b3be360 [multistage] support more agg functions currently implementable (#11208) c68b3be360 is described below commit c68b3be360fafac1c38f3322ed5fa64201d08daf Author: Rong Rong <ro...@apache.org> AuthorDate: Tue Aug 1 09:09:51 2023 -0700 [multistage] support more agg functions currently implementable (#11208) * support the rest of the agg functions * support others trivial ones such as DISTINCT_SUM/AVG, MIN_MAX_RANGE, etc * add DISTINCT_COUNT variances * add PERCENTILE_* variances * added TODO comments for the rest of the functions --------- Co-authored-by: Rong Rong <ro...@startree.ai> --- .../query/planner/logical/LiteralHintUtils.java | 4 +- .../src/test/resources/queries/CountDistinct.json | 103 ++++++++++++-- .../src/test/resources/queries/UDFAggregates.json | 152 ++++++++++++++++++++- .../pinot/segment/spi/AggregationFunctionType.java | 68 ++++++--- 4 files changed, 294 insertions(+), 33 deletions(-) diff --git a/pinot-query-planner/src/main/java/org/apache/pinot/query/planner/logical/LiteralHintUtils.java b/pinot-query-planner/src/main/java/org/apache/pinot/query/planner/logical/LiteralHintUtils.java index 21441b6d43..8f3fa8e2e6 100644 --- a/pinot-query-planner/src/main/java/org/apache/pinot/query/planner/logical/LiteralHintUtils.java +++ b/pinot-query-planner/src/main/java/org/apache/pinot/query/planner/logical/LiteralHintUtils.java @@ -46,13 +46,13 @@ public class LiteralHintUtils { e.getValue().getDataType().name(), e.getValue().getValue())); } // semi-colon is used to separate between encoded literals - return "{" + StringUtils.join(literalStrings, ";") + "}"; + return "{" + StringUtils.join(literalStrings, ";:;") + "}"; } public static Map<Integer, Map<Integer, Literal>> hintStringToLiteralMap(String literalString) { Map<Integer, Map<Integer, Literal>> aggCallToLiteralArgsMap = new HashMap<>(); if (StringUtils.isNotEmpty(literalString) && !"{}".equals(literalString)) { - String[] literalStringArr = literalString.substring(1, literalString.length() - 1).split(";"); + String[] literalStringArr = literalString.substring(1, literalString.length() - 1).split(";:;"); for (String literalStr : literalStringArr) { String[] literalStrParts = literalStr.split("\\|", 4); int aggIdx = Integer.parseInt(literalStrParts[0]); diff --git a/pinot-query-runtime/src/test/resources/queries/CountDistinct.json b/pinot-query-runtime/src/test/resources/queries/CountDistinct.json index f7f8ecd5d4..9c5cea93f3 100644 --- a/pinot-query-runtime/src/test/resources/queries/CountDistinct.json +++ b/pinot-query-runtime/src/test/resources/queries/CountDistinct.json @@ -104,12 +104,55 @@ "sql": "SELECT l.groupingCol, DISTINCTCOUNT(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", "outputs": [["b", 6], ["a", 6]] }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ groupingCol, DISTINCTCOUNT(val) FROM {tbl1} GROUP BY groupingCol", + "outputs": [["b", 2], ["a", 2]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNT(l.val), DISTINCTCOUNT(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["b", 2, 3], ["a", 2, 3]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNT(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["b", 6], ["a", 6]] + }, + { + "comments": "table aren't actually partitioned by val thus all segments can produce duplicate results, thus [[6]]", + "sql": "SELECT SEGMENT_PARTITIONED_DISTINCT_COUNT(val) FROM {tbl1}", + "outputs": [[6]] + }, + { + "comments": "table aren't actually partitioned by val thus all segments can produce duplicate results, thus [[b, 4], [a, 4]]", + "sql": "SELECT groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(val) FROM {tbl1} GROUP BY groupingCol", + "outputs": [["b", 4], ["a", 4]] + }, + { + "sql": "SELECT l.groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(l.val), SEGMENT_PARTITIONED_DISTINCT_COUNT(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["b", 2, 3], ["a", 2, 3]] + }, + { + "sql": "SELECT l.groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["b", 6], ["a", 6]] + }, + { + "comments": "table aren't actually partitioned by val thus all segments can produce duplicate results, thus [[b, 4], [a, 4]]", + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(val) FROM {tbl1} GROUP BY groupingCol", + "outputs": [["b", 4], ["a", 4]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(l.val), SEGMENT_PARTITIONED_DISTINCT_COUNT(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["b", 2, 3], ["a", 2, 3]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, SEGMENT_PARTITIONED_DISTINCT_COUNT(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["b", 6], ["a", 6]] + }, { "sql": "SELECT DISTINCTCOUNTHLL(val) FROM {tbl1}", "outputs": [[3]] }, { - "sql": "SELECT groupingCol, DISTINCTCOUNTHLL(val) FROM {tbl1} GROUP BY groupingCol", + "sql": "SELECT groupingCol, DISTINCTCOUNTHLL(val, 8) FROM {tbl1} GROUP BY groupingCol", "outputs": [["b", 2], ["a", 2]] }, { @@ -117,32 +160,76 @@ "outputs": [["b", 2, 3], ["a", 2, 3]] }, { - "sql": "SELECT l.groupingCol, DISTINCTCOUNTHLL(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "sql": "SELECT l.groupingCol, DISTINCTCOUNTHLL(CONCAT(l.val, r.val), 8) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", "outputs": [["b", 6], ["a", 6]] }, { - "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ groupingCol, DISTINCTCOUNT(val) FROM {tbl1} GROUP BY groupingCol", + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ groupingCol, DISTINCTCOUNTHLL(val, 8) FROM {tbl1} GROUP BY groupingCol", "outputs": [["b", 2], ["a", 2]] }, { - "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNT(l.val), DISTINCTCOUNT(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNTHLL(l.val), DISTINCTCOUNTHLL(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", "outputs": [["b", 2, 3], ["a", 2, 3]] }, { - "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNT(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNTHLL(CONCAT(l.val, r.val), 8) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", "outputs": [["b", 6], ["a", 6]] }, { - "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ groupingCol, DISTINCTCOUNTHLL(val) FROM {tbl1} GROUP BY groupingCol", + "sql": "SELECT DISTINCTCOUNTSMARTHLL(val, 'hllLog2m=2') FROM {tbl1}", + "outputs": [[3]] + }, + { + "sql": "SELECT groupingCol, DISTINCTCOUNTSMARTHLL(val, 'hllConversionThreshold=10;hllLog2m=8') FROM {tbl1} GROUP BY groupingCol", "outputs": [["b", 2], ["a", 2]] }, { - "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNTHLL(l.val), DISTINCTCOUNTHLL(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "sql": "SELECT l.groupingCol, DISTINCTCOUNTSMARTHLL(l.val), DISTINCTCOUNTSMARTHLL(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["b", 2, 3], ["a", 2, 3]] + }, + { + "sql": "SELECT l.groupingCol, DISTINCTCOUNTSMARTHLL(CONCAT(l.val, r.val), 'threshold=10;hllLog2m=2') FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["b", 6], ["a", 6]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ groupingCol, DISTINCTCOUNTSMARTHLL(val, 'hllConversionThreshold=10;hllLog2m=8') FROM {tbl1} GROUP BY groupingCol", + "outputs": [["b", 2], ["a", 2]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNTSMARTHLL(l.val), DISTINCTCOUNTSMARTHLL(r.val) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", "outputs": [["b", 2, 3], ["a", 2, 3]] }, { - "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNTHLL(CONCAT(l.val, r.val)) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNTSMARTHLL(CONCAT(l.val, r.val), 'threshold=10;hllLog2m=2') FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", "outputs": [["b", 6], ["a", 6]] + }, + { + "sql": "SELECT DISTINCTCOUNTRAWHLL(val, 2) FROM {tbl1}", + "outputs": [["000000020000000400000460"]] + }, + { + "sql": "SELECT groupingCol, DISTINCTCOUNTRAWHLL(val) FROM {tbl1} GROUP BY groupingCol", + "outputs": [["a", "00000008000000ac00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004000000000000000000000000000000000000000000002000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"], ["b", "00000008000000ac000000000000000000000000000000000000000000000000000000000000000000000000000000 [...] + }, + { + "sql": "SELECT l.groupingCol, DISTINCTCOUNTRAWHLL(l.val, 2), DISTINCTCOUNTRAWHLL(r.val, 2) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["a", "000000020000000400000060", "000000020000000400010060"], ["b", "000000020000000400000420", "000000020000000400000480"]] + }, + { + "sql": "SELECT l.groupingCol, DISTINCTCOUNTRAWHLL(CONCAT(l.val, r.val), 2) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["a", "000000020000000400028842"], ["b", "000000020000000400008405"]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ groupingCol, DISTINCTCOUNTRAWHLL(val) FROM {tbl1} GROUP BY groupingCol", + "outputs": [["a", "00000008000000ac00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000004000000000000000000000000000000000000000000002000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000"], ["b", "00000008000000ac000000000000000000000000000000000000000000000000000000000000000000000000000000 [...] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNTRAWHLL(l.val, 2), DISTINCTCOUNTRAWHLL(r.val, 2) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["a", "000000020000000400000060", "000000020000000400010060"], ["b", "000000020000000400000420", "000000020000000400000480"]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ l.groupingCol, DISTINCTCOUNTRAWHLL(CONCAT(l.val, r.val), 2) FROM {tbl1} l JOIN {tbl2} r ON l.groupingCol = r.groupingCol GROUP BY l.groupingCol", + "outputs": [["a", "000000020000000400028842"], ["b", "000000020000000400008405"]] } ] } diff --git a/pinot-query-runtime/src/test/resources/queries/UDFAggregates.json b/pinot-query-runtime/src/test/resources/queries/UDFAggregates.json index 1bc39d0953..57bc0d62f1 100644 --- a/pinot-query-runtime/src/test/resources/queries/UDFAggregates.json +++ b/pinot-query-runtime/src/test/resources/queries/UDFAggregates.json @@ -48,6 +48,82 @@ "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, LASTWITHTIME(float_col, timestamp_col, 'FLOAT'), LASTWITHTIME(double_col, timestamp_col, 'DOUBLE'), last_with_time(long_col, CAST(long_col AS TIMESTAMP), 'LONG') FROM {tbl} GROUP BY string_col", "outputs": [["a", 400, 400, 2], ["b", 1, 1, 100], ["c", 1.01, 1.01, 175]] }, + { + "sql": "SELECT SUMPRECISION(decimal_col) FROM {tbl}", + "outputs": [["10000000000100000000110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 [...] + }, + { + "sql": "SELECT string_col, SUMPRECISION(decimal_col) FROM {tbl} GROUP BY string_col", + "outputs": [["a" ,"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 [...] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, SUMPRECISION(decimal_col) FROM {tbl} GROUP BY string_col", + "outputs": [["a" ,"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 [...] + }, + { + "sql": "select min_max_range(int_col), minMaxRange(long_col), MIN_MAX_RANGE(float_col), MINMAXRANGE(double_col) FROM {tbl}", + "outputs": [[173, 173, 399.0, 399.0]] + }, + { + "sql": "select bool_col, min_max_range(int_col), minMaxRange(long_col), MIN_MAX_RANGE(float_col), MINMAXRANGE(double_col) FROM {tbl} GROUP BY bool_col", + "outputs": [[true, 173, 173, 398.25, 398.25], [false, 147, 147, 99.0, 99.0]] + }, + { + "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ bool_col, min_max_range(int_col), minMaxRange(long_col), MIN_MAX_RANGE(float_col), MINMAXRANGE(double_col) FROM {tbl} GROUP BY bool_col", + "outputs": [[true, 173, 173, 398.25, 398.25], [false, 147, 147, 99.0, 99.0]] + }, + { + "sql": "select distinct_sum(int_col), distinctSum(long_col), DISTINCT_SUM(float_col), DISTINCTSUM(double_col) FROM {tbl}", + "outputs": [[531, 531, 805.26, 805.26]] + }, + { + "sql": "select string_col, distinct_sum(int_col), distinctSum(long_col), DISTINCT_SUM(float_col), DISTINCTSUM(double_col) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 2, 2, 700.0, 700.0], ["b", 103, 103, 101.0, 101.0], ["c", 426, 426, 4.26, 4.26]] + }, + { + "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, distinct_sum(int_col), distinctSum(long_col), DISTINCT_SUM(float_col), DISTINCTSUM(double_col) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 2, 2, 700.0, 700.0], ["b", 103, 103, 101.0, 101.0], ["c", 426, 426, 4.26, 4.26]] + }, + { + "sql": "select distinct_avg(int_col), distinctAvg(long_col), DISTINCT_AVG(float_col), DISTINCTAVG(double_col) FROM {tbl}", + "outputs": [[88.5, 88.5, 115.03714285, 115.03714285]] + }, + { + "sql": "select string_col, distinct_avg(int_col), distinctAvg(long_col), DISTINCT_AVG(float_col), DISTINCTAVG(double_col) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 2.0, 2.0, 350.0, 350.0], ["b", 51.5, 51.5, 50.5, 50.5], ["c", 142.0, 142.0, 1.42, 1.42]] + }, + { + "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, distinct_avg(int_col), distinctAvg(long_col), DISTINCT_AVG(float_col), DISTINCTAVG(double_col) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 2.0, 2.0, 350.0, 350.0], ["b", 51.5, 51.5, 50.5, 50.5], ["c", 142.0, 142.0, 1.42, 1.42]] + } + ] + }, + "pinot_percentile_agg_udf_tests": { + "tables": { + "tbl": { + "schema": [ + {"name": "int_col", "type": "INT"}, + {"name": "long_col", "type": "LONG"}, + {"name": "float_col", "type": "FLOAT"}, + {"name": "double_col", "type": "DOUBLE"}, + {"name": "string_col", "type": "STRING"}, + {"name": "bool_col", "type": "BOOLEAN"}, + {"name": "decimal_col", "type": "BIG_DECIMAL"}, + {"name": "timestamp_col", "type": "TIMESTAMP"}, + {"name": "bytes_col", "type": "BYTES"} + ], + "inputs": [ + [2, 2, 300, 300, "a", true, "1E-307", "2020-01-01 03:10:12", "DEADBEEF"], + [2, 2, 400, 400, "a", true, "1E+307", "2020-01-01 03:39:12", "FDEADBEE"], + [3, 3, 100, 100, "b", false, "1E-308", "2020-01-01 08:32:12", "EFDEADBE"], + [100, 100, 1, 1, "b", false, "1E+308", "2020-02-06 03:32:12", "FDEADBEE"], + [101, 101, 1.01, 1.01, "c", false, "1E-317", "2020-05-01 03:32:12", "DEADBEEF"], + [150, 150, 1.5, 1.5, "c", false, "1E+317", "2020-02-03 03:32:12", "EFDEADBE"], + [175, 175, 1.75, 1.75, "c", true, "1E+328", "2020-01-02 03:32:12", "EFDEADBE"] + ] + } + }, + "queries": [ { "sql": "SELECT PERCENTILE(float_col, 50), PERCENTILE(double_col, 5), PERCENTILE(int_col, 75), PERCENTILE(long_col, 75) FROM {tbl}", "outputs": [[1.75, 1, 150, 150]] @@ -61,17 +137,69 @@ "outputs": [["a", 400, 300, 2, 2], ["b", 100, 1, 100, 100], ["c", 1.5, 1.01, 175, 175]] }, { - "sql": "SELECT SUMPRECISION(decimal_col) FROM {tbl}", - "outputs": [["10000000000100000000110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 [...] + "sql": "SELECT PERCENTILE_EST(float_col, 50), PERCENTILE_EST(double_col, 5), PERCENTILE_EST(int_col, 75), PERCENTILE_EST(long_col, 75) FROM {tbl}", + "outputs": [[1.0, 1.0, 150, 150]] }, { - "sql": "SELECT string_col, SUMPRECISION(decimal_col) FROM {tbl} GROUP BY string_col", - "outputs": [["a" ,"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 [...] + "sql": "SELECT bool_col, PERCENTILE_EST(float_col, 50), PERCENTILE_EST(double_col, 5), PERCENTILE_EST(int_col, 75), PERCENTILE_EST(long_col, 75) FROM {tbl} GROUP BY bool_col", + "outputs": [[false, 1.0, 1.0, 150, 150], [true, 300, 1.0, 175, 175]] }, { - "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, SUMPRECISION(decimal_col) FROM {tbl} GROUP BY string_col", - "outputs": [["a" ,"10000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000.000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000 [...] + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, PERCENTILE_EST(float_col, 50), PERCENTILE_EST(double_col, 5), PERCENTILE_EST(int_col, 75), PERCENTILE_EST(long_col, 75) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 400, 300, 2, 2], ["b", 100, 1, 100, 100], ["c", 1.0, 1.0, 175, 175]] + }, + { + "sql": "SELECT PERCENTILE_TDIGEST(float_col, 50), PERCENTILE_TDIGEST(double_col, 5), PERCENTILE_TDIGEST(int_col, 75), PERCENTILE_TDIGEST(long_col, 75) FROM {tbl}", + "outputs": [[1.75, 1.0, 137, 137]] + }, + { + "sql": "SELECT bool_col, PERCENTILE_TDIGEST(float_col, 50), PERCENTILE_TDIGEST(double_col, 5), PERCENTILE_TDIGEST(int_col, 75), PERCENTILE_TDIGEST(long_col, 75) FROM {tbl} GROUP BY bool_col", + "outputs": [[false, 1.255, 1.0, 125, 125], [true, 300, 1.75, 131, 131]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, PERCENTILE_TDIGEST(float_col, 50), PERCENTILE_TDIGEST(double_col, 5), PERCENTILE_TDIGEST(int_col, 75), PERCENTILE_TDIGEST(long_col, 75) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 350, 300, 2, 2], ["b", 50.5, 1, 100, 100], ["c", 1.5, 1.01, 168, 168]] + }, + { + "sql": "SELECT PERCENTILE_KLL(float_col, 50), PERCENTILE_KLL(double_col, 5), PERCENTILE_KLL(int_col, 75), PERCENTILE_KLL(long_col, 75) FROM {tbl}", + "outputs": [[1.75, 1, 150, 150]] }, + { + "sql": "SELECT bool_col, PERCENTILE_KLL(float_col, 50), PERCENTILE_KLL(double_col, 5), PERCENTILE_KLL(int_col, 75), PERCENTILE_KLL(long_col, 75) FROM {tbl} GROUP BY bool_col", + "outputs": [[false, 1.01, 1, 101, 101], [true, 300, 1.75, 175, 175]] + }, + { + "sql": "SELECT /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, PERCENTILE_KLL(float_col, 50), PERCENTILE_KLL(double_col, 5), PERCENTILE_KLL(int_col, 75), PERCENTILE_KLL(long_col, 75) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 300, 300, 2, 2], ["b", 1, 1, 100, 100], ["c", 1.5, 1.01, 175, 175]] + } + ] + }, + "pinot_sketch_agg_udf_tests": { + "tables": { + "tbl": { + "schema": [ + {"name": "int_col", "type": "INT"}, + {"name": "long_col", "type": "LONG"}, + {"name": "float_col", "type": "FLOAT"}, + {"name": "double_col", "type": "DOUBLE"}, + {"name": "string_col", "type": "STRING"}, + {"name": "bool_col", "type": "BOOLEAN"}, + {"name": "decimal_col", "type": "BIG_DECIMAL"}, + {"name": "timestamp_col", "type": "TIMESTAMP"}, + {"name": "bytes_col", "type": "BYTES"} + ], + "inputs": [ + [2, 2, 300, 300, "a", true, "1E-307", "2020-01-01 03:10:12", "DEADBEEF"], + [2, 2, 400, 400, "a", true, "1E+307", "2020-01-01 03:39:12", "FDEADBEE"], + [3, 3, 100, 100, "b", false, "1E-308", "2020-01-01 08:32:12", "EFDEADBE"], + [100, 100, 1, 1, "b", false, "1E+308", "2020-02-06 03:32:12", "FDEADBEE"], + [101, 101, 1.01, 1.01, "c", false, "1E-317", "2020-05-01 03:32:12", "DEADBEEF"], + [150, 150, 1.5, 1.5, "c", false, "1E+317", "2020-02-03 03:32:12", "EFDEADBE"], + [175, 175, 1.75, 1.75, "c", true, "1E+328", "2020-01-02 03:32:12","EFDEADBE"] + ] + } + }, + "queries": [ { "sql": "select GET_THETA_SKETCH_ESTIMATE(DISTINCT_COUNT_RAW_THETA_SKETCH(string_col, 'nominalEntries=16')), GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_DIFF(DISTINCT_COUNT_RAW_THETA_SKETCH(int_col, ''), DISTINCT_COUNT_RAW_THETA_SKETCH(int_col, ''))), GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_UNION(DISTINCT_COUNT_RAW_THETA_SKETCH(int_col, ''), DISTINCT_COUNT_RAW_THETA_SKETCH(long_col, ''))), GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_INTERSECT(DISTINCT_COUNT_RAW_THETA_SKETCH(double_col, ''), [...] "outputs": [[3, 0, 6, 6]] @@ -83,6 +211,18 @@ { "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ bool_col, GET_THETA_SKETCH_ESTIMATE(DISTINCT_COUNT_RAW_THETA_SKETCH(string_col, 'nominalEntries=16')), GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_DIFF(DISTINCT_COUNT_RAW_THETA_SKETCH(int_col, ''), DISTINCT_COUNT_RAW_THETA_SKETCH(int_col, ''))), GET_THETA_SKETCH_ESTIMATE(THETA_SKETCH_UNION(DISTINCT_COUNT_RAW_THETA_SKETCH(int_col, ''), DISTINCT_COUNT_RAW_THETA_SKETCH(long_col, ''))), GET_THETA_SKETCH_ESTIMATE(THETA_S [...] "outputs": [[true, 2, 0, 2, 3], [false, 2, 0, 4, 3]] + }, + { + "sql": "select distinct_count_theta_sketch(int_col), distinctCountThetaSketch(long_col), DISTINCT_COUNT_THETA_SKETCH(float_col), DISTINCTCOUNTTHETASKETCH(double_col) FROM {tbl}", + "outputs": [[6, 6, 7, 7]] + }, + { + "sql": "select string_col, distinct_count_theta_sketch(int_col), distinctCountThetaSketch(long_col), DISTINCT_COUNT_THETA_SKETCH(float_col), DISTINCTCOUNTTHETASKETCH(double_col) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 1, 1, 2, 2], ["b", 2, 2, 2, 2], ["c", 3, 3, 3, 3]] + }, + { + "sql": "select /*+ aggOptions(is_skip_leaf_stage_aggregate='true') */ string_col, distinct_count_theta_sketch(int_col), distinctCountThetaSketch(long_col), DISTINCT_COUNT_THETA_SKETCH(float_col), DISTINCTCOUNTTHETASKETCH(double_col) FROM {tbl} GROUP BY string_col", + "outputs": [["a", 1, 1, 2, 2], ["b", 2, 2, 2, 2], ["c", 3, 3, 3, 3]] } ] } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/AggregationFunctionType.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/AggregationFunctionType.java index 5f7ed4a748..77a9199727 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/AggregationFunctionType.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/AggregationFunctionType.java @@ -63,6 +63,7 @@ public enum AggregationFunctionType { ReturnTypes.AGG_SUM_EMPTY_IS_ZERO, ReturnTypes.explicit(SqlTypeName.DOUBLE)), SUMPRECISION("sumPrecision", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.ANY, ReturnTypes.explicit(SqlTypeName.DECIMAL), ReturnTypes.explicit(SqlTypeName.OTHER)), + // NO NEEDED in v2, AVG is compiled as SUM/COUNT AVG("avg"), MODE("mode"), @@ -72,7 +73,8 @@ public enum AggregationFunctionType { LASTWITHTIME("lastWithTime", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY, SqlTypeFamily.TIMESTAMP, SqlTypeFamily.CHARACTER)), ReturnTypes.ARG0, ReturnTypes.explicit(SqlTypeName.OTHER)), - MINMAXRANGE("minMaxRange"), + MINMAXRANGE("minMaxRange", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.NUMERIC, + OperandTypes.NUMERIC, ReturnTypes.ARG0, ReturnTypes.explicit(SqlTypeName.OTHER)), /** * for all distinct count family functions: * (1) distinct_count only supports single argument; @@ -81,41 +83,69 @@ public enum AggregationFunctionType { DISTINCTCOUNT("distinctCount", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.ANY, ReturnTypes.BIGINT, ReturnTypes.explicit(SqlTypeName.OTHER)), + // TODO: support bitmap and segment partition in V2 DISTINCTCOUNTBITMAP("distinctCountBitmap"), - SEGMENTPARTITIONEDDISTINCTCOUNT("segmentPartitionedDistinctCount"), - DISTINCTCOUNTHLL("distinctCountHLL", ImmutableList.of("DISTINCT_COUNT_HLL"), SqlKind.OTHER_FUNCTION, + SEGMENTPARTITIONEDDISTINCTCOUNT("segmentPartitionedDistinctCount", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.ANY, ReturnTypes.BIGINT, - ReturnTypes.explicit(SqlTypeName.OTHER)), - DISTINCTCOUNTRAWHLL("distinctCountRawHLL"), - DISTINCTCOUNTSMARTHLL("distinctCountSmartHLL"), + ReturnTypes.BIGINT), + DISTINCTCOUNTHLL("distinctCountHLL", ImmutableList.of("DISTINCT_COUNT_HLL"), SqlKind.OTHER_FUNCTION, + SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY, SqlTypeFamily.NUMERIC), ordinal -> ordinal > 0), + ReturnTypes.BIGINT, ReturnTypes.explicit(SqlTypeName.OTHER)), + DISTINCTCOUNTRAWHLL("distinctCountRawHLL", ImmutableList.of("DISTINCT_COUNT_RAW_HLL"), SqlKind.OTHER_FUNCTION, + SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY, SqlTypeFamily.INTEGER), ordinal -> ordinal > 0), + ReturnTypes.VARCHAR_2000, ReturnTypes.explicit(SqlTypeName.OTHER)), + DISTINCTCOUNTSMARTHLL("distinctCountSmartHLL", ImmutableList.of("DISTINCT_COUNT_SMART_HLL"), SqlKind.OTHER_FUNCTION, + SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY, SqlTypeFamily.CHARACTER), ordinal -> ordinal > 0), + ReturnTypes.BIGINT, ReturnTypes.explicit(SqlTypeName.OTHER)), + // DEPRECATED in v2 FASTHLL("fastHLL"), - DISTINCTCOUNTTHETASKETCH("distinctCountThetaSketch", ImmutableList.of("DISTINCT_COUNT_THETA_SKETCH"), + DISTINCTCOUNTTHETASKETCH("distinctCountThetaSketch", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY, SqlTypeFamily.CHARACTER), ordinal -> ordinal > 0), ReturnTypes.BIGINT, ReturnTypes.explicit(SqlTypeName.OTHER)), - DISTINCTCOUNTRAWTHETASKETCH("distinctCountRawThetaSketch", ImmutableList.of("DISTINCT_COUNT_RAW_THETA_SKETCH"), + DISTINCTCOUNTRAWTHETASKETCH("distinctCountRawThetaSketch", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.family(ImmutableList.of(SqlTypeFamily.ANY, SqlTypeFamily.CHARACTER), ordinal -> ordinal > 0), ReturnTypes.VARCHAR_2000, ReturnTypes.explicit(SqlTypeName.OTHER)), - DISTINCTSUM("distinctSum"), - DISTINCTAVG("distinctAvg"), + DISTINCTSUM("distinctSum", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.NUMERIC, + OperandTypes.NUMERIC, ReturnTypes.AGG_SUM, ReturnTypes.explicit(SqlTypeName.OTHER)), + DISTINCTAVG("distinctAvg", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.NUMERIC, + OperandTypes.NUMERIC, ReturnTypes.explicit(SqlTypeName.DOUBLE), ReturnTypes.explicit(SqlTypeName.OTHER)), PERCENTILE("percentile", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC, SqlTypeFamily.NUMERIC)), ReturnTypes.ARG0, ReturnTypes.explicit(SqlTypeName.OTHER)), - PERCENTILEEST("percentileEst"), - PERCENTILERAWEST("percentileRawEst"), - PERCENTILETDIGEST("percentileTDigest"), - PERCENTILERAWTDIGEST("percentileRawTDigest"), + PERCENTILEEST("percentileEst", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC, SqlTypeFamily.NUMERIC)), ReturnTypes.ARG0, + ReturnTypes.explicit(SqlTypeName.OTHER)), + PERCENTILERAWEST("percentileRawEst", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC, SqlTypeFamily.NUMERIC)), ReturnTypes.VARCHAR_2000, + ReturnTypes.explicit(SqlTypeName.OTHER)), + PERCENTILETDIGEST("percentileTDigest", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC, SqlTypeFamily.NUMERIC)), ReturnTypes.ARG0, + ReturnTypes.explicit(SqlTypeName.OTHER)), + PERCENTILERAWTDIGEST("percentileRawTDigest", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC, SqlTypeFamily.NUMERIC)), ReturnTypes.VARCHAR_2000, + ReturnTypes.explicit(SqlTypeName.OTHER)), + // DEPRECATED in v2 PERCENTILESMARTTDIGEST("percentileSmartTDigest"), - PERCENTILEKLL("percentileKLL"), - PERCENTILERAWKLL("percentileRawKLL"), + PERCENTILEKLL("percentileKLL", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC, SqlTypeFamily.NUMERIC)), ReturnTypes.ARG0, + ReturnTypes.explicit(SqlTypeName.OTHER)), + PERCENTILERAWKLL("percentileRawKLL", null, SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, + OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC, SqlTypeFamily.NUMERIC)), ReturnTypes.VARCHAR_2000, + ReturnTypes.explicit(SqlTypeName.OTHER)), + // DEPRECATED in v2 IDSET("idSet"), + // TODO: support histogram requires solving ARRAY constructor and multi-function signature without optional ordinal HISTOGRAM("histogram"), - // TODO: support underscore separated version of the stats functions + // TODO: support underscore separated version of the stats functions, resolving conflict in SqlStdOptTable // currently Pinot is missing generated agg functions impl from Calcite's AggregateReduceFunctionsRule COVARPOP("covarPop", Collections.emptyList(), SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.family(ImmutableList.of(SqlTypeFamily.NUMERIC, SqlTypeFamily.NUMERIC)), ReturnTypes.DOUBLE, @@ -137,6 +167,7 @@ public enum AggregationFunctionType { OperandTypes.NUMERIC, ReturnTypes.DOUBLE, ReturnTypes.explicit(SqlTypeName.OTHER)), FOURTHMOMENT("fourthMoment"), + // TODO: revisit support for Tuple sketches in V2 // DataSketches Tuple Sketch support DISTINCTCOUNTTUPLESKETCH("distinctCountTupleSketch"), @@ -146,6 +177,7 @@ public enum AggregationFunctionType { SUMVALUESINTEGERSUMTUPLESKETCH("sumValuesIntegerSumTupleSketch"), AVGVALUEINTEGERSUMTUPLESKETCH("avgValueIntegerSumTupleSketch"), + // TODO: revisit support for Geo-spatial agg in V2 // Geo aggregation functions STUNION("STUnion", ImmutableList.of("ST_UNION"), SqlKind.OTHER_FUNCTION, SqlFunctionCategory.USER_DEFINED_FUNCTION, OperandTypes.BINARY, ReturnTypes.explicit(SqlTypeName.VARBINARY), ReturnTypes.explicit(SqlTypeName.OTHER)), @@ -209,6 +241,7 @@ public enum AggregationFunctionType { OperandTypes.BOOLEAN, ReturnTypes.BOOLEAN, ReturnTypes.explicit(SqlTypeName.INTEGER)), // argMin and argMax + // TODO: argmin/argmax syntax not conformed with Calcite. we need to migrate. ARGMIN("argMin"), ARGMAX("argMax"), PARENTARGMIN(CommonConstants.RewriterConstants.PARENT_AGGREGATION_NAME_PREFIX + ARGMIN.getName()), @@ -217,6 +250,7 @@ public enum AggregationFunctionType { CHILDARGMAX(CommonConstants.RewriterConstants.CHILD_AGGREGATION_NAME_PREFIX + ARGMAX.getName()), // funnel aggregate functions + // TODO: revisit support for funnel count in V2 FUNNELCOUNT("funnelCount"); private static final Set<String> NAMES = Arrays.stream(values()).flatMap(func -> Stream.of(func.name(), --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org