This is an automated email from the ASF dual-hosted git repository. linzhongcheng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new ff9e03e2bf [Feature](add bitmap udaf) add the bitmap intersection and difference set for mixed calculation of udaf (#15588) ff9e03e2bf is described below commit ff9e03e2bf8ca5b4cdd6fe44b129db868e84ddb4 Author: zhbinbin <16679214+zhbin...@users.noreply.github.com> AuthorDate: Tue Mar 14 20:40:37 2023 +0800 [Feature](add bitmap udaf) add the bitmap intersection and difference set for mixed calculation of udaf (#15588) * Add the bitmap intersection and difference set for mixed calculation of udaf Co-authored-by: zhangbinbin05 <zhangbinbi...@baidu.com> --- be/src/util/bitmap_expr_calculation.h | 216 +++++++++++++++++++++ be/src/util/bitmap_intersect.h | 5 +- .../aggregate_function_orthogonal_bitmap.cpp | 20 +- .../aggregate_function_orthogonal_bitmap.h | 99 +++++++++- docs/en/docs/advanced/orthogonal-bitmap-manual.md | 41 ++++ .../orthogonal_bitmap_expr_calculate.md | 47 +++++ .../orthogonal_bitmap_expr_calculate_count.md | 47 +++++ docs/sidebars.json | 2 + .../docs/advanced/orthogonal-bitmap-manual.md | 42 ++++ .../orthogonal_bitmap_expr_calculate.md | 47 +++++ .../orthogonal_bitmap_expr_calculate_count.md | 47 +++++ .../apache/doris/analysis/FunctionCallExpr.java | 4 +- .../apache/doris/catalog/AggregateFunction.java | 1 + .../java/org/apache/doris/catalog/FunctionSet.java | 43 ++++ 14 files changed, 655 insertions(+), 6 deletions(-) diff --git a/be/src/util/bitmap_expr_calculation.h b/be/src/util/bitmap_expr_calculation.h new file mode 100644 index 0000000000..24784cc957 --- /dev/null +++ b/be/src/util/bitmap_expr_calculation.h @@ -0,0 +1,216 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once +#include <stack> +#include <string> + +#include "util/bitmap_intersect.h" + +namespace doris { + +// Compute the intersection union difference set of two or more bitmaps +// Usage: orthogonal_bitmap_parse_calculate(bitmap_column, filter_column, input_string) +// Example: orthogonal_bitmap_expr_calculate(user_id, event, '(A|B)&(C-D)'), meaning find the intersection union difference set of user_id in all A/B/C/D 4 bitmaps +// Operation symbol: +// the operator '|' stands for union, the operator '&' stands for intersection, the operator '-' indicates the difference set, the operator '^' stands for xor +class BitmapExprCalculation : public BitmapIntersect<std::string> { +public: + BitmapExprCalculation() = default; + + explicit BitmapExprCalculation(const char* src) { deserialize(src); } + + void bitmap_calculation_init(std::string& input_str) { + _polish = reverse_polish(input_str); + std::string bitmap_key; + for (int i = 0; i < _polish.length(); i++) { + char c = _polish.at(i); + if (c != '&' && c != '|' && c != '^' && c != '-' && c != ' ' && c != '\\') { + bitmap_key += c; + } else if (i != 0 && _polish.at(i - 1) == '\\') { + bitmap_key += c; + } else if (c == '\\') { + continue; + } else { + if (bitmap_key.length() > 0) { + add_key(bitmap_key); + bitmap_key.clear(); + } + } + } + if (bitmap_key.length() > 0) { + add_key(bitmap_key); + bitmap_key.clear(); + } + } + + BitmapValue bitmap_calculate() { + std::stack<BitmapValue> values; + std::string bitmap_key; + for (int i = 0; i < _polish.length(); i++) { + char c = _polish.at(i); + if (c == ' ') { + if (bitmap_key.length() > 0) { + values.push(_bitmaps[bitmap_key]); + bitmap_key.clear(); + } + } else if (c != '&' && c != '|' && c != '^' && c != '-' && c != '\\') { + bitmap_key += c; + } else if (i != 0 && _polish.at(i - 1) == '\\') { + bitmap_key += c; + } else if (c == '\\') { + continue; + } else { + if (bitmap_key.length() > 0) { + values.push(_bitmaps[bitmap_key]); + bitmap_key.clear(); + } + if (values.size() >= 2) { + BitmapValue op_a = values.top(); + values.pop(); + BitmapValue op_b = values.top(); + values.pop(); + BitmapValue cal_result; + bitmap_calculate(op_a, op_b, c, cal_result); + values.push(cal_result); + } + } + } + BitmapValue result; + if (bitmap_key.length() > 0) { + result |= _bitmaps[bitmap_key]; + } else if (!values.empty()) { + result |= values.top(); + } + return result; + } + + // calculate the bitmap value by expr bitmap calculate + int64_t bitmap_calculate_count() { + if (_bitmaps.empty()) { + return 0; + } + return bitmap_calculate().cardinality(); + } + +private: + constexpr int priority(char c) { + switch (c) { + case '&': + return 1; + case '|': + return 1; + case '^': + return 1; + case '-': + return 1; + default: + return 0; + } + } + + template <class T> + std::string print_stack(std::stack<T>& stack) { + std::string result; + while (!stack.empty()) { + result = stack.top() + result; + stack.pop(); + } + return result; + } + + std::string reverse_polish(const std::string& input_str) { + std::stack<char> polish; + std::stack<char> op_stack; + bool last_is_char = false; + for (int i = 0; i < input_str.length(); i++) { + char cur_char = input_str.at(i); + if (cur_char != '&' && cur_char != '|' && cur_char != '^' && cur_char != '-' && + cur_char != '(' && cur_char != ')' && cur_char != ' ' && cur_char != '\t') { + if (!last_is_char) { + polish.push(' '); + } + polish.push(cur_char); + last_is_char = true; + continue; + } else if (i != 0 && input_str.at(i - 1) == '\\') { + polish.push(cur_char); + last_is_char = true; + continue; + } else if (cur_char == ' ' || cur_char == '\t') { + last_is_char = false; + continue; + } else if (cur_char == '(') { + op_stack.push(cur_char); + } else if (!op_stack.empty() && cur_char == ')') { + while (!op_stack.empty() && op_stack.top() != '(') { + polish.push(op_stack.top()); + op_stack.pop(); + } + op_stack.pop(); + } else { + if (!op_stack.empty() && op_stack.top() == '(') { + op_stack.push(cur_char); + } else { + if (!op_stack.empty() && priority(cur_char) > priority(op_stack.top())) { + op_stack.push(cur_char); + } else { + while (!op_stack.empty()) { + if (op_stack.top() == '(') { + break; + } + if (priority(cur_char) <= priority(op_stack.top())) { + polish.push(op_stack.top()); + op_stack.pop(); + } else { + break; + } + } + op_stack.push(cur_char); + } + } + } + last_is_char = false; + } + + while (!op_stack.empty()) { + polish.push(op_stack.top()); + op_stack.pop(); + } + return print_stack(polish); + } + + void bitmap_calculate(BitmapValue& op_a, BitmapValue& op_b, char op, BitmapValue& result) { + result |= op_b; + switch (op) { + case '&': + result &= op_a; + break; + case '|': + result |= op_a; + break; + case '-': + result -= op_a; + break; + case '^': + result ^= op_a; + break; + } + } + + std::string _polish; +}; +} // namespace doris diff --git a/be/src/util/bitmap_intersect.h b/be/src/util/bitmap_intersect.h index 8258ee21eb..e64ad6ec54 100644 --- a/be/src/util/bitmap_intersect.h +++ b/be/src/util/bitmap_intersect.h @@ -148,7 +148,6 @@ void Helper::read_from<std::string>(const char** src, std::string* result) { *src += length; } // read_from end - } // namespace detail // Calculate the intersection of two or more bitmaps @@ -238,7 +237,7 @@ public: } } -private: +protected: std::map<T, BitmapValue> _bitmaps; }; @@ -325,7 +324,7 @@ public: } } -private: +protected: phmap::flat_hash_map<std::string, BitmapValue> _bitmaps; }; diff --git a/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.cpp b/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.cpp index d894a5be06..b1ef0eb9d1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.cpp @@ -70,6 +70,18 @@ AggregateFunctionPtr create_aggregate_function_orthogonal_bitmap_intersect_count result_is_nullable); } +AggregateFunctionPtr create_aggregate_function_orthogonal_bitmap_expr_calculate( + const std::string& name, const DataTypes& argument_types, bool result_is_nullable) { + return create_aggregate_function_orthogonal<AggOrthBitMapExprCal>(name, argument_types, + result_is_nullable); +} + +AggregateFunctionPtr create_aggregate_function_orthogonal_bitmap_expr_calculate_count( + const std::string& name, const DataTypes& argument_types, bool result_is_nullable) { + return create_aggregate_function_orthogonal<AggOrthBitMapExprCalCount>(name, argument_types, + result_is_nullable); +} + AggregateFunctionPtr create_aggregate_function_intersect_count(const std::string& name, const DataTypes& argument_types, @@ -92,5 +104,11 @@ void register_aggregate_function_orthogonal_bitmap(AggregateFunctionSimpleFactor factory.register_function_both("orthogonal_bitmap_union_count", create_aggregate_function_orthogonal_bitmap_union_count); factory.register_function_both("intersect_count", create_aggregate_function_intersect_count); + factory.register_function_both("orthogonal_bitmap_expr_calculate", + create_aggregate_function_orthogonal_bitmap_expr_calculate); + factory.register_function_both( + "orthogonal_bitmap_expr_calculate_count", + create_aggregate_function_orthogonal_bitmap_expr_calculate_count); } -} // namespace doris::vectorized \ No newline at end of file + +} // namespace doris::vectorized diff --git a/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h index 998ccfd097..20b90cd30e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_orthogonal_bitmap.h @@ -17,6 +17,7 @@ #pragma once +#include "util/bitmap_expr_calculation.h" #include "util/bitmap_intersect.h" #include "util/bitmap_value.h" #include "vec/aggregate_functions/aggregate_function.h" @@ -177,6 +178,102 @@ private: Int64 result = 0; }; +template <typename T> +struct AggOrthBitmapExprCalBaseData { +public: + using ColVecData = std::conditional_t<IsNumber<T>, ColumnVector<T>, ColumnString>; + + void add(const IColumn** columns, size_t row_num) { + const auto& bitmap_col = static_cast<const ColumnBitmap&>(*columns[0]); + const auto& data_col = static_cast<const ColVecData&>(*columns[1]); + const auto& bitmap_value = bitmap_col.get_element(row_num); + std::string update_key = data_col.get_data_at(row_num).to_string(); + bitmap_expr_cal.update(update_key, bitmap_value); + } + + void init_add_key(const IColumn** columns, size_t row_num, int argument_size) { + if (first_init) { + DCHECK(argument_size > 1); + const auto& col = static_cast<const ColVecData&>(*columns[2]); + std::string expr = col.get_data_at(row_num).to_string(); + bitmap_expr_cal.bitmap_calculation_init(expr); + first_init = false; + } + } + +protected: + doris::BitmapExprCalculation bitmap_expr_cal; + bool first_init = true; +}; + +template <typename T> +struct AggOrthBitMapExprCal : public AggOrthBitmapExprCalBaseData<T> { +public: + static constexpr auto name = "orthogonal_bitmap_expr_calculate"; + + static DataTypePtr get_return_type() { return std::make_shared<DataTypeBitMap>(); } + + void merge(const AggOrthBitMapExprCal& rhs) { + if (rhs.first_init) { + return; + } + result |= rhs.result; + } + + void write(BufferWritable& buf) { + write_binary(AggOrthBitmapExprCalBaseData<T>::first_init, buf); + result = AggOrthBitmapExprCalBaseData<T>::bitmap_expr_cal.bitmap_calculate(); + DataTypeBitMap::serialize_as_stream(result, buf); + } + + void read(BufferReadable& buf) { + read_binary(AggOrthBitmapExprCalBaseData<T>::first_init, buf); + DataTypeBitMap::deserialize_as_stream(result, buf); + } + + void get(IColumn& to) const { + auto& column = static_cast<ColumnBitmap&>(to); + column.get_data().emplace_back(result); + } + +private: + BitmapValue result; +}; + +template <typename T> +struct AggOrthBitMapExprCalCount : public AggOrthBitmapExprCalBaseData<T> { +public: + static constexpr auto name = "orthogonal_bitmap_expr_calculate_count"; + + static DataTypePtr get_return_type() { return std::make_shared<DataTypeInt64>(); } + + void merge(const AggOrthBitMapExprCalCount& rhs) { + if (rhs.first_init) { + return; + } + result += rhs.result; + } + + void write(BufferWritable& buf) { + write_binary(AggOrthBitmapExprCalBaseData<T>::first_init, buf); + result = AggOrthBitmapExprCalBaseData<T>::bitmap_expr_cal.bitmap_calculate_count(); + write_binary(result, buf); + } + + void read(BufferReadable& buf) { + read_binary(AggOrthBitmapExprCalBaseData<T>::first_init, buf); + read_binary(result, buf); + } + + void get(IColumn& to) const { + auto& column = static_cast<ColumnVector<Int64>&>(to); + column.get_data().emplace_back(result); + } + +private: + int64_t result = 0; +}; + template <typename T> struct OrthBitmapUnionCountData { static constexpr auto name = "orthogonal_bitmap_union_count"; @@ -247,4 +344,4 @@ public: private: int _argument_size; }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/docs/en/docs/advanced/orthogonal-bitmap-manual.md b/docs/en/docs/advanced/orthogonal-bitmap-manual.md index 42750ffa00..2fb9c65eb9 100644 --- a/docs/en/docs/advanced/orthogonal-bitmap-manual.md +++ b/docs/en/docs/advanced/orthogonal-bitmap-manual.md @@ -141,6 +141,35 @@ Explain: on the basis of this table schema, this function is divided into two layers. In the first layer, be nodes (update and serialize) merge all the bitmaps, and then count the resulting bitmaps. The count values are serialized and sent to the second level be nodes (merge and finalize). In the second layer, the be nodes are used to calculate the sum of all the count values from the first level nodes +#### orthogonal_bitmap_expr_calculate + +Compute the function by computing the intersection, union and difference set of the expression bitmap. + +Syntax: + +orthogonal_bitmap_expr_calculate(bitmap_column, filter_column, input_string) + +Parameters: + +the first parameter is the Bitmap column, the second parameter is the dimension column used for filtering, that is, the calculated key column, and the third parameter is the calculation expression string, which means that the bitmap intersection and union difference expression is calculated according to the key column + +the calculators supported by the expression: & represents intersection calculation, | represents union calculation, - represents difference calculation, ^ represents XOR calculation, and \ represents escape characters + +Explain: + +the aggregation of query planning is divided into two layers. The first layer of be aggregation node calculation includes init, update, and serialize steps. The second layer of be aggregation node calculation includes merge and finalize steps. In the first layer of be node, the input string is parsed in the init phase, which is converted into a suffix expression (inverse Polish), parses the calculated key value, and initializes it in the map<key, bitmap>structure; In the update phase, th [...] + +#### orthogonal_bitmap_expr_calculate_count + +Compute the count function by computing the intersection, union and difference set of the expression bitmap. The syntax and parameters is the same as orthogonal_bitmap_expr_calculate + +Syntax: + +orthogonal_bitmap_expr_calculate_count(bitmap_column, filter_column, input_string) + +Explain: + +the aggregation of query planning is divided into two layers. The first layer of be aggregation node calculation includes init, update, and serialize steps. The second layer of be aggregation node calculation includes merge and finalize steps. In the first layer of be node, the input string is parsed in the init phase, converted to suffix expression Formula (inverse Polish formula), parse the calculated key value and initialize it in the map<key, bitmap>structure; In the update phase, th [...] ### Suitable for the scene @@ -159,3 +188,15 @@ Calculate the deduplication value for user_id: ``` select orthogonal_bitmap_union_count(user_id) from user_tag_bitmap where tag in (13080800, 11110200); ``` + +Bitmap cross merge difference set hybrid computing: + +``` +select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000); +Note: 1000, 20000, 30000 plastic tags represent different labels of users +``` + +``` +select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23'); +Note: 'A:a/b', 'B:2-4', etc. are string types tag, representing different labels of users, where 'B:2-4' needs to be escaped as'B:2\\-4' +``` diff --git a/docs/en/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate.md b/docs/en/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate.md new file mode 100644 index 0000000000..e530de904d --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate.md @@ -0,0 +1,47 @@ +--- +{ +"title": "orthogonal_bitmap_expr_calculate", +"language": "en" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## orthogonal_bitmap_expr_calculate +### description +#### Syntax + +`BITMAP ORTHOGONAL_BITMAP_EXPR_CALCULATE(bitmap_column, column_to_filter, input_string)` +The first parameter is the Bitmap column, the second parameter is the dimension column used for filtering, that is, the calculated key column, and the third parameter is the calculation expression string, meaning that the bitmap intersection, union, and difference set expression is calculated according to the key column +The calculators supported by the expression:&represents intersection calculation, | represents union calculation, - represents difference calculation, ^ represents XOR calculation, and \ represents escape characters + +### example + +``` +select orthogonal_bitmap_expr_calculate(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000); +Note: 1000, 20000, 30000 plastic tags represent different labels of users +``` + +``` +select orthogonal_bitmap_expr_calculate(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23'); +Note: 'A:a/b', 'B:2-4', etc. are string types tag, representing different labels of users, where 'B:2-4' needs to be escaped as'B:2\\-4' +``` + +### keywords + + ORTHOGONAL_BITMAP_EXPR_CALCULATE,BITMAP diff --git a/docs/en/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate_count.md b/docs/en/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate_count.md new file mode 100644 index 0000000000..b7a34ff296 --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate_count.md @@ -0,0 +1,47 @@ +--- +{ +"title": "orthogonal_bitmap_expr_calculate_count", +"language": "en" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## orthogonal_bitmap_expr_calculate_count +### description +#### Syntax + +`BITMAP ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT(bitmap_column, column_to_filter, input_string)` +Calculate the bitmap intersection, union and difference set of expressions to calculate the count function. The first parameter is the Bitmap column, the second parameter is the dimension column used for filtering, that is, the calculated key column, and the third parameter is the calculation expression string, meaning that the bitmap intersection, union and difference set expression is calculated according to the key column +The calculators supported by the expression:&represents intersection calculation, | represents union calculation, - represents difference calculation, ^ represents XOR calculation, and \ represents escape characters + +### example + +``` +select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000); +Note: 1000, 20000, 30000 plastic tags represent different labels of users +``` + +``` +select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23'); +Note: 'A:a/b', 'B:2-4', etc. are string types tag, representing different labels of users, where 'B:2-4' needs to be escaped as'B:2\\-4' +``` + +### keywords + + ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT,BITMAP diff --git a/docs/sidebars.json b/docs/sidebars.json index 1fdad170ac..7ad255cbf5 100644 --- a/docs/sidebars.json +++ b/docs/sidebars.json @@ -545,6 +545,8 @@ "sql-manual/sql-functions/bitmap-functions/bitmap_intersect", "sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_intersect", "sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_intersect_count", + "sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate", + "sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate_count", "sql-manual/sql-functions/bitmap-functions/bitmap_hash64" ] }, diff --git a/docs/zh-CN/docs/advanced/orthogonal-bitmap-manual.md b/docs/zh-CN/docs/advanced/orthogonal-bitmap-manual.md index a89e5e9253..1cdf4ba15f 100644 --- a/docs/zh-CN/docs/advanced/orthogonal-bitmap-manual.md +++ b/docs/zh-CN/docs/advanced/orthogonal-bitmap-manual.md @@ -145,6 +145,36 @@ orthogonal_bitmap_union_count(bitmap_column) 查询规划上分2层,在第一层be节点(update、serialize)对所有bitmap求并集,再对并集的结果bitmap求count,count值序列化后发送至第二层be节点(merge、finalize),在第二层be节点对所有来源于第一层节点的count值循环求sum +#### orthogonal_bitmap_expr_calculate + +求表达式bitmap交并差集合计算函数。 + +语法: + +orthogonal_bitmap_expr_calculate(bitmap_column, filter_column, input_string) + +参数: + +第一个参数是Bitmap列,第二个参数是用来过滤的维度列,即计算的key列,第三个参数是计算表达式字符串,含义是依据key列进行bitmap交并差集表达式计算 + +表达式支持的计算符:& 代表交集计算,| 代表并集计算,- 代表差集计算, ^ 代表异或计算,\ 代表转义字符 + +说明: + +查询规划上聚合分2层,第一层be聚合节点计算包括init、update、serialize步骤,第二层be聚合节点计算包括merge、finalize步骤。在第一层be节点,init阶段解析input_string字符串,转换为后缀表达式(逆波兰式),解析出计算key值,并在map<key, bitmap>结构中初始化;update阶段,底层内核scan维度列(filter_column)数据后回调update函数,然后以计算key为单位对上一步的map结构中的bitmap进行聚合;serialize阶段,根据后缀表达式,解析出计算key列的bitmap,利用栈结构先进后出原则,进行bitmap交并差集合计算,然后对最终的结果bitmap序列化后发送至第二层聚合be节点。在第二层聚合be节点,对所有来源于第一层节点的bitmap值求并集,并返回最终bitmap结果 + +#### orthogonal_bitmap_expr_calculate_count + +求表达式bitmap交并差集合计算count函数, 语法和参数同orthogonal_bitmap_expr_calculate。 + +语法: + +orthogonal_bitmap_expr_calculate_count(bitmap_column, filter_column, input_string) + +说明: + +查询规划上聚合分2层,第一层be聚合节点计算包括init、update、serialize步骤,第二层be聚合节点计算包括merge、finalize步骤。在第一层be节点,init阶段解析input_string字符串,转换为后缀表达式(逆波兰式),解析出计算key值,并在map<key, bitmap>结构中初始化;update阶段,底层内核scan维度列(filter_column)数据后回调update函数,然后以计算key为单位对上一步的map结构中的bitmap进行聚合;serialize阶段,根据后缀表达式,解析出计算key列的bitmap,利用栈结构先进后出原则,进行bitmap交并差集合计算,然后对最终的结果bitmap的count值序列化后发送至第二层聚合be节点。在第二层聚合be节点,对所有来源于第一层节点的count值求加和,并返回最终count结果。 + ### 使用场景 符合对bitmap进行正交计算的场景,如在用户行为分析中,计算留存,漏斗,用户画像等。 @@ -161,3 +191,15 @@ orthogonal_bitmap_union_count(bitmap_column) ```sql select orthogonal_bitmap_union_count(user_id) from user_tag_bitmap where tag in (13080800, 11110200); ``` + +bitmap交并差集合混合计算: + +```sql +select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000); +注:1000、20000、30000等整形tag,代表用户不同标签 +``` + +```sql +select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23'); + 注:'A:a/b', 'B:2-4'等是字符串类型tag,代表用户不同标签, 其中'B:2-4'需要转义成'B:2\\-4' +``` diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate.md b/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate.md new file mode 100644 index 0000000000..a8087b850b --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate.md @@ -0,0 +1,47 @@ +--- +{ +"title": "orthogonal_bitmap_expr_calculate", +"language": "zh-CN" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## orthogonal_bitmap_expr_calculate +### description +#### Syntax + +`BITMAP ORTHOGONAL_BITMAP_EXPR_CALCULATE(bitmap_column, column_to_filter, input_string)` +求表达式bitmap交并差集合计算函数, 第一个参数是Bitmap列,第二个参数是用来过滤的维度列,即计算的key列,第三个参数是计算表达式字符串,含义是依据key列进行bitmap交并差集表达式计算 +表达式支持的计算符:& 代表交集计算,| 代表并集计算,- 代表差集计算, ^ 代表异或计算,\ 代表转义字符 + +### example + +```sql +select orthogonal_bitmap_expr_calculate(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000); +注:1000、20000、30000等整形tag,代表用户不同标签 +``` + +```sql +select orthogonal_bitmap_expr_calculate(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23'); + 注:'A:a/b', 'B:2-4'等是字符串类型tag,代表用户不同标签, 其中'B:2-4'需要转义成'B:2\\-4' +``` + +### keywords + + ORTHOGONAL_BITMAP_EXPR_CALCULATE,BITMAP diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate_count.md b/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate_count.md new file mode 100644 index 0000000000..bca01c6b16 --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate_count.md @@ -0,0 +1,47 @@ +--- +{ +"title": "orthogonal_bitmap_expr_calculate_count", +"language": "zh-CN" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## orthogonal_bitmap_expr_calculate_count +### description +#### Syntax + +`BITMAP ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT(bitmap_column, column_to_filter, input_string)` +求表达式bitmap交并差集合计算count函数, 第一个参数是Bitmap列,第二个参数是用来过滤的维度列,即计算的key列,第三个参数是计算表达式字符串,含义是依据key列进行bitmap交并差集表达式计算 +表达式支持的计算符:& 代表交集计算,| 代表并集计算,- 代表差集计算, ^ 代表异或计算,\ 代表转义字符 + +### example + +```sql +select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000); +注:1000、20000、30000等整形tag,代表用户不同标签 +``` + +```sql +select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23'); + 注:'A:a/b', 'B:2-4'等是字符串类型tag,代表用户不同标签, 其中'B:2-4'需要转义成'B:2\\-4' +``` + +### keywords + + ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT,BITMAP diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java index aa14e31109..30b40088db 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java @@ -819,7 +819,9 @@ public class FunctionCallExpr extends Expr { if (fnName.getFunction().equalsIgnoreCase(FunctionSet.INTERSECT_COUNT) || fnName.getFunction() .equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_INTERSECT) || fnName.getFunction() - .equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_INTERSECT_COUNT)) { + .equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_INTERSECT_COUNT) || fnName.getFunction() + .equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT) || fnName.getFunction() + .equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_EXPR_CALCULATE)) { if (children.size() <= 2) { throw new AnalysisException(fnName + "(bitmap_column, column_to_filter, filter_values) " + "function requires at least three parameters"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java index 84667a5897..885a85866a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/AggregateFunction.java @@ -50,6 +50,7 @@ public class AggregateFunction extends Function { "dense_rank", "multi_distinct_count", "multi_distinct_sum", FunctionSet.HLL_UNION_AGG, FunctionSet.HLL_UNION, FunctionSet.BITMAP_UNION, FunctionSet.BITMAP_INTERSECT, FunctionSet.ORTHOGONAL_BITMAP_INTERSECT, FunctionSet.ORTHOGONAL_BITMAP_INTERSECT_COUNT, + FunctionSet.ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT, FunctionSet.ORTHOGONAL_BITMAP_EXPR_CALCULATE, FunctionSet.INTERSECT_COUNT, FunctionSet.ORTHOGONAL_BITMAP_UNION_COUNT, FunctionSet.COUNT, "approx_count_distinct", "ndv", FunctionSet.BITMAP_UNION_INT, FunctionSet.BITMAP_UNION_COUNT, "ndv_no_finalize", FunctionSet.WINDOW_FUNNEL, FunctionSet.RETENTION, diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java index 9a820fb517..c5e8084b78 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java @@ -930,6 +930,8 @@ public class FunctionSet<T> { public static final String ORTHOGONAL_BITMAP_UNION_COUNT = "orthogonal_bitmap_union_count"; public static final String APPROX_COUNT_DISTINCT = "approx_count_distinct"; public static final String NDV = "ndv"; + public static final String ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT = "orthogonal_bitmap_expr_calculate_count"; + public static final String ORTHOGONAL_BITMAP_EXPR_CALCULATE = "orthogonal_bitmap_expr_calculate"; public static final String QUANTILE_UNION = "quantile_union"; //TODO(weixiang): is quantile_percent can be replaced by approx_percentile? @@ -2461,6 +2463,47 @@ public class FunctionSet<T> { Lists.newArrayList(Type.BITMAP, t, t), Type.BIGINT, Type.BITMAP, true, "", "", "", "", "", "", "", true, false, true, true)); } + + Type[] ntypes = {Type.CHAR, Type.VARCHAR, Type.STRING}; + for (Type t : ntypes) { + addBuiltin(AggregateFunction.createBuiltin(ORTHOGONAL_BITMAP_EXPR_CALCULATE, + Lists.newArrayList(Type.BITMAP, t, Type.STRING), + Type.BITMAP, + Type.VARCHAR, + true, + "_ZN5doris15BitmapFunctions37orthogonal_bitmap_expr_calculate_initEPN9doris_udf15FunctionContextEPNS1_9StringValE", + "_ZN5doris15BitmapFunctions39orthogonal_bitmap_expr_calculate_updateEPN9doris_udf15FunctionContextERKNS1_9StringValES6_iPS5_S7_", + "_ZN5doris15BitmapFunctions12bitmap_unionEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_", + "_ZN5doris15BitmapFunctions42orthogonal_bitmap_expr_calculate_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE", + "", + "", + "_ZN5doris15BitmapFunctions16bitmap_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE", + true, false, true)); + + addBuiltin(AggregateFunction.createBuiltin(ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT, + Lists.newArrayList(Type.BITMAP, t, Type.STRING), + Type.BIGINT, + Type.VARCHAR, + true, + "_ZN5doris15BitmapFunctions43orthogonal_bitmap_expr_calculate_count_initEPN9doris_udf15FunctionContextEPNS1_9StringValE", + "_ZN5doris15BitmapFunctions39orthogonal_bitmap_expr_calculate_updateEPN9doris_udf15FunctionContextERKNS1_9StringValES6_iPS5_S7_", + "_ZN5doris15BitmapFunctions29orthogonal_bitmap_count_mergeEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_", + "_ZN5doris15BitmapFunctions48orthogonal_bitmap_expr_calculate_count_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE", + "", + "", + "_ZN5doris15BitmapFunctions32orthogonal_bitmap_count_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE", + true, false, true)); + + //vec ORTHOGONAL_BITMAP_EXPR_CALCULATE and ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT + addBuiltin( + AggregateFunction.createBuiltin(ORTHOGONAL_BITMAP_EXPR_CALCULATE, Lists.newArrayList(Type.BITMAP, t, Type.STRING), + Type.BITMAP, Type.BITMAP, true, "", "", "", "", "", "", "", true, false, true, true)); + + addBuiltin(AggregateFunction.createBuiltin(ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT, + Lists.newArrayList(Type.BITMAP, t, Type.STRING), Type.BIGINT, Type.BITMAP, true, "", "", "", "", "", "", "", + true, false, true, true)); + } + // bitmap addBuiltin(AggregateFunction.createBuiltin(BITMAP_UNION, Lists.newArrayList(Type.BITMAP), Type.BITMAP, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org