This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push: new 586bec7 [fix](storage) Fix query result error due to find code by bound (#8787) 586bec7 is described below commit 586bec79f5a73aac587ca11c514df46d1501cf75 Author: ZenoYang <cookie...@qq.com> AuthorDate: Sun Apr 3 10:38:14 2022 +0800 [fix](storage) Fix query result error due to find code by bound (#8787) Problem recurrence SSB single table `lineorder_flat`, the query SQL is as follows: ```sql SELECT sum(LO_REVENUE), (LO_ORDERDATE DIV 10000) AS year, P_BRAND FROM lineorder_flat WHERE P_BRAND >= 'MFGR#22211111' AND P_BRAND <= 'MFGR#22281111' AND S_REGION = 'ASIA' and (LO_ORDERDATE DIV 10000) = 1992 GROUP BY year, P_BRAND ORDER BY year, P_BRAND; ``` when `enable_low_cardinality_optimize=false`, query result: ```sql +-------------------+------+-----------+ | sum(`LO_REVENUE`) | year | P_BRAND | +-------------------+------+-----------+ | 65423264312 | 1992 | MFGR#2222 | | 66936772687 | 1992 | MFGR#2223 | | 64047191934 | 1992 | MFGR#2224 | | 65744559138 | 1992 | MFGR#2225 | | 66993045668 | 1992 | MFGR#2226 | | 67411226147 | 1992 | MFGR#2227 | | 69390885970 | 1992 | MFGR#2228 | +-------------------+------+-----------+ ``` when `enable_low_cardinality_optimize=true`, query result: ```sql +-------------------+------+-----------+ | sum(`LO_REVENUE`) | year | P_BRAND | +-------------------+------+-----------+ | 66936772687 | 1992 | MFGR#2223 | | 64047191934 | 1992 | MFGR#2224 | | 65744559138 | 1992 | MFGR#2225 | | 66993045668 | 1992 | MFGR#2226 | | 67411226147 | 1992 | MFGR#2227 | | 69390885970 | 1992 | MFGR#2228 | +-------------------+------+-----------+ ``` One line less than the correct result. The reason is that 'MFGR#22211111' is not in the dictionary, so get the boundary code (`find_code_by_bound` method), but there is a bug here. --- be/src/olap/comparison_predicate.cpp | 9 +++++---- be/src/olap/in_list_predicate.cpp | 3 +-- be/src/vec/columns/column_dictionary.h | 34 +++++++++++++++++++++++----------- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/be/src/olap/comparison_predicate.cpp b/be/src/olap/comparison_predicate.cpp index ef6ee3a..57549c3 100644 --- a/be/src/olap/comparison_predicate.cpp +++ b/be/src/olap/comparison_predicate.cpp @@ -520,8 +520,7 @@ COMPARISON_PRED_BITMAP_EVALUATE(GreaterEqualPredicate, >=) auto& dict_col = \ reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \ *col_ptr); \ - auto code = dict_col.find_code(_value); \ - _dict_code = code; \ + _dict_code = dict_col.find_code(_value); \ _dict_code_inited = true; \ } \ } \ @@ -530,6 +529,9 @@ COMPARISON_PRED_BITMAP_EVALUATE(GreaterEqualPredicate, >=) COMPARISON_PRED_SET_DICT_CODE(EqualPredicate) COMPARISON_PRED_SET_DICT_CODE(NotEqualPredicate) +// If 1 OP 0 returns true, it means the predicate is > or >= +// If 1 OP 1 returns true, it means the predicate is >= or <= +// by this way, avoid redundant code #define RAMGE_COMPARISON_PRED_SET_DICT_CODE(CLASS, OP) \ template <class T> \ void CLASS<T>::set_dict_code_if_necessary(vectorized::IColumn& column) { \ @@ -548,8 +550,7 @@ COMPARISON_PRED_SET_DICT_CODE(NotEqualPredicate) auto& dict_col = \ reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \ *col_ptr); \ - auto code = dict_col.find_code_by_bound(_value, 0 OP 1, 1 OP 1); \ - _dict_code = code; \ + _dict_code = dict_col.find_code_by_bound(_value, 1 OP 0, 1 OP 1); \ _dict_code_inited = true; \ } \ } \ diff --git a/be/src/olap/in_list_predicate.cpp b/be/src/olap/in_list_predicate.cpp index 21214f2..0401673 100644 --- a/be/src/olap/in_list_predicate.cpp +++ b/be/src/olap/in_list_predicate.cpp @@ -299,8 +299,7 @@ IN_LIST_PRED_BITMAP_EVALUATE(NotInListPredicate, -=) auto& dict_col = \ reinterpret_cast<vectorized::ColumnDictionary<vectorized::Int32>&>( \ *col_ptr); \ - auto code_set = dict_col.find_codes(_values); \ - _dict_codes = std::move(code_set); \ + _dict_codes = dict_col.find_codes(_values); \ _dict_code_inited = true; \ } \ } \ diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index cca6c85..499a04a 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -244,8 +244,8 @@ public: int32_t find_code(const StringValue& value) const { return _dict.find_code(value); } - int32_t find_code_by_bound(const StringValue& value, bool lower, bool eq) const { - return _dict.find_code_by_bound(value, lower, eq); + int32_t find_code_by_bound(const StringValue& value, bool greater, bool eq) const { + return _dict.find_code_by_bound(value, greater, eq); } phmap::flat_hash_set<int32_t> find_codes( @@ -294,19 +294,31 @@ public: return -1; } - inline int32_t find_code_by_bound(const StringValue& value, bool lower, bool eq) const { + // For > , code takes upper_bound - 1; For >= , code takes upper_bound + // For < , code takes upper_bound; For <=, code takes upper_bound - 1 + // For example a sorted dict: <'b',0> <'c',1> <'d',2> + // Now the predicate value is 'ccc', 'ccc' is not in the dict, 'ccc' is between 'c' and 'd'. + // std::upper_bound(..., 'ccc') - begin, will return the encoding of 'd', which is 2 + // If the predicate is col > 'ccc' and the value of upper_bound-1 is 1, + // then evaluate code > 1 and the result is 'd'. + // If the predicate is col < 'ccc' and the value of upper_bound is 2, + // evaluate code < 2, and the return result is 'b'. + // If the predicate is col >= 'ccc' and the value of upper_bound is 2, + // evaluate code >= 2, and the return result is 'd'. + // If the predicate is col <= 'ccc' and the value of upper_bound-1 is 1, + // evaluate code <= 1, and the returned result is 'b'. + // If the predicate is col < 'a', 'a' is also not in the dict, and 'a' is less than 'b', + // so upper_bound is the code 0 of b, then evaluate code < 0 and returns empty + // If the predicate is col <= 'a' and upper_bound-1 is -1, + // then evaluate code <= -1 and returns empty + inline int32_t find_code_by_bound(const StringValue& value, bool greater, bool eq) const { auto code = find_code(value); if (code >= 0) { return code; } - - if (lower) { - return std::lower_bound(_dict_data.begin(), _dict_data.end(), value) - - _dict_data.begin() - eq; - } else { - return std::upper_bound(_dict_data.begin(), _dict_data.end(), value) - - _dict_data.begin() + eq; - } + auto bound = std::upper_bound(_dict_data.begin(), _dict_data.end(), value) - + _dict_data.begin(); + return greater ? bound - greater + eq : bound - eq; } inline phmap::flat_hash_set<int32_t> find_codes( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org