This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch cs_opt_version-3.1 in repository https://gitbox.apache.org/repos/asf/doris.git
commit c187e3b9b6cce21fd15103212c8e7f5f02593a0c Author: HappenLee <[email protected]> AuthorDate: Wed Jul 9 21:50:03 2025 +0800 cherry pick like opt/lru-k --- be/src/olap/like_column_predicate.cpp | 21 +++-------- be/src/olap/like_column_predicate.h | 65 +++++++++++++++++++++++++---------- be/src/vec/functions/like.cpp | 13 ++++--- 3 files changed, 58 insertions(+), 41 deletions(-) diff --git a/be/src/olap/like_column_predicate.cpp b/be/src/olap/like_column_predicate.cpp index b441e982606..6da2aa3062f 100644 --- a/be/src/olap/like_column_predicate.cpp +++ b/be/src/olap/like_column_predicate.cpp @@ -62,15 +62,12 @@ uint16_t LikeColumnPredicate<T>::_evaluate_inner(const vectorized::IColumn& colu auto* nested_col_ptr = vectorized::check_and_get_column< vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); auto& data_array = nested_col_ptr->get_data(); + const auto& dict_res = *_find_code_from_dictionary_column(*nested_col_ptr); if (!nullable_col->has_null()) { for (uint16_t i = 0; i != size; i++) { uint16_t idx = sel[i]; sel[new_size] = idx; - StringRef cell_value = nested_col_ptr->get_shrink_value(data_array[idx]); - unsigned char flag = 0; - static_cast<void>((_state->scalar_function)( - const_cast<vectorized::LikeSearchState*>(&_like_state), - StringRef(cell_value.data, cell_value.size), pattern, &flag)); + unsigned char flag = dict_res[data_array[idx]]; new_size += _opposite ^ flag; } } else { @@ -81,12 +78,7 @@ uint16_t LikeColumnPredicate<T>::_evaluate_inner(const vectorized::IColumn& colu new_size += _opposite; continue; } - - StringRef cell_value = nested_col_ptr->get_shrink_value(data_array[idx]); - unsigned char flag = 0; - static_cast<void>((_state->scalar_function)( - const_cast<vectorized::LikeSearchState*>(&_like_state), - StringRef(cell_value.data, cell_value.size), pattern, &flag)); + unsigned char flag = dict_res[data_array[idx]]; new_size += _opposite ^ flag; } } @@ -126,15 +118,12 @@ uint16_t LikeColumnPredicate<T>::_evaluate_inner(const vectorized::IColumn& colu if (column.is_column_dictionary()) { auto* nested_col_ptr = vectorized::check_and_get_column< vectorized::ColumnDictionary<vectorized::Int32>>(column); + const auto& dict_res = *_find_code_from_dictionary_column(*nested_col_ptr); auto& data_array = nested_col_ptr->get_data(); for (uint16_t i = 0; i != size; i++) { uint16_t idx = sel[i]; sel[new_size] = idx; - StringRef cell_value = nested_col_ptr->get_shrink_value(data_array[idx]); - unsigned char flag = 0; - static_cast<void>((_state->scalar_function)( - const_cast<vectorized::LikeSearchState*>(&_like_state), - StringRef(cell_value.data, cell_value.size), pattern, &flag)); + unsigned char flag = dict_res[data_array[idx]]; new_size += _opposite ^ flag; } } else { diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h index 31763d45f7e..7402d8c9f5a 100644 --- a/be/src/olap/like_column_predicate.h +++ b/be/src/olap/like_column_predicate.h @@ -101,6 +101,7 @@ private: if (nested_col.is_column_dictionary()) { auto* nested_col_ptr = vectorized::check_and_get_column< vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); + const auto& dict_res = *_find_code_from_dictionary_column(*nested_col_ptr); auto& data_array = nested_col_ptr->get_data(); for (uint16_t i = 0; i < size; i++) { if (null_map_data[i]) { @@ -112,18 +113,10 @@ private: continue; } - StringRef cell_value = nested_col_ptr->get_shrink_value(data_array[i]); + unsigned char flag = dict_res[data_array[i]]; if constexpr (is_and) { - unsigned char flag = 0; - static_cast<void>((_state->scalar_function)( - const_cast<vectorized::LikeSearchState*>(&_like_state), - StringRef(cell_value.data, cell_value.size), pattern, &flag)); flags[i] &= _opposite ^ flag; } else { - unsigned char flag = 0; - static_cast<void>((_state->scalar_function)( - const_cast<vectorized::LikeSearchState*>(&_like_state), - StringRef(cell_value.data, cell_value.size), pattern, &flag)); flags[i] = _opposite ^ flag; } } @@ -136,19 +129,12 @@ private: auto* nested_col_ptr = vectorized::check_and_get_column< vectorized::ColumnDictionary<vectorized::Int32>>(column); auto& data_array = nested_col_ptr->get_data(); + const auto& dict_res = *_find_code_from_dictionary_column(*nested_col_ptr); for (uint16_t i = 0; i < size; i++) { - StringRef cell_value = nested_col_ptr->get_shrink_value(data_array[i]); + unsigned char flag = dict_res[data_array[i]]; if constexpr (is_and) { - unsigned char flag = 0; - static_cast<void>((_state->scalar_function)( - const_cast<vectorized::LikeSearchState*>(&_like_state), - StringRef(cell_value.data, cell_value.size), pattern, &flag)); flags[i] &= _opposite ^ flag; } else { - unsigned char flag = 0; - static_cast<void>((_state->scalar_function)( - const_cast<vectorized::LikeSearchState*>(&_like_state), - StringRef(cell_value.data, cell_value.size), pattern, &flag)); flags[i] = _opposite ^ flag; } } @@ -159,6 +145,49 @@ private: } } + __attribute__((flatten)) std::vector<bool>* _find_code_from_dictionary_column( + const vectorized::ColumnDictI32& column) const { + std::vector<bool>* res = nullptr; + if (_segment_id_to_cached_res_flags.if_contains( + column.get_rowset_segment_id(), + [&res](const auto& pair) { res = &pair.second; })) { + return res; + } + + std::vector<bool> tmp_res(column.dict_size(), false); + for (int i = 0; i < column.dict_size(); i++) { + StringRef cell_value = column.get_shrink_value(i); + unsigned char flag = 0; + static_cast<void>((_state->scalar_function)( + const_cast<vectorized::LikeSearchState*>(&_like_state), + StringRef(cell_value.data, cell_value.size), pattern, &flag)); + tmp_res[i] = flag; + } + // Sometimes the dict is not initialized when run comparison predicate here, for example, + // the full page is null, then the reader will skip read, so that the dictionary is not + // inited. The cached code is wrong during this case, because the following page maybe not + // null, and the dict should have items in the future. + // + // Cached code may have problems, so that add a config here, if not opened, then + // we will return the code and not cache it. + if (!column.is_dict_empty() && config::enable_low_cardinality_cache_code) { + _segment_id_to_cached_res_flags.emplace( + std::pair {column.get_rowset_segment_id(), tmp_res}); + } + + _segment_id_to_cached_res_flags.if_contains( + column.get_rowset_segment_id(), [&res](const auto& pair) { res = &pair.second; }); + return res; + } + + mutable phmap::parallel_flat_hash_map< + std::pair<RowsetId, uint32_t>, std::vector<bool>, + phmap::priv::hash_default_hash<std::pair<RowsetId, uint32_t>>, + phmap::priv::hash_default_eq<std::pair<RowsetId, uint32_t>>, + std::allocator<std::pair<const std::pair<RowsetId, uint32_t>, int32_t>>, 4, + std::shared_mutex> + _segment_id_to_cached_res_flags; + std::string _debug_string() const override { std::string info = "LikeColumnPredicate"; return info; diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp index 4ed14280e4c..f3aa71d03a0 100644 --- a/be/src/vec/functions/like.cpp +++ b/be/src/vec/functions/like.cpp @@ -509,11 +509,12 @@ Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block, size_t input_rows_count) const { const auto values_col = block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); - const auto* values = check_and_get_column<ColumnString>(values_col.get()); + const auto* values = + assert_cast<const ColumnString*, TypeCheckOnRelease::DISABLE>(values_col.get()); - if (!values) { - return Status::InternalError("Not supported input arguments types"); - } + // if (!values) { + // return Status::InternalError("Not supported input arguments types"); + // } // result column auto res = ColumnUInt8::create(); ColumnUInt8::Container& vec_res = res->get_data(); @@ -578,9 +579,7 @@ Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values, } /// We check that the entry does not pass through the boundaries of strings. - if (pos + needle_size <= begin + value_offsets[i]) { - result[i] = 1; - } + result[i] = pos + needle_size <= begin + value_offsets[i]; // move to next string offset pos = begin + value_offsets[i]; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
