This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 0291f84a9e [fix](like-predicate) Add missing functions in LikeColumnPredicate (#11631) 0291f84a9e is described below commit 0291f84a9eb0b9d29a99b855a63a90a155c6be05 Author: Jerry Hu <mrh...@gmail.com> AuthorDate: Wed Aug 10 15:03:14 2022 +0800 [fix](like-predicate) Add missing functions in LikeColumnPredicate (#11631) --- be/src/olap/like_column_predicate.cpp | 153 +++++++++++++++++++++++----------- be/src/olap/like_column_predicate.h | 112 +++++++++++++++++++++---- be/src/olap/reader.cpp | 4 +- be/src/olap/reader.h | 2 +- be/src/vec/exec/volap_scan_node.cpp | 9 +- be/src/vec/exec/volap_scan_node.h | 2 +- be/src/vec/functions/like.cpp | 31 ++++++- be/src/vec/functions/like.h | 13 ++- be/src/vec/olap/block_reader.cpp | 12 +++ be/src/vec/olap/block_reader.h | 2 + 10 files changed, 260 insertions(+), 80 deletions(-) diff --git a/be/src/olap/like_column_predicate.cpp b/be/src/olap/like_column_predicate.cpp index f7eea7dabb..8be04cfe1e 100644 --- a/be/src/olap/like_column_predicate.cpp +++ b/be/src/olap/like_column_predicate.cpp @@ -23,15 +23,30 @@ namespace doris { -LikeColumnPredicate::LikeColumnPredicate(bool opposite, uint32_t column_id, - doris_udf::FunctionContext* fn_ctx, - doris_udf::StringVal val) +template <> +LikeColumnPredicate<true>::LikeColumnPredicate(bool opposite, uint32_t column_id, + doris_udf::FunctionContext* fn_ctx, + doris_udf::StringVal val) + : ColumnPredicate(column_id, opposite), + _fn_ctx(fn_ctx), + pattern(reinterpret_cast<char*>(val.ptr), val.len) { + _state = reinterpret_cast<StateType*>( + _fn_ctx->get_function_state(doris_udf::FunctionContext::THREAD_LOCAL)); + _state->search_state.clone(_like_state); +} + +template <> +LikeColumnPredicate<false>::LikeColumnPredicate(bool opposite, uint32_t column_id, + doris_udf::FunctionContext* fn_ctx, + doris_udf::StringVal val) : ColumnPredicate(column_id, opposite), _fn_ctx(fn_ctx), pattern(val) { - _state = reinterpret_cast<LikePredicateState*>( + _state = reinterpret_cast<StateType*>( _fn_ctx->get_function_state(doris_udf::FunctionContext::THREAD_LOCAL)); } -void LikeColumnPredicate::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* size) const { +template <bool is_vectorized> +void LikeColumnPredicate<is_vectorized>::evaluate(ColumnBlock* block, uint16_t* sel, + uint16_t* size) const { if (block->is_nullable()) { _base_evaluate<true>(block, sel, size); } else { @@ -39,58 +54,100 @@ void LikeColumnPredicate::evaluate(ColumnBlock* block, uint16_t* sel, uint16_t* } } -void LikeColumnPredicate::evaluate_vec(const vectorized::IColumn& column, uint16_t size, - bool* flags) const { - if (column.is_nullable()) { - auto* nullable_col = vectorized::check_and_get_column<vectorized::ColumnNullable>(column); - auto& null_map_data = nullable_col->get_null_map_column().get_data(); - auto& nested_col = nullable_col->get_nested_column(); - if (nested_col.is_column_dictionary()) { - auto* nested_col_ptr = vectorized::check_and_get_column< - vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); - auto& data_array = nested_col_ptr->get_data(); - for (uint16_t i = 0; i < size; i++) { - if (null_map_data[i]) { - flags[i] = _opposite; - continue; +template <bool is_vectorized> +void LikeColumnPredicate<is_vectorized>::evaluate_vec(const vectorized::IColumn& column, + uint16_t size, bool* flags) const { + _evaluate_vec<false>(column, size, flags); +} + +template <bool is_vectorized> +void LikeColumnPredicate<is_vectorized>::evaluate_and_vec(const vectorized::IColumn& column, + uint16_t size, bool* flags) const { + _evaluate_vec<true>(column, size, flags); +} + +template <bool is_vectorized> +uint16_t LikeColumnPredicate<is_vectorized>::evaluate(const vectorized::IColumn& column, + uint16_t* sel, uint16_t size) const { + uint16_t new_size = 0; + if constexpr (is_vectorized) { + if (column.is_nullable()) { + auto* nullable_col = + vectorized::check_and_get_column<vectorized::ColumnNullable>(column); + auto& null_map_data = nullable_col->get_null_map_column().get_data(); + auto& nested_col = nullable_col->get_nested_column(); + if (nested_col.is_column_dictionary()) { + auto* nested_col_ptr = vectorized::check_and_get_column< + vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); + auto& data_array = nested_col_ptr->get_data(); + for (uint16_t i = 0; i != size; i++) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + if (null_map_data[idx]) { + new_size += !_opposite; + continue; + } + + StringValue cell_value = nested_col_ptr->get_value(data_array[idx]); + unsigned char flag = 0; + (_state->function)(const_cast<vectorized::LikeSearchState*>(&_like_state), + cell_value, pattern, &flag); + new_size += _opposite ^ flag; } + } else { + auto* data_array = vectorized::check_and_get_column< + vectorized::PredicateColumnType<StringValue>>(column) + ->get_data() + .data(); + for (uint16_t i = 0; i != size; i++) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + if (null_map_data[idx]) { + new_size += !_opposite; + continue; + } - StringValue cell_value = nested_col_ptr->get_value(data_array[i]); - doris_udf::StringVal target; - cell_value.to_string_val(&target); - flags[i] = _opposite ^ ((_state->function)(_fn_ctx, target, pattern).val); + unsigned char flag = 0; + (_state->function)(const_cast<vectorized::LikeSearchState*>(&_like_state), + data_array[idx], pattern, &flag); + new_size += _opposite ^ flag; + } } } else { - for (uint16_t i = 0; i < size; i++) { - if (null_map_data[i]) { - flags[i] = _opposite; - continue; + if (column.is_column_dictionary()) { + auto* nested_col_ptr = vectorized::check_and_get_column< + vectorized::ColumnDictionary<vectorized::Int32>>(column); + auto& data_array = nested_col_ptr->get_data(); + for (uint16_t i = 0; i != size; i++) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + StringValue cell_value = nested_col_ptr->get_value(data_array[idx]); + unsigned char flag = 0; + (_state->function)(const_cast<vectorized::LikeSearchState*>(&_like_state), + cell_value, pattern, &flag); + new_size += _opposite ^ flag; } + } else { + auto* data_array = vectorized::check_and_get_column< + vectorized::PredicateColumnType<StringValue>>(column) + ->get_data() + .data(); - StringRef cell_value = nested_col.get_data_at(i); - doris_udf::StringVal target = cell_value.to_string_val(); - flags[i] = _opposite ^ ((_state->function)(_fn_ctx, target, pattern).val); - } - } - } else { - if (column.is_column_dictionary()) { - auto* nested_col_ptr = vectorized::check_and_get_column< - vectorized::ColumnDictionary<vectorized::Int32>>(column); - auto& data_array = nested_col_ptr->get_data(); - for (uint16_t i = 0; i < size; i++) { - StringValue cell_value = nested_col_ptr->get_value(data_array[i]); - doris_udf::StringVal target; - cell_value.to_string_val(&target); - flags[i] = _opposite ^ ((_state->function)(_fn_ctx, target, pattern).val); - } - } else { - for (uint16_t i = 0; i < size; i++) { - StringRef cell_value = column.get_data_at(i); - doris_udf::StringVal target = cell_value.to_string_val(); - flags[i] = _opposite ^ ((_state->function)(_fn_ctx, target, pattern).val); + for (uint16_t i = 0; i != size; i++) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + unsigned char flag = 0; + (_state->function)(const_cast<vectorized::LikeSearchState*>(&_like_state), + data_array[idx], pattern, &flag); + new_size += _opposite ^ flag; + } } } } + return new_size; } +template class LikeColumnPredicate<true>; +template class LikeColumnPredicate<false>; + } //namespace doris diff --git a/be/src/olap/like_column_predicate.h b/be/src/olap/like_column_predicate.h index fb00f7b146..4ece4b30c8 100644 --- a/be/src/olap/like_column_predicate.h +++ b/be/src/olap/like_column_predicate.h @@ -22,9 +22,11 @@ #include "udf/udf.h" #include "vec/columns/column_dictionary.h" #include "vec/core/types.h" +#include "vec/functions/like.h" namespace doris { +template <bool is_vectorized> class LikeColumnPredicate : public ColumnPredicate { public: LikeColumnPredicate(bool opposite, uint32_t column_id, doris_udf::FunctionContext* fn_ctx, @@ -45,33 +47,115 @@ public: return Status::OK(); } + uint16_t evaluate(const vectorized::IColumn& column, uint16_t* sel, + uint16_t size) const override; + + void evaluate_and_vec(const vectorized::IColumn& column, uint16_t size, + bool* flags) const override; + private: template <bool is_nullable> void _base_evaluate(const ColumnBlock* block, uint16_t* sel, uint16_t* size) const { uint16_t new_size = 0; - for (uint16_t i = 0; i < *size; ++i) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - const StringValue* cell_value = - reinterpret_cast<const StringValue*>(block->cell(idx).cell_ptr()); - doris_udf::StringVal target; - cell_value->to_string_val(&target); - if constexpr (is_nullable) { - new_size += _opposite ^ (!block->cell(idx).is_null() && - (_state->function)(_fn_ctx, target, pattern).val); - } else { - new_size += _opposite ^ (_state->function)(_fn_ctx, target, pattern).val; + if constexpr (!is_vectorized) { + for (uint16_t i = 0; i < *size; ++i) { + uint16_t idx = sel[i]; + sel[new_size] = idx; + const StringValue* cell_value = + reinterpret_cast<const StringValue*>(block->cell(idx).cell_ptr()); + doris_udf::StringVal target; + cell_value->to_string_val(&target); + if constexpr (is_nullable) { + new_size += _opposite ^ (!block->cell(idx).is_null() && + (_state->function)(_fn_ctx, target, pattern).val); + } else { + new_size += _opposite ^ (_state->function)(_fn_ctx, target, pattern).val; + } } } *size = new_size; } + template <bool is_and> + void _evaluate_vec(const vectorized::IColumn& column, uint16_t size, bool* flags) const { + if constexpr (is_vectorized) { + if (column.is_nullable()) { + auto* nullable_col = + vectorized::check_and_get_column<vectorized::ColumnNullable>(column); + auto& null_map_data = nullable_col->get_null_map_column().get_data(); + auto& nested_col = nullable_col->get_nested_column(); + if (nested_col.is_column_dictionary()) { + auto* nested_col_ptr = vectorized::check_and_get_column< + vectorized::ColumnDictionary<vectorized::Int32>>(nested_col); + auto& data_array = nested_col_ptr->get_data(); + for (uint16_t i = 0; i < size; i++) { + if (null_map_data[i]) { + if constexpr (is_and) { + flags[i] &= _opposite; + } else { + flags[i] = _opposite; + } + continue; + } + + StringValue cell_value = nested_col_ptr->get_value(data_array[i]); + if constexpr (is_and) { + unsigned char flag = 0; + (_state->function)( + const_cast<vectorized::LikeSearchState*>(&_like_state), + cell_value, pattern, &flag); + flags[i] &= _opposite ^ flag; + } else { + unsigned char flag = 0; + (_state->function)( + const_cast<vectorized::LikeSearchState*>(&_like_state), + cell_value, pattern, &flag); + flags[i] = _opposite ^ flag; + } + } + } else { + LOG(FATAL) << "vectorized (not) like predicates should be dict column"; + } + } else { + if (column.is_column_dictionary()) { + auto* nested_col_ptr = vectorized::check_and_get_column< + vectorized::ColumnDictionary<vectorized::Int32>>(column); + auto& data_array = nested_col_ptr->get_data(); + for (uint16_t i = 0; i < size; i++) { + StringValue cell_value = nested_col_ptr->get_value(data_array[i]); + if constexpr (is_and) { + unsigned char flag = 0; + (_state->function)( + const_cast<vectorized::LikeSearchState*>(&_like_state), + cell_value, pattern, &flag); + flags[i] &= _opposite ^ flag; + } else { + unsigned char flag = 0; + (_state->function)( + const_cast<vectorized::LikeSearchState*>(&_like_state), + cell_value, pattern, &flag); + flags[i] = _opposite ^ flag; + } + } + } else { + LOG(FATAL) << "vectorized (not) like predicates should be dict column"; + } + } + } + } + std::string _origin; // life time controlled by scan node doris_udf::FunctionContext* _fn_ctx; - doris_udf::StringVal pattern; + using PatternType = std::conditional_t<is_vectorized, StringValue, StringVal>; + using StateType = std::conditional_t<is_vectorized, vectorized::LikeState, LikePredicateState>; + PatternType pattern; + + StateType* _state; - LikePredicateState* _state; + // A separate scratch region is required for every concurrent caller of the Hyperscan API. + // So here _like_state is separate for each instance of LikeColumnPredicate. + vectorized::LikeSearchState _like_state; }; } //namespace doris diff --git a/be/src/olap/reader.cpp b/be/src/olap/reader.cpp index 99fd896d2a..4041124d3b 100644 --- a/be/src/olap/reader.cpp +++ b/be/src/olap/reader.cpp @@ -532,8 +532,8 @@ ColumnPredicate* TabletReader::_parse_to_predicate(const FunctionFilter& functio } // currently only support like predicate - return new LikeColumnPredicate(function_filter._opposite, index, function_filter._fn_ctx, - function_filter._string_param); + return new LikeColumnPredicate<false>(function_filter._opposite, index, function_filter._fn_ctx, + function_filter._string_param); } ColumnPredicate* TabletReader::_parse_to_predicate(const TCondition& condition, diff --git a/be/src/olap/reader.h b/be/src/olap/reader.h index b02f70ddef..4b10dfdbe5 100644 --- a/be/src/olap/reader.h +++ b/be/src/olap/reader.h @@ -169,7 +169,7 @@ protected: ColumnPredicate* _parse_to_predicate( const std::pair<std::string, std::shared_ptr<IBloomFilterFuncBase>>& bloom_filter); - ColumnPredicate* _parse_to_predicate(const FunctionFilter& function_filter); + virtual ColumnPredicate* _parse_to_predicate(const FunctionFilter& function_filter); Status _init_delete_condition(const ReaderParams& read_params); diff --git a/be/src/vec/exec/volap_scan_node.cpp b/be/src/vec/exec/volap_scan_node.cpp index c3572adaac..73cba8915e 100644 --- a/be/src/vec/exec/volap_scan_node.cpp +++ b/be/src/vec/exec/volap_scan_node.cpp @@ -1372,7 +1372,7 @@ bool VOlapScanNode::_should_push_down_in_predicate(VInPredicate* pred, VExprCont bool VOlapScanNode::_should_push_down_function_filter(VectorizedFnCall* fn_call, VExprContext* expr_ctx, - std::string* constant_str, + StringVal* constant_str, doris_udf::FunctionContext** fn_ctx) { // Now only `like` function filters is supported to push down if (fn_call->fn().name.function_name != "like") { @@ -1395,7 +1395,7 @@ bool VOlapScanNode::_should_push_down_function_filter(VectorizedFnCall* fn_call, DCHECK(children[1 - i]->type().is_string_type()); if (const ColumnConst* const_column = check_and_get_column<ColumnConst>( children[1 - i]->get_const_col(expr_ctx)->column_ptr)) { - *constant_str = const_column->get_data_at(0).to_string(); + *constant_str = const_column->get_data_at(0).to_string_val(); } else { return false; } @@ -1711,11 +1711,10 @@ Status VOlapScanNode::_normalize_function_filters(VExpr* expr, VExprContext* exp if (TExprNodeType::FUNCTION_CALL == fn_expr->node_type()) { doris_udf::FunctionContext* fn_ctx = nullptr; - std::string str; + StringVal val; if (_should_push_down_function_filter(reinterpret_cast<VectorizedFnCall*>(fn_expr), - expr_ctx, &str, &fn_ctx)) { + expr_ctx, &val, &fn_ctx)) { std::string col = slot->col_name(); - StringVal val(reinterpret_cast<uint8_t*>(str.data()), str.size()); _push_down_functions.emplace_back(opposite, col, fn_ctx, val); *push_down = true; } diff --git a/be/src/vec/exec/volap_scan_node.h b/be/src/vec/exec/volap_scan_node.h index 718ec8755b..fba7fdd24a 100644 --- a/be/src/vec/exec/volap_scan_node.h +++ b/be/src/vec/exec/volap_scan_node.h @@ -135,7 +135,7 @@ private: int* slot_ref_child, const std::function<bool(const std::string&)>& fn_checker); bool _should_push_down_function_filter(VectorizedFnCall* fn_call, VExprContext* expr_ctx, - std::string* constant_str, + StringVal* constant_str, doris_udf::FunctionContext** fn_ctx); Status _append_rf_into_conjuncts(RuntimeState* state, std::vector<VExpr*>& vexprs); diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp index 6e9b810ae1..306aa19a7e 100644 --- a/be/src/vec/functions/like.cpp +++ b/be/src/vec/functions/like.cpp @@ -49,6 +49,25 @@ static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\%)|(\\\\_)|([^%_]))+)"); static const re2::RE2 LIKE_STARTS_WITH_RE("(((\\\\%)|(\\\\_)|([^%_]))+)(?:%+)"); static const re2::RE2 LIKE_EQUALS_RE("(((\\\\%)|(\\\\_)|([^%_]))+)"); +Status LikeSearchState::clone(LikeSearchState& cloned) { + cloned.escape_char = escape_char; + cloned.set_search_string(search_string); + + if (hs_database) { + std::string re_pattern; + FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern); + + hs_database_t* database = nullptr; + hs_scratch_t* scratch = nullptr; + RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); + + cloned.hs_database.reset(database); + cloned.hs_scratch.reset(scratch); + } + + return Status::OK(); +} + Status FunctionLikeBase::constant_starts_with_fn(LikeSearchState* state, const StringValue& val, const StringValue& pattern, unsigned char* result) { @@ -77,8 +96,7 @@ Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const Str *result = true; return Status::OK(); } - StringValue pattern_value = StringValue::from_string_val(val.ptr); - *result = state->substring_pattern.search(&pattern_value) != -1; + *result = state->substring_pattern.search(&val) != -1; return Status::OK(); } @@ -314,6 +332,7 @@ Status FunctionLike::prepare(FunctionContext* context, FunctionContext::Function const auto& pattern = pattern_col->get_data_at(0); std::string pattern_str = pattern.to_string(); + state->search_state.pattern_str = pattern_str; std::string search_string; if (RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) { remove_escape_character(&search_string); @@ -388,4 +407,12 @@ Status FunctionRegexp::prepare(FunctionContext* context, return Status::OK(); } +void register_function_like(SimpleFunctionFactory& factory) { + factory.register_function<FunctionLike>(); +} + +void register_function_regexp(SimpleFunctionFactory& factory) { + factory.register_function<FunctionRegexp>(); +} + } // namespace doris::vectorized diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h index df164b20d0..ffd71b92e1 100644 --- a/be/src/vec/functions/like.h +++ b/be/src/vec/functions/like.h @@ -44,6 +44,8 @@ struct LikeSearchState { /// used. std::string search_string; + std::string pattern_str; + /// Used for LIKE predicates if the pattern is a constant argument, and is either a /// constant string or has a constant string at the beginning or end of the pattern. /// This will be set in order to check for that pattern in the corresponding part of @@ -85,6 +87,8 @@ struct LikeSearchState { LikeSearchState() : escape_char('\\') {} + Status clone(LikeSearchState& cloned); + void set_search_string(const std::string& search_string_arg) { search_string = search_string_arg; search_string_sv = StringValue(search_string); @@ -154,6 +158,8 @@ public: Status prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; + friend struct LikeSearchState; + private: static Status like_fn(LikeSearchState* state, const StringValue& val, const StringValue& pattern, unsigned char* result); @@ -175,11 +181,4 @@ public: Status prepare(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; }; -void register_function_like(SimpleFunctionFactory& factory) { - factory.register_function<FunctionLike>(); -} - -void register_function_regexp(SimpleFunctionFactory& factory) { - factory.register_function<FunctionRegexp>(); -} } // namespace doris::vectorized diff --git a/be/src/vec/olap/block_reader.cpp b/be/src/vec/olap/block_reader.cpp index d003df213d..c811a63620 100644 --- a/be/src/vec/olap/block_reader.cpp +++ b/be/src/vec/olap/block_reader.cpp @@ -18,6 +18,7 @@ #include "vec/olap/block_reader.h" #include "common/status.h" +#include "olap/like_column_predicate.h" #include "olap/olap_common.h" #include "runtime/mem_pool.h" #include "vec/aggregate_functions/aggregate_function_reader.h" @@ -380,4 +381,15 @@ void BlockReader::_update_agg_value(MutableColumns& columns, int begin, int end, } } +ColumnPredicate* BlockReader::_parse_to_predicate(const FunctionFilter& function_filter) { + int32_t index = _tablet->field_index(function_filter._col_name); + if (index < 0) { + return nullptr; + } + + // currently only support like predicate + return new LikeColumnPredicate<true>(function_filter._opposite, index, function_filter._fn_ctx, + function_filter._string_param); +} + } // namespace doris::vectorized diff --git a/be/src/vec/olap/block_reader.h b/be/src/vec/olap/block_reader.h index 183c003132..91f0d506b6 100644 --- a/be/src/vec/olap/block_reader.h +++ b/be/src/vec/olap/block_reader.h @@ -47,6 +47,8 @@ public: std::vector<RowLocation> current_block_row_locations() { return _block_row_locations; } + ColumnPredicate* _parse_to_predicate(const FunctionFilter& function_filter) override; + private: // Directly read row from rowset and pass to upper caller. No need to do aggregation. // This is usually used for DUPLICATE KEY tables --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org