xiaokang commented on code in PR #19936: URL: https://github.com/apache/doris/pull/19936#discussion_r1207965434
########## be/src/vec/exec/scan/vscan_node.cpp: ########## @@ -642,6 +642,11 @@ Status VScanNode::_normalize_predicate(VExpr* conjunct_expr_root, VExpr** output return Status::OK(); } + if (pdt == PushDownType::ACCEPTABLE && TExprNodeType::MATCH_PRED == cur_expr->node_type()) { Review Comment: add comment to explain why ########## be/src/olap/rowset/segment_v2/inverted_index_reader.h: ########## @@ -100,6 +100,11 @@ class InvertedIndexReader { uint32_t get_index_id() const { return _index_id; } + static std::vector<std::string> get_analyse_result(const std::string& field_name, Review Comment: Is there any risk to change wstring to string? ########## be/src/olap/rowset/segment_v2/segment_iterator.cpp: ########## @@ -785,8 +785,7 @@ Status SegmentIterator::_apply_inverted_index_on_column_predicate( Status res = pred->evaluate(_schema, _inverted_index_iterators[unique_id], num_rows(), &bitmap); if (!res.ok()) { - if ((res.code() == ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND && - pred->type() != PredicateType::MATCH) || Review Comment: chenage comment bellow accordingly ########## be/src/vec/exprs/vmatch_predicate.h: ########## @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <string> + +#include "common/object_pool.h" +#include "common/status.h" +#include "udf/udf.h" +#include "vec/exprs/vexpr.h" +#include "vec/functions/function.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +class TExprNode; +namespace vectorized { +class Block; +class VExprContext; +} // namespace vectorized +} // namespace doris + +namespace doris::vectorized { +struct InvertedIndexCtx { + std::string _parser_type; Review Comment: use normal field name parser_type in struct ########## be/src/olap/rowset/segment_v2/inverted_index_reader.cpp: ########## @@ -210,24 +212,24 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string auto index_file_name = InvertedIndexDescriptor::get_index_file_name(path.filename(), _index_id); auto index_file_path = index_dir / index_file_name; - std::unique_ptr<lucene::search::Query> query; - std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); try { - std::vector<std::wstring> analyse_result = - get_analyse_result(field_ws, search_str, query_type, analyser_type); + std::vector<std::string> analyse_result = + get_analyse_result(column_name, search_str, query_type, analyser_type); if (analyse_result.empty()) { LOG(WARNING) << "invalid input query_str: " << search_str << ", please check your query sql"; return Status::Error<ErrorCode::INVERTED_INDEX_NO_TERMS>(); } + std::unique_ptr<lucene::search::Query> query; + std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); roaring::Roaring query_match_bitmap; bool first = true; bool null_bitmap_already_read = false; - for (auto token_ws : analyse_result) { + for (auto token : analyse_result) { Review Comment: What's the purpose of token of string type, since it's not used after construct token_ws of wstring type? ########## be/src/vec/functions/match.cpp: ########## @@ -15,134 +15,167 @@ // specific language governing permissions and limitations // under the License. -#include <stddef.h> - -#include <algorithm> -#include <boost/iterator/iterator_facade.hpp> -#include <memory> -#include <ostream> -#include <string> -#include <utility> - -#include "common/config.h" -#include "common/consts.h" -#include "common/logging.h" -#include "common/status.h" -#include "vec/aggregate_functions/aggregate_function.h" -#include "vec/columns/column.h" -#include "vec/core/block.h" -#include "vec/core/column_numbers.h" -#include "vec/core/column_with_type_and_name.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" -#include "vec/functions/simple_function_factory.h" - -namespace doris { -class FunctionContext; -} // namespace doris +#include "vec/functions/match.h" -namespace doris::vectorized { - -class FunctionMatchBase : public IFunction { -public: - size_t get_number_of_arguments() const override { return 2; } - - String get_name() const override { return "match"; } +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "runtime/query_context.h" +#include "runtime/runtime_state.h" - /// Get result types by argument types. If the function does not apply to these arguments, throw an exception. - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared<DataTypeUInt8>(); - } +namespace doris::vectorized { - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - size_t result, size_t input_rows_count) override { - auto match_query_str = block.get_by_position(arguments[1]).to_string(0); - std::string column_name = block.get_by_position(arguments[0]).name; - auto match_pred_column_name = - BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; - if (!block.has(match_pred_column_name)) { - if (!config::enable_index_apply_preds_except_leafnode_of_andnode) { - return Status::Cancelled( - "please check whether turn on the configuration " - "'enable_index_apply_preds_except_leafnode_of_andnode'"); - } - LOG(WARNING) << "execute match query meet error, block no column: " - << match_pred_column_name; - return Status::InternalError( - "match query meet error, no match predicate evaluate result column in block."); +Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + auto match_query_str = block.get_by_position(arguments[1]).to_string(0); + std::string column_name = block.get_by_position(arguments[0]).name; + auto match_pred_column_name = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; + if (!block.has(match_pred_column_name)) { + LOG(INFO) << "begin to execute match directly, column_name=" << column_name + << ", match_query_str=" << match_query_str; + InvertedIndexCtx* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + + const auto values_col = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const auto* values = check_and_get_column<ColumnString>(values_col.get()); + if (!values) { + return Status::InternalError("Not supported input arguments types"); } + // result column + auto res = ColumnUInt8::create(); + ColumnUInt8::Container& vec_res = res->get_data(); + // set default value to 0, and match functions only need to set 1/true + vec_res.resize_fill(input_rows_count); + RETURN_IF_ERROR(execute_match(column_name, match_query_str, + input_rows_count, values, inverted_index_ctx, vec_res)); + block.replace_by_position(result, std::move(res)); + } else { auto match_pred_column = block.get_by_name(match_pred_column_name).column->convert_to_full_column_if_const(); - block.replace_by_position(result, std::move(match_pred_column)); - return Status::OK(); } -}; - -class FunctionMatchAny : public FunctionMatchBase { -public: - static constexpr auto name = "match_any"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAny>(); } - - String get_name() const override { return name; } -}; -class FunctionMatchAll : public FunctionMatchBase { -public: - static constexpr auto name = "match_all"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAll>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchPhrase : public FunctionMatchBase { -public: - static constexpr auto name = "match_phrase"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchElementEQ : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_eq"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + return Status::OK(); +} - String get_name() const override { return name; } -}; +Status FunctionMatchAny::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchAny::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, parser_type); + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, parser_type); + // TODO: more efficient impl + for (auto& token : tokens) { + auto it = std::find(values.begin(), values.end(), token); + if (it != values.end()) { + result[i] = true; + break; + } + } + } -class FunctionMatchElementLT : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_lt"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + return Status::OK(); +} - String get_name() const override { return name; } -}; +Status FunctionMatchAll::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchAll::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, parser_type); + + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, parser_type); + // TODO: more efficient impl + auto find_count = 0; + for (auto& token : tokens) { + auto it = std::find(values.begin(), values.end(), token); + if (it != values.end()) { + ++find_count; + } Review Comment: else break to skip useless comparasion. ########## be/src/vec/functions/match.cpp: ########## @@ -15,134 +15,167 @@ // specific language governing permissions and limitations // under the License. -#include <stddef.h> - -#include <algorithm> -#include <boost/iterator/iterator_facade.hpp> -#include <memory> -#include <ostream> -#include <string> -#include <utility> - -#include "common/config.h" -#include "common/consts.h" -#include "common/logging.h" -#include "common/status.h" -#include "vec/aggregate_functions/aggregate_function.h" -#include "vec/columns/column.h" -#include "vec/core/block.h" -#include "vec/core/column_numbers.h" -#include "vec/core/column_with_type_and_name.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" -#include "vec/functions/simple_function_factory.h" - -namespace doris { -class FunctionContext; -} // namespace doris +#include "vec/functions/match.h" -namespace doris::vectorized { - -class FunctionMatchBase : public IFunction { -public: - size_t get_number_of_arguments() const override { return 2; } - - String get_name() const override { return "match"; } +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "runtime/query_context.h" +#include "runtime/runtime_state.h" - /// Get result types by argument types. If the function does not apply to these arguments, throw an exception. - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared<DataTypeUInt8>(); - } +namespace doris::vectorized { - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - size_t result, size_t input_rows_count) override { - auto match_query_str = block.get_by_position(arguments[1]).to_string(0); - std::string column_name = block.get_by_position(arguments[0]).name; - auto match_pred_column_name = - BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; - if (!block.has(match_pred_column_name)) { - if (!config::enable_index_apply_preds_except_leafnode_of_andnode) { - return Status::Cancelled( - "please check whether turn on the configuration " - "'enable_index_apply_preds_except_leafnode_of_andnode'"); - } - LOG(WARNING) << "execute match query meet error, block no column: " - << match_pred_column_name; - return Status::InternalError( - "match query meet error, no match predicate evaluate result column in block."); +Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + auto match_query_str = block.get_by_position(arguments[1]).to_string(0); + std::string column_name = block.get_by_position(arguments[0]).name; + auto match_pred_column_name = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; + if (!block.has(match_pred_column_name)) { + LOG(INFO) << "begin to execute match directly, column_name=" << column_name Review Comment: DEBUG log ########## be/src/vec/functions/match.cpp: ########## @@ -15,134 +15,167 @@ // specific language governing permissions and limitations // under the License. -#include <stddef.h> - -#include <algorithm> -#include <boost/iterator/iterator_facade.hpp> -#include <memory> -#include <ostream> -#include <string> -#include <utility> - -#include "common/config.h" -#include "common/consts.h" -#include "common/logging.h" -#include "common/status.h" -#include "vec/aggregate_functions/aggregate_function.h" -#include "vec/columns/column.h" -#include "vec/core/block.h" -#include "vec/core/column_numbers.h" -#include "vec/core/column_with_type_and_name.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" -#include "vec/functions/simple_function_factory.h" - -namespace doris { -class FunctionContext; -} // namespace doris +#include "vec/functions/match.h" -namespace doris::vectorized { - -class FunctionMatchBase : public IFunction { -public: - size_t get_number_of_arguments() const override { return 2; } - - String get_name() const override { return "match"; } +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "runtime/query_context.h" +#include "runtime/runtime_state.h" - /// Get result types by argument types. If the function does not apply to these arguments, throw an exception. - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared<DataTypeUInt8>(); - } +namespace doris::vectorized { - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - size_t result, size_t input_rows_count) override { - auto match_query_str = block.get_by_position(arguments[1]).to_string(0); - std::string column_name = block.get_by_position(arguments[0]).name; - auto match_pred_column_name = - BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; - if (!block.has(match_pred_column_name)) { - if (!config::enable_index_apply_preds_except_leafnode_of_andnode) { - return Status::Cancelled( - "please check whether turn on the configuration " - "'enable_index_apply_preds_except_leafnode_of_andnode'"); - } - LOG(WARNING) << "execute match query meet error, block no column: " - << match_pred_column_name; - return Status::InternalError( - "match query meet error, no match predicate evaluate result column in block."); +Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + auto match_query_str = block.get_by_position(arguments[1]).to_string(0); + std::string column_name = block.get_by_position(arguments[0]).name; + auto match_pred_column_name = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; + if (!block.has(match_pred_column_name)) { + LOG(INFO) << "begin to execute match directly, column_name=" << column_name Review Comment: and all logs bellow ########## be/src/vec/exprs/vmatch_predicate.cpp: ########## @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exprs/vmatch_predicate.h" + +#include <fmt/format.h> +#include <fmt/ranges.h> // IWYU pragma: keep +#include <gen_cpp/Exprs_types.h> +#include <glog/logging.h> +#include <stddef.h> + +#include <algorithm> +#include <memory> +#include <ostream> +#include <string_view> +#include <vector> + +#include "common/status.h" +#include "vec/core/block.h" +#include "vec/core/column_numbers.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/columns_with_type_and_name.h" +#include "vec/exprs/vexpr_context.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +} // namespace doris + +namespace doris::vectorized { + +VMatchPredicate::VMatchPredicate(const TExprNode& node) + : VExpr(node) { + _inverted_index_ctx = std::make_shared<InvertedIndexCtx>(); + _inverted_index_ctx->_parser_type = node.match_predicate.parser_type; +} + +Status VMatchPredicate::prepare(RuntimeState* state, const RowDescriptor& desc, + VExprContext* context) { + RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); + + ColumnsWithTypeAndName argument_template; + argument_template.reserve(_children.size()); + std::vector<std::string_view> child_expr_name; + for (auto child : _children) { + argument_template.emplace_back(nullptr, child->data_type(), child->expr_name()); + child_expr_name.emplace_back(child->expr_name()); + } + + _function = SimpleFunctionFactory::instance().get_function( + _fn.name.function_name, argument_template, _data_type); Review Comment: How is _fn inited? Where is function_name passed? ########## be/src/vec/functions/match.cpp: ########## @@ -15,134 +15,167 @@ // specific language governing permissions and limitations // under the License. -#include <stddef.h> - -#include <algorithm> -#include <boost/iterator/iterator_facade.hpp> -#include <memory> -#include <ostream> -#include <string> -#include <utility> - -#include "common/config.h" -#include "common/consts.h" -#include "common/logging.h" -#include "common/status.h" -#include "vec/aggregate_functions/aggregate_function.h" -#include "vec/columns/column.h" -#include "vec/core/block.h" -#include "vec/core/column_numbers.h" -#include "vec/core/column_with_type_and_name.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" -#include "vec/functions/simple_function_factory.h" - -namespace doris { -class FunctionContext; -} // namespace doris +#include "vec/functions/match.h" -namespace doris::vectorized { - -class FunctionMatchBase : public IFunction { -public: - size_t get_number_of_arguments() const override { return 2; } - - String get_name() const override { return "match"; } +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "runtime/query_context.h" +#include "runtime/runtime_state.h" - /// Get result types by argument types. If the function does not apply to these arguments, throw an exception. - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared<DataTypeUInt8>(); - } +namespace doris::vectorized { - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - size_t result, size_t input_rows_count) override { - auto match_query_str = block.get_by_position(arguments[1]).to_string(0); - std::string column_name = block.get_by_position(arguments[0]).name; - auto match_pred_column_name = - BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; - if (!block.has(match_pred_column_name)) { - if (!config::enable_index_apply_preds_except_leafnode_of_andnode) { - return Status::Cancelled( - "please check whether turn on the configuration " - "'enable_index_apply_preds_except_leafnode_of_andnode'"); - } - LOG(WARNING) << "execute match query meet error, block no column: " - << match_pred_column_name; - return Status::InternalError( - "match query meet error, no match predicate evaluate result column in block."); +Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + auto match_query_str = block.get_by_position(arguments[1]).to_string(0); + std::string column_name = block.get_by_position(arguments[0]).name; + auto match_pred_column_name = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; + if (!block.has(match_pred_column_name)) { + LOG(INFO) << "begin to execute match directly, column_name=" << column_name + << ", match_query_str=" << match_query_str; + InvertedIndexCtx* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + + const auto values_col = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const auto* values = check_and_get_column<ColumnString>(values_col.get()); + if (!values) { + return Status::InternalError("Not supported input arguments types"); } + // result column + auto res = ColumnUInt8::create(); + ColumnUInt8::Container& vec_res = res->get_data(); + // set default value to 0, and match functions only need to set 1/true + vec_res.resize_fill(input_rows_count); + RETURN_IF_ERROR(execute_match(column_name, match_query_str, + input_rows_count, values, inverted_index_ctx, vec_res)); + block.replace_by_position(result, std::move(res)); + } else { auto match_pred_column = block.get_by_name(match_pred_column_name).column->convert_to_full_column_if_const(); - block.replace_by_position(result, std::move(match_pred_column)); - return Status::OK(); } -}; - -class FunctionMatchAny : public FunctionMatchBase { -public: - static constexpr auto name = "match_any"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAny>(); } - - String get_name() const override { return name; } -}; -class FunctionMatchAll : public FunctionMatchBase { -public: - static constexpr auto name = "match_all"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAll>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchPhrase : public FunctionMatchBase { -public: - static constexpr auto name = "match_phrase"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchElementEQ : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_eq"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + return Status::OK(); +} - String get_name() const override { return name; } -}; +Status FunctionMatchAny::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchAny::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, parser_type); + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = Review Comment: data_tokens ########## be/src/vec/functions/match.cpp: ########## @@ -15,134 +15,167 @@ // specific language governing permissions and limitations // under the License. -#include <stddef.h> - -#include <algorithm> -#include <boost/iterator/iterator_facade.hpp> -#include <memory> -#include <ostream> -#include <string> -#include <utility> - -#include "common/config.h" -#include "common/consts.h" -#include "common/logging.h" -#include "common/status.h" -#include "vec/aggregate_functions/aggregate_function.h" -#include "vec/columns/column.h" -#include "vec/core/block.h" -#include "vec/core/column_numbers.h" -#include "vec/core/column_with_type_and_name.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" -#include "vec/functions/simple_function_factory.h" - -namespace doris { -class FunctionContext; -} // namespace doris +#include "vec/functions/match.h" -namespace doris::vectorized { - -class FunctionMatchBase : public IFunction { -public: - size_t get_number_of_arguments() const override { return 2; } - - String get_name() const override { return "match"; } +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "runtime/query_context.h" +#include "runtime/runtime_state.h" - /// Get result types by argument types. If the function does not apply to these arguments, throw an exception. - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared<DataTypeUInt8>(); - } +namespace doris::vectorized { - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - size_t result, size_t input_rows_count) override { - auto match_query_str = block.get_by_position(arguments[1]).to_string(0); - std::string column_name = block.get_by_position(arguments[0]).name; - auto match_pred_column_name = - BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; - if (!block.has(match_pred_column_name)) { - if (!config::enable_index_apply_preds_except_leafnode_of_andnode) { - return Status::Cancelled( - "please check whether turn on the configuration " - "'enable_index_apply_preds_except_leafnode_of_andnode'"); - } - LOG(WARNING) << "execute match query meet error, block no column: " - << match_pred_column_name; - return Status::InternalError( - "match query meet error, no match predicate evaluate result column in block."); +Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + auto match_query_str = block.get_by_position(arguments[1]).to_string(0); + std::string column_name = block.get_by_position(arguments[0]).name; + auto match_pred_column_name = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; + if (!block.has(match_pred_column_name)) { + LOG(INFO) << "begin to execute match directly, column_name=" << column_name + << ", match_query_str=" << match_query_str; + InvertedIndexCtx* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + + const auto values_col = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const auto* values = check_and_get_column<ColumnString>(values_col.get()); + if (!values) { + return Status::InternalError("Not supported input arguments types"); } + // result column + auto res = ColumnUInt8::create(); + ColumnUInt8::Container& vec_res = res->get_data(); + // set default value to 0, and match functions only need to set 1/true + vec_res.resize_fill(input_rows_count); + RETURN_IF_ERROR(execute_match(column_name, match_query_str, + input_rows_count, values, inverted_index_ctx, vec_res)); + block.replace_by_position(result, std::move(res)); + } else { auto match_pred_column = block.get_by_name(match_pred_column_name).column->convert_to_full_column_if_const(); - block.replace_by_position(result, std::move(match_pred_column)); - return Status::OK(); } -}; - -class FunctionMatchAny : public FunctionMatchBase { -public: - static constexpr auto name = "match_any"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAny>(); } - - String get_name() const override { return name; } -}; -class FunctionMatchAll : public FunctionMatchBase { -public: - static constexpr auto name = "match_all"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAll>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchPhrase : public FunctionMatchBase { -public: - static constexpr auto name = "match_phrase"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchElementEQ : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_eq"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + return Status::OK(); +} - String get_name() const override { return name; } -}; +Status FunctionMatchAny::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchAny::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = Review Comment: query_tokens ########## be/src/vec/functions/match.cpp: ########## @@ -15,134 +15,167 @@ // specific language governing permissions and limitations // under the License. -#include <stddef.h> - -#include <algorithm> -#include <boost/iterator/iterator_facade.hpp> -#include <memory> -#include <ostream> -#include <string> -#include <utility> - -#include "common/config.h" -#include "common/consts.h" -#include "common/logging.h" -#include "common/status.h" -#include "vec/aggregate_functions/aggregate_function.h" -#include "vec/columns/column.h" -#include "vec/core/block.h" -#include "vec/core/column_numbers.h" -#include "vec/core/column_with_type_and_name.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" -#include "vec/functions/simple_function_factory.h" - -namespace doris { -class FunctionContext; -} // namespace doris +#include "vec/functions/match.h" -namespace doris::vectorized { - -class FunctionMatchBase : public IFunction { -public: - size_t get_number_of_arguments() const override { return 2; } - - String get_name() const override { return "match"; } +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "runtime/query_context.h" +#include "runtime/runtime_state.h" - /// Get result types by argument types. If the function does not apply to these arguments, throw an exception. - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared<DataTypeUInt8>(); - } +namespace doris::vectorized { - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - size_t result, size_t input_rows_count) override { - auto match_query_str = block.get_by_position(arguments[1]).to_string(0); - std::string column_name = block.get_by_position(arguments[0]).name; - auto match_pred_column_name = - BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; - if (!block.has(match_pred_column_name)) { - if (!config::enable_index_apply_preds_except_leafnode_of_andnode) { - return Status::Cancelled( - "please check whether turn on the configuration " - "'enable_index_apply_preds_except_leafnode_of_andnode'"); - } - LOG(WARNING) << "execute match query meet error, block no column: " - << match_pred_column_name; - return Status::InternalError( - "match query meet error, no match predicate evaluate result column in block."); +Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + auto match_query_str = block.get_by_position(arguments[1]).to_string(0); + std::string column_name = block.get_by_position(arguments[0]).name; + auto match_pred_column_name = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; + if (!block.has(match_pred_column_name)) { + LOG(INFO) << "begin to execute match directly, column_name=" << column_name + << ", match_query_str=" << match_query_str; + InvertedIndexCtx* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + + const auto values_col = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const auto* values = check_and_get_column<ColumnString>(values_col.get()); + if (!values) { + return Status::InternalError("Not supported input arguments types"); } + // result column + auto res = ColumnUInt8::create(); + ColumnUInt8::Container& vec_res = res->get_data(); + // set default value to 0, and match functions only need to set 1/true + vec_res.resize_fill(input_rows_count); + RETURN_IF_ERROR(execute_match(column_name, match_query_str, + input_rows_count, values, inverted_index_ctx, vec_res)); + block.replace_by_position(result, std::move(res)); + } else { auto match_pred_column = block.get_by_name(match_pred_column_name).column->convert_to_full_column_if_const(); - block.replace_by_position(result, std::move(match_pred_column)); - return Status::OK(); } -}; - -class FunctionMatchAny : public FunctionMatchBase { -public: - static constexpr auto name = "match_any"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAny>(); } - - String get_name() const override { return name; } -}; -class FunctionMatchAll : public FunctionMatchBase { -public: - static constexpr auto name = "match_all"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAll>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchPhrase : public FunctionMatchBase { -public: - static constexpr auto name = "match_phrase"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchElementEQ : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_eq"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + return Status::OK(); +} - String get_name() const override { return name; } -}; +Status FunctionMatchAny::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchAny::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, parser_type); + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, parser_type); + // TODO: more efficient impl + for (auto& token : tokens) { + auto it = std::find(values.begin(), values.end(), token); + if (it != values.end()) { + result[i] = true; + break; + } + } + } -class FunctionMatchElementLT : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_lt"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + return Status::OK(); +} - String get_name() const override { return name; } -}; +Status FunctionMatchAll::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchAll::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, parser_type); + + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, parser_type); + // TODO: more efficient impl + auto find_count = 0; + for (auto& token : tokens) { + auto it = std::find(values.begin(), values.end(), token); + if (it != values.end()) { + ++find_count; + } + } -class FunctionMatchElementGT : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_gt"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + if (find_count == tokens.size()) { + result[i] = true; + } + } - String get_name() const override { return name; } -}; + return Status::OK(); +} -class FunctionMatchElementLE : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_le"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } +Status FunctionMatchPhrase::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchPhrase::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, parser_type); + + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, parser_type); + // TODO: more efficient impl + std::vector<int> token_positions; + for (auto& token : tokens) { Review Comment: one more efficient and accurate impl: ``` bool matched = false; auto it = value_tokens.begin(); while (it != value_tokens.end()) { // find position of first token it = std::find(it, value_tokens.end, query_tokens[0]); if (it != value_tokens.end()) { matched = true; it++; auto it_more = it; // compare query_tokens after the first to value_tokens one by one for (size_t i = 1; i < query_tokens.size(); i++) { if (it_more == data_tokens.end() || *it_more != query_tokens[i]) { matched = false; } it_more++; } if (matched) { break; } } } // check matched ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org