zzzxl1993 commented on code in PR #38908: URL: https://github.com/apache/doris/pull/38908#discussion_r1730327870
########## be/src/vec/exprs/vmulti_match_predicate.cpp: ########## @@ -0,0 +1,230 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/exprs/vmulti_match_predicate.h" + +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wshadow-field" +#endif + +#include <CLucene/analysis/LanguageBasedAnalyzer.h> +#include <fmt/format.h> +#include <fmt/ranges.h> // IWYU pragma: keep +#include <gen_cpp/Exprs_types.h> +#include <glog/logging.h> +#include <stddef.h> + +#include <algorithm> +#include <memory> +#include <ostream> +#include <string_view> +#include <vector> + +#include "CLucene/analysis/standard95/StandardAnalyzer.h" +#include "common/status.h" +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "vec/core/block.h" +#include "vec/core/column_numbers.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/exprs/vexpr_context.h" +#include "vec/exprs/vliteral.h" +#include "vec/exprs/vslot_ref.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris { +class RowDescriptor; +class RuntimeState; +} // namespace doris + +namespace doris::vectorized { +using namespace doris::segment_v2; + +VMultiMatchPredicate::VMultiMatchPredicate(const TExprNode& node) : VExpr(node) { + _inverted_index_ctx = std::make_shared<InvertedIndexCtx>(); + _inverted_index_ctx->parser_type = + get_inverted_index_parser_type_from_string(node.match_predicate.parser_type); + _inverted_index_ctx->parser_mode = node.match_predicate.parser_mode; + _inverted_index_ctx->char_filter_map = node.match_predicate.char_filter_map; + _analyzer = InvertedIndexReader::create_analyzer(_inverted_index_ctx.get()); + _analyzer->set_lowercase(node.match_predicate.parser_lowercase); + if (node.match_predicate.parser_stopwords == "none") { + _analyzer->set_stopwords(nullptr); + } else { + _analyzer->set_stopwords(&lucene::analysis::standard95::stop_words); + } + _inverted_index_ctx->analyzer = _analyzer.get(); +} + +VMultiMatchPredicate::~VMultiMatchPredicate() = default; + +Status VMultiMatchPredicate::prepare(RuntimeState* state, const RowDescriptor& desc, + VExprContext* context) { + RETURN_IF_ERROR_OR_PREPARED(VExpr::prepare(state, desc, context)); + + ColumnsWithTypeAndName argument_template; + argument_template.reserve(_children.size()); + std::vector<std::string_view> child_expr_name; + DCHECK(_children.size() >= 3); + argument_template.emplace_back(nullptr, _children[0]->data_type(), _children[0]->expr_name()); + child_expr_name.emplace_back(_children[0]->expr_name()); + argument_template.emplace_back(nullptr, _children[_children.size() - 1]->data_type(), + _children[_children.size() - 1]->expr_name()); + child_expr_name.emplace_back(_children[_children.size() - 1]->expr_name()); + //NOTE: set function name to match_phrase_prefix currently + _function_name = "match_phrase_prefix"; + // result column always not null + if (_data_type->is_nullable()) { + _function = SimpleFunctionFactory::instance().get_function( + _function_name, argument_template, remove_nullable(_data_type)); + } else { + _function = SimpleFunctionFactory::instance().get_function(_function_name, + argument_template, _data_type); + } + if (_function == nullptr) { + std::string type_str; + for (auto arg : argument_template) { + type_str = type_str + " " + arg.type->get_name(); + } + return Status::NotSupported( + "Function {} is not implemented, input param type is {}, " + "and return type is {}.", + _function_name, type_str, _data_type->get_name()); + } + + VExpr::register_function_context(state, context); + _expr_name = fmt::format("{}({})", _function_name, child_expr_name); + _prepare_finished = true; + return Status::OK(); +} + +Status VMultiMatchPredicate::open(RuntimeState* state, VExprContext* context, + FunctionContext::FunctionStateScope scope) { + DCHECK(_prepare_finished); + for (int i = 0; i < _children.size(); ++i) { + RETURN_IF_ERROR(_children[i]->open(state, context, scope)); + } + RETURN_IF_ERROR(VExpr::init_function_context(context, scope, _function)); + if (scope == FunctionContext::THREAD_LOCAL || scope == FunctionContext::FRAGMENT_LOCAL) { + context->fn_context(_fn_context_index)->set_function_state(scope, _inverted_index_ctx); + } + if (scope == FunctionContext::FRAGMENT_LOCAL) { + RETURN_IF_ERROR(VExpr::get_const_col(context, nullptr)); + } + _open_finished = true; + return Status::OK(); +} + +void VMultiMatchPredicate::close(VExprContext* context, FunctionContext::FunctionStateScope scope) { + VExpr::close_function_context(context, scope, _function); + VExpr::close(context, scope); +} + +Status VMultiMatchPredicate::evaluate_inverted_index(VExprContext* context, + uint32_t segment_num_rows) const { + auto children_num = get_num_children(); + DCHECK_GE(children_num, 3); + vectorized::ColumnsWithTypeAndName arguments; + segment_v2::InvertedIndexResultBitmap ret; + // last argument is query value + if (get_child(children_num - 1)->is_literal()) { + auto* column_literal = assert_cast<VLiteral*>(get_child(children_num - 1).get()); + arguments.emplace_back(column_literal->get_column_ptr(), column_literal->get_data_type(), + column_literal->expr_name()); + } else { + return Status::NotSupported( + "child {} in evaluate_inverted_index for VMultiMatchPredicate must be " + "literal, but we got {}", + children_num - 1, get_child(children_num - 1)->expr_name()); + } + // second last argument is function name + if (get_child(children_num - 2)->is_literal()) { + auto* column_literal = assert_cast<VLiteral*>(get_child(children_num - 2).get()); + auto match_type = column_literal->value(); + if (match_type != "phrase_prefix") { + return Status::NotSupported("query type is incorrect, only support phrase_prefix"); + } + } else { + return Status::NotSupported( + "child {} in evaluate_inverted_index for VMultiMatchPredicate must be " + "literal, but we got {}", + children_num - 1, get_child(children_num - 1)->expr_name()); + } + std::set<int> column_ids; + for (int child_num = 0; child_num < children_num - 2; child_num++) { + if (get_child(child_num)->is_slot_ref()) { + auto* column_slot_ref = assert_cast<VSlotRef*>(get_child(child_num).get()); + column_ids.insert(column_slot_ref->column_id()); + } else { + return Status::NotSupported( + "child {} in evaluate_inverted_index for VMultiMatchPredicate must be slot " + "ref, but " + "we " + "got {}", + child_num, get_child(child_num)->expr_name()); + } + } + + for (const auto& col_id : column_ids) { + auto result_bitmap = DORIS_TRY( Review Comment: If one of the multi_match queries fails, is it possible to fallback to a non-indexed match query? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org