This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new b013c669da3 [fix](inverted index) implementation of match function without index (#36916) b013c669da3 is described below commit b013c669da3ae48d1bbd5c246a006628a92b482b Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Fri Jul 12 10:17:48 2024 +0800 [fix](inverted index) implementation of match function without index (#36916) ## Proposed changes pick from #36471 --- be/src/vec/functions/match.cpp | 150 +++++++++++++++++++++++++++++++++++++++++ be/src/vec/functions/match.h | 10 +-- 2 files changed, 152 insertions(+), 8 deletions(-) diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index 15ec3432940..d5ced67a75f 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -17,6 +17,8 @@ #include "vec/functions/match.h" +#include <hs/hs.h> + #include "runtime/query_context.h" #include "runtime/runtime_state.h" #include "util/debug_points.h" @@ -326,6 +328,154 @@ Status FunctionMatchPhrase::execute_match(const std::string& column_name, return Status::OK(); } +Status FunctionMatchPhrasePrefix::execute_match( + const std::string& column_name, const std::string& match_query_str, size_t input_rows_count, + const ColumnString* string_col, InvertedIndexCtx* inverted_index_ctx, + const ColumnArray::Offsets64* array_offsets, ColumnUInt8::Container& result) { + DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", { + return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( + "FunctionMatchPhrasePrefix not support execute_match"); + }) + + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = inverted_index_ctx->parser_type; + } + VLOG_DEBUG << "begin to run FunctionMatchPhrasePrefix::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + + auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx, + match_query_str); + std::vector<std::string> query_tokens; + doris::segment_v2::InvertedIndexReader::get_analyse_result( + query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name, + doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY); + + if (query_tokens.empty()) { + VLOG_DEBUG << fmt::format( + "token parser result is empty for query, " + "please check your query: '{}' and index parser: '{}'", + match_query_str, inverted_index_parser_type_to_string(parser_type)); + return Status::OK(); + } + + int32_t current_src_array_offset = 0; + for (size_t i = 0; i < input_rows_count; i++) { + auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i, + array_offsets, current_src_array_offset); + + for (size_t j = 0; j < data_tokens.size() - query_tokens.size() + 1; j++) { + if (data_tokens[j] == query_tokens[0] || query_tokens.size() == 1) { + bool match = true; + for (size_t k = 0; k < query_tokens.size(); k++) { + const std::string& data_token = data_tokens[j + k]; + const std::string& query_token = query_tokens[k]; + if (k == query_tokens.size() - 1) { + if (data_token.compare(0, query_token.size(), query_token) != 0) { + match = false; + break; + } + } else { + if (data_token != query_token) { + match = false; + break; + } + } + } + if (match) { + result[i] = true; + break; + } + } + } + } + + return Status::OK(); +} + +Status FunctionMatchRegexp::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, const ColumnString* string_col, + InvertedIndexCtx* inverted_index_ctx, + const ColumnArray::Offsets64* array_offsets, + ColumnUInt8::Container& result) { + DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", { + return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( + "FunctionMatchRegexp not support execute_match"); + }) + + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = inverted_index_ctx->parser_type; + } + VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + + if (match_query_str.empty()) { + VLOG_DEBUG << fmt::format( + "token parser result is empty for query, " + "please check your query: '{}' and index parser: '{}'", + match_query_str, inverted_index_parser_type_to_string(parser_type)); + return Status::OK(); + } + + const std::string& pattern = match_query_str; + + hs_database_t* database = nullptr; + hs_compile_error_t* compile_err = nullptr; + hs_scratch_t* scratch = nullptr; + + if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8, + HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan compilation failed: " << compile_err->message; + hs_free_compile_error(compile_err); + return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>( + std::string("hyperscan compilation failed:") + compile_err->message); + } + + if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan could not allocate scratch space."; + hs_free_database(database); + return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>( + "hyperscan could not allocate scratch space."); + } + + auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to, + unsigned int flags, void* context) -> int { + *((bool*)context) = true; + return 0; + }; + + try { + auto current_src_array_offset = 0; + for (int i = 0; i < input_rows_count; i++) { + std::vector<std::string> data_tokens = + analyse_data_token(column_name, inverted_index_ctx, string_col, i, + array_offsets, current_src_array_offset); + + for (auto& input : data_tokens) { + bool is_match = false; + if (hs_scan(database, input.data(), input.size(), 0, scratch, on_match, + (void*)&is_match) != HS_SUCCESS) { + LOG(ERROR) << "hyperscan match failed: " << input; + break; + } + + if (is_match) { + result[i] = true; + break; + } + } + } + } + _CLFINALLY({ + hs_free_scratch(scratch); + hs_free_database(database); + }) + + return Status::OK(); +} + void register_function_match(SimpleFunctionFactory& factory) { factory.register_function<FunctionMatchAny>(); factory.register_function<FunctionMatchAll>(); diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h index db8cca17ec0..d2db27813cf 100644 --- a/be/src/vec/functions/match.h +++ b/be/src/vec/functions/match.h @@ -139,10 +139,7 @@ public: size_t input_rows_count, const ColumnString* string_col, InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets, - ColumnUInt8::Container& result) override { - return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( - "FunctionMatchPhrasePrefix not support execute_match"); - } + ColumnUInt8::Container& result) override; }; class FunctionMatchRegexp : public FunctionMatchBase { @@ -156,10 +153,7 @@ public: size_t input_rows_count, const ColumnString* string_col, InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets, - ColumnUInt8::Container& result) override { - return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( - "FunctionMatchRegexp not support execute_match"); - } + ColumnUInt8::Container& result) override; }; } // namespace doris::vectorized --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org