This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new b013c669da3 [fix](inverted index) implementation of match function 
without index (#36916)
b013c669da3 is described below

commit b013c669da3ae48d1bbd5c246a006628a92b482b
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Fri Jul 12 10:17:48 2024 +0800

    [fix](inverted index) implementation of match function without index 
(#36916)
    
    ## Proposed changes
    
    pick from #36471
---
 be/src/vec/functions/match.cpp | 150 +++++++++++++++++++++++++++++++++++++++++
 be/src/vec/functions/match.h   |  10 +--
 2 files changed, 152 insertions(+), 8 deletions(-)

diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index 15ec3432940..d5ced67a75f 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -17,6 +17,8 @@
 
 #include "vec/functions/match.h"
 
+#include <hs/hs.h>
+
 #include "runtime/query_context.h"
 #include "runtime/runtime_state.h"
 #include "util/debug_points.h"
@@ -326,6 +328,154 @@ Status FunctionMatchPhrase::execute_match(const 
std::string& column_name,
     return Status::OK();
 }
 
+Status FunctionMatchPhrasePrefix::execute_match(
+        const std::string& column_name, const std::string& match_query_str, 
size_t input_rows_count,
+        const ColumnString* string_col, InvertedIndexCtx* inverted_index_ctx,
+        const ColumnArray::Offsets64* array_offsets, ColumnUInt8::Container& 
result) {
+    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "FunctionMatchPhrasePrefix not support execute_match");
+    })
+
+    doris::InvertedIndexParserType parser_type = 
doris::InvertedIndexParserType::PARSER_UNKNOWN;
+    if (inverted_index_ctx) {
+        parser_type = inverted_index_ctx->parser_type;
+    }
+    VLOG_DEBUG << "begin to run FunctionMatchPhrasePrefix::execute_match, 
parser_type: "
+               << inverted_index_parser_type_to_string(parser_type);
+
+    auto reader = 
doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
+                                                                        
match_query_str);
+    std::vector<std::string> query_tokens;
+    doris::segment_v2::InvertedIndexReader::get_analyse_result(
+            query_tokens, reader.get(), inverted_index_ctx->analyzer, 
column_name,
+            
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY);
+
+    if (query_tokens.empty()) {
+        VLOG_DEBUG << fmt::format(
+                "token parser result is empty for query, "
+                "please check your query: '{}' and index parser: '{}'",
+                match_query_str, 
inverted_index_parser_type_to_string(parser_type));
+        return Status::OK();
+    }
+
+    int32_t current_src_array_offset = 0;
+    for (size_t i = 0; i < input_rows_count; i++) {
+        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, 
string_col, i,
+                                              array_offsets, 
current_src_array_offset);
+
+        for (size_t j = 0; j < data_tokens.size() - query_tokens.size() + 1; 
j++) {
+            if (data_tokens[j] == query_tokens[0] || query_tokens.size() == 1) 
{
+                bool match = true;
+                for (size_t k = 0; k < query_tokens.size(); k++) {
+                    const std::string& data_token = data_tokens[j + k];
+                    const std::string& query_token = query_tokens[k];
+                    if (k == query_tokens.size() - 1) {
+                        if (data_token.compare(0, query_token.size(), 
query_token) != 0) {
+                            match = false;
+                            break;
+                        }
+                    } else {
+                        if (data_token != query_token) {
+                            match = false;
+                            break;
+                        }
+                    }
+                }
+                if (match) {
+                    result[i] = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    return Status::OK();
+}
+
+Status FunctionMatchRegexp::execute_match(const std::string& column_name,
+                                          const std::string& match_query_str,
+                                          size_t input_rows_count, const 
ColumnString* string_col,
+                                          InvertedIndexCtx* inverted_index_ctx,
+                                          const ColumnArray::Offsets64* 
array_offsets,
+                                          ColumnUInt8::Container& result) {
+    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
+        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
+                "FunctionMatchRegexp not support execute_match");
+    })
+
+    doris::InvertedIndexParserType parser_type = 
doris::InvertedIndexParserType::PARSER_UNKNOWN;
+    if (inverted_index_ctx) {
+        parser_type = inverted_index_ctx->parser_type;
+    }
+    VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, 
parser_type: "
+               << inverted_index_parser_type_to_string(parser_type);
+
+    if (match_query_str.empty()) {
+        VLOG_DEBUG << fmt::format(
+                "token parser result is empty for query, "
+                "please check your query: '{}' and index parser: '{}'",
+                match_query_str, 
inverted_index_parser_type_to_string(parser_type));
+        return Status::OK();
+    }
+
+    const std::string& pattern = match_query_str;
+
+    hs_database_t* database = nullptr;
+    hs_compile_error_t* compile_err = nullptr;
+    hs_scratch_t* scratch = nullptr;
+
+    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | 
HS_FLAG_UTF8,
+                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != 
HS_SUCCESS) {
+        LOG(ERROR) << "hyperscan compilation failed: " << compile_err->message;
+        hs_free_compile_error(compile_err);
+        return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+                std::string("hyperscan compilation failed:") + 
compile_err->message);
+    }
+
+    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
+        LOG(ERROR) << "hyperscan could not allocate scratch space.";
+        hs_free_database(database);
+        return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+                "hyperscan could not allocate scratch space.");
+    }
+
+    auto on_match = [](unsigned int id, unsigned long long from, unsigned long 
long to,
+                       unsigned int flags, void* context) -> int {
+        *((bool*)context) = true;
+        return 0;
+    };
+
+    try {
+        auto current_src_array_offset = 0;
+        for (int i = 0; i < input_rows_count; i++) {
+            std::vector<std::string> data_tokens =
+                    analyse_data_token(column_name, inverted_index_ctx, 
string_col, i,
+                                       array_offsets, 
current_src_array_offset);
+
+            for (auto& input : data_tokens) {
+                bool is_match = false;
+                if (hs_scan(database, input.data(), input.size(), 0, scratch, 
on_match,
+                            (void*)&is_match) != HS_SUCCESS) {
+                    LOG(ERROR) << "hyperscan match failed: " << input;
+                    break;
+                }
+
+                if (is_match) {
+                    result[i] = true;
+                    break;
+                }
+            }
+        }
+    }
+    _CLFINALLY({
+        hs_free_scratch(scratch);
+        hs_free_database(database);
+    })
+
+    return Status::OK();
+}
+
 void register_function_match(SimpleFunctionFactory& factory) {
     factory.register_function<FunctionMatchAny>();
     factory.register_function<FunctionMatchAll>();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index db8cca17ec0..d2db27813cf 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -139,10 +139,7 @@ public:
                          size_t input_rows_count, const ColumnString* 
string_col,
                          InvertedIndexCtx* inverted_index_ctx,
                          const ColumnArray::Offsets64* array_offsets,
-                         ColumnUInt8::Container& result) override {
-        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
-                "FunctionMatchPhrasePrefix not support execute_match");
-    }
+                         ColumnUInt8::Container& result) override;
 };
 
 class FunctionMatchRegexp : public FunctionMatchBase {
@@ -156,10 +153,7 @@ public:
                          size_t input_rows_count, const ColumnString* 
string_col,
                          InvertedIndexCtx* inverted_index_ctx,
                          const ColumnArray::Offsets64* array_offsets,
-                         ColumnUInt8::Container& result) override {
-        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
-                "FunctionMatchRegexp not support execute_match");
-    }
+                         ColumnUInt8::Container& result) override;
 };
 
 } // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to