This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 8056274dd9e [fix] Implementing match_phrase_edge without index query 
method (#41658)
8056274dd9e is described below

commit 8056274dd9e531bf476a2a4a260330bb100b83d1
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Wed Oct 16 14:19:15 2024 +0800

    [fix] Implementing match_phrase_edge without index query method (#41658)
    
    1. Supports match_phrase_edge query without creating an inverted index.
---
 be/src/vec/functions/match.cpp                     | 66 +++++++++++++++
 be/src/vec/functions/match.h                       |  5 +-
 .../test_index_match_phrase_edge.out               | 24 ++++++
 .../test_index_match_phrase_edge.groovy            | 98 ++++++++++++++++++++++
 4 files changed, 189 insertions(+), 4 deletions(-)

diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index de46cf008d5..bbdabe3c506 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -506,6 +506,72 @@ Status FunctionMatchRegexp::execute_match(FunctionContext* 
context, const std::s
     return Status::OK();
 }
 
+Status FunctionMatchPhraseEdge::execute_match(
+        FunctionContext* context, const std::string& column_name,
+        const std::string& match_query_str, size_t input_rows_count, const 
ColumnString* string_col,
+        InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* 
array_offsets,
+        ColumnUInt8::Container& result) const {
+    RETURN_IF_ERROR(check(context, name));
+
+    std::vector<std::string> query_tokens =
+            analyse_query_str_token(inverted_index_ctx, match_query_str, 
column_name);
+    if (query_tokens.empty()) {
+        VLOG_DEBUG << fmt::format(
+                "token parser result is empty for query, "
+                "please check your query: '{}' and index parser: '{}'",
+                match_query_str,
+                
inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
+        return Status::OK();
+    }
+
+    int32_t current_src_array_offset = 0;
+    for (size_t i = 0; i < input_rows_count; i++) {
+        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, 
string_col, i,
+                                              array_offsets, 
current_src_array_offset);
+
+        int32_t dis_count = data_tokens.size() - query_tokens.size();
+        if (dis_count < 0) {
+            continue;
+        }
+
+        for (size_t j = 0; j < dis_count + 1; j++) {
+            bool match = true;
+            if (query_tokens.size() == 1) {
+                if (data_tokens[j].find(query_tokens[0]) == std::string::npos) 
{
+                    match = false;
+                }
+            } else {
+                for (size_t k = 0; k < query_tokens.size(); k++) {
+                    const std::string& data_token = data_tokens[j + k];
+                    const std::string& query_token = query_tokens[k];
+                    if (k == 0) {
+                        if (!data_token.ends_with(query_token)) {
+                            match = false;
+                            break;
+                        }
+                    } else if (k == query_tokens.size() - 1) {
+                        if (!data_token.starts_with(query_token)) {
+                            match = false;
+                            break;
+                        }
+                    } else {
+                        if (data_token != query_token) {
+                            match = false;
+                            break;
+                        }
+                    }
+                }
+            }
+            if (match) {
+                result[i] = true;
+                break;
+            }
+        }
+    }
+
+    return Status::OK();
+}
+
 void register_function_match(SimpleFunctionFactory& factory) {
     factory.register_function<FunctionMatchAny>();
     factory.register_function<FunctionMatchAll>();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index a4cea93852a..477ab0a3409 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -180,10 +180,7 @@ public:
                          const std::string& match_query_str, size_t 
input_rows_count,
                          const ColumnString* string_col, InvertedIndexCtx* 
inverted_index_ctx,
                          const ColumnArray::Offsets64* array_offsets,
-                         ColumnUInt8::Container& result) const override {
-        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
-                "FunctionMatchPhraseEdge not support execute_match");
-    }
+                         ColumnUInt8::Container& result) const override;
 };
 
 } // namespace doris::vectorized
diff --git 
a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out 
b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
index 8accc202576..71714c41b3b 100644
--- a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
@@ -41,3 +41,27 @@
 -- !sql --
 6
 
+-- !sql --
+0
+
+-- !sql --
+874
+
+-- !sql --
+150
+
+-- !sql --
+20
+
+-- !sql --
+0
+
+-- !sql --
+874
+
+-- !sql --
+150
+
+-- !sql --
+20
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
index b7fe5664556..147291eb77b 100644
--- 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
+++ 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
@@ -79,4 +79,102 @@ suite("test_index_match_phrase_edge", "nonConcurrent"){
     } finally {
         GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
     }
+
+    def indexTbName2 = "test_index_match_phrase_edge2"
+    def indexTbName3 = "test_index_match_phrase_edge3"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName2}"
+    sql "DROP TABLE IF EXISTS ${indexTbName3}"
+
+    sql """
+      CREATE TABLE ${indexTbName2} (
+      `@timestamp` int(11) NULL COMMENT "",
+      `clientip` varchar(20) NULL COMMENT "",
+      `request` text NULL COMMENT "",
+      `status` int(11) NULL COMMENT "",
+      `size` int(11) NULL COMMENT "",
+      INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+      "replication_allocation" = "tag.location.default: 1"
+      );
+    """
+
+    sql """
+      CREATE TABLE ${indexTbName3} (
+      `@timestamp` int(11) NULL COMMENT "",
+      `clientip` varchar(20) NULL COMMENT "",
+      `request` text NULL COMMENT "",
+      `status` int(11) NULL COMMENT "",
+      `size` int(11) NULL COMMENT ""
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+      "replication_allocation" = "tag.location.default: 1"
+      );
+    """
+
+    def load_httplogs_data = {table_name, label, read_flag, format_flag, 
file_name, ignore_failure=false,
+                        expected_succ_rows = -1, load_to_single_tablet = 
'true' ->
+        
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+            
+            // set http request header params
+            set 'label', label + "_" + UUID.randomUUID().toString()
+            set 'read_json_by_line', read_flag
+            set 'format', format_flag
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+            if (expected_succ_rows >= 0) {
+                set 'max_filter_ratio', '1'
+            }
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+            check { result, exception, startTime, endTime ->
+                       if (ignore_failure && expected_succ_rows < 0) { return }
+                    if (exception != null) {
+                        throw exception
+                    }
+                    log.info("Stream load result: ${result}".toString())
+                    def json = parseJson(result)
+                    assertEquals("success", json.Status.toLowerCase())
+                    if (expected_succ_rows >= 0) {
+                        assertEquals(json.NumberLoadedRows, expected_succ_rows)
+                    } else {
+                        assertEquals(json.NumberTotalRows, 
json.NumberLoadedRows + json.NumberUnselectedRows)
+                        assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes 
> 0)
+                }
+            }
+        }
+    }
+
+    try {
+        load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json', 
'documents-1000.json')
+        load_httplogs_data.call(indexTbName3, indexTbName3, 'true', 'json', 
'documents-1000.json')
+
+        sql "sync"
+        sql """ set enable_common_expr_pushdown = true; """
+
+        GetDebugPoint().enableDebugPointForAllBEs("VMatchPredicate.execute")
+        qt_sql """ select count() from ${indexTbName2} where request 
match_phrase_edge ''; """
+        qt_sql """ select count() from ${indexTbName2} where request 
match_phrase_edge 'age'; """
+        qt_sql """ select count() from ${indexTbName2} where request 
match_phrase_edge 'es/na'; """
+        qt_sql """ select count() from ${indexTbName2} where request 
match_phrase_edge 'ets/images/ti'; """
+        GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
+
+        qt_sql """ select count() from ${indexTbName3} where request 
match_phrase_edge ''; """
+        qt_sql """ select count() from ${indexTbName3} where request 
match_phrase_edge 'age'; """
+        qt_sql """ select count() from ${indexTbName3} where request 
match_phrase_edge 'es/na'; """
+        qt_sql """ select count() from ${indexTbName3} where request 
match_phrase_edge 'ets/images/ti'; """
+    } finally {
+        GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
+    }
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to