This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 8056274dd9e [fix] Implementing match_phrase_edge without index query method (#41658) 8056274dd9e is described below commit 8056274dd9e531bf476a2a4a260330bb100b83d1 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Wed Oct 16 14:19:15 2024 +0800 [fix] Implementing match_phrase_edge without index query method (#41658) 1. Supports match_phrase_edge query without creating an inverted index. --- be/src/vec/functions/match.cpp | 66 +++++++++++++++ be/src/vec/functions/match.h | 5 +- .../test_index_match_phrase_edge.out | 24 ++++++ .../test_index_match_phrase_edge.groovy | 98 ++++++++++++++++++++++ 4 files changed, 189 insertions(+), 4 deletions(-) diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index de46cf008d5..bbdabe3c506 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -506,6 +506,72 @@ Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::s return Status::OK(); } +Status FunctionMatchPhraseEdge::execute_match( + FunctionContext* context, const std::string& column_name, + const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col, + InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets, + ColumnUInt8::Container& result) const { + RETURN_IF_ERROR(check(context, name)); + + std::vector<std::string> query_tokens = + analyse_query_str_token(inverted_index_ctx, match_query_str, column_name); + if (query_tokens.empty()) { + VLOG_DEBUG << fmt::format( + "token parser result is empty for query, " + "please check your query: '{}' and index parser: '{}'", + match_query_str, + inverted_index_parser_type_to_string(inverted_index_ctx->parser_type)); + return Status::OK(); + } + + int32_t current_src_array_offset = 0; + for (size_t i = 0; i < input_rows_count; i++) { + auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i, + array_offsets, current_src_array_offset); + + int32_t dis_count = data_tokens.size() - query_tokens.size(); + if (dis_count < 0) { + continue; + } + + for (size_t j = 0; j < dis_count + 1; j++) { + bool match = true; + if (query_tokens.size() == 1) { + if (data_tokens[j].find(query_tokens[0]) == std::string::npos) { + match = false; + } + } else { + for (size_t k = 0; k < query_tokens.size(); k++) { + const std::string& data_token = data_tokens[j + k]; + const std::string& query_token = query_tokens[k]; + if (k == 0) { + if (!data_token.ends_with(query_token)) { + match = false; + break; + } + } else if (k == query_tokens.size() - 1) { + if (!data_token.starts_with(query_token)) { + match = false; + break; + } + } else { + if (data_token != query_token) { + match = false; + break; + } + } + } + } + if (match) { + result[i] = true; + break; + } + } + } + + return Status::OK(); +} + void register_function_match(SimpleFunctionFactory& factory) { factory.register_function<FunctionMatchAny>(); factory.register_function<FunctionMatchAll>(); diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h index a4cea93852a..477ab0a3409 100644 --- a/be/src/vec/functions/match.h +++ b/be/src/vec/functions/match.h @@ -180,10 +180,7 @@ public: const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col, InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets, - ColumnUInt8::Container& result) const override { - return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>( - "FunctionMatchPhraseEdge not support execute_match"); - } + ColumnUInt8::Container& result) const override; }; } // namespace doris::vectorized diff --git a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out index 8accc202576..71714c41b3b 100644 --- a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out +++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out @@ -41,3 +41,27 @@ -- !sql -- 6 +-- !sql -- +0 + +-- !sql -- +874 + +-- !sql -- +150 + +-- !sql -- +20 + +-- !sql -- +0 + +-- !sql -- +874 + +-- !sql -- +150 + +-- !sql -- +20 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy index b7fe5664556..147291eb77b 100644 --- a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy +++ b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy @@ -79,4 +79,102 @@ suite("test_index_match_phrase_edge", "nonConcurrent"){ } finally { GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute") } + + def indexTbName2 = "test_index_match_phrase_edge2" + def indexTbName3 = "test_index_match_phrase_edge3" + + sql "DROP TABLE IF EXISTS ${indexTbName2}" + sql "DROP TABLE IF EXISTS ${indexTbName3}" + + sql """ + CREATE TABLE ${indexTbName2} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + CREATE TABLE ${indexTbName3} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json', 'documents-1000.json') + load_httplogs_data.call(indexTbName3, indexTbName3, 'true', 'json', 'documents-1000.json') + + sql "sync" + sql """ set enable_common_expr_pushdown = true; """ + + GetDebugPoint().enableDebugPointForAllBEs("VMatchPredicate.execute") + qt_sql """ select count() from ${indexTbName2} where request match_phrase_edge ''; """ + qt_sql """ select count() from ${indexTbName2} where request match_phrase_edge 'age'; """ + qt_sql """ select count() from ${indexTbName2} where request match_phrase_edge 'es/na'; """ + qt_sql """ select count() from ${indexTbName2} where request match_phrase_edge 'ets/images/ti'; """ + GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute") + + qt_sql """ select count() from ${indexTbName3} where request match_phrase_edge ''; """ + qt_sql """ select count() from ${indexTbName3} where request match_phrase_edge 'age'; """ + qt_sql """ select count() from ${indexTbName3} where request match_phrase_edge 'es/na'; """ + qt_sql """ select count() from ${indexTbName3} where request match_phrase_edge 'ets/images/ti'; """ + } finally { + GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute") + } } \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org