This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new af63014eda4 [fix](inverted index) Fix for Inaccurate 
match_phrase_prefix Cache in Query Processing (#46310)
af63014eda4 is described below

commit af63014eda4e06ea59479f4cf8650657f2f416cd
Author: zzzxl <yangs...@selectdb.com>
AuthorDate: Mon Jan 6 16:08:46 2025 +0800

    [fix](inverted index) Fix for Inaccurate match_phrase_prefix Cache in Query 
Processing (#46310)
    
    Problem Summary:
    1. different values of inverted_index_max_expansions require separate
    caches.
---
 .../rowset/segment_v2/inverted_index_reader.cpp    | 46 ++++++++++------------
 .../test_index_match_phrase_prefix_1.out           |  6 +++
 .../inverted_index_p0/test_index_match_regexp.out  |  6 +++
 .../test_index_match_phrase_prefix_1.groovy        |  9 +++++
 .../test_index_match_regexp.groovy                 |  7 ++++
 5 files changed, 49 insertions(+), 25 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 4fe45283cd2..fced65724e5 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -259,7 +259,7 @@ Status InvertedIndexReader::match_index_search(
         InvertedIndexQueryType query_type, const InvertedIndexQueryInfo& 
query_info,
         const FulltextIndexSearcherPtr& index_searcher,
         const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
-    TQueryOptions queryOptions = runtime_state->query_options();
+    const auto& queryOptions = runtime_state->query_options();
     try {
         SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
         auto query = QueryFactory::create(query_type, index_searcher, 
queryOptions, io_ctx);
@@ -294,24 +294,23 @@ Status FullTextIndexReader::query(const io::IOContext* 
io_ctx, OlapReaderStatist
     VLOG_DEBUG << column_name << " begin to search the fulltext index from 
clucene, query_str ["
                << search_str << "]";
 
+    const auto& queryOptions = runtime_state->query_options();
     try {
         InvertedIndexQueryInfo query_info;
         InvertedIndexQueryCache::CacheKey cache_key;
         auto index_file_key = 
_inverted_index_file_reader->get_index_file_cache_key(&_index_meta);
 
+        // terms
         if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
-            cache_key = {index_file_key, column_name, query_type, search_str};
             query_info.terms.emplace_back(search_str);
+        } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+            PhraseQuery::parser_info(search_str, column_name, query_type, 
_index_meta.properties(),
+                                     query_info, 
queryOptions.enable_phrase_query_sequential_opt);
         } else {
-            if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
-                PhraseQuery::parser_info(
-                        search_str, column_name, query_type, 
_index_meta.properties(), query_info,
-                        
runtime_state->query_options().enable_phrase_query_sequential_opt);
-            } else {
-                query_info.terms = 
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
-                        search_str, column_name, query_type, 
_index_meta.properties());
-            }
+            query_info.terms = 
inverted_index::InvertedIndexAnalyzer::get_analyse_result(
+                    search_str, column_name, query_type, 
_index_meta.properties());
         }
+
         if (query_info.terms.empty()) {
             auto msg = fmt::format(
                     "token parser result is empty for query, "
@@ -325,22 +324,20 @@ Status FullTextIndexReader::query(const io::IOContext* 
io_ctx, OlapReaderStatist
             }
         }
 
-        std::unique_ptr<lucene::search::Query> query;
+        // field_name
         query_info.field_name = StringUtil::string_to_wstring(column_name);
 
-        if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
-            query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
-            query_type == InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY ||
-            query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
-            query_type == InvertedIndexQueryType::EQUAL_QUERY ||
-            query_type == InvertedIndexQueryType::MATCH_ANY_QUERY) {
-            std::string str_tokens = join(query_info.terms, " ");
-            if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
-                str_tokens += " " + std::to_string(query_info.slop);
-                str_tokens += " " + std::to_string(query_info.ordered);
-            }
-            cache_key = {index_file_key, column_name, query_type, str_tokens};
+        // cache_key
+        std::string str_tokens = join(query_info.terms, " ");
+        if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+            str_tokens += " " + std::to_string(query_info.slop);
+            str_tokens += " " + std::to_string(query_info.ordered);
+        } else if (query_type == 
InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
+                   query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
+            str_tokens += " " + 
std::to_string(queryOptions.inverted_index_max_expansions);
         }
+        cache_key = {index_file_key, column_name, query_type, 
std::move(str_tokens)};
+
         auto* cache = InvertedIndexQueryCache::instance();
         InvertedIndexQueryCacheHandle cache_handler;
 
@@ -350,13 +347,12 @@ Status FullTextIndexReader::query(const io::IOContext* 
io_ctx, OlapReaderStatist
         if (cache_status.ok()) {
             return Status::OK();
         }
-        FulltextIndexSearcherPtr* searcher_ptr = nullptr;
 
         InvertedIndexCacheHandle inverted_index_cache_handle;
         RETURN_IF_ERROR(
                 handle_searcher_cache(runtime_state, 
&inverted_index_cache_handle, io_ctx, stats));
         auto searcher_variant = 
inverted_index_cache_handle.get_index_searcher();
-        searcher_ptr = 
std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
+        auto* searcher_ptr = 
std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
         if (searcher_ptr != nullptr) {
             term_match_bitmap = std::make_shared<roaring::Roaring>();
             RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, 
query_type, query_info,
diff --git 
a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out 
b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out
index e7e934f394e..7ac0d5f7ec2 100644
--- 
a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out
+++ 
b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out
@@ -11,3 +11,9 @@
 -- !sql --
 6
 
+-- !sql --
+0
+
+-- !sql --
+1
+
diff --git a/regression-test/data/inverted_index_p0/test_index_match_regexp.out 
b/regression-test/data/inverted_index_p0/test_index_match_regexp.out
index fb5d23ad266..2c06da4147c 100644
--- a/regression-test/data/inverted_index_p0/test_index_match_regexp.out
+++ b/regression-test/data/inverted_index_p0/test_index_match_regexp.out
@@ -20,3 +20,9 @@
 -- !sql --
 0
 
+-- !sql --
+4
+
+-- !sql --
+377
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy
 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy
index f42462f12a6..5ee38c9e403 100644
--- 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy
+++ 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy
@@ -47,6 +47,8 @@ suite("test_index_match_phrase_prefix_1", "nonConcurrent"){
     sql """ INSERT INTO ${indexTbName1} VALUES (6, "O1704361998540E2Cemx9S 
123456789", "O1704361998540E2Cemx9S 123456789", "O1704361998540E2Cemx9S 
123456789"); """
     sql """ INSERT INTO ${indexTbName1} VALUES (7, 
"O1704361998540E2Cemx9S*123456789", "O1704361998540E2Cemx9S*123456789", 
"O1704361998540E2Cemx9S*123456789"); """
 
+    sql """ INSERT INTO ${indexTbName1} VALUES (1, "", "s1", ""), (2, "", 
"s2", ""), (3, "", "s3", ""), (4, "", "s4", ""), (5, "", "tv s5", ""); """
+
     try {
         sql "sync"
         sql """ set enable_common_expr_pushdown = true; """
@@ -58,7 +60,14 @@ suite("test_index_match_phrase_prefix_1", "nonConcurrent"){
         qt_sql """ select count() from ${indexTbName1} where c 
match_phrase_prefix 'O1704361998540E2Cemx9S=123456789'; """
         qt_sql """ select count() from ${indexTbName1} where d 
match_phrase_prefix 'O1704361998540E2Cemx9S=123456789'; """
 
+        sql """ set inverted_index_max_expansions = 3; """
+        qt_sql """ select count() from ${indexTbName1} where c 
match_phrase_prefix 'tv s'; """
+
+        sql """ set inverted_index_max_expansions = 5; """
+        qt_sql """ select count() from ${indexTbName1} where c 
match_phrase_prefix 'tv s'; """
+
     } finally {
+        sql """ set inverted_index_max_expansions = 50; """
         GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
     }
 }
\ No newline at end of file
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy 
b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
index 49f0f563989..1f508306dbb 100644
--- a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
+++ b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy
@@ -90,7 +90,14 @@ suite("test_index_match_regexp", "nonConcurrent"){
         qt_sql """ select count() from test_index_match_regexp where request 
match_regexp '.*tickets.*'; """
         qt_sql """ select count() from test_index_match_regexp where request 
match_regexp 'nonexistence'; """
 
+        sql """ set inverted_index_max_expansions = 1; """
+        qt_sql """ select count() from test_index_match_regexp where request 
match_regexp 'b'; """
+        
+        sql """ set inverted_index_max_expansions = 50; """
+        qt_sql """ select count() from test_index_match_regexp where request 
match_regexp 'b'; """
+
     } finally {
+        sql """ set inverted_index_max_expansions = 50; """
         GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute")
     }
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to