This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit ecbfa4ec7d87643c180d101534a5a35394f71eb3
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Mon Aug 5 11:16:02 2024 +0800

    [fix](inverted index) fix match_phrase_edge query result error (#38327)
    
    1. The result of match_phrase_ edge query for a single word is incorrect
---
 .../inverted_index/query/phrase_edge_query.cpp     | 26 +++++++++++++---------
 .../inverted_index/query/phrase_edge_query.h       |  1 +
 .../test_index_match_phrase_edge.out               | 12 ++++++++++
 .../test_index_match_phrase_edge.groovy            | 11 +++++++++
 4 files changed, 40 insertions(+), 10 deletions(-)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
index 428dc05e6f6..ec1b5bdd9e4 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp
@@ -31,7 +31,9 @@ namespace doris::segment_v2 {
 
 PhraseEdgeQuery::PhraseEdgeQuery(const 
std::shared_ptr<lucene::search::IndexSearcher>& searcher,
                                  const TQueryOptions& query_options)
-        : _searcher(searcher), 
_query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()) {}
+        : _searcher(searcher),
+          _query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()),
+          _max_expansions(query_options.inverted_index_max_expansions) {}
 
 void PhraseEdgeQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& terms) {
     if (terms.empty()) {
@@ -50,9 +52,9 @@ void PhraseEdgeQuery::search(roaring::Roaring& roaring) {
 }
 
 void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) {
-    size_t count = 0;
+    bool first = true;
     std::wstring sub_term = StringUtil::string_to_wstring(_terms[0]);
-    find_words([this, &count, &sub_term, &roaring](Term* term) {
+    find_words([this, &first, &sub_term, &roaring](Term* term) {
         std::wstring_view ws_term(term->text(), term->textLength());
         if (ws_term.find(sub_term) == std::wstring::npos) {
             return;
@@ -70,12 +72,12 @@ void PhraseEdgeQuery::search_one_term(roaring::Roaring& 
roaring) {
         }
         _CLDELETE(term_doc);
 
-        if (count) {
+        if (!first) {
             roaring.swap(result);
+            first = false;
         } else {
             roaring |= result;
         }
-        count++;
     });
 }
 
@@ -86,15 +88,19 @@ void PhraseEdgeQuery::search_multi_term(roaring::Roaring& 
roaring) {
     std::vector<CL_NS(index)::Term*> suffix_terms;
     std::vector<CL_NS(index)::Term*> prefix_terms;
 
-    find_words([&suffix_term, &suffix_terms, &prefix_term, 
&prefix_terms](Term* term) {
+    find_words([this, &suffix_term, &suffix_terms, &prefix_term, 
&prefix_terms](Term* term) {
         std::wstring_view ws_term(term->text(), term->textLength());
 
-        if (ws_term.ends_with(suffix_term)) {
-            suffix_terms.push_back(_CL_POINTER(term));
+        if (_max_expansions == 0 || suffix_terms.size() < _max_expansions) {
+            if (ws_term.ends_with(suffix_term)) {
+                suffix_terms.push_back(_CL_POINTER(term));
+            }
         }
 
-        if (ws_term.starts_with(prefix_term)) {
-            prefix_terms.push_back(_CL_POINTER(term));
+        if (_max_expansions == 0 || prefix_terms.size() < _max_expansions) {
+            if (ws_term.starts_with(prefix_term)) {
+                prefix_terms.push_back(_CL_POINTER(term));
+            }
         }
     });
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
index 823f46285b1..5daf382e0d0 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h
@@ -52,6 +52,7 @@ private:
     std::wstring _field_name;
     std::vector<std::string> _terms;
     std::unique_ptr<CL_NS(search)::MultiPhraseQuery> _query;
+    int32_t _max_expansions = 50;
 };
 
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out 
b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
index e650f9b39b2..8accc202576 100644
--- a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out
@@ -29,3 +29,15 @@
 -- !sql --
 10     nav_tickets_off.gif     习惯于生活中很多 nav tickets off gif 虚假 nav tickets off 
gif 美化的人来说
 
+-- !sql --
+2
+
+-- !sql --
+4
+
+-- !sql --
+11
+
+-- !sql --
+6
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
index e05f6bb1ec9..8d4ab3d2320 100644
--- 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
+++ 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy
@@ -48,6 +48,12 @@ suite("test_index_match_phrase_edge", "p0"){
     sql """ INSERT INTO ${indexTbName1} VALUES (9, "hm_bg.jpg", "前几日 hm bg jpg 
在别处 hm bg jpg 购得"); """
     sql """ INSERT INTO ${indexTbName1} VALUES (10, "nav_tickets_off.gif", 
"习惯于生活中很多 nav tickets off gif 虚假 nav tickets off gif 美化的人来说"); """
 
+    sql """ INSERT INTO ${indexTbName1} VALUES (11, "40.135.0.0", "GET 
/images/hm_bg.jpg HTTP/1.0"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (12, "232.0.0.0", "GET 
/images/hm_bg.jpg HTTP/1.0"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (13, "26.1.0.0", "GET 
/images/hm_bg.jpg HTTP/1.0"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (14, "247.37.0.0", "GET 
/french/splash_inet.html HTTP/1.0"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (15, "247.37.0.0", "GET 
/images/hm_nbg.jpg HTTP/1.0"); """
+
     try {
         sql "sync"
 
@@ -63,6 +69,11 @@ suite("test_index_match_phrase_edge", "p0"){
         qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ue 
off gif 家长 na'; """
         qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'if 
虚假 na'; """
 
+        qt_sql """ select count() from ${indexTbName1} where b 
match_phrase_edge '1'; """
+        qt_sql """ select count() from ${indexTbName1} where b 
match_phrase_edge '3'; """
+        qt_sql """ select count() from ${indexTbName1} where c 
match_phrase_edge 'n'; """
+        qt_sql """ select count() from ${indexTbName1} where c 
match_phrase_edge 'b'; """
+
     } finally {
         //try_sql("DROP TABLE IF EXISTS ${testTable}")
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to