This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit ecbfa4ec7d87643c180d101534a5a35394f71eb3 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Mon Aug 5 11:16:02 2024 +0800 [fix](inverted index) fix match_phrase_edge query result error (#38327) 1. The result of match_phrase_ edge query for a single word is incorrect --- .../inverted_index/query/phrase_edge_query.cpp | 26 +++++++++++++--------- .../inverted_index/query/phrase_edge_query.h | 1 + .../test_index_match_phrase_edge.out | 12 ++++++++++ .../test_index_match_phrase_edge.groovy | 11 +++++++++ 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp index 428dc05e6f6..ec1b5bdd9e4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp @@ -31,7 +31,9 @@ namespace doris::segment_v2 { PhraseEdgeQuery::PhraseEdgeQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher, const TQueryOptions& query_options) - : _searcher(searcher), _query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()) {} + : _searcher(searcher), + _query(std::make_unique<CL_NS(search)::MultiPhraseQuery>()), + _max_expansions(query_options.inverted_index_max_expansions) {} void PhraseEdgeQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms) { if (terms.empty()) { @@ -50,9 +52,9 @@ void PhraseEdgeQuery::search(roaring::Roaring& roaring) { } void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) { - size_t count = 0; + bool first = true; std::wstring sub_term = StringUtil::string_to_wstring(_terms[0]); - find_words([this, &count, &sub_term, &roaring](Term* term) { + find_words([this, &first, &sub_term, &roaring](Term* term) { std::wstring_view ws_term(term->text(), term->textLength()); if (ws_term.find(sub_term) == std::wstring::npos) { return; @@ -70,12 +72,12 @@ void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) { } _CLDELETE(term_doc); - if (count) { + if (!first) { roaring.swap(result); + first = false; } else { roaring |= result; } - count++; }); } @@ -86,15 +88,19 @@ void PhraseEdgeQuery::search_multi_term(roaring::Roaring& roaring) { std::vector<CL_NS(index)::Term*> suffix_terms; std::vector<CL_NS(index)::Term*> prefix_terms; - find_words([&suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { + find_words([this, &suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { std::wstring_view ws_term(term->text(), term->textLength()); - if (ws_term.ends_with(suffix_term)) { - suffix_terms.push_back(_CL_POINTER(term)); + if (_max_expansions == 0 || suffix_terms.size() < _max_expansions) { + if (ws_term.ends_with(suffix_term)) { + suffix_terms.push_back(_CL_POINTER(term)); + } } - if (ws_term.starts_with(prefix_term)) { - prefix_terms.push_back(_CL_POINTER(term)); + if (_max_expansions == 0 || prefix_terms.size() < _max_expansions) { + if (ws_term.starts_with(prefix_term)) { + prefix_terms.push_back(_CL_POINTER(term)); + } } }); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h index 823f46285b1..5daf382e0d0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h @@ -52,6 +52,7 @@ private: std::wstring _field_name; std::vector<std::string> _terms; std::unique_ptr<CL_NS(search)::MultiPhraseQuery> _query; + int32_t _max_expansions = 50; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out index e650f9b39b2..8accc202576 100644 --- a/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out +++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_edge.out @@ -29,3 +29,15 @@ -- !sql -- 10 nav_tickets_off.gif 习惯于生活中很多 nav tickets off gif 虚假 nav tickets off gif 美化的人来说 +-- !sql -- +2 + +-- !sql -- +4 + +-- !sql -- +11 + +-- !sql -- +6 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy index e05f6bb1ec9..8d4ab3d2320 100644 --- a/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy +++ b/regression-test/suites/inverted_index_p0/test_index_match_phrase_edge.groovy @@ -48,6 +48,12 @@ suite("test_index_match_phrase_edge", "p0"){ sql """ INSERT INTO ${indexTbName1} VALUES (9, "hm_bg.jpg", "前几日 hm bg jpg 在别处 hm bg jpg 购得"); """ sql """ INSERT INTO ${indexTbName1} VALUES (10, "nav_tickets_off.gif", "习惯于生活中很多 nav tickets off gif 虚假 nav tickets off gif 美化的人来说"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (11, "40.135.0.0", "GET /images/hm_bg.jpg HTTP/1.0"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (12, "232.0.0.0", "GET /images/hm_bg.jpg HTTP/1.0"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (13, "26.1.0.0", "GET /images/hm_bg.jpg HTTP/1.0"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (14, "247.37.0.0", "GET /french/splash_inet.html HTTP/1.0"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (15, "247.37.0.0", "GET /images/hm_nbg.jpg HTTP/1.0"); """ + try { sql "sync" @@ -63,6 +69,11 @@ suite("test_index_match_phrase_edge", "p0"){ qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'ue off gif 家长 na'; """ qt_sql """ select * from ${indexTbName1} where c match_phrase_edge 'if 虚假 na'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase_edge '1'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase_edge '3'; """ + qt_sql """ select count() from ${indexTbName1} where c match_phrase_edge 'n'; """ + qt_sql """ select count() from ${indexTbName1} where c match_phrase_edge 'b'; """ + } finally { //try_sql("DROP TABLE IF EXISTS ${testTable}") } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org