This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new af63014eda4 [fix](inverted index) Fix for Inaccurate match_phrase_prefix Cache in Query Processing (#46310) af63014eda4 is described below commit af63014eda4e06ea59479f4cf8650657f2f416cd Author: zzzxl <yangs...@selectdb.com> AuthorDate: Mon Jan 6 16:08:46 2025 +0800 [fix](inverted index) Fix for Inaccurate match_phrase_prefix Cache in Query Processing (#46310) Problem Summary: 1. different values of inverted_index_max_expansions require separate caches. --- .../rowset/segment_v2/inverted_index_reader.cpp | 46 ++++++++++------------ .../test_index_match_phrase_prefix_1.out | 6 +++ .../inverted_index_p0/test_index_match_regexp.out | 6 +++ .../test_index_match_phrase_prefix_1.groovy | 9 +++++ .../test_index_match_regexp.groovy | 7 ++++ 5 files changed, 49 insertions(+), 25 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 4fe45283cd2..fced65724e5 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -259,7 +259,7 @@ Status InvertedIndexReader::match_index_search( InvertedIndexQueryType query_type, const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr<roaring::Roaring>& term_match_bitmap) { - TQueryOptions queryOptions = runtime_state->query_options(); + const auto& queryOptions = runtime_state->query_options(); try { SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); auto query = QueryFactory::create(query_type, index_searcher, queryOptions, io_ctx); @@ -294,24 +294,23 @@ Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatist VLOG_DEBUG << column_name << " begin to search the fulltext index from clucene, query_str [" << search_str << "]"; + const auto& queryOptions = runtime_state->query_options(); try { InvertedIndexQueryInfo query_info; InvertedIndexQueryCache::CacheKey cache_key; auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); + // terms if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { - cache_key = {index_file_key, column_name, query_type, search_str}; query_info.terms.emplace_back(search_str); + } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + PhraseQuery::parser_info(search_str, column_name, query_type, _index_meta.properties(), + query_info, queryOptions.enable_phrase_query_sequential_opt); } else { - if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { - PhraseQuery::parser_info( - search_str, column_name, query_type, _index_meta.properties(), query_info, - runtime_state->query_options().enable_phrase_query_sequential_opt); - } else { - query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result( - search_str, column_name, query_type, _index_meta.properties()); - } + query_info.terms = inverted_index::InvertedIndexAnalyzer::get_analyse_result( + search_str, column_name, query_type, _index_meta.properties()); } + if (query_info.terms.empty()) { auto msg = fmt::format( "token parser result is empty for query, " @@ -325,22 +324,20 @@ Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatist } } - std::unique_ptr<lucene::search::Query> query; + // field_name query_info.field_name = StringUtil::string_to_wstring(column_name); - if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || - query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || - query_type == InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY || - query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || - query_type == InvertedIndexQueryType::EQUAL_QUERY || - query_type == InvertedIndexQueryType::MATCH_ANY_QUERY) { - std::string str_tokens = join(query_info.terms, " "); - if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { - str_tokens += " " + std::to_string(query_info.slop); - str_tokens += " " + std::to_string(query_info.ordered); - } - cache_key = {index_file_key, column_name, query_type, str_tokens}; + // cache_key + std::string str_tokens = join(query_info.terms, " "); + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + str_tokens += " " + std::to_string(query_info.slop); + str_tokens += " " + std::to_string(query_info.ordered); + } else if (query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || + query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { + str_tokens += " " + std::to_string(queryOptions.inverted_index_max_expansions); } + cache_key = {index_file_key, column_name, query_type, std::move(str_tokens)}; + auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; @@ -350,13 +347,12 @@ Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatist if (cache_status.ok()) { return Status::OK(); } - FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; RETURN_IF_ERROR( handle_searcher_cache(runtime_state, &inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); - searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant); + auto* searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant); if (searcher_ptr != nullptr) { term_match_bitmap = std::make_shared<roaring::Roaring>(); RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, query_type, query_info, diff --git a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out index e7e934f394e..7ac0d5f7ec2 100644 --- a/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out +++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_prefix_1.out @@ -11,3 +11,9 @@ -- !sql -- 6 +-- !sql -- +0 + +-- !sql -- +1 + diff --git a/regression-test/data/inverted_index_p0/test_index_match_regexp.out b/regression-test/data/inverted_index_p0/test_index_match_regexp.out index fb5d23ad266..2c06da4147c 100644 --- a/regression-test/data/inverted_index_p0/test_index_match_regexp.out +++ b/regression-test/data/inverted_index_p0/test_index_match_regexp.out @@ -20,3 +20,9 @@ -- !sql -- 0 +-- !sql -- +4 + +-- !sql -- +377 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy index f42462f12a6..5ee38c9e403 100644 --- a/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy +++ b/regression-test/suites/inverted_index_p0/test_index_match_phrase_prefix_1.groovy @@ -47,6 +47,8 @@ suite("test_index_match_phrase_prefix_1", "nonConcurrent"){ sql """ INSERT INTO ${indexTbName1} VALUES (6, "O1704361998540E2Cemx9S 123456789", "O1704361998540E2Cemx9S 123456789", "O1704361998540E2Cemx9S 123456789"); """ sql """ INSERT INTO ${indexTbName1} VALUES (7, "O1704361998540E2Cemx9S*123456789", "O1704361998540E2Cemx9S*123456789", "O1704361998540E2Cemx9S*123456789"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (1, "", "s1", ""), (2, "", "s2", ""), (3, "", "s3", ""), (4, "", "s4", ""), (5, "", "tv s5", ""); """ + try { sql "sync" sql """ set enable_common_expr_pushdown = true; """ @@ -58,7 +60,14 @@ suite("test_index_match_phrase_prefix_1", "nonConcurrent"){ qt_sql """ select count() from ${indexTbName1} where c match_phrase_prefix 'O1704361998540E2Cemx9S=123456789'; """ qt_sql """ select count() from ${indexTbName1} where d match_phrase_prefix 'O1704361998540E2Cemx9S=123456789'; """ + sql """ set inverted_index_max_expansions = 3; """ + qt_sql """ select count() from ${indexTbName1} where c match_phrase_prefix 'tv s'; """ + + sql """ set inverted_index_max_expansions = 5; """ + qt_sql """ select count() from ${indexTbName1} where c match_phrase_prefix 'tv s'; """ + } finally { + sql """ set inverted_index_max_expansions = 50; """ GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute") } } \ No newline at end of file diff --git a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy index 49f0f563989..1f508306dbb 100644 --- a/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy +++ b/regression-test/suites/inverted_index_p0/test_index_match_regexp.groovy @@ -90,7 +90,14 @@ suite("test_index_match_regexp", "nonConcurrent"){ qt_sql """ select count() from test_index_match_regexp where request match_regexp '.*tickets.*'; """ qt_sql """ select count() from test_index_match_regexp where request match_regexp 'nonexistence'; """ + sql """ set inverted_index_max_expansions = 1; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp 'b'; """ + + sql """ set inverted_index_max_expansions = 50; """ + qt_sql """ select count() from test_index_match_regexp where request match_regexp 'b'; """ + } finally { + sql """ set inverted_index_max_expansions = 50; """ GetDebugPoint().disableDebugPointForAllBEs("VMatchPredicate.execute") } } \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org