This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 6fdce0ddd8f [feature](inverted index) String type inverted index match function completion (#38170) 6fdce0ddd8f is described below commit 6fdce0ddd8f362f939a177815ed3702f334253d3 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Wed Jul 31 12:18:47 2024 +0800 [feature](inverted index) String type inverted index match function completion (#38170) 1. Inverted index of string type supports match_phrase_prefix and match_regexp. --- be/src/olap/match_predicate.cpp | 21 ++-- be/src/olap/match_predicate.h | 2 +- .../rowset/segment_v2/inverted_index_reader.cpp | 103 +++++++-------- .../olap/rowset/segment_v2/inverted_index_reader.h | 13 +- .../inverted_index_p0/test_index_complex_match.out | 19 +++ .../test_index_complex_match.groovy | 138 +++++++++++++++++++++ .../inverted_index_p0/test_no_index_match.groovy | 4 +- 7 files changed, 224 insertions(+), 76 deletions(-) diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index 0332e3f2e31..683e38775f3 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -51,9 +51,9 @@ Status MatchPredicate::evaluate(const vectorized::IndexFieldNameAndTypePair& nam if (iterator == nullptr) { return Status::OK(); } - if (_skip_evaluate(iterator)) { - return Status::Error<ErrorCode::INVERTED_INDEX_EVALUATE_SKIPPED>( - "match predicate evaluate skipped."); + if (_check_evaluate(iterator)) { + return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>( + "phrase queries require setting support_phrase = true"); } auto type = name_with_type.second; const std::string& name = name_with_type.first; @@ -122,13 +122,14 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m return ret; } -bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const { - if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || - _match_type == MatchType::MATCH_PHRASE_EDGE) && - iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && - get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == - INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { - return true; +bool MatchPredicate::_check_evaluate(InvertedIndexIterator* iterator) const { + if (_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || + _match_type == MatchType::MATCH_PHRASE_EDGE) { + if (iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && + get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { + return true; + } } return false; } diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h index 17d8e76ac88..ad202b7b242 100644 --- a/be/src/olap/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -79,7 +79,7 @@ private: std::string info = "MatchPredicate"; return info; } - bool _skip_evaluate(InvertedIndexIterator* iterator) const; + bool _check_evaluate(InvertedIndexIterator* iterator) const; private: std::string _value; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index d89d089de3b..2ac283e6e34 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -284,6 +284,27 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, return Status::OK(); }; +Status InvertedIndexReader::match_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, + const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr<roaring::Roaring>& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + auto query = QueryFactory::create(query_type, index_searcher, queryOptions); + if (!query) { + return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>( + "query type " + query_type_to_string(query_type) + ", query is nullptr"); + } + query->add(query_info); + query->search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr<InvertedIndexIterator>* iterator) { *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); @@ -384,27 +405,6 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } } -Status FullTextIndexReader::match_index_search( - OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, - const std::shared_ptr<roaring::Roaring>& term_match_bitmap) { - TQueryOptions queryOptions = runtime_state->query_options(); - try { - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - auto query = QueryFactory::create(query_type, index_searcher, queryOptions); - if (!query) { - return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>( - "query type " + query_type_to_string(query_type) + ", query is nullptr"); - } - query->add(query_info); - query->search(*term_match_bitmap); - } catch (const CLuceneError& e) { - return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: {}", - e.what()); - } - return Status::OK(); -} - InvertedIndexReaderType FullTextIndexReader::type() { return InvertedIndexReaderType::FULLTEXT; } @@ -461,28 +461,25 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, std::string search_str(search_query->data, act_len); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; - std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); - std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); - // unique_ptr with custom deleter - std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term { - _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), - [](lucene::index::Term* term) { _CLDECDELETE(term); }}; - std::unique_ptr<lucene::search::Query> query; auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); - // try to get query bitmap result from cache and return immediately on cache hit InvertedIndexQueryCache::CacheKey cache_key {index_file_key, column_name, query_type, search_str}; auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { return Status::OK(); } - roaring::Roaring result; + std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); + + InvertedIndexQueryInfo query_info; + query_info.field_name = column_name_ws; + query_info.terms.emplace_back(search_str); + + auto result = std::make_shared<roaring::Roaring>(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); @@ -494,33 +491,29 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_ANY_QUERY: case InvertedIndexQueryType::MATCH_ALL_QUERY: case InvertedIndexQueryType::EQUAL_QUERY: { - query = std::make_unique<lucene::search::TermQuery>(term.get()); - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - (*searcher_ptr)->_search(query.get(), [&result](DocRange* doc_range) { - if (doc_range->type_ == DocRangeType::kMany) { - result.addMany(doc_range->doc_many_size_, doc_range->doc_many->data()); - } else { - result.addRange(doc_range->doc_range.first, doc_range->doc_range.second); - } - }); + RETURN_IF_ERROR(match_index_search(stats, runtime_state, + InvertedIndexQueryType::MATCH_ANY_QUERY, + query_info, *searcher_ptr, result)); break; } - case InvertedIndexQueryType::MATCH_PHRASE_QUERY: { - query = std::make_unique<lucene::search::TermQuery>(term.get()); - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - (*searcher_ptr) - ->_search(query.get(), - [&result](const int32_t docid, const float_t /*score*/) { - // docid equal to rowid in segment - result.add(docid); - }); + case InvertedIndexQueryType::MATCH_PHRASE_QUERY: + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { + RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, + *searcher_ptr, result)); break; } - case InvertedIndexQueryType::LESS_THAN_QUERY: case InvertedIndexQueryType::LESS_EQUAL_QUERY: case InvertedIndexQueryType::GREATER_THAN_QUERY: case InvertedIndexQueryType::GREATER_EQUAL_QUERY: { + std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); + // unique_ptr with custom deleter + std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term { + _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), + [](lucene::index::Term* term) { _CLDECDELETE(term); }}; + std::unique_ptr<lucene::search::Query> query; + bool include_upper = query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY; bool include_lower = query_type == InvertedIndexQueryType::GREATER_EQUAL_QUERY; @@ -537,7 +530,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, (*searcher_ptr) ->_search(query.get(), [&result](const int32_t docid, const float_t /*score*/) { - result.add(docid); + result->add(docid); }); break; } @@ -560,12 +553,10 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, } // add to cache - std::shared_ptr<roaring::Roaring> term_match_bitmap = - std::make_shared<roaring::Roaring>(result); - term_match_bitmap->runOptimize(); - cache->insert(cache_key, term_match_bitmap, &cache_handler); + result->runOptimize(); + cache->insert(cache_key, result, &cache_handler); - bit_map = term_match_bitmap; + bit_map = result; } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 92df87361c8..a598ccc9ee7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -141,6 +141,12 @@ public: InvertedIndexReaderType reader_type); protected: + Status match_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, + InvertedIndexQueryType query_type, + const InvertedIndexQueryInfo& query_info, + const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr<roaring::Roaring>& term_match_bitmap); + friend class InvertedIndexIterator; std::shared_ptr<InvertedIndexFileReader> _inverted_index_file_reader; TabletIndex _index_meta; @@ -177,13 +183,6 @@ public: const std::map<string, string>& properties); static void setup_analyzer_use_stopwords(std::unique_ptr<lucene::analysis::Analyzer>& analyzer, const std::map<string, string>& properties); - -private: - Status match_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, - InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, - const FulltextIndexSearcherPtr& index_searcher, - const std::shared_ptr<roaring::Roaring>& term_match_bitmap); }; class StringTypeInvertedIndexReader : public InvertedIndexReader { diff --git a/regression-test/data/inverted_index_p0/test_index_complex_match.out b/regression-test/data/inverted_index_p0/test_index_complex_match.out new file mode 100644 index 00000000000..5c3636700dd --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_complex_match.out @@ -0,0 +1,19 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +22 + +-- !sql -- +22 + +-- !sql -- +270 + +-- !sql -- +210 + +-- !sql -- +180 + +-- !sql -- +875 + diff --git a/regression-test/suites/inverted_index_p0/test_index_complex_match.groovy b/regression-test/suites/inverted_index_p0/test_index_complex_match.groovy new file mode 100644 index 00000000000..191e147685d --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_complex_match.groovy @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_complex_match", "p0"){ + def indexTbName1 = "test_index_complex_match_1" + def indexTbName2 = "test_index_complex_match_2" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + sql "DROP TABLE IF EXISTS ${indexTbName2}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX idx_1 (`clientip`) USING INVERTED COMMENT '', + INDEX idx_2 (`request`) USING INVERTED PROPERTIES("parser" = "english") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ + CREATE TABLE ${indexTbName2} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX idx_1 (`clientip`) USING INVERTED COMMENT '', + INDEX idx_2 (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "false") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName1, indexTbName1, 'true', 'json', 'documents-1000.json') + load_httplogs_data.call(indexTbName2, indexTbName2, 'true', 'json', 'documents-1000.json') + + sql "sync" + + qt_sql """ select count() from ${indexTbName1} where clientip match_phrase '247.37.0.0'; """ + qt_sql """ select count() from ${indexTbName1} where clientip match_phrase_prefix '247'; """ + qt_sql """ select count() from ${indexTbName1} where clientip match_regexp '2'; """ + + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'hm'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase_prefix 'na'; """ + qt_sql """ select count() from ${indexTbName1} where request match_regexp 'ag'; """ + + + try { + sql """ select count() from ${indexTbName2} where request match_phrase 'hm'; """ + } catch (Exception e) { + log.info(e.getMessage()); + assertTrue(e.getMessage().contains("phrase queries require setting support_phrase = true")) + } + + try { + sql """ select count() from ${indexTbName2} where request match_phrase_prefix 'na'; """ + } catch (Exception e) { + log.info(e.getMessage()); + assertTrue(e.getMessage().contains("phrase queries require setting support_phrase = true")) + } + + try { + sql """ select count() from ${indexTbName2} where request match_regexp 'ag'; """ + } catch (Exception e) { + log.info(e.getMessage()); + assertTrue(e.getMessage().contains("phrase queries require setting support_phrase = true")) + } + + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file diff --git a/regression-test/suites/inverted_index_p0/test_no_index_match.groovy b/regression-test/suites/inverted_index_p0/test_no_index_match.groovy index cfa94b514a7..60227b01168 100644 --- a/regression-test/suites/inverted_index_p0/test_no_index_match.groovy +++ b/regression-test/suites/inverted_index_p0/test_no_index_match.groovy @@ -98,14 +98,14 @@ suite("test_no_index_match", "p0") { } try { - """ select /*+ SET_VAR(enable_match_without_inverted_index = 0) */ count() from ${testTable_unique} where (request match_phrase 'hm bg'); """ + sql """ select /*+ SET_VAR(enable_match_without_inverted_index = 0) */ count() from ${testTable_unique} where (request match_phrase 'hm bg'); """ } catch (Exception e) { log.info(e.getMessage()); assertTrue(e.getMessage().contains("match_phrase not support execute_match")) } try { - """ select /*+ SET_VAR(enable_match_without_inverted_index = 0) */ count() from ${testTable_unique} where (request match_phrase_prefix 'hm b'); """ + sql """ select /*+ SET_VAR(enable_match_without_inverted_index = 0) */ count() from ${testTable_unique} where (request match_phrase_prefix 'hm b'); """ } catch (Exception e) { log.info(e.getMessage()); assertTrue(e.getMessage().contains("match_phrase_prefix not support execute_match")) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org