This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 153c7982f3 [Optimize](invert index) Optimize multiple terms conjunction query (#23871) 153c7982f3 is described below commit 153c7982f3b8dd1ffd715e52ced775a917544594 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Sat Sep 9 01:52:58 2023 +0800 [Optimize](invert index) Optimize multiple terms conjunction query (#23871) --- be/src/clucene | 2 +- be/src/olap/rowset/segment_v2/column_reader.cpp | 6 +- be/src/olap/rowset/segment_v2/column_reader.h | 4 +- .../inverted_index/query/conjunction_query.cpp | 169 ++++++++++++++ .../inverted_index/query/conjunction_query.h | 63 +++++ .../olap/rowset/segment_v2/inverted_index_cache.h | 6 + .../segment_v2/inverted_index_compaction.cpp | 3 +- .../rowset/segment_v2/inverted_index_reader.cpp | 256 +++++++++++---------- .../olap/rowset/segment_v2/inverted_index_reader.h | 58 +++-- .../rowset/segment_v2/inverted_index_writer.cpp | 13 +- be/src/olap/rowset/segment_v2/segment.cpp | 4 +- be/src/olap/rowset/segment_v2/segment.h | 3 +- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 2 +- .../java/org/apache/doris/qe/SessionVariable.java | 13 ++ gensrc/thrift/PaloInternalService.thrift | 2 + 15 files changed, 450 insertions(+), 154 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index 9e60ec666b..fd45366505 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 9e60ec666b3ccf7dd8b7c3e331ac03ccf87d5845 +Subproject commit fd453665055c65b94892d13a93ac47180afd72bb diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 48ac67a51f..17a3136231 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -32,6 +32,7 @@ #include "olap/column_predicate.h" #include "olap/decimal12.h" #include "olap/inverted_index_parser.h" +#include "olap/iterators.h" #include "olap/olap_common.h" #include "olap/rowset/segment_v2/binary_dict_page.h" // for BinaryDictPageDecoder #include "olap/rowset/segment_v2/binary_plain_page.h" @@ -242,11 +243,12 @@ Status ColumnReader::new_bitmap_index_iterator(BitmapIndexIterator** iterator) { } Status ColumnReader::new_inverted_index_iterator(const TabletIndex* index_meta, - OlapReaderStatistics* stats, + const StorageReadOptions& read_options, std::unique_ptr<InvertedIndexIterator>* iterator) { RETURN_IF_ERROR(_ensure_inverted_index_loaded(index_meta)); if (_inverted_index) { - RETURN_IF_ERROR(_inverted_index->new_iterator(stats, iterator)); + RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.stats, + read_options.runtime_state, iterator)); } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/column_reader.h b/be/src/olap/rowset/segment_v2/column_reader.h index 52b6092f92..41a1caf2b8 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.h +++ b/be/src/olap/rowset/segment_v2/column_reader.h @@ -53,6 +53,7 @@ class WrapperField; class AndBlockColumnPredicate; class ColumnPredicate; class TabletIndex; +class StorageReadOptions; namespace io { class FileReader; @@ -119,7 +120,8 @@ public: // Client should delete returned iterator Status new_bitmap_index_iterator(BitmapIndexIterator** iterator); - Status new_inverted_index_iterator(const TabletIndex* index_meta, OlapReaderStatistics* stats, + Status new_inverted_index_iterator(const TabletIndex* index_meta, + const StorageReadOptions& read_options, std::unique_ptr<InvertedIndexIterator>* iterator); // Seek to the first entry in the column. diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp new file mode 100644 index 0000000000..90909045cc --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "conjunction_query.h" + +#include <cstdint> + +namespace doris { + +ConjunctionQuery::ConjunctionQuery(IndexReader* reader) + : _reader(reader), _index_version(reader->getIndexVersion()) {} + +ConjunctionQuery::~ConjunctionQuery() { + for (auto& term : _terms) { + if (term) { + _CLDELETE(term); + } + } + for (auto& term_doc : _term_docs) { + if (term_doc) { + _CLDELETE(term_doc); + } + } +} + +void ConjunctionQuery::add(const std::wstring& field_name, + const std::vector<std::wstring>& wterms) { + if (wterms.size() < 1) { + _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() < 1"); + } + + std::vector<TermIterator> iterators; + for (auto& wterm : wterms) { + Term* t = _CLNEW Term(field_name.c_str(), wterm.c_str()); + _terms.push_back(t); + TermDocs* term_doc = _reader->termDocs(t); + _term_docs.push_back(term_doc); + iterators.emplace_back(term_doc); + } + + std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, const TermIterator& b) { + return a.docFreq() < b.docFreq(); + }); + + if (iterators.size() == 1) { + _lead1 = iterators[0]; + } else { + _lead1 = iterators[0]; + _lead2 = iterators[1]; + for (int32_t i = 2; i < _terms.size(); i++) { + _others.push_back(iterators[i]); + } + } + + if (_index_version == IndexVersion::kV1 && iterators.size() >= 2) { + int32_t little = iterators[0].docFreq(); + int32_t big = iterators[iterators.size() - 1].docFreq(); + if (little == 0 || (big / little) > _conjunction_ratio) { + _use_skip = true; + } + } +} + +void ConjunctionQuery::search(roaring::Roaring& roaring) { + if (_lead1.isEmpty()) { + return; + } + + if (!_use_skip) { + search_by_bitmap(roaring); + return; + } + + search_by_skiplist(roaring); +} + +void ConjunctionQuery::search_by_bitmap(roaring::Roaring& roaring) { + // can get a term of all docid + auto func = [&roaring](const TermIterator& term_docs, bool first) { + roaring::Roaring result; + DocRange doc_range; + while (term_docs.readRange(&doc_range)) { + if (doc_range.type_ == DocRangeType::kMany) { + result.addMany(doc_range.doc_many_size_, doc_range.doc_many->data()); + } else { + result.addRange(doc_range.doc_range.first, doc_range.doc_range.second); + } + } + if (first) { + roaring.swap(result); + } else { + roaring &= result; + } + }; + + // fill the bitmap for the first time + func(_lead1, true); + + // the second inverted list may be empty + if (!_lead2.isEmpty()) { + func(_lead2, false); + } + + // The inverted index iterators contained in the _others array must not be empty + for (auto& other : _others) { + func(other, false); + } +} + +void ConjunctionQuery::search_by_skiplist(roaring::Roaring& roaring) { + int32_t doc = 0; + int32_t first_doc = _lead1.nextDoc(); + while ((doc = do_next(first_doc)) != INT32_MAX) { + roaring.add(doc); + } +} + +int32_t ConjunctionQuery::do_next(int32_t doc) { + while (true) { + assert(doc == _lead1.docID()); + + // the skip list is used to find the two smallest inverted lists + int32_t next2 = _lead2.advance(doc); + if (next2 != doc) { + doc = _lead1.advance(next2); + if (next2 != doc) { + continue; + } + } + + // if both lead1 and lead2 exist, use skip list to lookup other inverted indexes + bool advance_head = false; + for (auto& other : _others) { + if (other.isEmpty()) { + continue; + } + + if (other.docID() < doc) { + int32_t next = other.advance(doc); + if (next > doc) { + doc = _lead1.advance(next); + advance_head = true; + break; + } + } + } + if (advance_head) { + continue; + } + + return doc; + } +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h new file mode 100644 index 0000000000..bffb12ffb2 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h @@ -0,0 +1,63 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <CLucene.h> +#include <CLucene/index/IndexReader.h> +#include <CLucene/index/IndexVersion.h> +#include <CLucene/index/Term.h> +#include <CLucene/search/query/TermIterator.h> + +#include "roaring/roaring.hh" + +CL_NS_USE(index) + +namespace doris { + +class ConjunctionQuery { +public: + ConjunctionQuery(IndexReader* reader); + ~ConjunctionQuery(); + + void set_conjunction_ratio(int32_t conjunction_ratio) { + _conjunction_ratio = conjunction_ratio; + } + + void add(const std::wstring& field_name, const std::vector<std::wstring>& wterms); + void search(roaring::Roaring& roaring); + +private: + void search_by_bitmap(roaring::Roaring& roaring); + void search_by_skiplist(roaring::Roaring& roaring); + + int32_t do_next(int32_t doc); + + IndexReader* _reader = nullptr; + IndexVersion _index_version = IndexVersion::kV0; + int32_t _conjunction_ratio = 1000; + bool _use_skip = false; + + TermIterator _lead1; + TermIterator _lead2; + std::vector<TermIterator> _others; + + std::vector<Term*> _terms; + std::vector<TermDocs*> _term_docs; +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_cache.h b/be/src/olap/rowset/segment_v2/inverted_index_cache.h index c67e17ddda..edbfdd6b72 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_cache.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_cache.h @@ -17,7 +17,13 @@ #pragma once +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Woverloaded-virtual" + #include <CLucene.h> // IWYU pragma: keep + +#pragma GCC diagnostic pop + #include <CLucene/config/repl_wchar.h> #include <CLucene/util/Misc.h> #include <butil/macros.h> diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index cbc3b4399d..7f653a9359 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -32,7 +32,8 @@ Status compact_column(int32_t index_id, int src_segment_num, int dest_segment_nu std::vector<uint32_t> dest_segment_num_rows) { lucene::store::Directory* dir = DorisCompoundDirectory::getDirectory(fs, index_writer_path.c_str(), false); - auto index_writer = _CLNEW lucene::index::IndexWriter(dir, nullptr, true /* create */, + lucene::analysis::SimpleAnalyzer<char> analyzer; + auto index_writer = _CLNEW lucene::index::IndexWriter(dir, &analyzer, true /* create */, true /* closeDirOnShutdown */); // get compound directory src_index_dirs diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index a521da394c..3d5801ecb7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -53,9 +53,12 @@ #include "io/fs/file_system.h" #include "olap/key_coder.h" #include "olap/olap_common.h" +#include "olap/rowset/segment_v2/inverted_index/query/conjunction_query.h" +#include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_compound_directory.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/types.h" +#include "runtime/runtime_state.h" #include "util/faststring.h" #include "util/runtime_profile.h" #include "util/time.h" @@ -101,13 +104,11 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( std::shared_ptr<lucene::analysis::Analyzer> analyzer; std::unique_ptr<lucene::util::Reader> reader; auto analyser_type = inverted_index_ctx->parser_type; - if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) { - analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>(); - reader.reset( - (new lucene::util::StringReader(std::wstring(value.begin(), value.end()).c_str()))); - } else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) { + if (analyser_type == InvertedIndexParserType::PARSER_STANDARD || + analyser_type == InvertedIndexParserType::PARSER_UNICODE) { analyzer = std::make_shared<lucene::analysis::standard95::StandardAnalyzer>(); - reader.reset(new lucene::util::SStringReader<char>(value.data(), value.size(), false)); + } else if (analyser_type == InvertedIndexParserType::PARSER_ENGLISH) { + analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>(); } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) { auto chinese_analyzer = std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false); @@ -119,17 +120,11 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All); } analyzer = chinese_analyzer; - reader.reset(_CLNEW lucene::util::SStringReader<char>(value.c_str(), strlen(value.c_str()), - false)); - //reader.reset(new lucene::util::SimpleInputStreamReader( - // new lucene::util::AStringReader(value.c_str()), - // lucene::util::SimpleInputStreamReader::UTF8)); } else { // default - analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>(); - reader.reset( - (new lucene::util::StringReader(std::wstring(value.begin(), value.end()).c_str()))); + analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<char>>(); } + reader.reset(new lucene::util::SStringReader<char>(value.data(), value.size(), false)); std::wstring field_ws = std::wstring(field_name.begin(), field_name.end()); std::unique_ptr<lucene::analysis::TokenStream> token_stream( @@ -138,17 +133,10 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( lucene::analysis::Token token; while (token_stream->next(&token)) { - if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) { - if (token.termLength<char>() != 0) { - std::string_view term(token.termBuffer<char>(), token.termLength<char>()); - std::wstring ws_term = StringUtil::string_to_wstring(term); - analyse_result.emplace_back(ws_term); - } - } else { - if (token.termLength<TCHAR>() != 0) { - analyse_result.emplace_back( - std::wstring(token.termBuffer<TCHAR>(), token.termLength<TCHAR>())); - } + if (token.termLength<char>() != 0) { + std::string_view term(token.termBuffer<char>(), token.termLength<char>()); + std::wstring ws_term = StringUtil::string_to_wstring(term); + analyse_result.emplace_back(ws_term); } } @@ -219,15 +207,15 @@ Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cach return Status::OK(); } -Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, +Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr<InvertedIndexIterator>* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, shared_from_this()); + *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); return Status::OK(); } -Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - roaring::Roaring* bit_map) { +Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, roaring::Roaring* bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); std::string search_str = reinterpret_cast<const StringRef*>(query_value)->to_string(); @@ -263,83 +251,35 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string } } - std::unique_ptr<lucene::search::Query> query; - std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); - - auto index_search = [&](bool& null_bitmap_already_read, - std::shared_ptr<roaring::Roaring>& term_match_bitmap, - InvertedIndexQueryCache* cache, - InvertedIndexQueryCache::CacheKey& cache_key, - InvertedIndexQueryCacheHandle& cache_handle) { - // check index file existence - if (!indexExists(index_file_path)) { - return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>( - "inverted index path: {} not exist.", index_file_path.string()); - } - - InvertedIndexCacheHandle inverted_index_cache_handle; - InvertedIndexSearcherCache::instance()->get_index_searcher( - _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, stats); - auto index_searcher = inverted_index_cache_handle.get_index_searcher(); - - // try to reuse index_searcher's directory to read null_bitmap to cache - // to avoid open directory additionally for null_bitmap - if (!null_bitmap_already_read) { - InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - read_null_bitmap(&null_bitmap_cache_handle, - index_searcher->getReader()->directory()); - null_bitmap_already_read = true; - } + // check index file existence + if (!indexExists(index_file_path)) { + return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>( + "inverted index path: {} not exist.", index_file_path.string()); + } - try { - if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || - query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || - query_type == InvertedIndexQueryType::EQUAL_QUERY) { - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - index_searcher->_search(query.get(), [&term_match_bitmap](DocRange* docRange) { - if (docRange->type_ == DocRangeType::kMany) { - term_match_bitmap->addMany(docRange->doc_many_size_, - docRange->doc_many.data()); - } else { - term_match_bitmap->addRange(docRange->doc_range.first, - docRange->doc_range.second); - } - }); - } else { - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - index_searcher->_search( - query.get(), - [&term_match_bitmap](const int32_t docid, const float_t /*score*/) { - // docid equal to rowid in segment - term_match_bitmap->add(docid); - }); - } - } catch (const CLuceneError& e) { - return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>( - "CLuceneError occured: {}", e.what()); - } + InvertedIndexCacheHandle inverted_index_cache_handle; + InvertedIndexSearcherCache::instance()->get_index_searcher( + _fs, index_dir.c_str(), index_file_name, &inverted_index_cache_handle, stats); + auto index_searcher = inverted_index_cache_handle.get_index_searcher(); - { - // add to cache - term_match_bitmap->runOptimize(); - cache->insert(cache_key, term_match_bitmap, &cache_handle); - } - return Status::OK(); - }; + std::unique_ptr<lucene::search::Query> query; + std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); roaring::Roaring query_match_bitmap; bool null_bitmap_already_read = false; - if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || + query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) { std::wstring wstr_tokens; for (auto& token : analyse_result) { wstr_tokens += token; + wstr_tokens += L" "; } auto cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCache::CacheKey cache_key; cache_key.index_path = index_file_path; cache_key.column_name = column_name; - cache_key.query_type = InvertedIndexQueryType::MATCH_PHRASE_QUERY; + cache_key.query_type = query_type; auto str_tokens = lucene_wcstoutf8string(wstr_tokens.c_str(), wstr_tokens.length()); cache_key.value.swap(str_tokens); InvertedIndexQueryCacheHandle cache_handle; @@ -352,19 +292,28 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string term_match_bitmap = std::make_shared<roaring::Roaring>(); - auto* phrase_query = new lucene::search::PhraseQuery(); - for (auto& token : analyse_result) { - auto* term = _CLNEW lucene::index::Term(field_ws.c_str(), token.c_str()); - phrase_query->add(term); - _CLDECDELETE(term); + Status res = Status::OK(); + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + auto* phrase_query = new lucene::search::PhraseQuery(); + for (auto& token : analyse_result) { + auto* term = _CLNEW lucene::index::Term(field_ws.c_str(), token.c_str()); + phrase_query->add(term); + _CLDECDELETE(term); + } + query.reset(phrase_query); + res = normal_index_search(stats, query_type, index_searcher, + null_bitmap_already_read, query, term_match_bitmap); + } else { + res = match_all_index_search(stats, runtime_state, field_ws, analyse_result, + index_searcher, term_match_bitmap); } - query.reset(phrase_query); - - Status res = index_search(null_bitmap_already_read, term_match_bitmap, cache, - cache_key, cache_handle); if (!res.ok()) { return res; } + + // add to cache + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); } query_match_bitmap = *term_match_bitmap; } else { @@ -394,11 +343,16 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string [](lucene::index::Term* term) { _CLDECDELETE(term); }}; query.reset(new lucene::search::TermQuery(term.get())); - Status res = index_search(null_bitmap_already_read, term_match_bitmap, cache, - cache_key, cache_handle); + Status res = + normal_index_search(stats, query_type, index_searcher, + null_bitmap_already_read, query, term_match_bitmap); if (!res.ok()) { return res; } + + // add to cache + term_match_bitmap->runOptimize(); + cache->insert(cache_key, term_match_bitmap, &cache_handle); } // add to query_match_bitmap @@ -415,8 +369,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string query_match_bitmap |= *term_match_bitmap; break; } - case InvertedIndexQueryType::EQUAL_QUERY: - case InvertedIndexQueryType::MATCH_ALL_QUERY: { + case InvertedIndexQueryType::EQUAL_QUERY: { SCOPED_RAW_TIMER(&stats->inverted_index_query_bitmap_op_timer); query_match_bitmap &= *term_match_bitmap; break; @@ -437,17 +390,83 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, const std::string } } +Status FullTextIndexReader::normal_index_search( + OlapReaderStatistics* stats, InvertedIndexQueryType query_type, + const IndexSearcherPtr& index_searcher, bool& null_bitmap_already_read, + const std::unique_ptr<lucene::search::Query>& query, + const std::shared_ptr<roaring::Roaring>& term_match_bitmap) { + check_null_bitmap(index_searcher, null_bitmap_already_read); + + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || + query_type == InvertedIndexQueryType::EQUAL_QUERY) { + index_searcher->_search(query.get(), [&term_match_bitmap](DocRange* doc_range) { + if (doc_range->type_ == DocRangeType::kMany) { + term_match_bitmap->addMany(doc_range->doc_many_size_, + doc_range->doc_many->data()); + } else { + term_match_bitmap->addRange(doc_range->doc_range.first, + doc_range->doc_range.second); + } + }); + } else { + index_searcher->_search(query.get(), [&term_match_bitmap](const int32_t docid, + const float_t /*score*/) { + // docid equal to rowid in segment + term_match_bitmap->add(docid); + }); + } + } catch (const CLuceneError& e) { + return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: {}", + e.what()); + } + + return Status::OK(); +} + +Status FullTextIndexReader::match_all_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::wstring& field_ws, + const std::vector<std::wstring>& analyse_result, const IndexSearcherPtr& index_searcher, + const std::shared_ptr<roaring::Roaring>& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + ConjunctionQuery query(index_searcher->getReader()); + query.set_conjunction_ratio(queryOptions.inverted_index_conjunction_opt_threshold); + query.add(field_ws, analyse_result); + query.search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + +void FullTextIndexReader::check_null_bitmap(const IndexSearcherPtr& index_searcher, + bool& null_bitmap_already_read) { + // try to reuse index_searcher's directory to read null_bitmap to cache + // to avoid open directory additionally for null_bitmap + if (!null_bitmap_already_read) { + InvertedIndexQueryCacheHandle null_bitmap_cache_handle; + read_null_bitmap(&null_bitmap_cache_handle, index_searcher->getReader()->directory()); + null_bitmap_already_read = true; + } +} + InvertedIndexReaderType FullTextIndexReader::type() { return InvertedIndexReaderType::FULLTEXT; } Status StringTypeInvertedIndexReader::new_iterator( - OlapReaderStatistics* stats, std::unique_ptr<InvertedIndexIterator>* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, shared_from_this()); + OlapReaderStatistics* stats, RuntimeState* runtime_state, + std::unique_ptr<InvertedIndexIterator>* iterator) { + *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); return Status::OK(); } Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, roaring::Roaring* bit_map) { @@ -538,11 +557,11 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY) { SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - index_searcher->_search(query.get(), [&result](DocRange* docRange) { - if (docRange->type_ == DocRangeType::kMany) { - result.addMany(docRange->doc_many_size_, docRange->doc_many.data()); + index_searcher->_search(query.get(), [&result](DocRange* doc_range) { + if (doc_range->type_ == DocRangeType::kMany) { + result.addMany(doc_range->doc_many_size_, doc_range->doc_many->data()); } else { - result.addRange(docRange->doc_range.first, docRange->doc_range.second); + result.addRange(doc_range->doc_range.first, doc_range->doc_range.second); } }); } else { @@ -600,9 +619,9 @@ BkdIndexReader::BkdIndexReader(io::FileSystemSPtr fs, const std::string& path, config::inverted_index_read_buffer_size); } -Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats, +Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr<InvertedIndexIterator>* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, shared_from_this()); + *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); return Status::OK(); } @@ -692,9 +711,9 @@ Status BkdIndexReader::handle_cache(InvertedIndexQueryCache* cache, } } -Status BkdIndexReader::query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - roaring::Roaring* bit_map) { +Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, roaring::Roaring* bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); auto visitor = std::make_unique<InvertedIndexVisitor>(bit_map, query_type); @@ -968,7 +987,8 @@ Status InvertedIndexIterator::read_from_inverted_index(const std::string& column } } - RETURN_IF_ERROR(_reader->query(_stats, column_name, query_value, query_type, bit_map)); + RETURN_IF_ERROR( + _reader->query(_stats, _runtime_state, column_name, query_value, query_type, bit_map)); return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 5f7b318825..30269f3b19 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -52,6 +52,7 @@ namespace doris { class KeyCoder; class TypeInfo; struct OlapReaderStatistics; +class RuntimeState; namespace segment_v2 { @@ -65,6 +66,8 @@ enum class InvertedIndexReaderType { BKD = 2, }; +using IndexSearcherPtr = std::shared_ptr<lucene::search::IndexSearcher>; + class InvertedIndexReader : public std::enable_shared_from_this<InvertedIndexReader> { public: explicit InvertedIndexReader(io::FileSystemSPtr fs, const std::string& path, @@ -73,11 +76,11 @@ public: virtual ~InvertedIndexReader() = default; // create a new column iterator. Client should delete returned iterator - virtual Status new_iterator(OlapReaderStatistics* stats, + virtual Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr<InvertedIndexIterator>* iterator) = 0; - virtual Status query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - roaring::Roaring* bit_map) = 0; + virtual Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, roaring::Roaring* bit_map) = 0; virtual Status try_query(OlapReaderStatistics* stats, const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t* count) = 0; @@ -118,11 +121,11 @@ public: : InvertedIndexReader(fs, path, index_meta) {} ~FullTextIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, + Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr<InvertedIndexIterator>* iterator) override; - Status query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - roaring::Roaring* bit_map) override; + Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, roaring::Roaring* bit_map) override; Status try_query(OlapReaderStatistics* stats, const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t* count) override { @@ -131,6 +134,21 @@ public: } InvertedIndexReaderType type() override; + +private: + Status normal_index_search(OlapReaderStatistics* stats, InvertedIndexQueryType query_type, + const IndexSearcherPtr& index_searcher, + bool& null_bitmap_already_read, + const std::unique_ptr<lucene::search::Query>& query, + const std::shared_ptr<roaring::Roaring>& term_match_bitmap); + + Status match_all_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::wstring& field_ws, + const std::vector<std::wstring>& analyse_result, + const IndexSearcherPtr& index_searcher, + const std::shared_ptr<roaring::Roaring>& term_match_bitmap); + + void check_null_bitmap(const IndexSearcherPtr& index_searcher, bool& null_bitmap_already_read); }; class StringTypeInvertedIndexReader : public InvertedIndexReader { @@ -142,11 +160,11 @@ public: : InvertedIndexReader(fs, path, index_meta) {} ~StringTypeInvertedIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, + Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr<InvertedIndexIterator>* iterator) override; - Status query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - roaring::Roaring* bit_map) override; + Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, roaring::Roaring* bit_map) override; Status try_query(OlapReaderStatistics* stats, const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t* count) override { @@ -214,12 +232,12 @@ public: } } - Status new_iterator(OlapReaderStatistics* stats, + Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr<InvertedIndexIterator>* iterator) override; - Status query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - roaring::Roaring* bit_map) override; + Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, roaring::Roaring* bit_map) override; Status try_query(OlapReaderStatistics* stats, const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t* count) override; @@ -246,8 +264,9 @@ class InvertedIndexIterator { ENABLE_FACTORY_CREATOR(InvertedIndexIterator); public: - InvertedIndexIterator(OlapReaderStatistics* stats, std::shared_ptr<InvertedIndexReader> reader) - : _stats(stats), _reader(reader) {} + InvertedIndexIterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + std::shared_ptr<InvertedIndexReader> reader) + : _stats(stats), _runtime_state(runtime_state), _reader(reader) {} Status read_from_inverted_index(const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t segment_num_rows, @@ -264,7 +283,8 @@ public: [[nodiscard]] const std::map<string, string>& get_index_properties() const; private: - OlapReaderStatistics* _stats; + OlapReaderStatistics* _stats = nullptr; + RuntimeState* _runtime_state = nullptr; std::shared_ptr<InvertedIndexReader> _reader; }; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 2194d349a9..8cd65bbab6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -155,9 +155,8 @@ public: _doc = std::make_unique<lucene::document::Document>(); _dir.reset(DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), true)); - if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) { - _analyzer = std::make_unique<lucene::analysis::standard::StandardAnalyzer>(); - } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) { + if (_parser_type == InvertedIndexParserType::PARSER_STANDARD || + _parser_type == InvertedIndexParserType::PARSER_UNICODE) { _analyzer = std::make_unique<lucene::analysis::standard95::StandardAnalyzer>(); } else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) { _analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); @@ -234,12 +233,10 @@ public: void new_fulltext_field(const char* field_value_data, size_t field_value_size) { if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH || - _parser_type == InvertedIndexParserType::PARSER_CHINESE) { + _parser_type == InvertedIndexParserType::PARSER_CHINESE || + _parser_type == InvertedIndexParserType::PARSER_UNICODE || + _parser_type == InvertedIndexParserType::PARSER_STANDARD) { new_char_token_stream(field_value_data, field_value_size, _field); - } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) { - new_char_token_stream(field_value_data, field_value_size, _field); - } else if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) { - new_field_value(field_value_data, field_value_size, _field); } else { new_field_char_value(field_value_data, field_value_size, _field); } diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 21ceea7a93..7eb660b3c2 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -328,12 +328,12 @@ Status Segment::new_bitmap_index_iterator(const TabletColumn& tablet_column, Status Segment::new_inverted_index_iterator(const TabletColumn& tablet_column, const TabletIndex* index_meta, - OlapReaderStatistics* stats, + const StorageReadOptions& read_options, std::unique_ptr<InvertedIndexIterator>* iter) { auto col_unique_id = tablet_column.unique_id(); if (_column_readers.count(col_unique_id) > 0 && index_meta) { RETURN_IF_ERROR(_column_readers.at(col_unique_id) - ->new_inverted_index_iterator(index_meta, stats, iter)); + ->new_inverted_index_iterator(index_meta, read_options, iter)); return Status::OK(); } return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index 65f3245e6b..382ae69a7b 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -90,7 +90,8 @@ public: std::unique_ptr<BitmapIndexIterator>* iter); Status new_inverted_index_iterator(const TabletColumn& tablet_column, - const TabletIndex* index_meta, OlapReaderStatistics* stats, + const TabletIndex* index_meta, + const StorageReadOptions& read_options, std::unique_ptr<InvertedIndexIterator>* iter); const ShortKeyIndexDecoder* get_short_key_index() const { diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index bde5a533ae..203117a22a 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1076,7 +1076,7 @@ Status SegmentIterator::_init_inverted_index_iterators() { if (_inverted_index_iterators[cid] == nullptr) { RETURN_IF_ERROR(_segment->new_inverted_index_iterator( _opts.tablet_schema->column(cid), _opts.tablet_schema->get_inverted_index(cid), - _opts.stats, &_inverted_index_iterators[cid])); + _opts, &_inverted_index_iterators[cid])); } } return Status::OK(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 2d9303beb9..6dd2ac6811 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -395,6 +395,8 @@ public class SessionVariable implements Serializable, Writable { public static final String ENABLE_MEMTABLE_ON_SINK_NODE = "enable_memtable_on_sink_node"; + public static final String INVERTED_INDEX_CONJUNCTION_OPT_THRESHOLD = "inverted_index_conjunction_opt_threshold"; + public static final List<String> DEBUG_VARIABLES = ImmutableList.of( SKIP_DELETE_PREDICATE, SKIP_DELETE_BITMAP, @@ -1148,6 +1150,15 @@ public class SessionVariable implements Serializable, Writable { @VariableMgr.VarAttr(name = ENABLE_INSERT_GROUP_COMMIT) public boolean enableInsertGroupCommit = false; + @VariableMgr.VarAttr(name = INVERTED_INDEX_CONJUNCTION_OPT_THRESHOLD, + description = {"在match_all中求取多个倒排索引的交集时,如果最大的倒排索引中的总数是最小倒排索引中的总数的整数倍," + + "则使用跳表来优化交集操作。", + "When intersecting multiple inverted indexes in match_all," + + " if the maximum total count of the largest inverted index" + + " is a multiple of the minimum total count of the smallest inverted index," + + " use a skiplist to optimize the intersection."}) + public int invertedIndexConjunctionOptThreshold = 1000; + // If this fe is in fuzzy mode, then will use initFuzzyModeVariables to generate some variables, // not the default value set in the code. public void initFuzzyModeVariables() { @@ -2243,6 +2254,8 @@ public class SessionVariable implements Serializable, Writable { tResult.setTruncateCharOrVarcharColumns(truncateCharOrVarcharColumns); tResult.setEnableMemtableOnSinkNode(enableMemtableOnSinkNode); + tResult.setInvertedIndexConjunctionOptThreshold(invertedIndexConjunctionOptThreshold); + return tResult; } diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 9c75e46f44..02a314f8fa 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -240,6 +240,8 @@ struct TQueryOptions { // A tag used to distinguish fe start epoch. 82: optional i64 fe_process_uuid = 0; + + 83: optional i32 inverted_index_conjunction_opt_threshold = 1000; } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org