This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new e6a5d3375e [Feature-WIP](inverted index) add chinese analyzer for inverted index reader (#15998) e6a5d3375e is described below commit e6a5d3375e3587c69fafd53ef3f62416959e3f36 Author: YueW <45946325+tany...@users.noreply.github.com> AuthorDate: Tue Jan 17 20:20:40 2023 +0800 [Feature-WIP](inverted index) add chinese analyzer for inverted index reader (#15998) add chinese analyzer for inverted index reader dependency pr: #14211 #15807 #15823 --- .../rowset/segment_v2/inverted_index_reader.cpp | 44 +++++++++++++--------- .../olap/rowset/segment_v2/inverted_index_reader.h | 10 ++--- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index a11c076df2..5671f268c9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -17,6 +17,7 @@ #include "olap/rowset/segment_v2/inverted_index_reader.h" +#include <CLucene/analysis/LanguageBasedAnalyzer.h> #include <CLucene/search/BooleanQuery.h> #include <CLucene/search/PhraseQuery.h> @@ -47,29 +48,41 @@ bool InvertedIndexReader::indexExists(io::Path& index_file_path) { return exists; } -std::vector<std::string> FullTextIndexReader::get_analyse_result( - const std::wstring& field_name, const std::wstring& value, - InvertedIndexQueryType query_type, InvertedIndexParserType analyser_type) { - std::vector<std::string> analyse_result; +std::vector<std::wstring> FullTextIndexReader::get_analyse_result( + const std::wstring& field_name, const std::string& value, InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type) { + std::vector<std::wstring> analyse_result; std::shared_ptr<lucene::analysis::Analyzer> analyzer; + std::unique_ptr<lucene::util::Reader> reader; if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) { analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>(); + reader.reset( + (new lucene::util::StringReader(std::wstring(value.begin(), value.end()).c_str()))); + } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) { + auto chinese_analyzer = + std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false); + chinese_analyzer->initDict(config::inverted_index_dict_path); + analyzer = chinese_analyzer; + reader.reset(new lucene::util::SimpleInputStreamReader( + new lucene::util::AStringReader(value.c_str()), + lucene::util::SimpleInputStreamReader::UTF8)); } else { // default analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>(); + reader.reset( + (new lucene::util::StringReader(std::wstring(value.begin(), value.end()).c_str()))); } - std::unique_ptr<lucene::util::StringReader> reader( - new lucene::util::StringReader(value.c_str())); std::unique_ptr<lucene::analysis::TokenStream> token_stream( analyzer->tokenStream(field_name.c_str(), reader.get())); lucene::analysis::Token token; while (token_stream->next(&token)) { - std::string tk = - lucene::util::Misc::toString(token.termBuffer<TCHAR>(), token.termLength<TCHAR>()); - analyse_result.emplace_back(tk); + if (token.termLength<TCHAR>() != 0) { + analyse_result.emplace_back( + std::wstring(token.termBuffer<TCHAR>(), token.termLength<TCHAR>())); + } } if (token_stream != nullptr) { @@ -78,7 +91,7 @@ std::vector<std::string> FullTextIndexReader::get_analyse_result( if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY || query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) { - std::set<std::string> unrepeated_result(analyse_result.begin(), analyse_result.end()); + std::set<std::wstring> unrepeated_result(analyse_result.begin(), analyse_result.end()); analyse_result.assign(unrepeated_result.begin(), unrepeated_result.end()); } @@ -100,10 +113,9 @@ Status FullTextIndexReader::query(const std::string& column_name, const void* qu << " begin to load the fulltext index from clucene, query_str=" << search_str; std::unique_ptr<lucene::search::Query> query; std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); - std::wstring search_str_ws = std::wstring(search_str.begin(), search_str.end()); try { - std::vector<std::string> analyse_result = - get_analyse_result(field_ws, search_str_ws, query_type, analyser_type); + std::vector<std::wstring> analyse_result = + get_analyse_result(field_ws, search_str, query_type, analyser_type); if (analyse_result.empty()) { LOG(WARNING) << "invalid input query_str: " << search_str @@ -114,8 +126,7 @@ Status FullTextIndexReader::query(const std::string& column_name, const void* qu switch (query_type) { case InvertedIndexQueryType::MATCH_ANY_QUERY: { query.reset(_CLNEW lucene::search::BooleanQuery()); - for (auto token : analyse_result) { - std::wstring token_ws = std::wstring(token.begin(), token.end()); + for (auto token_ws : analyse_result) { lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); static_cast<lucene::search::BooleanQuery*>(query.get()) @@ -127,8 +138,7 @@ Status FullTextIndexReader::query(const std::string& column_name, const void* qu } case InvertedIndexQueryType::MATCH_ALL_QUERY: { query.reset(_CLNEW lucene::search::BooleanQuery()); - for (auto token : analyse_result) { - std::wstring token_ws = std::wstring(token.begin(), token.end()); + for (auto token_ws : analyse_result) { lucene::index::Term* term = _CLNEW lucene::index::Term(field_ws.c_str(), token_ws.c_str()); static_cast<lucene::search::BooleanQuery*>(query.get()) diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 70a21f3e77..dca374a9a2 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -75,7 +75,7 @@ public: virtual InvertedIndexReaderType type() = 0; bool indexExists(io::Path& index_file_path); - uint32_t get_index_id() { return _index_id; } + uint32_t get_index_id() const { return _index_id; } protected: bool _is_match_query(InvertedIndexQueryType query_type); @@ -103,10 +103,10 @@ public: } InvertedIndexReaderType type() override; - std::vector<std::string> get_analyse_result(const std::wstring& field_name, - const std::wstring& value, - InvertedIndexQueryType query_type, - InvertedIndexParserType analyser_type); + std::vector<std::wstring> get_analyse_result(const std::wstring& field_name, + const std::string& value, + InvertedIndexQueryType query_type, + InvertedIndexParserType analyser_type); }; class StringTypeInvertedIndexReader : public InvertedIndexReader { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org