This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 111957401b [improvement](invert index) Added lucene9.5 unicode tokenizer (#22217) 111957401b is described below commit 111957401bd420a7fa134808f74c44215b97f3a2 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Wed Jul 26 00:50:24 2023 +0800 [improvement](invert index) Added lucene9.5 unicode tokenizer (#22217) --- be/src/clucene | 2 +- .../rowset/segment_v2/inverted_index_reader.cpp | 21 ++++++++++++++------- .../rowset/segment_v2/inverted_index_writer.cpp | 11 +++++------ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index 5dd6fca31d..313ae23c47 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 5dd6fca31d1a0226a29abfea7c03c9694401ec32 +Subproject commit 313ae23c47ea6f73289e79364a259e404458ac7f diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 0b7340bb40..8a9a08df0a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -46,6 +46,7 @@ #include <roaring/roaring.hh> #include <set> +#include "CLucene/analysis/standard95/StandardAnalyzer.h" #include "common/config.h" #include "common/logging.h" #include "io/fs/file_system.h" @@ -105,10 +106,8 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( reader.reset( (new lucene::util::StringReader(std::wstring(value.begin(), value.end()).c_str()))); } else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) { - analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>(); - reader.reset(new lucene::util::SimpleInputStreamReader( - new lucene::util::AStringReader(value.c_str()), - lucene::util::SimpleInputStreamReader::UTF8)); + analyzer = std::make_shared<lucene::analysis::standard95::StandardAnalyzer>(); + reader.reset(new lucene::util::SStringReader<char>(value.data(), value.size(), false)); } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) { auto chinese_analyzer = std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false); @@ -139,9 +138,17 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( lucene::analysis::Token token; while (token_stream->next(&token)) { - if (token.termLength<TCHAR>() != 0) { - analyse_result.emplace_back( - std::wstring(token.termBuffer<TCHAR>(), token.termLength<TCHAR>())); + if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) { + if (token.termLength<char>() != 0) { + std::string_view term(token.termBuffer<char>(), token.termLength<char>()); + std::wstring ws_term = lucene_utf8stows(term); + analyse_result.emplace_back(ws_term); + } + } else { + if (token.termLength<TCHAR>() != 0) { + analyse_result.emplace_back( + std::wstring(token.termBuffer<TCHAR>(), token.termLength<TCHAR>())); + } } } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index e6a5be6315..1762a60189 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -30,6 +30,7 @@ #include <roaring/roaring.hh> #include <vector> +#include "CLucene/analysis/standard95/StandardAnalyzer.h" #include "common/config.h" #include "olap/field.h" #include "olap/inverted_index_parser.h" @@ -154,9 +155,10 @@ public: _doc = std::make_unique<lucene::document::Document>(); _dir.reset(DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), true)); - if (_parser_type == InvertedIndexParserType::PARSER_STANDARD || - _parser_type == InvertedIndexParserType::PARSER_UNICODE) { + if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) { _analyzer = std::make_unique<lucene::analysis::standard::StandardAnalyzer>(); + } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) { + _analyzer = std::make_unique<lucene::analysis::standard95::StandardAnalyzer>(); } else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) { _analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); } else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) { @@ -224,10 +226,7 @@ public: _parser_type == InvertedIndexParserType::PARSER_CHINESE) { new_char_token_stream(field_value_data, field_value_size, _field); } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) { - auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader( - new lucene::util::AStringReader(field_value_data, field_value_size), - lucene::util::SimpleInputStreamReader::UTF8); - _field->setValue(stringReader); + new_char_token_stream(field_value_data, field_value_size, _field); } else if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) { new_field_value(field_value_data, field_value_size, _field); } else { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org