This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 171f374f56 [improvement](invert index) Change the loading method of keyword type (#21893) 171f374f56 is described below commit 171f374f5629752266e0be15525292a972f3256e Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Wed Jul 19 15:26:49 2023 +0800 [improvement](invert index) Change the loading method of keyword type (#21893) 1. fix can not index Chinese 2. optimized invert index load --- be/src/clucene | 2 +- be/src/olap/rowset/segment_v2/inverted_index_reader.cpp | 3 ++- be/src/olap/rowset/segment_v2/inverted_index_writer.cpp | 10 ++++++++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index 103e88a8a3..5dd6fca31d 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 103e88a8a3b24da9ae2a0d9908a3ceb3f7808a61 +Subproject commit 5dd6fca31d1a0226a29abfea7c03c9694401ec32 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 0a935bec6a..d382d74aab 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -22,6 +22,7 @@ #include <CLucene/analysis/LanguageBasedAnalyzer.h> #include <CLucene/analysis/standard/StandardAnalyzer.h> #include <CLucene/clucene-config.h> +#include <CLucene/config/repl_wchar.h> #include <CLucene/debug/error.h> #include <CLucene/debug/mem.h> #include <CLucene/index/IndexReader.h> @@ -425,7 +426,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; std::wstring column_name_ws = std::wstring(column_name.begin(), column_name.end()); - std::wstring search_str_ws = std::wstring(search_str.begin(), search_str.end()); + std::wstring search_str_ws = lucene_utf8stows(search_str); // unique_ptr with custom deleter std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term { _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index fcf125b2fa..e6a5be6315 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -172,7 +172,7 @@ public: _analyzer.reset(chinese_analyzer); } else { // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer - _analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<TCHAR>>(); + _analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); } _index_writer = std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(), create, true); @@ -228,8 +228,10 @@ public: new lucene::util::AStringReader(field_value_data, field_value_size), lucene::util::SimpleInputStreamReader::UTF8); _field->setValue(stringReader); - } else { + } else if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) { new_field_value(field_value_data, field_value_size, _field); + } else { + new_field_char_value(field_value_data, field_value_size, _field); } } @@ -246,6 +248,10 @@ public: //_CLDELETE_ARRAY(field_value) } + void new_field_char_value(const char* s, size_t len, lucene::document::Field* field) { + field->setValue((char*)s, len); + } + Status add_values(const std::string fn, const void* values, size_t count) override { if constexpr (field_is_slice_type(field_type)) { if (_field == nullptr || _index_writer == nullptr) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org