This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 171f374f56 [improvement](invert index) Change the loading method of 
keyword type (#21893)
171f374f56 is described below

commit 171f374f5629752266e0be15525292a972f3256e
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Wed Jul 19 15:26:49 2023 +0800

    [improvement](invert index) Change the loading method of keyword type 
(#21893)
    
    1. fix can not index Chinese
    2. optimized invert index load
---
 be/src/clucene                                          |  2 +-
 be/src/olap/rowset/segment_v2/inverted_index_reader.cpp |  3 ++-
 be/src/olap/rowset/segment_v2/inverted_index_writer.cpp | 10 ++++++++--
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 103e88a8a3..5dd6fca31d 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 103e88a8a3b24da9ae2a0d9908a3ceb3f7808a61
+Subproject commit 5dd6fca31d1a0226a29abfea7c03c9694401ec32
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 0a935bec6a..d382d74aab 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -22,6 +22,7 @@
 #include <CLucene/analysis/LanguageBasedAnalyzer.h>
 #include <CLucene/analysis/standard/StandardAnalyzer.h>
 #include <CLucene/clucene-config.h>
+#include <CLucene/config/repl_wchar.h>
 #include <CLucene/debug/error.h>
 #include <CLucene/debug/mem.h>
 #include <CLucene/index/IndexReader.h>
@@ -425,7 +426,7 @@ Status 
StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats,
     VLOG_DEBUG << "begin to query the inverted index from clucene"
                << ", column_name: " << column_name << ", search_str: " << 
search_str;
     std::wstring column_name_ws = std::wstring(column_name.begin(), 
column_name.end());
-    std::wstring search_str_ws = std::wstring(search_str.begin(), 
search_str.end());
+    std::wstring search_str_ws = lucene_utf8stows(search_str);
     // unique_ptr with custom deleter
     std::unique_ptr<lucene::index::Term, void (*)(lucene::index::Term*)> term {
             _CLNEW lucene::index::Term(column_name_ws.c_str(), 
search_str_ws.c_str()),
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index fcf125b2fa..e6a5be6315 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -172,7 +172,7 @@ public:
             _analyzer.reset(chinese_analyzer);
         } else {
             // ANALYSER_NOT_SET, ANALYSER_NONE use default SimpleAnalyzer
-            _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<TCHAR>>();
+            _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
         }
         _index_writer = 
std::make_unique<lucene::index::IndexWriter>(_dir.get(), _analyzer.get(),
                                                                      create, 
true);
@@ -228,8 +228,10 @@ public:
                     new lucene::util::AStringReader(field_value_data, 
field_value_size),
                     lucene::util::SimpleInputStreamReader::UTF8);
             _field->setValue(stringReader);
-        } else {
+        } else if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
             new_field_value(field_value_data, field_value_size, _field);
+        } else {
+            new_field_char_value(field_value_data, field_value_size, _field);
         }
     }
 
@@ -246,6 +248,10 @@ public:
         //_CLDELETE_ARRAY(field_value)
     }
 
+    void new_field_char_value(const char* s, size_t len, 
lucene::document::Field* field) {
+        field->setValue((char*)s, len);
+    }
+
     Status add_values(const std::string fn, const void* values, size_t count) 
override {
         if constexpr (field_is_slice_type(field_type)) {
             if (_field == nullptr || _index_writer == nullptr) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to