[doris] branch master updated: [improvement](invert index) Added lucene9.5 unicode tokenizer (#22217)

kxiao Tue, 25 Jul 2023 09:50:55 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 111957401b [improvement](invert index) Added lucene9.5 unicode 
tokenizer (#22217)
111957401b is described below

commit 111957401bd420a7fa134808f74c44215b97f3a2
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Wed Jul 26 00:50:24 2023 +0800

    [improvement](invert index) Added lucene9.5 unicode tokenizer (#22217)
---
 be/src/clucene                                      |  2 +-
 .../rowset/segment_v2/inverted_index_reader.cpp     | 21 ++++++++++++++-------
 .../rowset/segment_v2/inverted_index_writer.cpp     | 11 +++++------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 5dd6fca31d..313ae23c47 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 5dd6fca31d1a0226a29abfea7c03c9694401ec32
+Subproject commit 313ae23c47ea6f73289e79364a259e404458ac7f
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 0b7340bb40..8a9a08df0a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -46,6 +46,7 @@
 #include <roaring/roaring.hh>
 #include <set>
 
+#include "CLucene/analysis/standard95/StandardAnalyzer.h"
 #include "common/config.h"
 #include "common/logging.h"
 #include "io/fs/file_system.h"
@@ -105,10 +106,8 @@ std::vector<std::wstring> 
InvertedIndexReader::get_analyse_result(
         reader.reset(
                 (new lucene::util::StringReader(std::wstring(value.begin(), 
value.end()).c_str())));
     } else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
-        analyzer = 
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
-        reader.reset(new lucene::util::SimpleInputStreamReader(
-                new lucene::util::AStringReader(value.c_str()),
-                lucene::util::SimpleInputStreamReader::UTF8));
+        analyzer = 
std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
+        reader.reset(new lucene::util::SStringReader<char>(value.data(), 
value.size(), false));
     } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
         auto chinese_analyzer =
                 
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
@@ -139,9 +138,17 @@ std::vector<std::wstring> 
InvertedIndexReader::get_analyse_result(
     lucene::analysis::Token token;
 
     while (token_stream->next(&token)) {
-        if (token.termLength<TCHAR>() != 0) {
-            analyse_result.emplace_back(
-                    std::wstring(token.termBuffer<TCHAR>(), 
token.termLength<TCHAR>()));
+        if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
+            if (token.termLength<char>() != 0) {
+                std::string_view term(token.termBuffer<char>(), 
token.termLength<char>());
+                std::wstring ws_term = lucene_utf8stows(term);
+                analyse_result.emplace_back(ws_term);
+            }
+        } else {
+            if (token.termLength<TCHAR>() != 0) {
+                analyse_result.emplace_back(
+                        std::wstring(token.termBuffer<TCHAR>(), 
token.termLength<TCHAR>()));
+            }
         }
     }
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index e6a5be6315..1762a60189 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -30,6 +30,7 @@
 #include <roaring/roaring.hh>
 #include <vector>
 
+#include "CLucene/analysis/standard95/StandardAnalyzer.h"
 #include "common/config.h"
 #include "olap/field.h"
 #include "olap/inverted_index_parser.h"
@@ -154,9 +155,10 @@ public:
         _doc = std::make_unique<lucene::document::Document>();
         _dir.reset(DorisCompoundDirectory::getDirectory(_fs, 
index_path.c_str(), true));
 
-        if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
-            _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+        if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
             _analyzer = 
std::make_unique<lucene::analysis::standard::StandardAnalyzer>();
+        } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+            _analyzer = 
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
         } else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
             _analyzer = 
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
         } else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
@@ -224,10 +226,7 @@ public:
             _parser_type == InvertedIndexParserType::PARSER_CHINESE) {
             new_char_token_stream(field_value_data, field_value_size, _field);
         } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
-            auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader(
-                    new lucene::util::AStringReader(field_value_data, 
field_value_size),
-                    lucene::util::SimpleInputStreamReader::UTF8);
-            _field->setValue(stringReader);
+            new_char_token_stream(field_value_data, field_value_size, _field);
         } else if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
             new_field_value(field_value_data, field_value_size, _field);
         } else {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[doris] branch master updated: [improvement](invert index) Added lucene9.5 unicode tokenizer (#22217)

Reply via email to