This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new d3f897789c [improvement](invert index) Added lucene9.5 unicode
tokenizer (#22217)
d3f897789c is described below
commit d3f897789c9c9bc1b4e36bf940b1761353e38061
Author: zzzxl <[email protected]>
AuthorDate: Wed Jul 26 00:50:24 2023 +0800
[improvement](invert index) Added lucene9.5 unicode tokenizer (#22217)
---
be/src/clucene | 2 +-
.../rowset/segment_v2/inverted_index_reader.cpp | 21 ++++++++++++++-------
.../rowset/segment_v2/inverted_index_writer.cpp | 11 +++++------
3 files changed, 20 insertions(+), 14 deletions(-)
diff --git a/be/src/clucene b/be/src/clucene
index 5dd6fca31d..313ae23c47 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 5dd6fca31d1a0226a29abfea7c03c9694401ec32
+Subproject commit 313ae23c47ea6f73289e79364a259e404458ac7f
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 0b7340bb40..8a9a08df0a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -46,6 +46,7 @@
#include <roaring/roaring.hh>
#include <set>
+#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#include "common/config.h"
#include "common/logging.h"
#include "io/fs/file_system.h"
@@ -105,10 +106,8 @@ std::vector<std::wstring>
InvertedIndexReader::get_analyse_result(
reader.reset(
(new lucene::util::StringReader(std::wstring(value.begin(),
value.end()).c_str())));
} else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
- analyzer =
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
- reader.reset(new lucene::util::SimpleInputStreamReader(
- new lucene::util::AStringReader(value.c_str()),
- lucene::util::SimpleInputStreamReader::UTF8));
+ analyzer =
std::make_shared<lucene::analysis::standard95::StandardAnalyzer>();
+ reader.reset(new lucene::util::SStringReader<char>(value.data(),
value.size(), false));
} else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
auto chinese_analyzer =
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
@@ -139,9 +138,17 @@ std::vector<std::wstring>
InvertedIndexReader::get_analyse_result(
lucene::analysis::Token token;
while (token_stream->next(&token)) {
- if (token.termLength<TCHAR>() != 0) {
- analyse_result.emplace_back(
- std::wstring(token.termBuffer<TCHAR>(),
token.termLength<TCHAR>()));
+ if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ if (token.termLength<char>() != 0) {
+ std::string_view term(token.termBuffer<char>(),
token.termLength<char>());
+ std::wstring ws_term = lucene_utf8stows(term);
+ analyse_result.emplace_back(ws_term);
+ }
+ } else {
+ if (token.termLength<TCHAR>() != 0) {
+ analyse_result.emplace_back(
+ std::wstring(token.termBuffer<TCHAR>(),
token.termLength<TCHAR>()));
+ }
}
}
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index e6a5be6315..1762a60189 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -30,6 +30,7 @@
#include <roaring/roaring.hh>
#include <vector>
+#include "CLucene/analysis/standard95/StandardAnalyzer.h"
#include "common/config.h"
#include "olap/field.h"
#include "olap/inverted_index_parser.h"
@@ -154,9 +155,10 @@ public:
_doc = std::make_unique<lucene::document::Document>();
_dir.reset(DorisCompoundDirectory::getDirectory(_fs,
index_path.c_str(), true));
- if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
- _parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
_analyzer =
std::make_unique<lucene::analysis::standard::StandardAnalyzer>();
+ } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
+ _analyzer =
std::make_unique<lucene::analysis::standard95::StandardAnalyzer>();
} else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
_analyzer =
std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
} else if (_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
@@ -224,10 +226,7 @@ public:
_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
new_char_token_stream(field_value_data, field_value_size, _field);
} else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
- auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader(
- new lucene::util::AStringReader(field_value_data,
field_value_size),
- lucene::util::SimpleInputStreamReader::UTF8);
- _field->setValue(stringReader);
+ new_char_token_stream(field_value_data, field_value_size, _field);
} else if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
new_field_value(field_value_data, field_value_size, _field);
} else {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]