This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e6a5d3375e [Feature-WIP](inverted index) add chinese analyzer for 
inverted index reader (#15998)
e6a5d3375e is described below

commit e6a5d3375e3587c69fafd53ef3f62416959e3f36
Author: YueW <45946325+tany...@users.noreply.github.com>
AuthorDate: Tue Jan 17 20:20:40 2023 +0800

    [Feature-WIP](inverted index) add chinese analyzer for inverted index 
reader (#15998)
    
    add chinese analyzer for inverted index reader
    dependency pr: #14211 #15807 #15823
---
 .../rowset/segment_v2/inverted_index_reader.cpp    | 44 +++++++++++++---------
 .../olap/rowset/segment_v2/inverted_index_reader.h | 10 ++---
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index a11c076df2..5671f268c9 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -17,6 +17,7 @@
 
 #include "olap/rowset/segment_v2/inverted_index_reader.h"
 
+#include <CLucene/analysis/LanguageBasedAnalyzer.h>
 #include <CLucene/search/BooleanQuery.h>
 #include <CLucene/search/PhraseQuery.h>
 
@@ -47,29 +48,41 @@ bool InvertedIndexReader::indexExists(io::Path& 
index_file_path) {
     return exists;
 }
 
-std::vector<std::string> FullTextIndexReader::get_analyse_result(
-        const std::wstring& field_name, const std::wstring& value,
-        InvertedIndexQueryType query_type, InvertedIndexParserType 
analyser_type) {
-    std::vector<std::string> analyse_result;
+std::vector<std::wstring> FullTextIndexReader::get_analyse_result(
+        const std::wstring& field_name, const std::string& value, 
InvertedIndexQueryType query_type,
+        InvertedIndexParserType analyser_type) {
+    std::vector<std::wstring> analyse_result;
     std::shared_ptr<lucene::analysis::Analyzer> analyzer;
+    std::unique_ptr<lucene::util::Reader> reader;
     if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) {
         analyzer = 
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
+        reader.reset(
+                (new lucene::util::StringReader(std::wstring(value.begin(), 
value.end()).c_str())));
+    } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
+        auto chinese_analyzer =
+                
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
+        chinese_analyzer->initDict(config::inverted_index_dict_path);
+        analyzer = chinese_analyzer;
+        reader.reset(new lucene::util::SimpleInputStreamReader(
+                new lucene::util::AStringReader(value.c_str()),
+                lucene::util::SimpleInputStreamReader::UTF8));
     } else {
         // default
         analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>();
+        reader.reset(
+                (new lucene::util::StringReader(std::wstring(value.begin(), 
value.end()).c_str())));
     }
 
-    std::unique_ptr<lucene::util::StringReader> reader(
-            new lucene::util::StringReader(value.c_str()));
     std::unique_ptr<lucene::analysis::TokenStream> token_stream(
             analyzer->tokenStream(field_name.c_str(), reader.get()));
 
     lucene::analysis::Token token;
 
     while (token_stream->next(&token)) {
-        std::string tk =
-                lucene::util::Misc::toString(token.termBuffer<TCHAR>(), 
token.termLength<TCHAR>());
-        analyse_result.emplace_back(tk);
+        if (token.termLength<TCHAR>() != 0) {
+            analyse_result.emplace_back(
+                    std::wstring(token.termBuffer<TCHAR>(), 
token.termLength<TCHAR>()));
+        }
     }
 
     if (token_stream != nullptr) {
@@ -78,7 +91,7 @@ std::vector<std::string> 
FullTextIndexReader::get_analyse_result(
 
     if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
         query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
-        std::set<std::string> unrepeated_result(analyse_result.begin(), 
analyse_result.end());
+        std::set<std::wstring> unrepeated_result(analyse_result.begin(), 
analyse_result.end());
         analyse_result.assign(unrepeated_result.begin(), 
unrepeated_result.end());
     }
 
@@ -100,10 +113,9 @@ Status FullTextIndexReader::query(const std::string& 
column_name, const void* qu
                << " begin to load the fulltext index from clucene, query_str=" 
<< search_str;
     std::unique_ptr<lucene::search::Query> query;
     std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
-    std::wstring search_str_ws = std::wstring(search_str.begin(), 
search_str.end());
     try {
-        std::vector<std::string> analyse_result =
-                get_analyse_result(field_ws, search_str_ws, query_type, 
analyser_type);
+        std::vector<std::wstring> analyse_result =
+                get_analyse_result(field_ws, search_str, query_type, 
analyser_type);
 
         if (analyse_result.empty()) {
             LOG(WARNING) << "invalid input query_str: " << search_str
@@ -114,8 +126,7 @@ Status FullTextIndexReader::query(const std::string& 
column_name, const void* qu
         switch (query_type) {
         case InvertedIndexQueryType::MATCH_ANY_QUERY: {
             query.reset(_CLNEW lucene::search::BooleanQuery());
-            for (auto token : analyse_result) {
-                std::wstring token_ws = std::wstring(token.begin(), 
token.end());
+            for (auto token_ws : analyse_result) {
                 lucene::index::Term* term =
                         _CLNEW lucene::index::Term(field_ws.c_str(), 
token_ws.c_str());
                 static_cast<lucene::search::BooleanQuery*>(query.get())
@@ -127,8 +138,7 @@ Status FullTextIndexReader::query(const std::string& 
column_name, const void* qu
         }
         case InvertedIndexQueryType::MATCH_ALL_QUERY: {
             query.reset(_CLNEW lucene::search::BooleanQuery());
-            for (auto token : analyse_result) {
-                std::wstring token_ws = std::wstring(token.begin(), 
token.end());
+            for (auto token_ws : analyse_result) {
                 lucene::index::Term* term =
                         _CLNEW lucene::index::Term(field_ws.c_str(), 
token_ws.c_str());
                 static_cast<lucene::search::BooleanQuery*>(query.get())
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 70a21f3e77..dca374a9a2 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -75,7 +75,7 @@ public:
     virtual InvertedIndexReaderType type() = 0;
     bool indexExists(io::Path& index_file_path);
 
-    uint32_t get_index_id() { return _index_id; }
+    uint32_t get_index_id() const { return _index_id; }
 
 protected:
     bool _is_match_query(InvertedIndexQueryType query_type);
@@ -103,10 +103,10 @@ public:
     }
 
     InvertedIndexReaderType type() override;
-    std::vector<std::string> get_analyse_result(const std::wstring& field_name,
-                                                const std::wstring& value,
-                                                InvertedIndexQueryType 
query_type,
-                                                InvertedIndexParserType 
analyser_type);
+    std::vector<std::wstring> get_analyse_result(const std::wstring& 
field_name,
+                                                 const std::string& value,
+                                                 InvertedIndexQueryType 
query_type,
+                                                 InvertedIndexParserType 
analyser_type);
 };
 
 class StringTypeInvertedIndexReader : public InvertedIndexReader {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to