(doris-thirdparty) branch clucene updated: [fix](chinese) fix the issue where the be crashes due to the missing Chinese dict (#182)

jianliangqi Thu, 01 Feb 2024 02:05:21 -0800

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new 63ae98a8bc [fix](chinese) fix the issue where the be crashes due to 
the missing Chinese dict (#182)
63ae98a8bc is described below

commit 63ae98a8bc280dc4728dca744c3fe06e7a38caf1
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Thu Feb 1 18:04:26 2024 +0800

    [fix](chinese) fix the issue where the be crashes due to the missing 
Chinese dict (#182)
---
 .../CLucene/analysis/LanguageBasedAnalyzer.cpp     | 14 +++++++++++-
 .../CLucene/analysis/jieba/ChineseTokenizer.cpp    |  8 +++----
 .../CLucene/analysis/jieba/ChineseTokenizer.h      | 25 ++++++++++++++++------
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp 
b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
index 2a32ff04fa..6adfcf1e34 100644
--- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
+++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp
@@ -4,6 +4,7 @@
 * Distributable under the terms of either the Apache License (Version 2.0) or
 * the GNU Lesser General Public License, as specified in the COPYING file.
 
------------------------------------------------------------------------------*/
+#include <fstream>
 #include "CLucene/_ApiHeader.h"
 
 #include "CLucene/analysis/Analyzers.h"
@@ -64,7 +65,18 @@ void LanguageBasedAnalyzer::setMode(AnalyzerMode m) {
 
 void LanguageBasedAnalyzer::initDict(const std::string &dictPath) {
     if (_tcscmp(lang, _T("chinese")) == 0) {
-        CL_NS2(analysis, jieba)::ChineseTokenizer::init(dictPath);
+        ChineseDict chineseDict;
+        chineseDict.dictPath_ = dictPath;
+
+        for (const auto& file : chineseDict.files_) {
+            std::string path = dictPath + "/" + file;
+            std::ifstream in(path);
+            if (!in.good()) {
+                _CLTHROWA(CL_ERR_IO, std::string("chinese tokenizer dict file 
not found: " + path).c_str());
+            }
+        }
+
+        CL_NS2(analysis, jieba)::ChineseTokenizer::init(&chineseDict);
     }
 }
 
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp 
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
index 9a7f5eddfd..ef46315ff5 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp
@@ -17,11 +17,11 @@ ChineseTokenizer::ChineseTokenizer(lucene::util::Reader 
*reader, AnalyzerMode m,
     Tokenizer::lowercase = lowercase;
 }
 
-void ChineseTokenizer::init(const std::string &dictPath) {
-    JiebaSingleton::getInstance(dictPath);
+void ChineseTokenizer::init(const ChineseDict* chineseDict) {
+    JiebaSingleton::getInstance(chineseDict);
 }
 
-CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) 
{
+CL_NS(analysis)::Token* ChineseTokenizer::next(lucene::analysis::Token* token) 
{
     if (bufferIndex >= dataLen) {
         return nullptr;
     }
@@ -29,7 +29,7 @@ CL_NS(analysis)::Token 
*ChineseTokenizer::next(lucene::analysis::Token *token) {
     std::string_view& token_text = tokens_text[bufferIndex++];
     size_t size = std::min(token_text.size(), 
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
     if (Tokenizer::lowercase) {
-        if (!token_text.empty() && token_text[0] < 0x80) {
+        if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) 
{
             std::transform(token_text.begin(), token_text.end(),
                            const_cast<char*>(token_text.data()),
                            [](char c) { return to_lower(c); });
diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h 
b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
index 9fe33f5805..09760b7b1c 100644
--- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
+++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h
@@ -14,14 +14,25 @@
 CL_NS_DEF2(analysis,jieba)
 CL_NS_USE(analysis)
 
+struct ChineseDict {
+    std::string dictPath_;
+    std::vector<std::string> files_ = {
+        "jieba.dict.utf8",
+        "hmm_model.utf8",
+        "user.dict.utf8",
+        "idf.utf8",
+        "stop_words.utf8"
+    };
+};
+
 class JiebaSingleton {
 public:
-    static cppjieba::Jieba& getInstance(const std::string& dictPath = "") {
-        static cppjieba::Jieba instance(dictPath + "/" + "jieba.dict.utf8",
-                                        dictPath + "/" + "hmm_model.utf8",
-                                        dictPath + "/" + "user.dict.utf8",
-                                        dictPath + "/" + "idf.utf8",
-                                        dictPath + "/" + "stop_words.utf8");
+    static cppjieba::Jieba& getInstance(const ChineseDict* dict = nullptr) {
+        static cppjieba::Jieba instance(dict->dictPath_ + "/" + 
dict->files_[0],
+                                        dict->dictPath_ + "/" + 
dict->files_[1],
+                                        dict->dictPath_ + "/" + 
dict->files_[2],
+                                        dict->dictPath_ + "/" + 
dict->files_[3],
+                                        dict->dictPath_ + "/" + 
dict->files_[4]);
         return instance;
     }
 
@@ -46,7 +57,7 @@ public:
     // Constructor
     explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode);
     explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, 
bool lowercase);
-    static void init(const std::string& dictPath="");
+    static void init(const ChineseDict* chineseDict);
 
     // Destructor
     ~ChineseTokenizer() override = default;


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris-thirdparty) branch clucene updated: [fix](chinese) fix the issue where the be crashes due to the missing Chinese dict (#182)

Reply via email to