This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new 63ae98a8bc [fix](chinese) fix the issue where the be crashes due to the missing Chinese dict (#182) 63ae98a8bc is described below commit 63ae98a8bc280dc4728dca744c3fe06e7a38caf1 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Thu Feb 1 18:04:26 2024 +0800 [fix](chinese) fix the issue where the be crashes due to the missing Chinese dict (#182) --- .../CLucene/analysis/LanguageBasedAnalyzer.cpp | 14 +++++++++++- .../CLucene/analysis/jieba/ChineseTokenizer.cpp | 8 +++---- .../CLucene/analysis/jieba/ChineseTokenizer.h | 25 ++++++++++++++++------ 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp index 2a32ff04fa..6adfcf1e34 100644 --- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp +++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp @@ -4,6 +4,7 @@ * Distributable under the terms of either the Apache License (Version 2.0) or * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ +#include <fstream> #include "CLucene/_ApiHeader.h" #include "CLucene/analysis/Analyzers.h" @@ -64,7 +65,18 @@ void LanguageBasedAnalyzer::setMode(AnalyzerMode m) { void LanguageBasedAnalyzer::initDict(const std::string &dictPath) { if (_tcscmp(lang, _T("chinese")) == 0) { - CL_NS2(analysis, jieba)::ChineseTokenizer::init(dictPath); + ChineseDict chineseDict; + chineseDict.dictPath_ = dictPath; + + for (const auto& file : chineseDict.files_) { + std::string path = dictPath + "/" + file; + std::ifstream in(path); + if (!in.good()) { + _CLTHROWA(CL_ERR_IO, std::string("chinese tokenizer dict file not found: " + path).c_str()); + } + } + + CL_NS2(analysis, jieba)::ChineseTokenizer::init(&chineseDict); } } diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp index 9a7f5eddfd..ef46315ff5 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp @@ -17,11 +17,11 @@ ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, Tokenizer::lowercase = lowercase; } -void ChineseTokenizer::init(const std::string &dictPath) { - JiebaSingleton::getInstance(dictPath); +void ChineseTokenizer::init(const ChineseDict* chineseDict) { + JiebaSingleton::getInstance(chineseDict); } -CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) { +CL_NS(analysis)::Token* ChineseTokenizer::next(lucene::analysis::Token* token) { if (bufferIndex >= dataLen) { return nullptr; } @@ -29,7 +29,7 @@ CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) { std::string_view& token_text = tokens_text[bufferIndex++]; size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN)); if (Tokenizer::lowercase) { - if (!token_text.empty() && token_text[0] < 0x80) { + if (!token_text.empty() && static_cast<uint8_t>(token_text[0]) < 0x80) { std::transform(token_text.begin(), token_text.end(), const_cast<char*>(token_text.data()), [](char c) { return to_lower(c); }); diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h index 9fe33f5805..09760b7b1c 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h @@ -14,14 +14,25 @@ CL_NS_DEF2(analysis,jieba) CL_NS_USE(analysis) +struct ChineseDict { + std::string dictPath_; + std::vector<std::string> files_ = { + "jieba.dict.utf8", + "hmm_model.utf8", + "user.dict.utf8", + "idf.utf8", + "stop_words.utf8" + }; +}; + class JiebaSingleton { public: - static cppjieba::Jieba& getInstance(const std::string& dictPath = "") { - static cppjieba::Jieba instance(dictPath + "/" + "jieba.dict.utf8", - dictPath + "/" + "hmm_model.utf8", - dictPath + "/" + "user.dict.utf8", - dictPath + "/" + "idf.utf8", - dictPath + "/" + "stop_words.utf8"); + static cppjieba::Jieba& getInstance(const ChineseDict* dict = nullptr) { + static cppjieba::Jieba instance(dict->dictPath_ + "/" + dict->files_[0], + dict->dictPath_ + "/" + dict->files_[1], + dict->dictPath_ + "/" + dict->files_[2], + dict->dictPath_ + "/" + dict->files_[3], + dict->dictPath_ + "/" + dict->files_[4]); return instance; } @@ -46,7 +57,7 @@ public: // Constructor explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode); explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase); - static void init(const std::string& dictPath=""); + static void init(const ChineseDict* chineseDict); // Destructor ~ChineseTokenizer() override = default; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org