This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new a23a45e6e1 [Fix](analyzer) add reader ownership for chinese and standard analyzer (#223) a23a45e6e1 is described below commit a23a45e6e1846a8e82194a94f1678e006d638c31 Author: airborne12 <airborn...@gmail.com> AuthorDate: Thu Jun 13 10:54:12 2024 +0800 [Fix](analyzer) add reader ownership for chinese and standard analyzer (#223) --- src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp | 2 +- src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp | 4 +++- src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h | 2 +- src/core/CLucene/analysis/standard95/StandardAnalyzer.h | 3 ++- src/core/CLucene/analysis/standard95/StandardTokenizer.h | 4 +++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp index 6adfcf1e34..2f2af354d5 100644 --- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp +++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp @@ -121,7 +121,7 @@ TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader * if (_tcscmp(lang, _T("cjk")) == 0) { ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader); } else if (_tcscmp(lang, _T("chinese")) == 0) { - ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase); + ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase, Analyzer::_ownReader); } else { CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader(); diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp index ef46315ff5..3aa5e32a60 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp @@ -10,11 +10,13 @@ CL_NS_USE(util) ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m) : Tokenizer(reader), mode(m) { reset(reader); Tokenizer::lowercase = false; + Tokenizer::ownReader = false; } -ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, bool lowercase) : Tokenizer(reader), mode(m) { +ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, bool lowercase, bool ownReader) : Tokenizer(reader), mode(m) { reset(reader); Tokenizer::lowercase = lowercase; + Tokenizer::ownReader = ownReader; } void ChineseTokenizer::init(const ChineseDict* chineseDict) { diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h index 09760b7b1c..b973aabc1d 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h @@ -56,7 +56,7 @@ private: public: // Constructor explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode); - explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase); + explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase, bool ownReader=false); static void init(const ChineseDict* chineseDict); // Destructor diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h index ccfd1030e1..60764abb41 100644 --- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h +++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h @@ -8,6 +8,7 @@ class StandardAnalyzer : public Analyzer { public: StandardAnalyzer() : Analyzer() { _lowercase = true; + _ownReader = false; _stopwords = nullptr; } @@ -15,7 +16,7 @@ class StandardAnalyzer : public Analyzer { TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords); + return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords, _ownReader); } TokenStream* reusableTokenStream(const TCHAR* fieldName, diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizer.h b/src/core/CLucene/analysis/standard95/StandardTokenizer.h index 431673f00e..62c8b2d0ad 100644 --- a/src/core/CLucene/analysis/standard95/StandardTokenizer.h +++ b/src/core/CLucene/analysis/standard95/StandardTokenizer.h @@ -23,13 +23,15 @@ class StandardTokenizer : public Tokenizer { : Tokenizer(in) { scanner_ = std::make_unique<StandardTokenizerImpl>(in); Tokenizer::lowercase = true; + Tokenizer::lowercase = false; Tokenizer::stopwords = nullptr; } - StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set<std::string_view>* stopwords) + StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set<std::string_view>* stopwords, bool ownReader=false) : Tokenizer(in) { scanner_ = std::make_unique<StandardTokenizerImpl>(in); Tokenizer::lowercase = lowercase; Tokenizer::stopwords = stopwords; + Tokenizer::ownReader = ownReader; } Token* next(Token* token) override { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org