This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch clucene-2.0 in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene-2.0 by this push: new cb7245de50 [Pick 2.0](analyzer) add reader ownership for chinese and standard analyzer (#225) cb7245de50 is described below commit cb7245de50a05ff7e99c888673138775e4f59491 Author: airborne12 <airborn...@gmail.com> AuthorDate: Wed Jun 19 11:49:58 2024 +0800 [Pick 2.0](analyzer) add reader ownership for chinese and standard analyzer (#225) * [Fix](analyzer) add ownership flag to Field's TokenStream value and Analyzer's Reader (#222) * [Fix](analyzer) add reader ownership for chinese and standard analyzer (#223) --- .../CLucene/analysis/LanguageBasedAnalyzer.cpp | 2 +- .../CLucene/analysis/jieba/ChineseTokenizer.cpp | 4 +++- .../CLucene/analysis/jieba/ChineseTokenizer.h | 2 +- src/core/CLucene/analysis/AnalysisHeader.h | 16 +++++++++++++--- src/core/CLucene/analysis/Analyzers.cpp | 4 +++- src/core/CLucene/analysis/Analyzers.h | 5 +++-- src/core/CLucene/analysis/standard95/StandardAnalyzer.h | 3 ++- src/core/CLucene/analysis/standard95/StandardTokenizer.h | 4 +++- src/core/CLucene/document/Field.cpp | 8 ++++++-- src/core/CLucene/document/Field.h | 3 ++- 10 files changed, 37 insertions(+), 14 deletions(-) diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp index 6adfcf1e34..2f2af354d5 100644 --- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp +++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp @@ -121,7 +121,7 @@ TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader * if (_tcscmp(lang, _T("cjk")) == 0) { ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader); } else if (_tcscmp(lang, _T("chinese")) == 0) { - ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase); + ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase, Analyzer::_ownReader); } else { CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader(); diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp index ef46315ff5..3aa5e32a60 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp @@ -10,11 +10,13 @@ CL_NS_USE(util) ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m) : Tokenizer(reader), mode(m) { reset(reader); Tokenizer::lowercase = false; + Tokenizer::ownReader = false; } -ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, bool lowercase) : Tokenizer(reader), mode(m) { +ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, bool lowercase, bool ownReader) : Tokenizer(reader), mode(m) { reset(reader); Tokenizer::lowercase = lowercase; + Tokenizer::ownReader = ownReader; } void ChineseTokenizer::init(const ChineseDict* chineseDict) { diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h index 09760b7b1c..b973aabc1d 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h @@ -56,7 +56,7 @@ private: public: // Constructor explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode); - explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase); + explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase, bool ownReader=false); static void init(const ChineseDict* chineseDict); // Destructor diff --git a/src/core/CLucene/analysis/AnalysisHeader.h b/src/core/CLucene/analysis/AnalysisHeader.h index 74aca5a5b6..8235eb5e6f 100644 --- a/src/core/CLucene/analysis/AnalysisHeader.h +++ b/src/core/CLucene/analysis/AnalysisHeader.h @@ -10,6 +10,7 @@ #include "CLucene/index/Payload.h" #include "CLucene/util/VoidList.h" #include "CLucene/LuceneThreads.h" +#include "CLucene/util/CLStreams.h" #include <unordered_set> @@ -304,6 +305,10 @@ public: _stopwords = stopwords; } + virtual void set_ownReader(bool ownReader) { + _ownReader = ownReader; + } + private: DEFINE_MUTEX(THIS_LOCK) @@ -322,6 +327,7 @@ protected: virtual void setPreviousTokenStream(TokenStream* obj); bool _lowercase = false; + bool _ownReader = false; std::unordered_set<std::string_view>* _stopwords = nullptr; public: @@ -359,19 +365,23 @@ protected: /** The text source for this Tokenizer. */ CL_NS(util)::Reader* input; bool lowercase = false; + bool ownReader = false; std::unordered_set<std::string_view>* stopwords = nullptr; public: /** Construct a tokenizer with null input. */ Tokenizer():input(nullptr){} /** Construct a token stream processing the given input. */ - explicit Tokenizer(CL_NS(util)::Reader* _input):input(_input){} + explicit Tokenizer(CL_NS(util)::Reader* _input, bool _ownReader = false):input(_input), ownReader(_ownReader){} /** By default, closes the input Reader. */ virtual void close() { if (input != NULL) { - // ? delete input; - input = NULL; + if (ownReader) { + _CLDELETE(input); + } else { + input = NULL; + } } }; diff --git a/src/core/CLucene/analysis/Analyzers.cpp b/src/core/CLucene/analysis/Analyzers.cpp index 05a1c9e6f8..fde703e44a 100644 --- a/src/core/CLucene/analysis/Analyzers.cpp +++ b/src/core/CLucene/analysis/Analyzers.cpp @@ -51,11 +51,13 @@ template class LowerCaseTokenizer<TCHAR>; template<typename T> SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in) : LowerCaseTokenizer<T>(in) { Tokenizer::lowercase = true; + Tokenizer::ownReader = false; } template<typename T> -SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in, bool lowercase) : LowerCaseTokenizer<T>(in) { +SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in, bool lowercase, bool ownReader) : LowerCaseTokenizer<T>(in) { Tokenizer::lowercase = lowercase; + Tokenizer::ownReader = ownReader; } template<typename T> diff --git a/src/core/CLucene/analysis/Analyzers.h b/src/core/CLucene/analysis/Analyzers.h index a06263cfcf..22231ef5e4 100644 --- a/src/core/CLucene/analysis/Analyzers.h +++ b/src/core/CLucene/analysis/Analyzers.h @@ -140,7 +140,7 @@ class CLUCENE_EXPORT SimpleTokenizer:public LowerCaseTokenizer<T> { public: /** Construct a new SimpleTokenizer. */ explicit SimpleTokenizer(CL_NS(util)::Reader* in); - SimpleTokenizer(CL_NS(util)::Reader* in, bool lowercase); + SimpleTokenizer(CL_NS(util)::Reader* in, bool lowercase, bool ownReader = false); virtual ~SimpleTokenizer(); Token* next(Token* token) override { @@ -182,12 +182,13 @@ class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer { public: SimpleAnalyzer(){ _lowercase = true; + _ownReader = false; } bool isSDocOpt() override { return true; } TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) override{ - return _CLNEW SimpleTokenizer<T>(reader, _lowercase); + return _CLNEW SimpleTokenizer<T>(reader, _lowercase, _ownReader); } TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) override{ if (tokenizer_ == nullptr) { diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h index ccfd1030e1..60764abb41 100644 --- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h +++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h @@ -8,6 +8,7 @@ class StandardAnalyzer : public Analyzer { public: StandardAnalyzer() : Analyzer() { _lowercase = true; + _ownReader = false; _stopwords = nullptr; } @@ -15,7 +16,7 @@ class StandardAnalyzer : public Analyzer { TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords); + return _CLNEW StandardTokenizer(reader, _lowercase, _stopwords, _ownReader); } TokenStream* reusableTokenStream(const TCHAR* fieldName, diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizer.h b/src/core/CLucene/analysis/standard95/StandardTokenizer.h index 431673f00e..62c8b2d0ad 100644 --- a/src/core/CLucene/analysis/standard95/StandardTokenizer.h +++ b/src/core/CLucene/analysis/standard95/StandardTokenizer.h @@ -23,13 +23,15 @@ class StandardTokenizer : public Tokenizer { : Tokenizer(in) { scanner_ = std::make_unique<StandardTokenizerImpl>(in); Tokenizer::lowercase = true; + Tokenizer::lowercase = false; Tokenizer::stopwords = nullptr; } - StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set<std::string_view>* stopwords) + StandardTokenizer(lucene::util::Reader* in, bool lowercase, std::unordered_set<std::string_view>* stopwords, bool ownReader=false) : Tokenizer(in) { scanner_ = std::make_unique<StandardTokenizerImpl>(in); Tokenizer::lowercase = lowercase; Tokenizer::stopwords = stopwords; + Tokenizer::ownReader = ownReader; } Token* next(Token* token) override { diff --git a/src/core/CLucene/document/Field.cpp b/src/core/CLucene/document/Field.cpp index 13bdf54d3d..5ec73be7e0 100644 --- a/src/core/CLucene/document/Field.cpp +++ b/src/core/CLucene/document/Field.cpp @@ -176,7 +176,8 @@ void Field::setValue(ValueArray<uint8_t>* value) { } /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */ -void Field::setValue(CL_NS(analysis)::TokenStream* value) { +void Field::setValue(CL_NS(analysis)::TokenStream* value, bool own_stream) { + ownStream = own_stream; _resetValue(); fieldsData = value; valueType = VALUE_TOKENSTREAM; @@ -340,7 +341,10 @@ void Field::_resetValue() { } else if (valueType & VALUE_BINARY) { ValueArray<uint8_t>* v = static_cast<ValueArray<uint8_t>*>(fieldsData); _CLDELETE(v); - } + } else if (valueType & VALUE_TOKENSTREAM && ownStream) { + auto* v = static_cast<CL_NS(analysis)::TokenStream*>(fieldsData); + _CLDELETE(v); + } valueType=VALUE_NONE; } const char* Field::getObjectName() const{ diff --git a/src/core/CLucene/document/Field.h b/src/core/CLucene/document/Field.h index 23c0ad17f5..eac8043999 100644 --- a/src/core/CLucene/document/Field.h +++ b/src/core/CLucene/document/Field.h @@ -305,7 +305,7 @@ public: void setValue(CL_NS(util)::ValueArray<uint8_t>* value) ; /** Expert: change the value of this field. See <a href="#setValue(TCHAR*)">setValue(TCHAR*)</a>. */ - void setValue(CL_NS(analysis)::TokenStream* value); + void setValue(CL_NS(analysis)::TokenStream* value, bool own_stream = false); //void setValue(CL_NS(analysis)::STokenStream* value); @@ -334,6 +334,7 @@ protected: float_t boost; IndexVersion indexVersion_ = IndexVersion::kV1; + bool ownStream = false; }; CL_NS_END #endif --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org