This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new 25324632ba [Fix](analyzer) add ownership flag to Field's TokenStream value and Analyzer's Reader (#222) 25324632ba is described below commit 25324632babc0e5da28048ebbe9adcbdfc73c281 Author: airborne12 <airborn...@gmail.com> AuthorDate: Wed Jun 12 10:37:05 2024 +0800 [Fix](analyzer) add ownership flag to Field's TokenStream value and Analyzer's Reader (#222) --- src/core/CLucene/analysis/AnalysisHeader.h | 16 +++++++++++++--- src/core/CLucene/analysis/Analyzers.cpp | 4 +++- src/core/CLucene/analysis/Analyzers.h | 5 +++-- src/core/CLucene/document/Field.cpp | 8 ++++++-- src/core/CLucene/document/Field.h | 3 ++- 5 files changed, 27 insertions(+), 9 deletions(-) diff --git a/src/core/CLucene/analysis/AnalysisHeader.h b/src/core/CLucene/analysis/AnalysisHeader.h index fe10e396bc..e1528d2f94 100644 --- a/src/core/CLucene/analysis/AnalysisHeader.h +++ b/src/core/CLucene/analysis/AnalysisHeader.h @@ -10,6 +10,7 @@ #include "CLucene/index/Payload.h" #include "CLucene/util/VoidList.h" #include "CLucene/LuceneThreads.h" +#include "CLucene/util/CLStreams.h" #include <unordered_set> @@ -304,6 +305,10 @@ public: _stopwords = stopwords; } + virtual void set_ownReader(bool ownReader) { + _ownReader = ownReader; + } + private: DEFINE_MUTEX(THIS_LOCK) @@ -322,6 +327,7 @@ protected: virtual void setPreviousTokenStream(TokenStream* obj); bool _lowercase = false; + bool _ownReader = false; std::unordered_set<std::string_view>* _stopwords = nullptr; public: @@ -359,19 +365,23 @@ protected: /** The text source for this Tokenizer. */ CL_NS(util)::Reader* input; bool lowercase = false; + bool ownReader = false; std::unordered_set<std::string_view>* stopwords = nullptr; public: /** Construct a tokenizer with null input. */ Tokenizer():input(nullptr){} /** Construct a token stream processing the given input. */ - explicit Tokenizer(CL_NS(util)::Reader* _input):input(_input){} + explicit Tokenizer(CL_NS(util)::Reader* _input, bool _ownReader = false):input(_input), ownReader(_ownReader){} /** By default, closes the input Reader. */ virtual void close() { if (input != NULL) { - // ? delete input; - input = NULL; + if (ownReader) { + _CLDELETE(input); + } else { + input = NULL; + } } }; diff --git a/src/core/CLucene/analysis/Analyzers.cpp b/src/core/CLucene/analysis/Analyzers.cpp index 05a1c9e6f8..fde703e44a 100644 --- a/src/core/CLucene/analysis/Analyzers.cpp +++ b/src/core/CLucene/analysis/Analyzers.cpp @@ -51,11 +51,13 @@ template class LowerCaseTokenizer<TCHAR>; template<typename T> SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in) : LowerCaseTokenizer<T>(in) { Tokenizer::lowercase = true; + Tokenizer::ownReader = false; } template<typename T> -SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in, bool lowercase) : LowerCaseTokenizer<T>(in) { +SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in, bool lowercase, bool ownReader) : LowerCaseTokenizer<T>(in) { Tokenizer::lowercase = lowercase; + Tokenizer::ownReader = ownReader; } template<typename T> diff --git a/src/core/CLucene/analysis/Analyzers.h b/src/core/CLucene/analysis/Analyzers.h index a06263cfcf..22231ef5e4 100644 --- a/src/core/CLucene/analysis/Analyzers.h +++ b/src/core/CLucene/analysis/Analyzers.h @@ -140,7 +140,7 @@ class CLUCENE_EXPORT SimpleTokenizer:public LowerCaseTokenizer<T> { public: /** Construct a new SimpleTokenizer. */ explicit SimpleTokenizer(CL_NS(util)::Reader* in); - SimpleTokenizer(CL_NS(util)::Reader* in, bool lowercase); + SimpleTokenizer(CL_NS(util)::Reader* in, bool lowercase, bool ownReader = false); virtual ~SimpleTokenizer(); Token* next(Token* token) override { @@ -182,12 +182,13 @@ class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer { public: SimpleAnalyzer(){ _lowercase = true; + _ownReader = false; } bool isSDocOpt() override { return true; } TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) override{ - return _CLNEW SimpleTokenizer<T>(reader, _lowercase); + return _CLNEW SimpleTokenizer<T>(reader, _lowercase, _ownReader); } TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) override{ if (tokenizer_ == nullptr) { diff --git a/src/core/CLucene/document/Field.cpp b/src/core/CLucene/document/Field.cpp index 13bdf54d3d..5ec73be7e0 100644 --- a/src/core/CLucene/document/Field.cpp +++ b/src/core/CLucene/document/Field.cpp @@ -176,7 +176,8 @@ void Field::setValue(ValueArray<uint8_t>* value) { } /** Expert: change the value of this field. See <a href="#setValue(java.lang.String)">setValue(String)</a>. */ -void Field::setValue(CL_NS(analysis)::TokenStream* value) { +void Field::setValue(CL_NS(analysis)::TokenStream* value, bool own_stream) { + ownStream = own_stream; _resetValue(); fieldsData = value; valueType = VALUE_TOKENSTREAM; @@ -340,7 +341,10 @@ void Field::_resetValue() { } else if (valueType & VALUE_BINARY) { ValueArray<uint8_t>* v = static_cast<ValueArray<uint8_t>*>(fieldsData); _CLDELETE(v); - } + } else if (valueType & VALUE_TOKENSTREAM && ownStream) { + auto* v = static_cast<CL_NS(analysis)::TokenStream*>(fieldsData); + _CLDELETE(v); + } valueType=VALUE_NONE; } const char* Field::getObjectName() const{ diff --git a/src/core/CLucene/document/Field.h b/src/core/CLucene/document/Field.h index 23c0ad17f5..eac8043999 100644 --- a/src/core/CLucene/document/Field.h +++ b/src/core/CLucene/document/Field.h @@ -305,7 +305,7 @@ public: void setValue(CL_NS(util)::ValueArray<uint8_t>* value) ; /** Expert: change the value of this field. See <a href="#setValue(TCHAR*)">setValue(TCHAR*)</a>. */ - void setValue(CL_NS(analysis)::TokenStream* value); + void setValue(CL_NS(analysis)::TokenStream* value, bool own_stream = false); //void setValue(CL_NS(analysis)::STokenStream* value); @@ -334,6 +334,7 @@ protected: float_t boost; IndexVersion indexVersion_ = IndexVersion::kV1; + bool ownStream = false; }; CL_NS_END #endif --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org