This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new d95d6be9 [Feature](tokenizer) add lowercase option for tokenizer (#157) d95d6be9 is described below commit d95d6be91ecd4e471306caa57b580ba548605962 Author: airborne12 <airborn...@gmail.com> AuthorDate: Wed Dec 20 14:43:55 2023 +0800 [Feature](tokenizer) add lowercase option for tokenizer (#157) --- src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp | 5 +++-- .../CLucene/analysis/jieba/ChineseTokenizer.cpp | 13 +++++++++++++ src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h | 1 + src/core/CLucene/analysis/AnalysisHeader.h | 7 +++++++ src/core/CLucene/analysis/Analyzers.cpp | 12 +++++++++++- src/core/CLucene/analysis/Analyzers.h | 13 ++++++++----- src/core/CLucene/analysis/standard95/StandardAnalyzer.h | 5 +++-- src/core/CLucene/analysis/standard95/StandardTokenizer.h | 13 ++++++++++--- 8 files changed, 56 insertions(+), 13 deletions(-) diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp index 23de239d..0bc03443 100644 --- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp +++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp @@ -31,6 +31,7 @@ LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR *language, bool stem, A _tcsncpy(lang, language, 100); this->stem = stem; this->mode = mode; + Analyzer::_lowercase = false; } LanguageBasedAnalyzer::~LanguageBasedAnalyzer() { @@ -78,7 +79,7 @@ TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const TCHAR * /*fieldNam streams->filteredTokenStream = _CLNEW StopFilter(streams->tokenStream, true, stopSet); } else if (_tcscmp(lang, _T("chinese")) == 0) { - streams->tokenStream = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode); + streams->tokenStream = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase); streams->filteredTokenStream = streams->tokenStream; } else { CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader(); @@ -111,7 +112,7 @@ TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader * if (_tcscmp(lang, _T("cjk")) == 0) { ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader); } else if (_tcscmp(lang, _T("chinese")) == 0) { - ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode); + ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode, Analyzer::_lowercase); } else { CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader(); diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp index 2fd6f0a3..9a7f5edd 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp @@ -9,6 +9,12 @@ CL_NS_USE(util) ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m) : Tokenizer(reader), mode(m) { reset(reader); + Tokenizer::lowercase = false; +} + +ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m, bool lowercase) : Tokenizer(reader), mode(m) { + reset(reader); + Tokenizer::lowercase = lowercase; } void ChineseTokenizer::init(const std::string &dictPath) { @@ -22,6 +28,13 @@ CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) { std::string_view& token_text = tokens_text[bufferIndex++]; size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN)); + if (Tokenizer::lowercase) { + if (!token_text.empty() && token_text[0] < 0x80) { + std::transform(token_text.begin(), token_text.end(), + const_cast<char*>(token_text.data()), + [](char c) { return to_lower(c); }); + } + } token->setNoCopy(token_text.data(), 0, size); return token; } diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h index 9bd34fb7..9fe33f58 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h @@ -45,6 +45,7 @@ private: public: // Constructor explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode); + explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode, bool lowercase); static void init(const std::string& dictPath=""); // Destructor diff --git a/src/core/CLucene/analysis/AnalysisHeader.h b/src/core/CLucene/analysis/AnalysisHeader.h index 46ab0020..578d8e00 100644 --- a/src/core/CLucene/analysis/AnalysisHeader.h +++ b/src/core/CLucene/analysis/AnalysisHeader.h @@ -293,6 +293,10 @@ public: * performance. */ virtual TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); + + virtual void set_lowercase(bool lowercase) { + _lowercase = lowercase; + } private: DEFINE_MUTEX(THIS_LOCK) @@ -309,6 +313,8 @@ protected: * to save a TokenStream for later re-use by the same * thread. */ virtual void setPreviousTokenStream(TokenStream* obj); + bool _lowercase = false; + public: /** * Invoked before indexing a Field instance if @@ -343,6 +349,7 @@ class CLUCENE_EXPORT Tokenizer:public TokenStream { protected: /** The text source for this Tokenizer. */ CL_NS(util)::Reader* input; + bool lowercase = false; public: /** Construct a tokenizer with null input. */ diff --git a/src/core/CLucene/analysis/Analyzers.cpp b/src/core/CLucene/analysis/Analyzers.cpp index 3ea3e8b0..05a1c9e6 100644 --- a/src/core/CLucene/analysis/Analyzers.cpp +++ b/src/core/CLucene/analysis/Analyzers.cpp @@ -50,6 +50,12 @@ template class LowerCaseTokenizer<TCHAR>; template<typename T> SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in) : LowerCaseTokenizer<T>(in) { + Tokenizer::lowercase = true; +} + +template<typename T> +SimpleTokenizer<T>::SimpleTokenizer(CL_NS(util)::Reader *in, bool lowercase) : LowerCaseTokenizer<T>(in) { + Tokenizer::lowercase = lowercase; } template<typename T> @@ -86,7 +92,11 @@ Token *SimpleTokenizer<char>::next(Token *token) { if (length == 0)// start of token start = offset - 1; - buffer[length++] = to_lower(c); // buffer it, normalized + if (lowercase) { + buffer[length++] = to_lower(c); // buffer it, normalized + } else { + buffer[length++] = c; // buffer it, normalized + } if (length == LUCENE_MAX_WORD_LEN)// buffer overflow! break; diff --git a/src/core/CLucene/analysis/Analyzers.h b/src/core/CLucene/analysis/Analyzers.h index 432dde01..a06263cf 100644 --- a/src/core/CLucene/analysis/Analyzers.h +++ b/src/core/CLucene/analysis/Analyzers.h @@ -138,8 +138,9 @@ protected: template<typename T> class CLUCENE_EXPORT SimpleTokenizer:public LowerCaseTokenizer<T> { public: - /** Construct a new SimpleTokenizer. */ - SimpleTokenizer(CL_NS(util)::Reader* in); + /** Construct a new SimpleTokenizer. */ + explicit SimpleTokenizer(CL_NS(util)::Reader* in); + SimpleTokenizer(CL_NS(util)::Reader* in, bool lowercase); virtual ~SimpleTokenizer(); Token* next(Token* token) override { @@ -179,16 +180,18 @@ public: template <typename T> class CLUCENE_EXPORT SimpleAnalyzer: public Analyzer { public: - SimpleAnalyzer(){} + SimpleAnalyzer(){ + _lowercase = true; + } bool isSDocOpt() override { return true; } TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) override{ - return _CLNEW SimpleTokenizer<T>(reader); + return _CLNEW SimpleTokenizer<T>(reader, _lowercase); } TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) override{ if (tokenizer_ == nullptr) { - tokenizer_ = new SimpleTokenizer<T>(reader); + tokenizer_ = new SimpleTokenizer<T>(reader, _lowercase); } else { tokenizer_->reset(reader); } diff --git a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h index 7e29eec8..7460c811 100644 --- a/src/core/CLucene/analysis/standard95/StandardAnalyzer.h +++ b/src/core/CLucene/analysis/standard95/StandardAnalyzer.h @@ -6,17 +6,18 @@ namespace lucene::analysis::standard95 { class StandardAnalyzer : public Analyzer { public: + StandardAnalyzer() : Analyzer() { _lowercase = true; } bool isSDocOpt() override { return true; } TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - return _CLNEW StandardTokenizer(reader, useStopWords_); + return _CLNEW StandardTokenizer(reader, useStopWords_, _lowercase); } TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { if (tokenizer_ == nullptr) { - tokenizer_ = new StandardTokenizer(reader, useStopWords_); + tokenizer_ = new StandardTokenizer(reader, useStopWords_, _lowercase); } else { tokenizer_->reset(reader); } diff --git a/src/core/CLucene/analysis/standard95/StandardTokenizer.h b/src/core/CLucene/analysis/standard95/StandardTokenizer.h index 67403ae8..1aac8671 100644 --- a/src/core/CLucene/analysis/standard95/StandardTokenizer.h +++ b/src/core/CLucene/analysis/standard95/StandardTokenizer.h @@ -22,6 +22,12 @@ class StandardTokenizer : public Tokenizer { StandardTokenizer(lucene::util::Reader* in, bool useStopWords) : Tokenizer(in), useStopWords_(useStopWords) { scanner_ = std::make_unique<StandardTokenizerImpl>(in); + Tokenizer::lowercase = true; + } + StandardTokenizer(lucene::util::Reader* in, bool useStopWords, bool lowercase) + : Tokenizer(in), useStopWords_(useStopWords) { + scanner_ = std::make_unique<StandardTokenizerImpl>(in); + Tokenizer::lowercase = lowercase; } Token* next(Token* token) override { @@ -37,9 +43,10 @@ class StandardTokenizer : public Tokenizer { if (scanner_->yylength() <= maxTokenLength) { std::string_view term = scanner_->getText(); if (tokenType == StandardTokenizerImpl::WORD_TYPE) { - std::transform(term.begin(), term.end(), - const_cast<char*>(term.data()), - [](char c) { return to_lower(c); }); + if (Tokenizer::lowercase) { + std::transform(term.begin(), term.end(), const_cast<char*>(term.data()), + [](char c) { return to_lower(c); }); + } if (useStopWords_ && stop_words.count(term)) { skippedPositions++; continue; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org