This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new ba5731e8 [Feature] add chinese analzyer mode for Jieba (#76) ba5731e8 is described below commit ba5731e8e49c1bd98cc5a70304832c4ba4afe87d Author: airborne12 <airborn...@gmail.com> AuthorDate: Fri May 26 19:07:58 2023 +0800 [Feature] add chinese analzyer mode for Jieba (#76) --- .../CLucene/analysis/LanguageBasedAnalyzer.cpp | 97 ++++++++------ .../CLucene/analysis/LanguageBasedAnalyzer.h | 27 +++- .../CLucene/analysis/jieba/ChineseTokenizer.cpp | 21 ++- .../CLucene/analysis/jieba/ChineseTokenizer.h | 13 +- src/test/contribs-lib/analysis/testChinese.cpp | 148 ++++++++++++++++++++- 5 files changed, 256 insertions(+), 50 deletions(-) diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp index 2db2e941..8d7d8674 100644 --- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp +++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.cpp @@ -22,15 +22,23 @@ CL_NS_USE2(analysis, snowball) CL_NS_DEF(analysis) -LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR *language, bool stem) { +LanguageBasedAnalyzer::LanguageBasedAnalyzer(const TCHAR *language, bool stem, AnalyzerMode mode) { + stopSet = _CLNEW CLTCSetList; + if (language == NULL) _tcsncpy(lang, LUCENE_BLANK_STRING, 100); else _tcsncpy(lang, language, 100); this->stem = stem; + this->mode = mode; } LanguageBasedAnalyzer::~LanguageBasedAnalyzer() = default; + +void LanguageBasedAnalyzer::setStopWords(const TCHAR** stopwords) { + StopFilter::fillStopTable(stopSet, stopwords); +} + void LanguageBasedAnalyzer::setLanguage(const TCHAR *language) { _tcsncpy(lang, language, 100); } @@ -39,6 +47,10 @@ void LanguageBasedAnalyzer::setStem(bool s) { this->stem = s; } +void LanguageBasedAnalyzer::setMode(AnalyzerMode m) { + this->mode = m; +} + void LanguageBasedAnalyzer::initDict(const std::string &dictPath) { if (_tcscmp(lang, _T("chinese")) == 0) { CL_NS2(analysis, jieba)::ChineseTokenizer::init(dictPath); @@ -46,65 +58,68 @@ void LanguageBasedAnalyzer::initDict(const std::string &dictPath) { } TokenStream *LanguageBasedAnalyzer::reusableTokenStream(const TCHAR * /*fieldName*/, CL_NS(util)::Reader *reader) { - TokenStream *tokenizer = getPreviousTokenStream(); - if (tokenizer == nullptr) { + SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream()); + + if (streams == nullptr) { + streams = _CLNEW SavedStreams(); if (_tcscmp(lang, _T("cjk")) == 0) { - tokenizer = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader); + streams->tokenStream = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader); + streams->filteredTokenStream = + _CLNEW StopFilter(streams->tokenStream, true, stopSet); } else if (_tcscmp(lang, _T("chinese")) == 0) { - tokenizer = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader); + streams->tokenStream = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode); + streams->filteredTokenStream = + _CLNEW StopFilter(streams->tokenStream, true, stopSet); } else { - BufferedReader *bufferedReader = reader->__asBufferedReader(); - if (bufferedReader == NULL) - tokenizer = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true); - else - tokenizer = _CLNEW StandardTokenizer(bufferedReader); - - tokenizer = _CLNEW StandardFilter(tokenizer, true); - - if (stem) - tokenizer = _CLNEW SnowballFilter(tokenizer, lang, true);//todo: should check whether snowball supports the language - - if (stem) //hmm... this could be configured seperately from stem - tokenizer = _CLNEW ISOLatin1AccentFilter(tokenizer, true);//todo: this should really only be applied to latin languages... - - //lower case after the latin1 filter - tokenizer = _CLNEW LowerCaseFilter(tokenizer, true); + CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader(); + + if (bufferedReader == nullptr) { + streams->tokenStream = _CLNEW StandardTokenizer( + _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true); + } else { + streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader); + } + + streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true); + if (stem) { + streams->filteredTokenStream = _CLNEW SnowballFilter( streams->filteredTokenStream, lang, true);//todo: should check whether snowball supports the language + } + streams->filteredTokenStream = + _CLNEW LowerCaseFilter(streams->filteredTokenStream, true); + streams->filteredTokenStream = + _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet); } - setPreviousTokenStream(tokenizer); + setPreviousTokenStream(streams); } else { - auto t = dynamic_cast<Tokenizer *>(tokenizer); - if (t != nullptr) { - t->reset(reader); - } + streams->tokenStream->reset(reader); } - return tokenizer; + + return streams->filteredTokenStream; } TokenStream *LanguageBasedAnalyzer::tokenStream(const TCHAR *fieldName, Reader *reader) { - TokenStream *ret = NULL; + TokenStream *ret = nullptr; if (_tcscmp(lang, _T("cjk")) == 0) { ret = _CLNEW CL_NS2(analysis, cjk)::CJKTokenizer(reader); } else if (_tcscmp(lang, _T("chinese")) == 0) { - ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader); + ret = _CLNEW CL_NS2(analysis, jieba)::ChineseTokenizer(reader, mode); } else { - BufferedReader *bufferedReader = reader->__asBufferedReader(); - if (bufferedReader == NULL) - ret = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true); - else + CL_NS(util)::BufferedReader* bufferedReader = reader->__asBufferedReader(); + + if (bufferedReader == nullptr) { + ret = _CLNEW StandardTokenizer( + _CLNEW CL_NS(util)::FilteredBufferedReader(reader, false), true); + } else { ret = _CLNEW StandardTokenizer(bufferedReader); + } ret = _CLNEW StandardFilter(ret, true); - - if (stem) + if (stem) { ret = _CLNEW SnowballFilter(ret, lang, true);//todo: should check whether snowball supports the language - - if (stem) //hmm... this could be configured seperately from stem - ret = _CLNEW ISOLatin1AccentFilter(ret, true);//todo: this should really only be applied to latin languages... - - //lower case after the latin1 filter + } ret = _CLNEW LowerCaseFilter(ret, true); } - //todo: could add a stop filter based on the language - need to fix the stoplist loader first + ret = _CLNEW StopFilter(ret, true, stopSet); return ret; } diff --git a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h index 22fc3dd9..7c07a882 100644 --- a/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h +++ b/src/contribs-lib/CLucene/analysis/LanguageBasedAnalyzer.h @@ -11,15 +11,40 @@ CL_NS_DEF(analysis) +enum class AnalyzerMode { + Default, + All, + Search +}; + class CLUCENE_CONTRIBS_EXPORT LanguageBasedAnalyzer : public CL_NS(analysis)::Analyzer { + class SavedStreams : public TokenStream { + public: + Tokenizer* tokenStream; + TokenStream* filteredTokenStream; + + SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) + { + } + + void close(){} + Token* next(Token* token) {return NULL;} + }; + /** + * Contains the stopwords used with the StopFilter. + */ + CL_NS(analysis)::CLTCSetList* stopSet; TCHAR lang[100]{}; bool stem; + AnalyzerMode mode{}; public: - explicit LanguageBasedAnalyzer(const TCHAR *language = nullptr, bool stem = true); + explicit LanguageBasedAnalyzer(const TCHAR *language = nullptr, bool stem = true, AnalyzerMode mode = AnalyzerMode::All); ~LanguageBasedAnalyzer() override; + void setStopWords(const TCHAR** stopwords); void setLanguage(const TCHAR *language); void setStem(bool s); + void setMode(AnalyzerMode m); void initDict(const std::string &dictPath); TokenStream *tokenStream(const TCHAR *fieldName, CL_NS(util)::Reader *reader) override; TokenStream *reusableTokenStream(const TCHAR * /*fieldName*/, CL_NS(util)::Reader *reader) override; diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp index 0d44e376..ca371958 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp @@ -7,7 +7,7 @@ CL_NS_DEF2(analysis, jieba) CL_NS_USE(analysis) CL_NS_USE(util) -ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader) : Tokenizer(reader) { +ChineseTokenizer::ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode m) : Tokenizer(reader), mode(m) { buffer[0] = 0; } @@ -22,7 +22,7 @@ CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) { int totalLen = 0; do { auto bufferLen = input->read((const void**)&ioBuffer, 1, LUCENE_IO_BUFFER_SIZE); - if (bufferLen == -1) { + if (bufferLen <= 0) { dataLen = 0; bufferIndex = 0; break; @@ -35,17 +35,26 @@ CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) { char tmp_buffer[4 * totalLen]; lucene_wcsntoutf8(tmp_buffer, initBuffer, totalLen, 4 * totalLen); - JiebaSingleton::getInstance().Cut(tmp_buffer, tokens_text, true); + switch (mode) { + case AnalyzerMode::Search: + JiebaSingleton::getInstance().CutForSearch(tmp_buffer, tokens_text, true); + break; + case AnalyzerMode::All: + JiebaSingleton::getInstance().CutAll(tmp_buffer, tokens_text); + break; + case AnalyzerMode::Default: + JiebaSingleton::getInstance().Cut(tmp_buffer, tokens_text, true); + break; + } dataLen = tokens_text.size(); } if (bufferIndex < dataLen) { - auto token_text = tokens_text[bufferIndex]; - bufferIndex++; + auto token_text = tokens_text[bufferIndex++]; lucene_utf8towcs(buffer, token_text.c_str(), LUCENE_MAX_WORD_LEN); auto length = _tcslen(buffer); token->set(buffer, 0, length); return token; } - return NULL; + return nullptr; } CL_NS_END2 \ No newline at end of file diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h index e642be86..276b138b 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.h @@ -7,6 +7,8 @@ #include "Jieba.hpp" #include "CLucene/analysis/AnalysisHeader.h" +#include "CLucene/analysis/LanguageBasedAnalyzer.h" + CL_NS_DEF2(analysis,jieba) @@ -27,6 +29,7 @@ private: class ChineseTokenizer : public lucene::analysis::Tokenizer { private: + AnalyzerMode mode{}; /** word offset, used to imply which character(in ) is parsed */ int32_t offset{}; @@ -52,7 +55,7 @@ private: public: // Constructor - explicit ChineseTokenizer(lucene::util::Reader *reader); + explicit ChineseTokenizer(lucene::util::Reader *reader, AnalyzerMode mode); static void init(const std::string& dictPath=""); // Destructor @@ -60,6 +63,14 @@ public: // Override the next method to tokenize Chinese text using Jieba lucene::analysis::Token* next(lucene::analysis::Token* token) override; + + void reset(lucene::util::Reader *reader) override { + this->input = reader; + this->offset = 0; + this->bufferIndex = 0; + this->dataLen = 0; + this->tokens_text.clear(); + } }; CL_NS_END2 diff --git a/src/test/contribs-lib/analysis/testChinese.cpp b/src/test/contribs-lib/analysis/testChinese.cpp index 3f54d537..c4210e4d 100644 --- a/src/test/contribs-lib/analysis/testChinese.cpp +++ b/src/test/contribs-lib/analysis/testChinese.cpp @@ -149,6 +149,142 @@ std::string get_dict_path() { return ""; } +void testSimpleJiebaSearchModeTokenizer2(CuTest* tc) { + LanguageBasedAnalyzer a; + CL_NS(util)::StringReader reader(_T("冰咒龙")); + reader.mark(50); + TokenStream* ts; + Token t; + + //test with chinese + a.setLanguage(_T("chinese")); + a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::Search); + a.initDict(get_dict_path()); + ts = a.tokenStream(_T("contents"), &reader); + + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("冰咒")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("龙")) == 0); + CLUCENE_ASSERT(ts->next(&t) == NULL); + _CLDELETE(ts); +} + +void testSimpleJiebaAllModeTokenizer2(CuTest* tc) { + LanguageBasedAnalyzer a; + CL_NS(util)::StringReader reader(_T("冰咒龙")); + reader.mark(50); + TokenStream* ts; + Token t; + + //test with chinese + a.setLanguage(_T("chinese")); + a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::All); + a.initDict(get_dict_path()); + ts = a.tokenStream(_T("contents"), &reader); + + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("冰")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("咒")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("龙")) == 0); + CLUCENE_ASSERT(ts->next(&t) == NULL); + _CLDELETE(ts); +} + +void testSimpleJiebaAllModeTokenizer(CuTest* tc) { + LanguageBasedAnalyzer a; + CL_NS(util)::StringReader reader(_T("我来到北京清华大学")); + reader.mark(50); + TokenStream* ts; + Token t; + + //test with chinese + a.setLanguage(_T("chinese")); + a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::All); + a.initDict(get_dict_path()); + ts = a.tokenStream(_T("contents"), &reader); + + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("北京")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华大学")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("华大")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("大学")) == 0); + CLUCENE_ASSERT(ts->next(&t) == NULL); + _CLDELETE(ts); +} + +void testSimpleJiebaDefaultModeTokenizer(CuTest* tc) { + LanguageBasedAnalyzer a; + CL_NS(util)::StringReader reader(_T("我来到北京清华大学")); + reader.mark(50); + TokenStream* ts; + Token t; + + //test with chinese + a.setLanguage(_T("chinese")); + a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::Default); + a.initDict(get_dict_path()); + ts = a.tokenStream(_T("contents"), &reader); + + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("北京")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华大学")) == 0); + CLUCENE_ASSERT(ts->next(&t) == NULL); + _CLDELETE(ts); +} + +void testSimpleJiebaSearchModeTokenizer(CuTest* tc) { + LanguageBasedAnalyzer a; + CL_NS(util)::StringReader reader(_T("我来到北京清华大学")); + reader.mark(50); + TokenStream* ts; + Token t; + + //test with chinese + a.setLanguage(_T("chinese")); + a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::Search); + a.initDict(get_dict_path()); + ts = a.tokenStream(_T("contents"), &reader); + + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("北京")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("华大")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("大学")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("清华大学")) == 0); + CLUCENE_ASSERT(ts->next(&t) == NULL); + _CLDELETE(ts); +} + void testSimpleJiebaTokenizer(CuTest* tc) { LanguageBasedAnalyzer a; CL_NS(util)::StringReader reader(_T("我爱你中国")); @@ -159,6 +295,7 @@ void testSimpleJiebaTokenizer(CuTest* tc) { //test with chinese a.setLanguage(_T("chinese")); a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::Default); a.initDict(get_dict_path()); ts = a.tokenStream(_T("contents"), &reader); @@ -180,6 +317,7 @@ void testSimpleJiebaTokenizer2(CuTest* tc) { //test with chinese a.setLanguage(_T("chinese")); a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::Default); ts = a.tokenStream(_T("contents"), &reader); CLUCENE_ASSERT(ts->next(&t) != NULL); @@ -208,6 +346,7 @@ void testSimpleJiebaTokenizer3(CuTest* tc) { //test with chinese a.setLanguage(_T("chinese")); a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::Default); ts = a.tokenStream(_T("contents"), &reader); CLUCENE_ASSERT(ts->next(&t) != NULL); @@ -316,7 +455,7 @@ void testJiebaMatch(CuTest* tc) { auto analyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer(); analyzer->setLanguage(L"chinese"); - + analyzer->setMode(lucene::analysis::AnalyzerMode::Default); IndexWriter w(&dir, analyzer, true); auto field_name = lucene::util::Misc::_charToWide("chinese"); @@ -395,6 +534,7 @@ void testJiebaMatch2(CuTest* tc) { auto analyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer(); analyzer->setLanguage(L"chinese"); + analyzer->setMode(lucene::analysis::AnalyzerMode::Default); IndexWriter w(&dir, analyzer, true); auto field_name = lucene::util::Misc::_charToWide("chinese"); @@ -474,6 +614,7 @@ void testJiebaMatchHuge(CuTest* tc) { auto analyzer = _CLNEW lucene::analysis::LanguageBasedAnalyzer(); analyzer->setLanguage(L"chinese"); + analyzer->setMode(lucene::analysis::AnalyzerMode::Default); analyzer->initDict(get_dict_path()); IndexWriter w(&dir, analyzer, true); @@ -1127,6 +1268,11 @@ CuSuite *testchinese(void) { SUITE_ADD_TEST(suite, testJiebaMatch); SUITE_ADD_TEST(suite, testJiebaMatch2); SUITE_ADD_TEST(suite, testJiebaMatchHuge); + SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer); + SUITE_ADD_TEST(suite, testSimpleJiebaDefaultModeTokenizer); + SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer); + SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer2); + SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer2); return suite; } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org