This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 6ac0bfeceb [Feature](inverted index) add unicode parser for inverted index (#21035) 6ac0bfeceb is described below commit 6ac0bfecebc12da011bb63ee1f1ae3f445f108ee Author: airborne12 <airborn...@gmail.com> AuthorDate: Wed Jun 21 20:14:06 2023 +0800 [Feature](inverted index) add unicode parser for inverted index (#21035) --- be/src/clucene | 2 +- be/src/olap/inverted_index_parser.cpp | 6 +++++- be/src/olap/inverted_index_parser.h | 2 ++ .../olap/rowset/segment_v2/inverted_index_reader.cpp | 5 +++++ .../olap/rowset/segment_v2/inverted_index_writer.cpp | 14 ++++++++++---- docs/en/docs/data-table/index/inverted-index.md | 16 +++++++++------- docs/zh-CN/docs/data-table/index/inverted-index.md | 19 ++++++++++--------- .../org/apache/doris/analysis/InvertedIndexUtil.java | 6 ++++-- .../data/inverted_index_p0/test_chinese_analyzer.out | 6 ------ .../inverted_index_p0/test_chinese_analyzer.groovy | 2 -- 10 files changed, 46 insertions(+), 32 deletions(-) diff --git a/be/src/clucene b/be/src/clucene index 60f5eab7ac..103e88a8a3 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 60f5eab7ac6294493a2e7e290297000c3c39875c +Subproject commit 103e88a8a3b24da9ae2a0d9908a3ceb3f7808a61 diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index d1e04e9df1..b0ab8c9d1a 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -27,6 +27,8 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_ return INVERTED_INDEX_PARSER_NONE; case InvertedIndexParserType::PARSER_STANDARD: return INVERTED_INDEX_PARSER_STANDARD; + case InvertedIndexParserType::PARSER_UNICODE: + return INVERTED_INDEX_PARSER_UNICODE; case InvertedIndexParserType::PARSER_ENGLISH: return INVERTED_INDEX_PARSER_ENGLISH; case InvertedIndexParserType::PARSER_CHINESE: @@ -44,6 +46,8 @@ InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::st return InvertedIndexParserType::PARSER_NONE; } else if (parser_str_lower == INVERTED_INDEX_PARSER_STANDARD) { return InvertedIndexParserType::PARSER_STANDARD; + } else if (parser_str_lower == INVERTED_INDEX_PARSER_UNICODE) { + return InvertedIndexParserType::PARSER_UNICODE; } else if (parser_str_lower == INVERTED_INDEX_PARSER_ENGLISH) { return InvertedIndexParserType::PARSER_ENGLISH; } else if (parser_str_lower == INVERTED_INDEX_PARSER_CHINESE) { @@ -67,7 +71,7 @@ std::string get_parser_mode_string_from_properties( if (properties.find(INVERTED_INDEX_PARSER_MODE_KEY) != properties.end()) { return properties.at(INVERTED_INDEX_PARSER_MODE_KEY); } else { - return INVERTED_INDEX_PARSER_FINE_GRANULARITY; + return INVERTED_INDEX_PARSER_COARSE_GRANULARITY; } } diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index 87e2ef991a..eb4c414308 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -29,6 +29,7 @@ enum class InvertedIndexParserType { PARSER_STANDARD = 2, PARSER_ENGLISH = 3, PARSER_CHINESE = 4, + PARSER_UNICODE = 5, }; struct InvertedIndexCtx { @@ -46,6 +47,7 @@ const std::string INVERTED_INDEX_PARSER_KEY = "parser"; const std::string INVERTED_INDEX_PARSER_UNKNOWN = "unknown"; const std::string INVERTED_INDEX_PARSER_NONE = "none"; const std::string INVERTED_INDEX_PARSER_STANDARD = "standard"; +const std::string INVERTED_INDEX_PARSER_UNICODE = "unicode"; const std::string INVERTED_INDEX_PARSER_ENGLISH = "english"; const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese"; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index f198cea229..c311bcec8f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -103,6 +103,11 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result( analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>(); reader.reset( (new lucene::util::StringReader(std::wstring(value.begin(), value.end()).c_str()))); + } else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) { + analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>(); + reader.reset(new lucene::util::SimpleInputStreamReader( + new lucene::util::AStringReader(value.c_str()), + lucene::util::SimpleInputStreamReader::UTF8)); } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) { auto chinese_analyzer = std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 8e1dfc0b05..ab5d3548df 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -154,7 +154,8 @@ public: _doc = std::make_unique<lucene::document::Document>(); _dir.reset(DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), true)); - if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) { + if (_parser_type == InvertedIndexParserType::PARSER_STANDARD || + _parser_type == InvertedIndexParserType::PARSER_UNICODE) { _analyzer = std::make_unique<lucene::analysis::standard::StandardAnalyzer>(); } else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) { _analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); @@ -163,10 +164,10 @@ public: chinese_analyzer->setLanguage(L"chinese"); chinese_analyzer->initDict(config::inverted_index_dict_path); auto mode = get_parser_mode_string_from_properties(_index_meta->properties()); - if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) { - chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default); - } else { + if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) { chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All); + } else { + chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default); } _analyzer.reset(chinese_analyzer); } else { @@ -222,6 +223,11 @@ public: if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH || _parser_type == InvertedIndexParserType::PARSER_CHINESE) { new_char_token_stream(field_value_data, field_value_size, _field); + } else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) { + auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader( + new lucene::util::AStringReader(field_value_data, field_value_size), + lucene::util::SimpleInputStreamReader::UTF8); + _field->setValue(stringReader); } else { new_field_value(field_value_data, field_value_size, _field); } diff --git a/docs/en/docs/data-table/index/inverted-index.md b/docs/en/docs/data-table/index/inverted-index.md index 6b3ad7a647..57216d8ad4 100644 --- a/docs/en/docs/data-table/index/inverted-index.md +++ b/docs/en/docs/data-table/index/inverted-index.md @@ -52,7 +52,7 @@ The features for inverted index is as follows: - add fulltext search on text(string, varchar, char) field - MATCH_ALL matches all keywords, MATCH_ANY matches any keywords - support fulltext on array of text field - - support english and chinese word parser + - support english, chinese and mixed unicode word parser - accelerate normal equal, range query, replacing bitmap index in the future - suport =, !=, >, >=, <, <= on text, numeric, datetime types - suport =, !=, >, >=, <, <= on array of text, numeric, datetime types @@ -74,10 +74,12 @@ The features for inverted index is as follows: - missing stands for no parser, the whole field is considered to be a term - "english" stands for english parser - "chinese" stands for chinese parser + - "unicode" stands for mixed-type word segmentation suitable for situations with a mix of Chinese and English. It can segment email prefixes and suffixes, IP addresses, and mixed characters and numbers, and can also segment Chinese characters into 1-gram. + - "parser_mode" is utilized to set the tokenizer/parser type for Chinese word segmentation. - in "fine_grained" mode, the system will meticulously tokenize each possible segment. - in "coarse_grained" mode, the system follows the maximization principle, performing accurate and comprehensive tokenization. - - default mode is "fine_grained". + - default mode is "coarse_grained". - "support_phrase" is utilized to specify if the index requires support for phrase mode. - "true" indicates that support is needed. - "false" indicates that support is not needed. @@ -88,10 +90,10 @@ The features for inverted index is as follows: CREATE TABLE table_name ( columns_difinition, - INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'] - INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'] + INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment'] + INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment'] INDEX idx_name3(column_name3) USING INVERTED [PROPERTIES("parser" = "chinese", "parser_mode" = "fine_grained|coarse_grained")] [COMMENT 'your comment'] - INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment'] + INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode", "support_phrase" = "true|false")] [COMMENT 'your comment'] ) table_properties; ``` @@ -99,9 +101,9 @@ table_properties; - add an inverted index to existed table ```sql -- syntax 1 -CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']; +CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment']; -- syntax 2 -ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']; +ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment']; ``` - drop an inverted index diff --git a/docs/zh-CN/docs/data-table/index/inverted-index.md b/docs/zh-CN/docs/data-table/index/inverted-index.md index 768e29f856..3ac4992519 100644 --- a/docs/zh-CN/docs/data-table/index/inverted-index.md +++ b/docs/zh-CN/docs/data-table/index/inverted-index.md @@ -52,7 +52,7 @@ Doris倒排索引的功能简要介绍如下: - 增加了字符串类型的全文检索 - 支持字符串全文检索,包括同时匹配多个关键字MATCH_ALL、匹配任意一个关键字MATCH_ANY、匹配短语词组MATCH_PHRASE - 支持字符串数组类型的全文检索 - - 支持英文、中文分词 + - 支持英文、中文以及混合类型分词 - 加速普通等值、范围查询,覆盖bitmap索引的功能,未来会代替bitmap索引 - 支持字符串、数值、日期时间类型的 =, !=, >, >=, <, <= 快速过滤 - 支持字符串、数字、日期时间数组类型的 =, !=, >, >=, <, <= @@ -72,11 +72,12 @@ Doris倒排索引的功能简要介绍如下: - parser指定分词器 - 默认不指定代表不分词 - english是英文分词,适合被索引列是英文的情况,用空格和标点符号分词,性能高 - - chinese是中文分词,适合被索引列有中文或者中英文混合的情况,采用jieba分词库,性能比english分词低 + - chinese是中文分词,适合被索引列有中文或者中英文混合的情况,性能比english分词低 + - unicode是混合类型分词,适用于中英文混合的情况。它能够对邮箱前缀和后缀、IP地址以及字符数字混合进行分词,并且可以对中文字符进行1-gram分词。 - parser_mode用于指定中文分词的模式 - fine_grained模式,系统将对可以进行分词的部分都进行详尽的分词处理 - coarse_grained模式,系统则依据最大化原则,执行精确且全面的分词操作 - - 默认find_grained模式 + - 默认coarse_grained模式 - support_phrase用于指定索引是否需要支持短语模式 - true为需要 - false为不需要 @@ -87,10 +88,10 @@ Doris倒排索引的功能简要介绍如下: CREATE TABLE table_name ( columns_difinition, - INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'] - INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'] + INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment'] + INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment'] INDEX idx_name3(column_name3) USING INVERTED [PROPERTIES("parser" = "chinese", "parser_mode" = "fine_grained|coarse_grained")] [COMMENT 'your comment'] - INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment'] + INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment'] ) table_properties; ``` @@ -98,9 +99,9 @@ table_properties; - 已有表增加倒排索引 ```sql -- 语法1 -CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']; +CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment']; -- 语法2 -ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']; +ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment']; ``` - 删除倒排索引 @@ -149,7 +150,7 @@ USE test_inverted_index; -- 创建表的同时创建了comment的倒排索引idx_comment -- USING INVERTED 指定索引类型是倒排索引 --- PROPERTIES("parser" = "english") 指定采用english分词,还支持"chinese"中文分词,如果不指定"parser"参数表示不分词 +-- PROPERTIES("parser" = "english") 指定采用english分词,还支持"chinese"中文分词和"unicode"中英文混合分词,如果不指定"parser"参数表示不分词 CREATE TABLE hackernews_1m ( `id` BIGINT, diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 8b8c57b95b..294f71dff6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -28,6 +28,7 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_PARSER_UNKNOWN = "unknown"; public static String INVERTED_INDEX_PARSER_NONE = "none"; public static String INVERTED_INDEX_PARSER_STANDARD = "standard"; + public static String INVERTED_INDEX_PARSER_UNICODE = "unicode"; public static String INVERTED_INDEX_PARSER_ENGLISH = "english"; public static String INVERTED_INDEX_PARSER_CHINESE = "chinese"; @@ -53,8 +54,9 @@ public class InvertedIndexUtil { if (colType.isStringType()) { if (!(parser.equals(INVERTED_INDEX_PARSER_NONE) || parser.equals(INVERTED_INDEX_PARSER_STANDARD) - || parser.equals(INVERTED_INDEX_PARSER_ENGLISH) - || parser.equals(INVERTED_INDEX_PARSER_CHINESE))) { + || parser.equals(INVERTED_INDEX_PARSER_UNICODE) + || parser.equals(INVERTED_INDEX_PARSER_ENGLISH) + || parser.equals(INVERTED_INDEX_PARSER_CHINESE))) { throw new AnalysisException("INVERTED index parser: " + parser + " is invalid for column: " + indexColName + " of type " + colType); } diff --git a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out index 71489df784..dfb1cd2ccb 100644 --- a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out +++ b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out @@ -11,18 +11,12 @@ -- !sql -- 1 我来到北京清华大学 --- !sql -- -1 我来到北京清华大学 - -- !sql -- 3 人民可以得到更多实惠 -- !sql -- 2 我爱你中国 --- !sql -- -1 我来到北京清华大学 - -- !sql -- -- !sql -- diff --git a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy index f779e0bfce..1acf8ffa6a 100644 --- a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy +++ b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy @@ -48,7 +48,6 @@ suite("test_chinese_analyzer"){ sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');" qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我爱你' ORDER BY id;" - qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '大学' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华大学' ORDER BY id;" @@ -74,7 +73,6 @@ suite("test_chinese_analyzer"){ sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');" qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我爱你' ORDER BY id;" - qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org