This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 7834ff5ad8b [feature](inverted index) add icu analyzer for minority language tokenization (#47289) 7834ff5ad8b is described below commit 7834ff5ad8ba3f576cae4be00967d3fdfd7cbaf3 Author: zzzxl <yangs...@selectdb.com> AuthorDate: Fri Feb 14 10:24:20 2025 +0800 [feature](inverted index) add icu analyzer for minority language tokenization (#47289) Problem Summary: This pull request introduces support for the ICU (International Components for Unicode) library in the project, enhancing the capabilities for text analysis and processing. The most important changes include adding ICU as a dependency, updating the inverted index parser to support ICU, and including new test cases for the ICU analyzer. --- be/CMakeLists.txt | 28 +++++++++ be/src/clucene | 2 +- be/src/olap/inverted_index_parser.cpp | 4 ++ be/src/olap/inverted_index_parser.h | 2 + .../inverted_index/analyzer/analyzer.cpp | 4 ++ .../apache/doris/analysis/InvertedIndexUtil.java | 8 ++- .../data/inverted_index_p0/test_icu_analyzer.out | Bin 0 -> 371 bytes .../data/inverted_index_p0/test_tokenize.out | Bin 2012 -> 2288 bytes .../inverted_index_p0/test_icu_analyzer.groovy | 51 +++++++++++++++ .../suites/inverted_index_p0/test_tokenize.groovy | 69 +++++++++++---------- 10 files changed, 131 insertions(+), 37 deletions(-) diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 8c209f847ce..e40b0f64089 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -204,6 +204,30 @@ set(ZLIB_ROOT "$ENV{DORIS_THIRDPARTY}/installed") set(Roaring_ROOT "$ENV{DORIS_THIRDPARTY}/installed") set(USE_STAT64 0) +set(ICU_ROOT "$ENV{DORIS_THIRDPARTY}/installed") +find_package(ICU COMPONENTS uc i18n data) + +if (ICU_FOUND) + add_library(icu INTERFACE) + + target_link_libraries(icu INTERFACE + ICU::uc + ICU::i18n + ICU::data + ) + + set(COMMON_THIRDPARTY + ${COMMON_THIRDPARTY} + icu + ) + + add_definitions(-DUSE_ICU) + + message(STATUS "ICU found and linked successfully!") +else() + message(WARNING "ICU not found! Please install ICU first.") +endif() + # disable clucene bthread supported. set(USE_BTHREAD OFF) @@ -225,6 +249,10 @@ install(DIRECTORY ${SRC_DIR}/clucene/src/contribs-lib/CLucene/analysis/jieba/dict DESTINATION ${OUTPUT_DIR}) +install(DIRECTORY + ${SRC_DIR}/clucene/src/core/CLucene/analysis/icu/data/uax29 + DESTINATION ${OUTPUT_DIR}/dict/icu) + # Check if functions are supported in this platform. All flags will generated # in gensrc/build/common/env_config.h. # You can check funcion here which depends on platform. Don't forget add this diff --git a/be/src/clucene b/be/src/clucene index 835c1ab0a39..467b1b5a5b1 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 835c1ab0a39a0e4594f7acf4fcca31a614debe8e +Subproject commit 467b1b5a5b1c736546ef77965a88f0d8948a3ded diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index f1de5a5e0c1..44b170617f1 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -34,6 +34,8 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_ return INVERTED_INDEX_PARSER_ENGLISH; case InvertedIndexParserType::PARSER_CHINESE: return INVERTED_INDEX_PARSER_CHINESE; + case InvertedIndexParserType::PARSER_ICU: + return INVERTED_INDEX_PARSER_ICU; default: return INVERTED_INDEX_PARSER_UNKNOWN; } @@ -51,6 +53,8 @@ InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::st return InvertedIndexParserType::PARSER_ENGLISH; } else if (parser_str_lower == INVERTED_INDEX_PARSER_CHINESE) { return InvertedIndexParserType::PARSER_CHINESE; + } else if (parser_str_lower == INVERTED_INDEX_PARSER_ICU) { + return InvertedIndexParserType::PARSER_ICU; } return InvertedIndexParserType::PARSER_UNKNOWN; diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index f1f85995a20..d70cfa395f4 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -38,6 +38,7 @@ enum class InvertedIndexParserType { PARSER_ENGLISH = 3, PARSER_CHINESE = 4, PARSER_UNICODE = 5, + PARSER_ICU = 6 }; using CharFilterMap = std::map<std::string, std::string>; @@ -67,6 +68,7 @@ const std::string INVERTED_INDEX_PARSER_STANDARD = "standard"; const std::string INVERTED_INDEX_PARSER_UNICODE = "unicode"; const std::string INVERTED_INDEX_PARSER_ENGLISH = "english"; const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese"; +const std::string INVERTED_INDEX_PARSER_ICU = "icu"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = "support_phrase"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true"; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp index 94ba8fce0bc..44bd24651af 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp @@ -28,6 +28,7 @@ #ifdef __clang__ #pragma clang diagnostic pop #endif +#include "CLucene/analysis/icu/ICUAnalyzer.h" #include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" namespace doris::segment_v2::inverted_index { @@ -65,6 +66,9 @@ std::unique_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyz chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All); } analyzer = std::move(chinese_analyzer); + } else if (analyser_type == InvertedIndexParserType::PARSER_ICU) { + analyzer = std::make_unique<lucene::analysis::ICUAnalyzer>(); + analyzer->initDict(config::inverted_index_dict_path + "/icu"); } else { // default analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index bb32fd24029..88ecc83337a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -36,6 +36,7 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_PARSER_UNICODE = "unicode"; public static String INVERTED_INDEX_PARSER_ENGLISH = "english"; public static String INVERTED_INDEX_PARSER_CHINESE = "chinese"; + public static String INVERTED_INDEX_PARSER_ICU = "icu"; public static String INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode"; public static String INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained"; @@ -141,7 +142,8 @@ public class InvertedIndexUtil { || parser.equals(INVERTED_INDEX_PARSER_STANDARD) || parser.equals(INVERTED_INDEX_PARSER_UNICODE) || parser.equals(INVERTED_INDEX_PARSER_ENGLISH) - || parser.equals(INVERTED_INDEX_PARSER_CHINESE))) { + || parser.equals(INVERTED_INDEX_PARSER_CHINESE) + || parser.equals(INVERTED_INDEX_PARSER_ICU))) { throw new AnalysisException("INVERTED index parser: " + parser + " is invalid for column: " + indexColName + " of type " + colType); } @@ -182,9 +184,9 @@ public class InvertedIndexUtil { String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY); String dictCompression = properties.get(INVERTED_INDEX_DICT_COMPRESSION_KEY); - if (parser != null && !parser.matches("none|english|unicode|chinese|standard")) { + if (parser != null && !parser.matches("none|english|unicode|chinese|standard|icu")) { throw new AnalysisException("Invalid inverted index 'parser' value: " + parser - + ", parser must be none, english, unicode or chinese"); + + ", parser must be none, english, unicode, chinese or icu"); } if (!"chinese".equals(parser) && parserMode != null) { diff --git a/regression-test/data/inverted_index_p0/test_icu_analyzer.out b/regression-test/data/inverted_index_p0/test_icu_analyzer.out new file mode 100644 index 00000000000..2c5978b17e6 Binary files /dev/null and b/regression-test/data/inverted_index_p0/test_icu_analyzer.out differ diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out b/regression-test/data/inverted_index_p0/test_tokenize.out index ae22daffe15..e4e271fb730 100644 Binary files a/regression-test/data/inverted_index_p0/test_tokenize.out and b/regression-test/data/inverted_index_p0/test_tokenize.out differ diff --git a/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy b/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy new file mode 100644 index 00000000000..2fa943b9ca9 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy @@ -0,0 +1,51 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_icu_analyzer", "p0"){ + def indexTbName1 = "test_icu_analyzer" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `a` int(11) NULL COMMENT "", + `b` text NULL COMMENT "", + INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "icu") COMMENT '', + ) ENGINE=OLAP + DUPLICATE KEY(`a`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ INSERT INTO ${indexTbName1} VALUES (1, "GET /images/hm_bg.jpg HTTP/1.0"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (2, "มนไมเปนไปตามความตองการมนมหมายเลขอยในเนอหา"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (3, "在启动新的 BE 节点前,需要先在 FE 集群中注册新的 BE 节点"); """ + + try { + sql "sync" + sql """ set enable_common_expr_pushdown = true; """ + + qt_sql """ select * from ${indexTbName1} where b match_phrase 'images hm_bg.jpg'; """ + qt_sql """ select * from ${indexTbName1} where b match_phrase 'อย ใน'; """ + qt_sql """ select * from ${indexTbName1} where b match_phrase '新的 be'; """ + } finally { + } +} \ No newline at end of file diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy index 4672a39cedb..3258f5de710 100644 --- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy +++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy @@ -31,17 +31,17 @@ suite("test_tokenize"){ sql "DROP TABLE IF EXISTS ${indexTblName}" // create 1 replica table sql """ - CREATE TABLE IF NOT EXISTS ${indexTblName}( - `id`int(11)NULL, - `c` text NULL, - INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese") COMMENT '' - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT 'OLAP' - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES( - "replication_allocation" = "tag.location.default: 1" - ); + CREATE TABLE IF NOT EXISTS ${indexTblName}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); """ def var_result = sql "show variables" @@ -56,17 +56,17 @@ suite("test_tokenize"){ sql "DROP TABLE IF EXISTS ${indexTblName2}" // create 1 replica table sql """ - CREATE TABLE IF NOT EXISTS ${indexTblName2}( - `id`int(11)NULL, - `c` text NULL, - INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT '' - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT 'OLAP' - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES( - "replication_allocation" = "tag.location.default: 1" - ); + CREATE TABLE IF NOT EXISTS ${indexTblName2}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); """ sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345...@qq.com,,ip是1.1.1.1,this information is created automatically.');" @@ -77,17 +77,17 @@ suite("test_tokenize"){ sql "DROP TABLE IF EXISTS ${indexTblName3}" // create 1 replica table sql """ - CREATE TABLE IF NOT EXISTS ${indexTblName3}( - `id`int(11)NULL, - `c` text NULL, - INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT '' - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - COMMENT 'OLAP' - DISTRIBUTED BY HASH(`id`) BUCKETS 1 - PROPERTIES( - "replication_allocation" = "tag.location.default: 1" - ); + CREATE TABLE IF NOT EXISTS ${indexTblName3}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); """ sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345...@qq.com,,ip是1.1.1.1,this information is created automatically.');" @@ -109,4 +109,7 @@ suite("test_tokenize"){ throw e } } + + qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="icu"');""" + qt_tokenize_sql """SELECT TOKENIZE('มนไมเปนไปตามความตองการมนมหมายเลขอยในเนอหา', '"parser"="icu"');""" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org