(doris) branch master updated: [feature](inverted index) add icu analyzer for minority language tokenization (#47289)

airborne Thu, 13 Feb 2025 18:25:40 -0800

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 7834ff5ad8b [feature](inverted index) add icu analyzer for minority 
language tokenization (#47289)
7834ff5ad8b is described below

commit 7834ff5ad8ba3f576cae4be00967d3fdfd7cbaf3
Author: zzzxl <yangs...@selectdb.com>
AuthorDate: Fri Feb 14 10:24:20 2025 +0800

    [feature](inverted index) add icu analyzer for minority language 
tokenization (#47289)
    
    Problem Summary:
    This pull request introduces support for the ICU (International
    Components for Unicode) library in the project, enhancing the
    capabilities for text analysis and processing. The most important
    changes include adding ICU as a dependency, updating the inverted index
    parser to support ICU, and including new test cases for the ICU
    analyzer.
---
 be/CMakeLists.txt                                  |  28 +++++++++
 be/src/clucene                                     |   2 +-
 be/src/olap/inverted_index_parser.cpp              |   4 ++
 be/src/olap/inverted_index_parser.h                |   2 +
 .../inverted_index/analyzer/analyzer.cpp           |   4 ++
 .../apache/doris/analysis/InvertedIndexUtil.java   |   8 ++-
 .../data/inverted_index_p0/test_icu_analyzer.out   | Bin 0 -> 371 bytes
 .../data/inverted_index_p0/test_tokenize.out       | Bin 2012 -> 2288 bytes
 .../inverted_index_p0/test_icu_analyzer.groovy     |  51 +++++++++++++++
 .../suites/inverted_index_p0/test_tokenize.groovy  |  69 +++++++++++----------
 10 files changed, 131 insertions(+), 37 deletions(-)

diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 8c209f847ce..e40b0f64089 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -204,6 +204,30 @@ set(ZLIB_ROOT "$ENV{DORIS_THIRDPARTY}/installed")
 set(Roaring_ROOT "$ENV{DORIS_THIRDPARTY}/installed")
 set(USE_STAT64 0)
 
+set(ICU_ROOT "$ENV{DORIS_THIRDPARTY}/installed")
+find_package(ICU COMPONENTS uc i18n data)
+
+if (ICU_FOUND)
+    add_library(icu INTERFACE)
+    
+    target_link_libraries(icu INTERFACE
+        ICU::uc
+        ICU::i18n
+        ICU::data
+    )
+
+    set(COMMON_THIRDPARTY
+        ${COMMON_THIRDPARTY}
+        icu
+    )
+
+    add_definitions(-DUSE_ICU)
+
+    message(STATUS "ICU found and linked successfully!")
+else()
+    message(WARNING "ICU not found! Please install ICU first.")
+endif()
+
 # disable clucene bthread supported.
 set(USE_BTHREAD OFF)
 
@@ -225,6 +249,10 @@ install(DIRECTORY
     ${SRC_DIR}/clucene/src/contribs-lib/CLucene/analysis/jieba/dict
     DESTINATION ${OUTPUT_DIR})
 
+install(DIRECTORY
+    ${SRC_DIR}/clucene/src/core/CLucene/analysis/icu/data/uax29
+    DESTINATION ${OUTPUT_DIR}/dict/icu)
+
 # Check if functions are supported in this platform. All flags will generated
 # in gensrc/build/common/env_config.h.
 # You can check funcion here which depends on platform. Don't forget add this
diff --git a/be/src/clucene b/be/src/clucene
index 835c1ab0a39..467b1b5a5b1 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 835c1ab0a39a0e4594f7acf4fcca31a614debe8e
+Subproject commit 467b1b5a5b1c736546ef77965a88f0d8948a3ded
diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index f1de5a5e0c1..44b170617f1 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -34,6 +34,8 @@ std::string 
inverted_index_parser_type_to_string(InvertedIndexParserType parser_
         return INVERTED_INDEX_PARSER_ENGLISH;
     case InvertedIndexParserType::PARSER_CHINESE:
         return INVERTED_INDEX_PARSER_CHINESE;
+    case InvertedIndexParserType::PARSER_ICU:
+        return INVERTED_INDEX_PARSER_ICU;
     default:
         return INVERTED_INDEX_PARSER_UNKNOWN;
     }
@@ -51,6 +53,8 @@ InvertedIndexParserType 
get_inverted_index_parser_type_from_string(const std::st
         return InvertedIndexParserType::PARSER_ENGLISH;
     } else if (parser_str_lower == INVERTED_INDEX_PARSER_CHINESE) {
         return InvertedIndexParserType::PARSER_CHINESE;
+    } else if (parser_str_lower == INVERTED_INDEX_PARSER_ICU) {
+        return InvertedIndexParserType::PARSER_ICU;
     }
 
     return InvertedIndexParserType::PARSER_UNKNOWN;
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index f1f85995a20..d70cfa395f4 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -38,6 +38,7 @@ enum class InvertedIndexParserType {
     PARSER_ENGLISH = 3,
     PARSER_CHINESE = 4,
     PARSER_UNICODE = 5,
+    PARSER_ICU = 6
 };
 
 using CharFilterMap = std::map<std::string, std::string>;
@@ -67,6 +68,7 @@ const std::string INVERTED_INDEX_PARSER_STANDARD = "standard";
 const std::string INVERTED_INDEX_PARSER_UNICODE = "unicode";
 const std::string INVERTED_INDEX_PARSER_ENGLISH = "english";
 const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese";
+const std::string INVERTED_INDEX_PARSER_ICU = "icu";
 
 const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = "support_phrase";
 const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true";
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index 94ba8fce0bc..44bd24651af 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -28,6 +28,7 @@
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
+#include "CLucene/analysis/icu/ICUAnalyzer.h"
 #include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
 
 namespace doris::segment_v2::inverted_index {
@@ -65,6 +66,9 @@ std::unique_ptr<lucene::analysis::Analyzer> 
InvertedIndexAnalyzer::create_analyz
             chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
         }
         analyzer = std::move(chinese_analyzer);
+    } else if (analyser_type == InvertedIndexParserType::PARSER_ICU) {
+        analyzer = std::make_unique<lucene::analysis::ICUAnalyzer>();
+        analyzer->initDict(config::inverted_index_dict_path + "/icu");
     } else {
         // default
         analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index bb32fd24029..88ecc83337a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -36,6 +36,7 @@ public class InvertedIndexUtil {
     public static String INVERTED_INDEX_PARSER_UNICODE = "unicode";
     public static String INVERTED_INDEX_PARSER_ENGLISH = "english";
     public static String INVERTED_INDEX_PARSER_CHINESE = "chinese";
+    public static String INVERTED_INDEX_PARSER_ICU = "icu";
 
     public static String INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
     public static String INVERTED_INDEX_PARSER_FINE_GRANULARITY = 
"fine_grained";
@@ -141,7 +142,8 @@ public class InvertedIndexUtil {
                     || parser.equals(INVERTED_INDEX_PARSER_STANDARD)
                         || parser.equals(INVERTED_INDEX_PARSER_UNICODE)
                             || parser.equals(INVERTED_INDEX_PARSER_ENGLISH)
-                                || 
parser.equals(INVERTED_INDEX_PARSER_CHINESE))) {
+                                || parser.equals(INVERTED_INDEX_PARSER_CHINESE)
+                                    || 
parser.equals(INVERTED_INDEX_PARSER_ICU))) {
                 throw new AnalysisException("INVERTED index parser: " + parser
                     + " is invalid for column: " + indexColName + " of type " 
+ colType);
             }
@@ -182,9 +184,9 @@ public class InvertedIndexUtil {
         String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
         String dictCompression = 
properties.get(INVERTED_INDEX_DICT_COMPRESSION_KEY);
 
-        if (parser != null && 
!parser.matches("none|english|unicode|chinese|standard")) {
+        if (parser != null && 
!parser.matches("none|english|unicode|chinese|standard|icu")) {
             throw new AnalysisException("Invalid inverted index 'parser' 
value: " + parser
-                    + ", parser must be none, english, unicode or chinese");
+                    + ", parser must be none, english, unicode, chinese or 
icu");
         }
 
         if (!"chinese".equals(parser) && parserMode != null) {
diff --git a/regression-test/data/inverted_index_p0/test_icu_analyzer.out 
b/regression-test/data/inverted_index_p0/test_icu_analyzer.out
new file mode 100644
index 00000000000..2c5978b17e6
Binary files /dev/null and 
b/regression-test/data/inverted_index_p0/test_icu_analyzer.out differ
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out 
b/regression-test/data/inverted_index_p0/test_tokenize.out
index ae22daffe15..e4e271fb730 100644
Binary files a/regression-test/data/inverted_index_p0/test_tokenize.out and 
b/regression-test/data/inverted_index_p0/test_tokenize.out differ
diff --git a/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy 
b/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy
new file mode 100644
index 00000000000..2fa943b9ca9
--- /dev/null
+++ b/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_icu_analyzer", "p0"){
+    def indexTbName1 = "test_icu_analyzer"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+    sql """
+      CREATE TABLE ${indexTbName1} (
+      `a` int(11) NULL COMMENT "",
+      `b` text NULL COMMENT "",
+      INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "icu") COMMENT '',
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`a`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+      "replication_allocation" = "tag.location.default: 1"
+      );
+    """
+
+    sql """ INSERT INTO ${indexTbName1} VALUES (1, "GET /images/hm_bg.jpg 
HTTP/1.0"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (2, 
"มนไมเปนไปตามความตองการมนมหมายเลขอยในเนอหา"); """
+    sql """ INSERT INTO ${indexTbName1} VALUES (3, "在启动新的 BE 节点前，需要先在 FE 
集群中注册新的 BE 节点"); """
+
+    try {
+        sql "sync"
+        sql """ set enable_common_expr_pushdown = true; """
+
+        qt_sql """ select * from ${indexTbName1} where b match_phrase 'images 
hm_bg.jpg'; """
+        qt_sql """ select * from ${indexTbName1} where b match_phrase 'อย ใน'; 
"""
+        qt_sql """ select * from ${indexTbName1} where b match_phrase '新的 be'; 
"""
+    } finally {
+    }
+}
\ No newline at end of file
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy 
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index 4672a39cedb..3258f5de710 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -31,17 +31,17 @@ suite("test_tokenize"){
     sql "DROP TABLE IF EXISTS ${indexTblName}"
     // create 1 replica table
     sql """
-       CREATE TABLE IF NOT EXISTS ${indexTblName}(
-               `id`int(11)NULL,
-               `c` text NULL,
-               INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese") 
COMMENT ''
-       ) ENGINE=OLAP
-       DUPLICATE KEY(`id`)
-       COMMENT 'OLAP'
-       DISTRIBUTED BY HASH(`id`) BUCKETS 1
-       PROPERTIES(
-               "replication_allocation" = "tag.location.default: 1"
-       );
+    CREATE TABLE IF NOT EXISTS ${indexTblName}(
+      `id`int(11)NULL,
+      `c` text NULL,
+      INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="chinese") COMMENT ''
+    ) ENGINE=OLAP
+    DUPLICATE KEY(`id`)
+    COMMENT 'OLAP'
+    DISTRIBUTED BY HASH(`id`) BUCKETS 1
+    PROPERTIES(
+      "replication_allocation" = "tag.location.default: 1"
+    );
     """
     
     def var_result = sql "show variables"
@@ -56,17 +56,17 @@ suite("test_tokenize"){
     sql "DROP TABLE IF EXISTS ${indexTblName2}"
     // create 1 replica table
     sql """
-       CREATE TABLE IF NOT EXISTS ${indexTblName2}(
-               `id`int(11)NULL,
-               `c` text NULL,
-               INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") 
COMMENT ''
-       ) ENGINE=OLAP
-       DUPLICATE KEY(`id`)
-       COMMENT 'OLAP'
-       DISTRIBUTED BY HASH(`id`) BUCKETS 1
-       PROPERTIES(
-                "replication_allocation" = "tag.location.default: 1"
-        );
+    CREATE TABLE IF NOT EXISTS ${indexTblName2}(
+      `id`int(11)NULL,
+      `c` text NULL,
+      INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT ''
+    ) ENGINE=OLAP
+    DUPLICATE KEY(`id`)
+    COMMENT 'OLAP'
+    DISTRIBUTED BY HASH(`id`) BUCKETS 1
+    PROPERTIES(
+      "replication_allocation" = "tag.location.default: 1"
+    );
     """
 
     sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, 
'人民可以得到更多实惠'), (4, 
'陕西省西安市高新区创业大厦A座，我的手机号码是12345678901,邮箱是12345...@qq.com，,ip是1.1.1.1，this 
information is created automatically.');"
@@ -77,17 +77,17 @@ suite("test_tokenize"){
     sql "DROP TABLE IF EXISTS ${indexTblName3}"
     // create 1 replica table
     sql """
-       CREATE TABLE IF NOT EXISTS ${indexTblName3}(
-               `id`int(11)NULL,
-               `c` text NULL,
-               INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") 
COMMENT ''
-       ) ENGINE=OLAP
-       DUPLICATE KEY(`id`)
-       COMMENT 'OLAP'
-       DISTRIBUTED BY HASH(`id`) BUCKETS 1
-       PROPERTIES(
-                "replication_allocation" = "tag.location.default: 1"
-        );
+    CREATE TABLE IF NOT EXISTS ${indexTblName3}(
+      `id`int(11)NULL,
+      `c` text NULL,
+      INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT ''
+    ) ENGINE=OLAP
+    DUPLICATE KEY(`id`)
+    COMMENT 'OLAP'
+    DISTRIBUTED BY HASH(`id`) BUCKETS 1
+    PROPERTIES(
+        "replication_allocation" = "tag.location.default: 1"
+    );
     """
 
     sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, 
'人民可以得到更多实惠'), (4, 
'陕西省西安市高新区创业大厦A座，我的手机号码是12345678901,邮箱是12345...@qq.com，,ip是1.1.1.1，this 
information is created automatically.');"
@@ -109,4 +109,7 @@ suite("test_tokenize"){
         throw e
       }
     }
+
+    qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="icu"');"""
+    qt_tokenize_sql """SELECT 
TOKENIZE('มนไมเปนไปตามความตองการมนมหมายเลขอยในเนอหา', '"parser"="icu"');"""
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch master updated: [feature](inverted index) add icu analyzer for minority language tokenization (#47289)

Reply via email to