(doris) branch master updated: [opt](inverted index) Refactor ICU tokenizer code location for better organization and maintainability. (#48079)

jianliangqi Thu, 20 Feb 2025 18:27:23 -0800

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new b98f27a4892 [opt](inverted index) Refactor ICU tokenizer code location 
for better organization and maintainability. (#48079)
b98f27a4892 is described below

commit b98f27a48927d1b45fb313fe1bd50fcb9eb29d39
Author: zzzxl <yangs...@selectdb.com>
AuthorDate: Fri Feb 21 10:20:43 2025 +0800

    [opt](inverted index) Refactor ICU tokenizer code location for better 
organization and maintainability. (#48079)
    
    Problem Summary:
    This pull request includes several changes to the ICU (International
    Components for Unicode) integration within the project. The primary
    focus is on updating the build configuration, refactoring the ICU
    analyzer, and adding new ICU-related files.
---
 be/CMakeLists.txt                                  |  26 +-
 be/cmake/thirdparty.cmake                          |   4 +
 be/dict/icu/uax29/Default.txt                      | 152 ++++++
 be/dict/icu/uax29/MyanmarSyllable.txt              |  35 ++
 be/src/clucene                                     |   2 +-
 .../inverted_index/analyzer/analyzer.cpp           |   4 +-
 .../analyzer/icu/BreakIteratorWrapper.cpp          | 107 ++++
 .../analyzer/icu/BreakIteratorWrapper.h            |  54 ++
 .../analyzer/icu/CompositeBreakIterator.cpp        |  83 +++
 .../analyzer/icu/CompositeBreakIterator.h          |  58 +++
 .../analyzer/icu/DefaultICUTokenizerConfig.cpp     | 128 +++++
 .../analyzer/icu/DefaultICUTokenizerConfig.h       |  44 ++
 .../inverted_index/analyzer/icu/ICUAnalyzer.h      |  61 +++
 .../inverted_index/analyzer/icu/ICUCommon.h        |  48 ++
 .../inverted_index/analyzer/icu/ICUTokenizer.cpp   |  81 +++
 .../inverted_index/analyzer/icu/ICUTokenizer.h     |  50 ++
 .../analyzer/icu/ICUTokenizerConfig.h              |  37 ++
 .../inverted_index/analyzer/icu/ScriptIterator.cpp | 121 +++++
 .../inverted_index/analyzer/icu/ScriptIterator.h   |  64 +++
 .../inverted_index/analyzer/icu_anzlyzer_test.cpp  | 575 +++++++++++++++++++++
 20 files changed, 1706 insertions(+), 28 deletions(-)

diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 901acf4975d..14216adfadb 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -205,30 +205,6 @@ set(ZLIB_ROOT "$ENV{DORIS_THIRDPARTY}/installed")
 set(Roaring_ROOT "$ENV{DORIS_THIRDPARTY}/installed")
 set(USE_STAT64 0)
 
-set(ICU_ROOT "$ENV{DORIS_THIRDPARTY}/installed")
-find_library(ICU_UC_LIBRARY NAMES libicuuc.a PATHS ${ICU_ROOT}/lib)
-find_library(ICU_I18N_LIBRARY NAMES libicui18n.a PATHS ${ICU_ROOT}/lib)
-find_library(ICU_DATA_LIBRARY NAMES libicudata.a PATHS ${ICU_ROOT}/lib)
-message("ICU_UC_LIBRARY: " ${ICU_UC_LIBRARY})
-message("ICU_I18N_LIBRARY: " ${ICU_I18N_LIBRARY})
-message("ICU_DATA_LIBRARY: " ${ICU_DATA_LIBRARY})
-if (ICU_UC_LIBRARY AND ICU_I18N_LIBRARY AND ICU_DATA_LIBRARY)
-    add_library(icu INTERFACE IMPORTED)
-    target_link_libraries(icu INTERFACE
-        ${ICU_UC_LIBRARY}
-        ${ICU_I18N_LIBRARY}
-        ${ICU_DATA_LIBRARY}
-    )
-    set(COMMON_THIRDPARTY
-        ${COMMON_THIRDPARTY}
-        icu
-    )
-    add_definitions(-DUSE_ICU)
-    message(STATUS "ICU found and linked successfully!")
-else()
-    message(WARNING "ICU not found! Please install ICU first.")
-endif()
-
 # disable clucene bthread supported.
 set(USE_BTHREAD OFF)
 
@@ -251,7 +227,7 @@ install(DIRECTORY
     DESTINATION ${OUTPUT_DIR})
 
 install(DIRECTORY
-    ${SRC_DIR}/clucene/src/core/CLucene/analysis/icu/data/uax29
+    ${BASE_DIR}/dict/icu/uax29
     DESTINATION ${OUTPUT_DIR}/dict/icu)
 
 # Check if functions are supported in this platform. All flags will generated
diff --git a/be/cmake/thirdparty.cmake b/be/cmake/thirdparty.cmake
index 19f2a00012a..a165c4ab203 100644
--- a/be/cmake/thirdparty.cmake
+++ b/be/cmake/thirdparty.cmake
@@ -171,3 +171,7 @@ endif()
 if ("${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86" OR 
"${CMAKE_BUILD_TARGET_ARCH}" STREQUAL "x86_64")
     add_thirdparty(deflate)
 endif()
+
+add_thirdparty(icuuc LIB64)
+add_thirdparty(icui18n LIB64)
+add_thirdparty(icudata LIB64)
diff --git a/be/dict/icu/uax29/Default.txt b/be/dict/icu/uax29/Default.txt
new file mode 100644
index 00000000000..763758e3fe3
--- /dev/null
+++ b/be/dict/icu/uax29/Default.txt
@@ -0,0 +1,152 @@
+#  Character class definitions from TR 29
+
+!!chain;
+!!quoted_literals_only;
+
+
+#
+#  Character Class Definitions.
+#
+
+$Han                = [:Han:];
+
+$CR                 = [\p{Word_Break = CR}];
+$LF                 = [\p{Word_Break = LF}];
+$Newline            = [\p{Word_Break = Newline}];
+$Extend             = [\p{Word_Break = Extend}-$Han];
+$ZWJ                = [\p{Word_Break = ZWJ}];
+$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
+$Format             = [\p{Word_Break = Format}];
+$Katakana           = [\p{Word_Break = Katakana}];
+$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
+$ALetter            = [\p{Word_Break = ALetter}];
+$Single_Quote       = [\p{Word_Break = Single_Quote}];
+$Double_Quote       = [\p{Word_Break = Double_Quote}];
+$MidNumLet          = [\p{Word_Break = MidNumLet}];
+$MidLetter          = [\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]];
+$MidNum             = [\p{Word_Break = MidNum}];
+$Numeric            = [\p{Word_Break = Numeric}];
+$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
+$WSegSpace          = [\p{Word_Break = WSegSpace}];
+$Extended_Pict      = [\p{Extended_Pictographic}];
+
+$Hiragana           = [:Hiragana:];
+$Ideographic        = [\p{Ideographic}];
+
+
+#   Dictionary character set, for triggering language-based break engines. 
Currently
+#   limited to LineBreak=Complex_Context. Note that this set only works in 
Unicode
+#   5.0 or later as the definition of Complex_Context was corrected to include 
all
+#   characters requiring dictionary break.
+
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
+$HangulSyllable = [\uac00-\ud7a3];
+$ComplexContext = [:LineBreak = Complex_Context:];
+$KanaKanji      = [$Han $Hiragana $Katakana];
+$dictionaryCJK  = [$KanaKanji $HangulSyllable];
+$dictionary     = [$ComplexContext $dictionaryCJK];
+
+# TODO: check if handling of katakana in dictionary makes rules incorrect/void
+
+# leave CJK scripts out of ALetterPlus
+$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
+
+
+## -------------------------------------------------
+
+# Rule 3 - CR x LF
+#
+$CR $LF;
+
+# Rule 3c   Do not break within emoji zwj sequences.
+#             ZWJ ×  \p{Extended_Pictographic}.  Precedes WB4, so no 
intervening Extend chars allowed.
+#
+$ZWJ $Extended_Pict;
+
+# Rule 3d - Keep horizontal whitespace together.
+#
+$WSegSpace $WSegSpace;
+
+# Rule 4 - ignore Format and Extend characters, except when they appear at the 
beginning
+#          of a region of Text.
+
+$ExFm  = [$Extend $Format $ZWJ];
+
+^$ExFm+;            # This rule fires only when there are format or extend 
characters at the
+                    # start of text, or immediately following another 
boundary. It groups them, in
+                    # the event there are more than one.
+
+[^$CR $LF $Newline $ExFm] $ExFm*;   # This rule rule attaches trailing 
format/extends to words,
+                                    # with no special rule status value.
+
+$Numeric $ExFm* {100};              # This group of rules also attach trailing 
format/extends, but
+$ALetterPlus $ExFm* {200};          # with rule status set based on the word's 
final base character.
+$HangulSyllable {200};
+$Hebrew_Letter $ExFm* {200};
+$Katakana $ExFm* {300};             # note:  these status values override 
those from rule 5
+$Hiragana $ExFm* {300};             #        by virtue of being numerically 
larger.
+$Ideographic $ExFm* {400};          #
+
+#
+# rule 5
+#    Do not break between most letters.
+#
+($ALetterPlus | $Hebrew_Letter) $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 6 and 7
+($ALetterPlus | $Hebrew_Letter)  $ExFm* ($MidLetter | $MidNumLet | 
$Single_Quote) $ExFm* ($ALetterPlus | $Hebrew_Letter) {200};
+
+# rule 7a
+$Hebrew_Letter $ExFm* $Single_Quote {200};
+
+# rule 7b and 7c
+$Hebrew_Letter $ExFm* $Double_Quote $ExFm* $Hebrew_Letter;
+
+# rule 8
+
+$Numeric $ExFm* $Numeric;
+
+# rule 9
+
+($ALetterPlus | $Hebrew_Letter)  $ExFm* $Numeric;
+
+# rule 10
+
+$Numeric $ExFm* ($ALetterPlus | $Hebrew_Letter);
+
+# rule 11 and 12
+
+$Numeric $ExFm* ($MidNum | $MidNumLet | $Single_Quote) $ExFm* $Numeric;
+
+# rule 13
+# to be consistent with $KanaKanji $KanaKanhi, changed
+# from 300 to 400.
+# See also TestRuleStatus in intltest/rbbiapts.cpp
+$Katakana $ExFm*  $Katakana {300};
+
+# rule 13a/b
+
+$ALetterPlus   $ExFm* $ExtendNumLet {200};    #  (13a)
+$Hebrew_Letter $ExFm* $ExtendNumLet {200};    #  (13a)
+$Numeric       $ExFm* $ExtendNumLet {100};    #  (13a)
+$Katakana      $ExFm* $ExtendNumLet {300};    #  (13a)
+$ExtendNumLet  $ExFm* $ExtendNumLet {200};    #  (13a)
+
+$ExtendNumLet  $ExFm* $ALetterPlus  {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Hebrew_Letter {200};    #  (13b)
+$ExtendNumLet  $ExFm* $Numeric      {100};    #  (13b)
+$ExtendNumLet  $ExFm* $Katakana     {300};    #  (13b)
+
+# rules 15 - 17
+#    Pairs of Regional Indicators stay together.
+#    With incoming rule chaining disabled by ^, this rule will match exactly 
two of them.
+#    No other rule begins with a Regional_Indicator, so chaining cannot extend 
the match.
+#
+^$Regional_Indicator $ExFm* $Regional_Indicator;
+
+# special handling for CJK characters: chain for later dictionary segmentation
+$HangulSyllable $HangulSyllable {200};
+
+# Rule 999
+#     Match a single code point if no other rule applies.
+.;
diff --git a/be/dict/icu/uax29/MyanmarSyllable.txt 
b/be/dict/icu/uax29/MyanmarSyllable.txt
new file mode 100644
index 00000000000..aaef16f4931
--- /dev/null
+++ b/be/dict/icu/uax29/MyanmarSyllable.txt
@@ -0,0 +1,35 @@
+# Parses Myanmar text, with syllable as token. 
+
+$Consonant            = [:Indic_Syllabic_Category = Consonant:];
+$ConsonantPlaceholder = [:Indic_Syllabic_Category = Consonant_Placeholder:];
+$VowelIndependent     = [:Indic_Syllabic_Category = Vowel_Independent:];
+$Virama               = [:Indic_Syllabic_Category = Invisible_Stacker:];
+$Asat                 = [:Indic_Syllabic_Category = Pure_Killer:];
+# for our purposes, $Cons means 'base'
+$Cons = $Consonant | $ConsonantPlaceholder | $VowelIndependent;
+$WordJoin = [:Line_Break=Word_Joiner:]; 
+
+#
+# default numerical definitions
+#
+$Extend       = [\p{Word_Break = Extend}];
+$Format       = [\p{Word_Break = Format}];
+$MidNumLet    = [\p{Word_Break = MidNumLet}];
+$MidNum       = [\p{Word_Break = MidNum}];
+$Numeric      = [\p{Word_Break = Numeric}];
+$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];                               
                           
+$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
+$MidNumEx       = $MidNum       ($Extend |  $Format)*;
+$NumericEx      = $Numeric      ($Extend |  $Format)*;
+$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
+
+$ConsEx = $Cons ($Extend | $Format)*;
+$AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
+$MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
+$MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
+
+!!forward;
+$MyanmarJoinedSyllableEx {200};
+
+# default numeric rules
+$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx 
$ExtendNumLetEx?)*  {100};
diff --git a/be/src/clucene b/be/src/clucene
index 467b1b5a5b1..3236e18d93b 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 467b1b5a5b1c736546ef77965a88f0d8948a3ded
+Subproject commit 3236e18d93bf96481493d88c34b6c2515f3b0b75
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index 44bd24651af..28f68932fe1 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -28,7 +28,7 @@
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
-#include "CLucene/analysis/icu/ICUAnalyzer.h"
+#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h"
 #include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
 
 namespace doris::segment_v2::inverted_index {
@@ -67,7 +67,7 @@ std::unique_ptr<lucene::analysis::Analyzer> 
InvertedIndexAnalyzer::create_analyz
         }
         analyzer = std::move(chinese_analyzer);
     } else if (analyser_type == InvertedIndexParserType::PARSER_ICU) {
-        analyzer = std::make_unique<lucene::analysis::ICUAnalyzer>();
+        analyzer = std::make_unique<ICUAnalyzer>();
         analyzer->initDict(config::inverted_index_dict_path + "/icu");
     } else {
         // default
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp
new file mode 100644
index 00000000000..094aa93c4e2
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp
@@ -0,0 +1,107 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "BreakIteratorWrapper.h"
+
+#include <unicode/unistr.h>
+
+#include <mutex>
+#include <string>
+
+#include "ICUCommon.h"
+#include "ICUTokenizerConfig.h"
+
+namespace doris::segment_v2 {
+
+icu::UnicodeSet BreakIteratorWrapper::EMOJI_RK;
+icu::UnicodeSet BreakIteratorWrapper::EMOJI;
+
+BreakIteratorWrapper::BreakIteratorWrapper(icu::BreakIterator* rbbi) : 
rbbi_(rbbi) {}
+
+void BreakIteratorWrapper::initialize() {
+    static std::once_flag once_flag;
+    std::call_once(once_flag, []() {
+        UErrorCode status = U_ZERO_ERROR;
+        EMOJI_RK.applyPattern("[*#0-9©®™〰〽]", status);
+        if (U_FAILURE(status)) {
+            std::string error_msg = "EMOJI RK failed to initialize: ";
+            error_msg += u_errorName(status);
+            _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
+        }
+        EMOJI.applyPattern("[[:Emoji:][:Extended_Pictographic:]]", status);
+        if (U_FAILURE(status)) {
+            std::string error_msg = "EMOJI failed to initialize: ";
+            error_msg += u_errorName(status);
+            _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
+        }
+    });
+}
+
+int32_t BreakIteratorWrapper::next() {
+    int32_t current = rbbi_->current();
+    int32_t next = rbbi_->next();
+    status_ = calc_status(current, next);
+    return next;
+}
+
+int32_t BreakIteratorWrapper::calc_status(int32_t current, int32_t next) {
+    if (next != UBRK_DONE && is_emoji(current, next)) {
+        return ICUTokenizerConfig::EMOJI_SEQUENCE_STATUS;
+    } else {
+        return rbbi_->getRuleStatus();
+    }
+}
+
+bool BreakIteratorWrapper::is_emoji(int32_t current, int32_t next) {
+    int32_t begin = start_ + current;
+    int32_t end = start_ + next;
+    UChar32 codepoint = 0;
+    U16_GET(text_, 0, begin, end, codepoint);
+    if (EMOJI.contains(codepoint)) {
+        if (EMOJI_RK.contains(codepoint)) {
+            int32_t trailer = begin + U16_LENGTH(codepoint);
+            return trailer < end && (text_[trailer] == 0xFE0F || 
text_[trailer] == 0x20E3);
+        } else {
+            return true;
+        }
+    }
+    return false;
+}
+
+void BreakIteratorWrapper::set_text(const UChar* text, int32_t start, int32_t 
length) {
+    text_ = text;
+    start_ = start;
+
+    UErrorCode status = U_ZERO_ERROR;
+    UTextPtr utext(utext_openUChars(nullptr, text + start, length, &status));
+    if (U_FAILURE(status)) {
+        std::string error_msg = "Failed to create UText: ";
+        error_msg += u_errorName(status);
+        _CLTHROWT(CL_ERR_Runtime, error_msg.c_str());
+    }
+
+    rbbi_->setText(utext.get(), status);
+    if (U_FAILURE(status)) {
+        std::string error_msg = "Failed to set text: ";
+        error_msg += u_errorName(status);
+        _CLTHROWT(CL_ERR_Runtime, error_msg.c_str());
+    }
+
+    status_ = UBRK_WORD_NONE;
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h
new file mode 100644
index 00000000000..0bee1be9efa
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h
@@ -0,0 +1,54 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/umachine.h>
+#include <unicode/utext.h>
+
+#include <memory>
+#include <unordered_set>
+
+#include "ICUCommon.h"
+
+namespace doris::segment_v2 {
+
+class BreakIteratorWrapper {
+public:
+    BreakIteratorWrapper(icu::BreakIterator* rbbi);
+    ~BreakIteratorWrapper() = default;
+
+    void initialize();
+    int32_t current() { return rbbi_->current(); }
+    int32_t get_rule_status() const { return status_; }
+    int32_t next();
+    int32_t calc_status(int32_t current, int32_t next);
+    bool is_emoji(int32_t current, int32_t next);
+    void set_text(const UChar* text, int32_t start, int32_t length);
+
+private:
+    static icu::UnicodeSet EMOJI_RK;
+    static icu::UnicodeSet EMOJI;
+
+    BreakIteratorPtr rbbi_;
+    const UChar* text_ = nullptr;
+    int32_t start_ = 0;
+    int32_t status_ = UBRK_WORD_NONE;
+};
+using BreakIteratorWrapperPtr = std::unique_ptr<BreakIteratorWrapper>;
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp
new file mode 100644
index 00000000000..35f7f499cc5
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp
@@ -0,0 +1,83 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "CompositeBreakIterator.h"
+
+#include <unicode/unistr.h>
+
+#include <memory>
+
+namespace doris::segment_v2 {
+
+CompositeBreakIterator::CompositeBreakIterator(const ICUTokenizerConfigPtr& 
config)
+        : config_(config) {
+    scriptIterator_ = std::make_unique<ScriptIterator>(config->combine_cj());
+    word_breakers_.resize(u_getIntPropertyMaxValue(UCHAR_SCRIPT) + 1);
+}
+
+void CompositeBreakIterator::initialize() {
+    scriptIterator_->initialize();
+}
+
+int32_t CompositeBreakIterator::next() {
+    int32_t next = rbbi_->next();
+    while (next == UBRK_DONE && scriptIterator_->next()) {
+        rbbi_ = get_break_iterator(scriptIterator_->get_script_code());
+        rbbi_->set_text(text_, scriptIterator_->get_script_start(),
+                        scriptIterator_->get_script_limit() - 
scriptIterator_->get_script_start());
+        next = rbbi_->next();
+    }
+    return (next == UBRK_DONE) ? UBRK_DONE : next + 
scriptIterator_->get_script_start();
+}
+
+int32_t CompositeBreakIterator::current() {
+    int32_t current = rbbi_->current();
+    return (current == UBRK_DONE) ? UBRK_DONE : current + 
scriptIterator_->get_script_start();
+}
+
+int32_t CompositeBreakIterator::get_rule_status() {
+    return rbbi_->get_rule_status();
+}
+
+int32_t CompositeBreakIterator::get_script_code() {
+    return scriptIterator_->get_script_code();
+}
+
+void CompositeBreakIterator::set_text(const UChar* text, int32_t start, 
int32_t length) {
+    text_ = text;
+    scriptIterator_->set_text(text_, start, length);
+    if (scriptIterator_->next()) {
+        rbbi_ = get_break_iterator(scriptIterator_->get_script_code());
+        rbbi_->set_text(text_, scriptIterator_->get_script_start(),
+                        scriptIterator_->get_script_limit() - 
scriptIterator_->get_script_start());
+    } else {
+        rbbi_ = get_break_iterator(USCRIPT_COMMON);
+        rbbi_->set_text(text_, 0, 0);
+    }
+}
+
+BreakIteratorWrapper* CompositeBreakIterator::get_break_iterator(int32_t 
scriptCode) {
+    if (!word_breakers_[scriptCode]) {
+        auto wordBreak =
+                
std::make_unique<BreakIteratorWrapper>(config_->get_break_iterator(scriptCode));
+        wordBreak->initialize();
+        word_breakers_[scriptCode].swap(wordBreak);
+    }
+    return word_breakers_[scriptCode].get();
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h
new file mode 100644
index 00000000000..251c37b91b2
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h
@@ -0,0 +1,58 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/umachine.h>
+#include <unicode/unistr.h>
+#include <unicode/utext.h>
+
+#include <memory>
+#include <vector>
+
+#include "BreakIteratorWrapper.h"
+#include "ICUCommon.h"
+#include "ICUTokenizerConfig.h"
+#include "ScriptIterator.h"
+
+namespace doris::segment_v2 {
+
+class CompositeBreakIterator {
+public:
+    CompositeBreakIterator(const ICUTokenizerConfigPtr& config);
+    ~CompositeBreakIterator() = default;
+
+    void initialize();
+    int32_t next();
+    int32_t current();
+    int32_t get_rule_status();
+    int32_t get_script_code();
+    void set_text(const UChar* text, int32_t start, int32_t length);
+
+private:
+    BreakIteratorWrapper* get_break_iterator(int32_t scriptCode);
+
+    const UChar* text_ = nullptr;
+
+    ICUTokenizerConfigPtr config_;
+    std::vector<BreakIteratorWrapperPtr> word_breakers_;
+    BreakIteratorWrapper* rbbi_ = nullptr;
+    ScriptIteratorPtr scriptIterator_;
+};
+using CompositeBreakIteratorPtr = std::unique_ptr<CompositeBreakIterator>;
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp
new file mode 100644
index 00000000000..7da5d4df377
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "DefaultICUTokenizerConfig.h"
+
+#include <atomic>
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+namespace doris::segment_v2 {
+
+BreakIteratorPtr DefaultICUTokenizerConfig::cjk_break_iterator_;
+BreakIteratorPtr DefaultICUTokenizerConfig::default_break_iterator_;
+BreakIteratorPtr DefaultICUTokenizerConfig::myanmar_syllable_iterator_;
+
+DefaultICUTokenizerConfig::DefaultICUTokenizerConfig(bool cjk_as_words, bool 
myanmar_as_words) {
+    cjk_as_words_ = cjk_as_words;
+    myanmar_as_words_ = myanmar_as_words;
+}
+
+void DefaultICUTokenizerConfig::initialize(const std::string& dictPath) {
+    static std::atomic<bool> initialized_(false);
+    if (!initialized_) {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (!initialized_) {
+            try {
+                UErrorCode status = U_ZERO_ERROR;
+                cjk_break_iterator_.reset(
+                        
icu::BreakIterator::createWordInstance(icu::Locale::getRoot(), status));
+                if (U_FAILURE(status)) {
+                    std::string error_msg = "Failed to create CJK 
BreakIterator: ";
+                    error_msg += u_errorName(status);
+                    _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
+                }
+
+                read_break_iterator(default_break_iterator_, dictPath + 
"/uax29/Default.txt");
+                read_break_iterator(myanmar_syllable_iterator_,
+                                    dictPath + "/uax29/MyanmarSyllable.txt");
+
+                initialized_ = true;
+            } catch (...) {
+                cjk_break_iterator_.reset();
+                default_break_iterator_.reset();
+                myanmar_syllable_iterator_.reset();
+                throw; // Clean up resources and rethrow the original 
exception to the caller
+            }
+        }
+    }
+}
+
+icu::BreakIterator* DefaultICUTokenizerConfig::get_break_iterator(int32_t 
script) {
+    UErrorCode status = U_ZERO_ERROR;
+    icu::BreakIterator* clone = nullptr;
+    switch (script) {
+    case USCRIPT_JAPANESE:
+        clone = cjk_break_iterator_->clone();
+        break;
+    case USCRIPT_MYANMAR:
+        if (myanmar_as_words_) {
+            clone = default_break_iterator_->clone();
+        } else {
+            clone = myanmar_syllable_iterator_->clone();
+        }
+        break;
+    default:
+        clone = default_break_iterator_->clone();
+        break;
+    }
+    if (clone == nullptr) {
+        std::string error_msg = "UBreakIterator clone failed: ";
+        error_msg += u_errorName(status);
+        _CLTHROWT(CL_ERR_Runtime, error_msg.c_str());
+    }
+    return clone;
+}
+
+void DefaultICUTokenizerConfig::read_break_iterator(BreakIteratorPtr& rbbi,
+                                                    const std::string& 
filename) {
+    std::ifstream in(filename, std::ios::binary);
+    if (!in) {
+        std::string error_msg = "Unable to open the file: " + filename;
+        _CLTHROWT(CL_ERR_IO, error_msg.c_str());
+    }
+
+    std::ostringstream ss;
+    ss << in.rdbuf();
+    in.close();
+
+    std::string utf8Content = ss.str();
+    icu::UnicodeString rulesData(utf8Content.data());
+
+    UParseError parseErr;
+    UErrorCode status = U_ZERO_ERROR;
+    rbbi = std::make_unique<icu::RuleBasedBreakIterator>(rulesData, parseErr, 
status);
+    if (U_FAILURE(status)) {
+        std::string error_msg = "ubrk_openRules failed: ";
+        error_msg += u_errorName(status);
+        _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
+    }
+    if (parseErr.line != 0 || parseErr.offset != 0) {
+        std::string error_msg = "Syntax error in break rules at line ";
+        error_msg += std::to_string(parseErr.line);
+        error_msg += ", offset ";
+        error_msg += std::to_string(parseErr.offset);
+        _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
+    }
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h
new file mode 100644
index 00000000000..e3673cd543e
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "ICUTokenizerConfig.h"
+
+namespace doris::segment_v2 {
+
+class DefaultICUTokenizerConfig : public ICUTokenizerConfig {
+public:
+    DefaultICUTokenizerConfig(bool cjkAsWords, bool myanmarAsWords);
+    ~DefaultICUTokenizerConfig() override = default;
+
+    void initialize(const std::string& dictPath) override;
+    bool combine_cj() override { return cjk_as_words_; }
+    icu::BreakIterator* get_break_iterator(int32_t script) override;
+
+private:
+    static void read_break_iterator(BreakIteratorPtr& rbbi, const std::string& 
filename);
+
+    static BreakIteratorPtr cjk_break_iterator_;
+    static BreakIteratorPtr default_break_iterator_;
+    static BreakIteratorPtr myanmar_syllable_iterator_;
+
+    bool cjk_as_words_ = false;
+    bool myanmar_as_words_ = false;
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h
new file mode 100644
index 00000000000..f3a7554f13f
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "ICUTokenizer.h"
+
+namespace doris::segment_v2 {
+
+class ICUAnalyzer : public Analyzer {
+public:
+    ICUAnalyzer() {
+        _lowercase = true;
+        _ownReader = false;
+    }
+
+    ~ICUAnalyzer() override = default;
+
+    bool isSDocOpt() override { return true; }
+
+    void initDict(const std::string& dictPath) override { dictPath_ = 
dictPath; }
+
+    TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* 
reader) override {
+        auto* tokenizer = _CLNEW ICUTokenizer(_lowercase, _ownReader);
+        tokenizer->initialize(dictPath_);
+        tokenizer->reset(reader);
+        return (TokenStream*)tokenizer;
+    }
+
+    TokenStream* reusableTokenStream(const TCHAR* fieldName,
+                                     lucene::util::Reader* reader) override {
+        if (tokenizer_ == nullptr) {
+            tokenizer_ = std::make_unique<ICUTokenizer>(_lowercase, 
_ownReader);
+            tokenizer_->initialize(dictPath_);
+        }
+        tokenizer_->reset(reader);
+        return (TokenStream*)tokenizer_.get();
+    };
+
+private:
+    std::string dictPath_;
+    std::unique_ptr<ICUTokenizer> tokenizer_;
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUCommon.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUCommon.h
new file mode 100644
index 00000000000..1cdffab48d3
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUCommon.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "CLucene.h"
+#include "CLucene/debug/error.h"
+#include "unicode/brkiter.h"
+#include "unicode/rbbi.h"
+#include "unicode/ubrk.h"
+#include "unicode/uchar.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/uscript.h"
+#include "unicode/utext.h"
+#include "unicode/utf8.h"
+
+namespace doris::segment_v2 {
+
+using BreakIteratorPtr = std::unique_ptr<icu::BreakIterator>;
+
+struct UTextDeleter {
+    void operator()(UText* utext) const {
+        if (utext != nullptr) {
+            utext_close(utext);
+        }
+    }
+};
+
+using UTextPtr = std::unique_ptr<UText, UTextDeleter>;
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp
new file mode 100644
index 00000000000..6297c901796
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp
@@ -0,0 +1,81 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ICUTokenizer.h"
+
+#include <unicode/unistr.h>
+
+#include <memory>
+#include <string>
+
+namespace doris::segment_v2 {
+
+ICUTokenizer::ICUTokenizer() {
+    this->lowercase = false;
+    this->ownReader = false;
+
+    config_ = std::make_shared<DefaultICUTokenizerConfig>(true, true);
+    breaker_ = std::make_unique<CompositeBreakIterator>(config_);
+}
+
+ICUTokenizer::ICUTokenizer(bool lower_case, bool own_reader) : ICUTokenizer() {
+    this->lowercase = lower_case;
+    this->ownReader = own_reader;
+}
+
+void ICUTokenizer::initialize(const std::string& dictPath) {
+    config_->initialize(dictPath);
+    breaker_->initialize();
+}
+
+Token* ICUTokenizer::next(Token* token) {
+    int32_t start = breaker_->current();
+    assert(start != UBRK_DONE);
+
+    int32_t end = breaker_->next();
+    while (end != UBRK_DONE && breaker_->get_rule_status() == 0) {
+        start = end;
+        end = breaker_->next();
+    }
+
+    if (end == UBRK_DONE) {
+        return nullptr;
+    }
+
+    utf8Str_.clear();
+    auto subString = buffer_.tempSubString(start, end - start);
+    if (this->lowercase) {
+        subString.toLower().toUTF8String(utf8Str_);
+    } else {
+        subString.toUTF8String(utf8Str_);
+    }
+
+    token->setNoCopy(utf8Str_.data(), 0, utf8Str_.size());
+    return token;
+}
+
+void ICUTokenizer::reset(lucene::util::Reader* reader) {
+    const char* buf = nullptr;
+    int32_t len = reader->read((const void**)&buf, 0, reader->size());
+    buffer_ = icu::UnicodeString::fromUTF8(icu::StringPiece(buf, len));
+    if (!buffer_.isEmpty() && buffer_.isBogus()) {
+        _CLTHROWT(CL_ERR_Runtime, "Failed to convert UTF-8 string to 
UnicodeString.");
+    }
+    breaker_->set_text(buffer_.getBuffer(), 0, buffer_.length());
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h
new file mode 100644
index 00000000000..f703f677806
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/utext.h>
+
+#include "CLucene.h"
+#include "CLucene/analysis/AnalysisHeader.h"
+#include "CompositeBreakIterator.h"
+#include "DefaultICUTokenizerConfig.h"
+#include "ICUCommon.h"
+
+using namespace lucene::analysis;
+
+namespace doris::segment_v2 {
+
+class ICUTokenizer : public Tokenizer {
+public:
+    ICUTokenizer();
+    ICUTokenizer(bool lowercase, bool ownReader);
+    ~ICUTokenizer() override = default;
+
+    void initialize(const std::string& dictPath);
+    Token* next(Token* token) override;
+    void reset(lucene::util::Reader* reader) override;
+
+private:
+    std::string utf8Str_;
+    icu::UnicodeString buffer_;
+
+    ICUTokenizerConfigPtr config_;
+    CompositeBreakIteratorPtr breaker_;
+};
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h
new file mode 100644
index 00000000000..33accf72c51
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "ICUCommon.h"
+
+namespace doris::segment_v2 {
+
+class ICUTokenizerConfig {
+public:
+    ICUTokenizerConfig() = default;
+    virtual ~ICUTokenizerConfig() = default;
+
+    virtual void initialize(const std::string& dictPath) = 0;
+    virtual icu::BreakIterator* get_break_iterator(int32_t script) = 0;
+    virtual bool combine_cj() = 0;
+
+    static const int32_t EMOJI_SEQUENCE_STATUS = 299;
+};
+using ICUTokenizerConfigPtr = std::shared_ptr<ICUTokenizerConfig>;
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp
new file mode 100644
index 00000000000..5ca81d2a954
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp
@@ -0,0 +1,121 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "ScriptIterator.h"
+
+#include <unicode/unistr.h>
+
+#include <mutex>
+#include <string>
+
+namespace doris::segment_v2 {
+
+std::vector<int32_t> ScriptIterator::k_basic_latin(128);
+
+ScriptIterator::ScriptIterator(bool combine_cj) : combine_cj_(combine_cj) {}
+
+void ScriptIterator::initialize() {
+    static std::once_flag once_flag;
+    std::call_once(once_flag, []() {
+        UErrorCode status = U_ZERO_ERROR;
+        for (int32_t i = 0; i < 128; i++) {
+            k_basic_latin[i] = uscript_getScript(i, &status);
+            if (U_FAILURE(status)) {
+                std::string error_msg = "Get script failed: ";
+                error_msg += u_errorName(status);
+                _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
+            }
+        }
+        return k_basic_latin;
+    });
+}
+
+bool ScriptIterator::next() {
+    if (script_limit_ >= limit_) {
+        return false;
+    }
+
+    script_code_ = USCRIPT_COMMON;
+    script_start_ = script_limit_;
+
+    while (index_ < limit_) {
+        UChar32 ch = 0;
+        U16_GET(text_, start_, index_, limit_, ch);
+        int32_t script = get_script(ch);
+        if (is_same_script(script_code_, script, ch) || is_combining_mark(ch)) 
{
+            index_ += U16_LENGTH(ch);
+            if (script_code_ <= USCRIPT_INHERITED && script > 
USCRIPT_INHERITED) {
+                script_code_ = script;
+            }
+        } else {
+            break;
+        }
+    }
+
+    script_limit_ = index_;
+    return true;
+}
+
+void ScriptIterator::set_text(const UChar* text, int32_t start, int32_t 
length) {
+    text_ = text;
+    start_ = start;
+    index_ = start;
+    limit_ = start + length;
+    script_start_ = start;
+    script_limit_ = start;
+    script_code_ = USCRIPT_INVALID_CODE;
+}
+
+int32_t ScriptIterator::get_script(UChar32 codepoint) const {
+    if (0 <= codepoint && codepoint < 128) {
+        return k_basic_latin[codepoint];
+    } else {
+        UErrorCode err = U_ZERO_ERROR;
+        int32_t script = uscript_getScript(codepoint, &err);
+        if (U_FAILURE(err)) {
+            std::string error_msg = "Get Script error: ";
+            error_msg += u_errorName(err);
+            error_msg += ", script: " + std::to_string(script);
+            _CLTHROWT(CL_ERR_Runtime, error_msg.c_str());
+        }
+        if (combine_cj_) {
+            if (script == USCRIPT_HAN || script == USCRIPT_HIRAGANA || script 
== USCRIPT_KATAKANA) {
+                return USCRIPT_JAPANESE;
+            } else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) {
+                return USCRIPT_LATIN;
+            } else {
+                return script;
+            }
+        } else {
+            return script;
+        }
+    }
+}
+
+bool ScriptIterator::is_same_script(int32_t current_script, int32_t script, 
UChar32 codepoint) {
+    return (current_script == script) || (current_script <= USCRIPT_INHERITED) 
||
+           (script <= USCRIPT_INHERITED) ||
+           uscript_hasScript(codepoint, (UScriptCode)current_script);
+}
+
+bool ScriptIterator::is_combining_mark(UChar32 codepoint) {
+    auto type = (UCharCategory)u_charType(codepoint);
+    return (type == U_COMBINING_SPACING_MARK || type == U_NON_SPACING_MARK ||
+            type == U_ENCLOSING_MARK);
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h
new file mode 100644
index 00000000000..1cc67c4350c
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <unicode/umachine.h>
+#include <unicode/utext.h>
+
+#include <memory>
+#include <vector>
+
+#include "ICUCommon.h"
+
+namespace doris::segment_v2 {
+
+class ScriptIterator {
+public:
+    ScriptIterator(bool combine_cj);
+    ~ScriptIterator() = default;
+
+    void initialize();
+
+    int32_t get_script_start() const { return script_start_; }
+    int32_t get_script_limit() const { return script_limit_; }
+    int32_t get_script_code() const { return script_code_; }
+
+    bool next();
+    void set_text(const UChar* text, int32_t start, int32_t length);
+
+private:
+    int32_t get_script(UChar32 codepoint) const;
+    static bool is_same_script(int32_t current_script, int32_t script, UChar32 
codepoint);
+    static bool is_combining_mark(UChar32 codepoint);
+
+    static std::vector<int32_t> k_basic_latin;
+
+    const UChar* text_ = nullptr;
+    int32_t start_ = 0;
+    int32_t index_ = 0;
+    int32_t limit_ = 0;
+
+    int32_t script_start_ = 0;
+    int32_t script_limit_ = 0;
+    int32_t script_code_ = USCRIPT_INVALID_CODE;
+
+    bool combine_cj_ = false;
+};
+using ScriptIteratorPtr = std::unique_ptr<ScriptIterator>;
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp
new file mode 100644
index 00000000000..98fa722be2c
--- /dev/null
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp
@@ -0,0 +1,575 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h"
+
+using namespace lucene::analysis;
+
+namespace doris::segment_v2 {
+
+class ICUTokenizerTest : public ::testing::Test {
+protected:
+    void tokenize(const std::string& s, std::vector<std::string>& datas) {
+        try {
+            ICUAnalyzer analyzer;
+            analyzer.initDict("./be/dict/icu");
+            analyzer.set_lowercase(false);
+
+            lucene::util::SStringReader<char> reader;
+            reader.init(s.data(), s.size(), false);
+
+            std::unique_ptr<ICUTokenizer> tokenizer;
+            tokenizer.reset((ICUTokenizer*)analyzer.tokenStream(L"", &reader));
+
+            Token t;
+            while (tokenizer->next(&t)) {
+                std::string term(t.termBuffer<char>(), t.termLength<char>());
+                datas.emplace_back(term);
+            }
+        } catch (CLuceneError& e) {
+            std::cout << e.what() << std::endl;
+            throw;
+        }
+    }
+};
+
+TEST_F(ICUTokenizerTest, TestICUTokenizer) {
+    std::vector<std::string> datas;
+
+    // Chinese text
+    std::string chineseText =
+            "今天天气真好，我们一起去公园散步吧。人工智能正在改变我们的生活方式。这本书的内容非常有"
+            "趣，我推荐给你。";
+    tokenize(chineseText, datas);
+    ASSERT_EQ(datas.size(), 27);
+    datas.clear();
+
+    // English text
+    std::string englishText =
+            "The quick brown fox jumps over the lazy dog. Artificial 
intelligence is transforming "
+            "various industries. Reading books can significantly enhance your 
knowledge.";
+    tokenize(englishText, datas);
+    ASSERT_EQ(datas.size(), 22);
+    datas.clear();
+
+    // Vietnamese text
+    std::string vietnameseText =
+            "Hôm nay thời tiết thật đẹp, chúng ta cùng đi dạo công viên nhé. 
Trí tuệ nhân tạo đang "
+            "thay đổi cách sống của chúng ta. Cuốn sách này rất thú vị, tôi 
muốn giới thiệu cho "
+            "bạn.";
+    tokenize(vietnameseText, datas);
+    ASSERT_EQ(datas.size(), 38);
+    datas.clear();
+
+    // Portuguese text
+    std::string portugueseText =
+            "O tempo está ótimo hoje, vamos dar um passeio no parque. A 
inteligência artificial "
+            "está transformando nossas vidas. Este livro é muito interessante, 
eu recomendo para "
+            "você.";
+    tokenize(portugueseText, datas);
+    ASSERT_EQ(datas.size(), 27);
+    datas.clear();
+
+    // Indonesian text
+    std::string indonesianText =
+            "Hari ini cuaca sangat bagus, mari kita jalan-jalan ke taman. 
Kecerdasan buatan sedang "
+            "mengubah cara hidup kita. Buku ini sangat menarik, mari kita 
rekomendasikan.";
+    tokenize(indonesianText, datas);
+    ASSERT_EQ(datas.size(), 25);
+    datas.clear();
+
+    // Spanish text
+    std::string spanishText =
+            "Hoy hace muy buen tiempo, vamos a pasear por el parque. La 
inteligencia artificial "
+            "está cambiando nuestras vidas. Este libro es muy interesante, te 
lo recomiendo.";
+    tokenize(spanishText, datas);
+    ASSERT_EQ(datas.size(), 26);
+    datas.clear();
+
+    // Thai text
+    std::string thaiText =
+            "วันนี้อากาศดีมาก "
+            
"เราไปเดินเล่นที่สวนสาธารณะกันเถอะปัญญาประดิษฐ์กำลังเปลี่ยนวิถีชีวิตของเราหนังสือเล่มนี้น่าสนใจมาก
 "
+            "ฉันอยากแนะนำให้คุณอ่าน";
+    tokenize(thaiText, datas);
+    ASSERT_EQ(datas.size(), 34);
+    datas.clear();
+
+    // Hindi text
+    std::string hindiText =
+            "आज मौसम बहुत अच्छा है, चलो पार्क में टहलने चलते हैं। कृत्रिम 
बुद्धिमत्ता हमारे जीवन को बदल रही है। यह "
+            "किताब बहुत दिलचस्प है, मैं इसे आपको सुझाता हूं।";
+    tokenize(hindiText, datas);
+    ASSERT_EQ(datas.size(), 29);
+    datas.clear();
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerEmptyText) {
+    std::vector<std::string> datas;
+    std::string emptyText;
+    tokenize(emptyText, datas);
+    ASSERT_EQ(datas.size(), 0);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerSingleWord) {
+    std::vector<std::string> datas;
+
+    // Chinese word
+    std::string chineseText = "天气";
+    tokenize(chineseText, datas);
+    ASSERT_EQ(datas.size(), 1);
+    datas.clear();
+
+    // English word
+    std::string englishText = "weather";
+    tokenize(englishText, datas);
+    ASSERT_EQ(datas.size(), 1);
+    datas.clear();
+
+    // Arabic word
+    std::string arabicText = "الذكاء";
+    tokenize(arabicText, datas);
+    ASSERT_EQ(datas.size(), 1);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerMultipleSpaces) {
+    std::vector<std::string> datas;
+    std::string multipleSpacesText = "The    quick    brown   fox";
+    tokenize(multipleSpacesText, datas);
+    ASSERT_EQ(datas.size(), 4);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerPunctuation) {
+    std::vector<std::string> datas;
+    std::string textWithPunctuation = "Hello, world! How's it going?";
+    tokenize(textWithPunctuation, datas);
+    ASSERT_EQ(datas.size(), 5);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerMixedLanguage) {
+    std::vector<std::string> datas;
+    std::string mixedText = "Hello, 今天天气真好!";
+    tokenize(mixedText, datas);
+    ASSERT_EQ(datas.size(), 4);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerUnicode) {
+    std::vector<std::string> datas;
+    std::string unicodeText = "你好，世界! 😊🌍";
+    tokenize(unicodeText, datas);
+    ASSERT_EQ(datas.size(), 4);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerNumericText) {
+    std::vector<std::string> datas;
+    std::string numericText = "The price is 100 dollars.";
+    tokenize(numericText, datas);
+    ASSERT_EQ(datas.size(), 5);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerLongText) {
+    std::vector<std::string> datas;
+    std::string longText =
+            "Artificial intelligence is rapidly changing various industries 
around the world. "
+            "From healthcare to finance, it is transforming the way we work, 
live, and interact "
+            "with technology.";
+    tokenize(longText, datas);
+    ASSERT_EQ(datas.size(), 26);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerSpecialCharacters) {
+    std::vector<std::string> datas;
+    std::string specialCharsText = "@#$%^&*()_+{}[]|:;\"'<>,.?/~`";
+    tokenize(specialCharsText, datas);
+    ASSERT_EQ(datas.size(), 0);
+}
+
+TEST_F(ICUTokenizerTest, TestICUTokenizerLongWords) {
+    std::vector<std::string> datas;
+    std::string longWordText = "hippopotomonstrosesquipedaliophobia";
+    tokenize(longWordText, datas);
+    ASSERT_EQ(datas.size(), 1);
+}
+
+TEST_F(ICUTokenizerTest, TestICUArmenian) {
+    std::vector<std::string> datas;
+    std::string longWordText =
+            "Վիքիպեդիայի 13 միլիոն հոդվածները (4,600` հայերեն վիքիպեդիայում) 
գրվել են կամավորների "
+            "կողմից ու համարյա բոլոր հոդվածները կարող է խմբագրել ցանկաց մարդ 
ով կարող է բացել "
+            "Վիքիպեդիայի կայքը։";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {
+            "Վիքիպեդիայի",   "13",    "միլիոն",     "հոդվածները",  "4,600",  
"հայերեն",
+            "վիքիպեդիայում", "գրվել", "են",         "կամավորների", "կողմից", 
"ու",
+            "համարյա",       "բոլոր", "հոդվածները", "կարող",       "է",      
"խմբագրել",
+            "ցանկաց",        "մարդ",  "ով",         "կարող",       "է",      
"բացել",
+            "Վիքիպեդիայի",   "կայքը"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUAmharic) {
+    std::vector<std::string> datas;
+    std::string longWordText = "ዊኪፔድያ የባለ ብዙ ቋንቋ የተሟላ ትክክለኛና ነጻ መዝገበ ዕውቀት 
(ኢንሳይክሎፒዲያ) ነው። ማንኛውም";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"ዊኪፔድያ", "የባለ",  "ብዙ",   "ቋንቋ",       
"የተሟላ", "ትክክለኛና",
+                                       "ነጻ",    "መዝገበ", "ዕውቀት", "ኢንሳይክሎፒዲያ", 
"ነው",   "ማንኛውም"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUArabic) {
+    std::vector<std::string> datas;
+    std::string longWordText =
+            "الفيلم الوثائقي الأول عن ويكيبيديا يسمى \"الحقيقة بالأرقام: قصة 
ويكيبيديا\" "
+            "(بالإنجليزية: Truth in Numbers: The Wikipedia Story)، سيتم إطلاقه 
في 2008.";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {
+            "الفيلم",   "الوثائقي",  "الأول",     "عن",          "ويكيبيديا", 
"يسمى", "الحقيقة",
+            "بالأرقام", "قصة",       "ويكيبيديا", "بالإنجليزية", "Truth",     
"in",   "Numbers",
+            "The",      "Wikipedia", "Story",     "سيتم",        "إطلاقه",    
"في",   "2008"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUAramaic) {
+    std::vector<std::string> datas;
+    std::string longWordText =
+            "ܘܝܩܝܦܕܝܐ (ܐܢܓܠܝܐ: Wikipedia) ܗܘ ܐܝܢܣܩܠܘܦܕܝܐ ܚܐܪܬܐ ܕܐܢܛܪܢܛ ܒܠܫܢ̈ܐ 
ܣܓܝܐ̈ܐ܂ ܫܡܗ ܐܬܐ ܡܢ "
+            "ܡ̈ܠܬܐ ܕ\"ܘܝܩܝ\" ܘ\"ܐܝܢܣܩܠܘܦܕܝܐ\"܀";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {
+            "ܘܝܩܝܦܕܝܐ", "ܐܢܓܠܝܐ", "Wikipedia", "ܗܘ",  "ܐܝܢܣܩܠܘܦܕܝܐ", "ܚܐܪܬܐ",
+            "ܕܐܢܛܪܢܛ",  "ܒܠܫܢ̈ܐ",  "ܣܓܝܐ̈ܐ",     "ܫܡܗ", "ܐܬܐ",         "ܡܢ",
+            "ܡ̈ܠܬܐ",     "ܕ",      "ܘܝܩܝ",      "ܘ",   "ܐܝܢܣܩܠܘܦܕܝܐ"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUBengali) {
+    std::vector<std::string> datas;
+    std::string longWordText =
+            "এই বিশ্বকোষ পরিচালনা করে উইকিমিডিয়া ফাউন্ডেশন (একটি অলাভজনক 
সংস্থা)। উইকিপিডিয়ার শুরু ১৫ "
+            "জানুয়ারি, ২০০১ সালে। এখন পর্যন্ত ২০০টিরও বেশী ভাষায় উইকিপিডিয়া 
রয়েছে।܀";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"এই",         "বিশ্বকোষ", "পরিচালনা", 
"করে",   "উইকিমিডিয়া",
+                                       "ফাউন্ডেশন",   "একটি",    "অলাভজনক",  
"সংস্থা", "উইকিপিডিয়ার",
+                                       "শুরু",         "১৫",      "জানুয়ারি", 
 "২০০১",  "সালে",
+                                       "এখন",        "পর্যন্ত",   "২০০টিরও",  
"বেশী",  "ভাষায়",
+                                       "উইকিপিডিয়া", "রয়েছে"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUFarsi) {
+    std::vector<std::string> datas;
+    std::string longWordText =
+            "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای 
دانشنامهٔ تخصصی نوپدیا نوشته "
+            "شد.";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"ویکی",     "پدیای", "انگلیسی", "در",   
 "تاریخ", "۲۵",
+                                       "دی",       "۱۳۷۹",  "به",      "صورت", 
 "مکملی", "برای",
+                                       "دانشنامهٔ", "تخصصی", "نوپدیا",  
"نوشته", "شد"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUGreek) {
+    std::vector<std::string> datas;
+    std::string longWordText =
+            "Γράφεται σε συνεργασία από εθελοντές με το λογισμικό wiki, κάτι 
που σημαίνει ότι "
+            "άρθρα μπορεί να προστεθούν ή να αλλάξουν από τον καθένα.";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"Γράφεται", "σε",         "συνεργασία", 
"από",   "εθελοντές",
+                                       "με",       "το",         "λογισμικό",  
"wiki",  "κάτι",
+                                       "που",      "σημαίνει",   "ότι",        
"άρθρα", "μπορεί",
+                                       "να",       "προστεθούν", "ή",          
"να",    "αλλάξουν",
+                                       "από",      "τον",        "καθένα"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUKhmer) {
+    std::vector<std::string> datas;
+    std::string longWordText = "ផ្ទះស្កឹមស្កៃបីបួនខ្នងនេះ";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"ផ្ទះ", "ស្កឹមស្កៃ", "បី", "បួន", 
"ខ្នង", "នេះ"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICULao) {
+    std::vector<std::string> datas;
+    std::string longWordText = "ກວ່າດອກ ພາສາລາວ";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"ກວ່າ", "ດອກ", "ພາສາ", "ລາວ"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUMyanmar) {
+    std::vector<std::string> datas;
+    std::string longWordText = "သက်ဝင်လှုပ်ရှားစေပြီး";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"သက်ဝင်", "လှုပ်ရှား", "စေ", "ပြီး"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUThai) {
+    std::vector<std::string> datas;
+    std::string longWordText = "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? 
๑๒๓๔";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"การ", "ที่",   "ได้",  "ต้อง", "แสดง", 
"ว่า",  "งาน",
+                                       "ดี",   "แล้ว", "เธอ", "จะ",  "ไป",   
"ไหน", "๑๒๓๔"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUTibetan) {
+    std::vector<std::string> datas;
+    std::string longWordText = 
"སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། 
།";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"སྣོན", "མཛོད", "དང", "ལས",  "འདིས", 
"བོད",  "ཡིག",
+                                       "མི",  "ཉམས", "གོང", "འཕེལ", "དུ",   
"གཏོང", "བར",
+                                       "ཧ",  "ཅང",  "དགེ", "མཚན", "མཆིས", 
"སོ"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUChinese) {
+    std::vector<std::string> datas;
+    std::string longWordText = "我是中国人。 １２３４ Ｔｅｓｔｓ ";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"我是", "中国人", "１２３４", "Ｔｅｓｔｓ"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUHebrew) {
+    {
+        std::vector<std::string> datas;
+        std::string longWordText = "דנקנר תקף את הדו\"ח";
+        tokenize(longWordText, datas);
+        std::vector<std::string> result = {"דנקנר", "תקף", "את", "הדו\"ח"};
+        for (size_t i = 0; i < datas.size(); i++) {
+            ASSERT_EQ(datas[i], result[i]);
+        }
+    }
+    {
+        std::vector<std::string> datas;
+        std::string longWordText = "חברת בת של מודי'ס";
+        tokenize(longWordText, datas);
+        std::vector<std::string> result = {"חברת", "בת", "של", "מודי'ס"};
+        for (size_t i = 0; i < datas.size(); i++) {
+            ASSERT_EQ(datas[i], result[i]);
+        }
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUEmpty) {
+    std::vector<std::string> datas;
+    std::string longWordText = " . ";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICULUCENE1545) {
+    std::vector<std::string> datas;
+    std::string longWordText = "moͤchte";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"moͤchte"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUAlphanumericSA) {
+    std::vector<std::string> datas;
+    std::string longWordText = "B2B 2B";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"B2B", "2B"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUDelimitersSA) {
+    std::vector<std::string> datas;
+    std::string longWordText = "some-dashed-phrase dogs,chase,cats ac/dc";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"some",  "dashed", "phrase", "dogs",
+                                       "chase", "cats",   "ac",     "dc"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUApostrophesSA) {
+    std::vector<std::string> datas;
+    std::string longWordText = "O'Reilly you're she's Jim's don't O'Reilly's";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"O'Reilly", "you're", "she's",
+                                       "Jim's",    "don't",  "O'Reilly's"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUNumericSA) {
+    std::vector<std::string> datas;
+    std::string longWordText = "21.35 R2D2 C3PO 216.239.63.104 216.239.63.104";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"21.35", "R2D2", "C3PO", 
"216.239.63.104", "216.239.63.104"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUTextWithNumbersSA) {
+    std::vector<std::string> datas;
+    std::string longWordText = "David has 5000 bones";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"David", "has", "5000", "bones"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUVariousTextSA) {
+    std::vector<std::string> datas;
+    std::string longWordText =
+            "C embedded developers wanted foo bar FOO BAR foo      bar .  FOO 
<> BAR \"QUOTED\" "
+            "word";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"C",   "embedded", "developers", 
"wanted", "foo",
+                                       "bar", "FOO",      "BAR",        "foo", 
   "bar",
+                                       "FOO", "BAR",      "QUOTED",     
"word"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUKoreanSA) {
+    std::vector<std::string> datas;
+    std::string longWordText = "안녕하세요 한글입니다";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"안녕하세요", "한글입니다"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUReusableTokenStream) {
+    std::vector<std::string> datas;
+    std::string longWordText = 
"སྣོན་མཛོད་དང་ལས་འདིས་བོད་ཡིག་མི་ཉམས་གོང་འཕེལ་དུ་གཏོང་བར་ཧ་ཅང་དགེ་མཚན་མཆིས་སོ། 
།";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"སྣོན", "མཛོད", "དང", "ལས",  "འདིས", 
"བོད",  "ཡིག",
+                                       "མི",  "ཉམས", "གོང", "འཕེལ", "དུ",   
"གཏོང", "བར",
+                                       "ཧ",  "ཅང",  "དགེ", "མཚན", "མཆིས", 
"སོ"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUOffsets) {
+    std::vector<std::string> datas;
+    std::string longWordText = "David has 5000 bones";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"David", "has", "5000", "bones"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUKorean) {
+    std::vector<std::string> datas;
+    std::string longWordText = "훈민정음";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"훈민정음"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUJapanese) {
+    std::vector<std::string> datas;
+    std::string longWordText = "仮名遣い カタカナ";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"仮名遣い", "カタカナ"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUEmoji) {
+    std::vector<std::string> datas;
+    std::string longWordText =
+            "💩 💩💩 👩‍❤️‍👩 👨🏼‍⚕️ 🇺🇸🇺🇸 #️⃣ 3️⃣ "
+            "🏴";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {
+            "💩", "💩", "💩", "👩‍❤️‍👩", "👨🏼‍⚕️", "🇺🇸", "🇺🇸",
+            "#️⃣",  "3️⃣",  "🏴"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUEmojiTokenization) {
+    std::vector<std::string> datas;
+    std::string longWordText = "poo💩poo 💩中國💩";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"poo", "💩", "poo", "💩", "中國", "💩"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+TEST_F(ICUTokenizerTest, TestICUScriptExtensions) {
+    std::vector<std::string> datas;
+    std::string longWordText = "𑅗० 𑅗ा 𑅗᪾";
+    tokenize(longWordText, datas);
+    std::vector<std::string> result = {"𑅗०", "𑅗ा", "𑅗᪾"};
+    for (size_t i = 0; i < datas.size(); i++) {
+        ASSERT_EQ(datas[i], result[i]);
+    }
+}
+
+} // namespace doris::segment_v2


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch master updated: [opt](inverted index) Refactor ICU tokenizer code location for better organization and maintainability. (#48079)

Reply via email to