(doris) branch branch-3.1 updated: branch-3.1: [feature](inverted index) add char group tokenizer #54479 (#54616)

morrysnow Tue, 12 Aug 2025 21:11:54 -0700

This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 2a1adc65c32 branch-3.1: [feature](inverted index) add char group 
tokenizer #54479 (#54616)
2a1adc65c32 is described below

commit 2a1adc65c32ffef37baab6946acd23ab23450926
Author: zzzxl <[email protected]>
AuthorDate: Wed Aug 13 12:11:23 2025 +0800

    branch-3.1: [feature](inverted index) add char group tokenizer #54479 
(#54616)
    
    pick #54479
---
 .../inverted_index/analysis_factory_mgr.cpp        |   3 +
 .../char/char_group_tokenizer_factory.cpp          | 158 +++++++++++
 .../tokenizer/char/char_group_tokenizer_factory.h  |  51 ++++
 .../tokenizer/char/char_tokenizer.cpp              |  93 +++++++
 .../inverted_index/tokenizer/char/char_tokenizer.h |  50 ++++
 .../tokenizer/keyword/keyword_tokenizer.h          |   2 -
 .../segment_v2/inverted_index/util/string_helper.h |  37 +++
 .../char_group_tokenizer_factory_test.cpp          | 292 +++++++++++++++++++++
 .../indexpolicy/CharGroupTokenizerValidator.java   |  92 +++++++
 .../apache/doris/indexpolicy/IndexPolicyMgr.java   |   3 +
 .../doris/indexpolicy/PolicyValidatorTests.java    |  20 ++
 .../analyzer/test_char_group_tokenizer.out         | Bin 0 -> 999 bytes
 .../analyzer/test_char_group_tokenizer.groovy      | 131 +++++++++
 13 files changed, 930 insertions(+), 2 deletions(-)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
index 6b2904ec7f0..51585e5580b 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
@@ -20,6 +20,7 @@
 #include 
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
 #include 
"olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h"
 #include 
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h"
 #include 
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
 #include 
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
 #include 
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
@@ -35,6 +36,8 @@ void AnalysisFactoryMgr::initialise() {
         registerFactory("ngram", []() { return 
std::make_shared<NGramTokenizerFactory>(); });
         registerFactory("edge_ngram",
                         []() { return 
std::make_shared<EdgeNGramTokenizerFactory>(); });
+        registerFactory("char_group",
+                        []() { return 
std::make_shared<CharGroupTokenizerFactory>(); });
 
         // token_filter
         registerFactory("lowercase", []() { return 
std::make_shared<LowerCaseFilterFactory>(); });
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.cpp
new file mode 100644
index 00000000000..7528cb5e4f4
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.cpp
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "char_group_tokenizer_factory.h"
+
+#include <unicode/uscript.h>
+
+#include "common/exception.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.h"
+#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+void CharGroupTokenizerFactory::initialize(const Settings& settings) {
+    _max_token_length = settings.get_int("max_token_length", 
CharTokenizer::DEFAULT_MAX_WORD_LEN);
+
+    for (const auto& str : settings.get_entry_list("tokenize_on_chars")) {
+        if (str.length() == 0) {
+            throw Exception(ErrorCode::INVALID_ARGUMENT,
+                            "[tokenize_on_chars] cannot contain empty 
characters");
+        }
+
+        icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(str);
+
+        if (unicode_str.countChar32() == 1) {
+            _tokenize_on_chars.insert(unicode_str.char32At(0));
+        } else if (unicode_str.length() > 0 && unicode_str.charAt(0) == '\\') {
+            _tokenize_on_chars.insert(parse_escaped_char(unicode_str));
+        } else {
+            if (str == "letter") {
+                _tokenize_on_letter = true;
+            } else if (str == "digit") {
+                _tokenize_on_digit = true;
+            } else if (str == "whitespace") {
+                _tokenize_on_space = true;
+            } else if (str == "punctuation") {
+                _tokenize_on_punctuation = true;
+            } else if (str == "symbol") {
+                _tokenize_on_symbol = true;
+            } else if (str == "cjk") {
+                _tokenize_on_cjk = true;
+            } else {
+                throw Exception(ErrorCode::INVALID_ARGUMENT,
+                                "Invalid escaped char in [" + str + "]");
+            }
+        }
+    }
+}
+
+UChar32 CharGroupTokenizerFactory::parse_escaped_char(const 
icu::UnicodeString& unicode_str) {
+    icu::UnicodeString unescaped = unicode_str.unescape();
+
+    if (unescaped.countChar32() != 1) {
+        std::string s;
+        unicode_str.toUTF8String(s);
+        throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid escaped char [" 
+ s + "]");
+    }
+
+    return unescaped.char32At(0);
+}
+
+TokenizerPtr CharGroupTokenizerFactory::create() {
+    struct CharGroupConfig {
+        bool tokenize_on_space = false;
+        bool tokenize_on_letter = false;
+        bool tokenize_on_digit = false;
+        bool tokenize_on_punctuation = false;
+        bool tokenize_on_symbol = false;
+        bool tokenize_on_cjk = false;
+        std::unordered_set<UChar32> tokenize_on_chars;
+
+        CharGroupConfig(bool space, bool letter, bool digit, bool punct, bool 
symbol, bool cjk,
+                        std::unordered_set<UChar32> chars)
+                : tokenize_on_space(space),
+                  tokenize_on_letter(letter),
+                  tokenize_on_digit(digit),
+                  tokenize_on_punctuation(punct),
+                  tokenize_on_symbol(symbol),
+                  tokenize_on_cjk(cjk),
+                  tokenize_on_chars(std::move(chars)) {}
+    };
+
+    class CharGroupTokenizerImpl : public CharTokenizer {
+    public:
+        CharGroupTokenizerImpl(CharGroupConfig config) : 
_config(std::move(config)) {}
+        ~CharGroupTokenizerImpl() override = default;
+
+        bool is_cjk_char(UChar32 c) override {
+            if (!_config.tokenize_on_cjk) {
+                return false;
+            }
+
+            UErrorCode status = U_ZERO_ERROR;
+            UScriptCode script = uscript_getScript(c, &status);
+            if (!U_SUCCESS(status)) {
+                return false;
+            }
+
+            return script == USCRIPT_HAN || script == USCRIPT_HIRAGANA ||
+                   script == USCRIPT_KATAKANA || script == USCRIPT_HANGUL;
+        }
+
+        bool is_token_char(UChar32 c) override {
+            if (_config.tokenize_on_space && u_isspace(c)) {
+                return false;
+            }
+            if (_config.tokenize_on_letter && u_isalpha(c)) {
+                return false;
+            }
+            if (_config.tokenize_on_digit && u_isdigit(c)) {
+                return false;
+            }
+            if (_config.tokenize_on_punctuation && u_ispunct(c)) {
+                return false;
+            }
+            if (_config.tokenize_on_symbol) {
+                int8_t char_type = u_charType(c);
+                if (char_type == U_MATH_SYMBOL || char_type == 
U_CURRENCY_SYMBOL ||
+                    char_type == U_MODIFIER_SYMBOL || char_type == 
U_OTHER_SYMBOL) {
+                    return false;
+                }
+            }
+            if (_config.tokenize_on_chars.contains(c)) {
+                return false;
+            }
+            return true;
+        }
+
+    private:
+        CharGroupConfig _config;
+    };
+
+    CharGroupConfig config(_tokenize_on_space, _tokenize_on_letter, 
_tokenize_on_digit,
+                           _tokenize_on_punctuation, _tokenize_on_symbol, 
_tokenize_on_cjk,
+                           _tokenize_on_chars);
+
+    auto tokenzier = 
std::make_shared<CharGroupTokenizerImpl>(std::move(config));
+    tokenzier->initialize(_max_token_length);
+    return tokenzier;
+}
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h
new file mode 100644
index 00000000000..528e42d9383
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+class CharGroupTokenizerFactory : public TokenizerFactory {
+public:
+    CharGroupTokenizerFactory() = default;
+    ~CharGroupTokenizerFactory() override = default;
+
+    void initialize(const Settings& settings) override;
+
+    TokenizerPtr create() override;
+
+private:
+    static UChar32 parse_escaped_char(const icu::UnicodeString& unicode_str);
+
+    std::unordered_set<UChar32> _tokenize_on_chars;
+
+    int32_t _max_token_length = 0;
+
+    bool _tokenize_on_space = false;
+    bool _tokenize_on_letter = false;
+    bool _tokenize_on_digit = false;
+    bool _tokenize_on_punctuation = false;
+    bool _tokenize_on_symbol = false;
+    bool _tokenize_on_cjk = false;
+};
+using CharGroupTokenizerFactoryPtr = 
std::shared_ptr<CharGroupTokenizerFactory>;
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.cpp
new file mode 100644
index 00000000000..f4c14195086
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.cpp
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "char_tokenizer.h"
+
+#include "common/exception.h"
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+void CharTokenizer::initialize(int32_t max_token_len) {
+    if (max_token_len > MAX_TOKEN_LENGTH_LIMIT || max_token_len <= 0) {
+        throw Exception(ErrorCode::INVALID_ARGUMENT,
+                        "maxTokenLen must be greater than 0 and less than " +
+                                std::to_string(MAX_TOKEN_LENGTH_LIMIT) +
+                                " passed: " + std::to_string(max_token_len));
+    }
+    _max_token_len = max_token_len;
+}
+
+Token* CharTokenizer::next(Token* token) {
+    if (!token) {
+        return nullptr;
+    }
+
+    int32_t start = -1;
+    int32_t end = -1;
+    while (true) {
+        if (_buffer_index >= _data_len) {
+            if (start == -1) {
+                return nullptr;
+            }
+            break;
+        }
+
+        UChar32 c = U_UNASSIGNED;
+        const int32_t prev_i = _buffer_index;
+        U8_NEXT(_char_buffer, _buffer_index, _data_len, c);
+        if (c < 0) {
+            continue;
+        }
+
+        if (is_cjk_char(c)) {
+            if (start == -1) {
+                start = prev_i;
+                end = _buffer_index - 1;
+            } else {
+                _buffer_index = prev_i;
+            }
+            break;
+        } else if (is_token_char(c)) {
+            if (start == -1) {
+                start = prev_i;
+            }
+            end = _buffer_index - 1;
+            int32_t current_length = end - start + 1;
+            if (current_length >= _max_token_len) {
+                break;
+            }
+        } else if (start != -1) {
+            break;
+        }
+    }
+
+    int32_t length = end - start + 1;
+    std::string_view term(_char_buffer + start, length);
+    set(token, term);
+    return token;
+}
+
+void CharTokenizer::reset() {
+    DorisTokenizer::reset();
+
+    _buffer_index = 0;
+    _data_len = _in->read((const void**)&_char_buffer, 0, 
static_cast<int32_t>(_in->size()));
+}
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.h
new file mode 100644
index 00000000000..e2701d91afa
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+class CharTokenizer : public DorisTokenizer {
+public:
+    CharTokenizer() = default;
+    ~CharTokenizer() override = default;
+
+    void initialize(int32_t max_token_len);
+    Token* next(Token* token) override;
+    void reset() override;
+
+    virtual bool is_cjk_char(UChar32 c) = 0;
+    virtual bool is_token_char(UChar32 c) = 0;
+
+    static constexpr int32_t DEFAULT_MAX_WORD_LEN = 255;
+
+private:
+    static constexpr int32_t MAX_TOKEN_LENGTH_LIMIT = 16383;
+
+    int32_t _max_token_len = 0;
+
+    int32_t _buffer_index = 0;
+    int32_t _data_len = 0;
+    const char* _char_buffer = nullptr;
+};
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
index dbe05087ca7..708c060e97a 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
@@ -19,8 +19,6 @@
 
 #include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
 
-using namespace lucene::analysis;
-
 namespace doris::segment_v2::inverted_index {
 #include "common/compile_check_begin.h"
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/util/string_helper.h 
b/be/src/olap/rowset/segment_v2/inverted_index/util/string_helper.h
new file mode 100644
index 00000000000..a695ebc5e8b
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/util/string_helper.h
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <boost/locale.hpp>
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+class StringHelper {
+public:
+    static std::wstring to_wstring(const std::string& str) {
+        return boost::locale::conv::utf_to_utf<wchar_t>(str);
+    }
+
+    static std::string to_string(const std::wstring& wstr) {
+        return boost::locale::conv::utf_to_utf<char>(wstr);
+    }
+};
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp
 
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp
new file mode 100644
index 00000000000..7d2782098a1
--- /dev/null
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp
@@ -0,0 +1,292 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h"
+
+#include <gtest/gtest.h>
+
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class CharGroupTokenizerTest : public ::testing::Test {
+protected:
+    std::vector<std::string> tokenize(CharGroupTokenizerFactory& factory, 
const std::string& text) {
+        std::vector<std::string> tokens;
+        auto tokenizer = factory.create();
+        {
+            lucene::util::SStringReader<char> reader;
+            reader.init(text.data(), text.size(), false);
+            tokenizer->set_reader(&reader);
+            tokenizer->reset();
+
+            Token t;
+            while (tokenizer->next(&t)) {
+                std::string term(t.termBuffer<char>(), t.termLength<char>());
+                tokens.emplace_back(term);
+            }
+        }
+        return tokens;
+    }
+};
+
+TEST_F(CharGroupTokenizerTest, DefaultConfiguration) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "Hello World 123!");
+    std::vector<std::string> expected = {"Hello World 123!"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnSpace) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[whitespace]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "Hello World\tTab\nNewline");
+    std::vector<std::string> expected = {"Hello", "World", "Tab", "Newline"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnLetter) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[letter]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "Hello123World");
+    std::vector<std::string> expected = {"123"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnDigit) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[digit]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "abc123def456");
+    std::vector<std::string> expected = {"abc", "def"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnPunctuation) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[punctuation]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "Hello,World!Test?End");
+    std::vector<std::string> expected = {"Hello", "World", "Test", "End"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnSymbol) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[symbol]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "Hello$World+Test@End");
+    std::vector<std::string> expected = {"Hello", "World", "Test@End"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnCustomChars) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[-], [_]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "hello-world_test");
+    std::vector<std::string> expected = {"hello", "world", "test"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, EscapedChars) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[\\n], [\\t], [\\r]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "hello\nworld\ttest\rend");
+    std::vector<std::string> expected = {"hello", "world", "test", "end"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, MaxTokenLength) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("max_token_length", "5");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "hello verylongword world");
+    std::vector<std::string> expected = {"hello", " very", "longw", "ord w", 
"orld"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, CombinedConfiguration) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[-], [_]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "hello-world test_case, end!");
+    std::vector<std::string> expected = {"hello", "world test", "case, end!"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, UnicodeCharacters) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[whitespace]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "你好 世界 测试");
+    std::vector<std::string> expected = {"你好", "世界", "测试"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, ChinesePunctuation) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[，], [。], [！], [？]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "你好，世界！测试？结束。");
+    std::vector<std::string> expected = {"你好", "世界", "测试", "结束"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, EmptyString) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "");
+    ASSERT_TRUE(tokens.empty());
+}
+
+TEST_F(CharGroupTokenizerTest, OnlyDelimiters) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[whitespace], [punctuation]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "   !!!   ???   ");
+    ASSERT_TRUE(tokens.empty());
+}
+
+TEST_F(CharGroupTokenizerTest, SingleCharacter) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "a");
+    std::vector<std::string> expected = {"a"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, LongText) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[whitespace]");
+    factory.initialize(settings);
+
+    std::string long_text;
+    for (int i = 0; i < 1000; ++i) {
+        long_text += "word" + std::to_string(i) + " ";
+    }
+
+    auto tokens = tokenize(factory, long_text);
+    ASSERT_EQ(tokens.size(), 1000);
+    ASSERT_EQ(tokens[0], "word0");
+    ASSERT_EQ(tokens[999], "word999");
+}
+
+TEST_F(CharGroupTokenizerTest, ConsecutiveDelimiters) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[whitespace], [-]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "hello---world   test");
+    std::vector<std::string> expected = {"hello", "world", "test"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, VeryLongToken) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("max_token_length", "10");
+    factory.initialize(settings);
+
+    std::string very_long_word(50, 'a');
+    auto tokens = tokenize(factory, very_long_word);
+    ASSERT_EQ(tokens.size(), 5);
+    ASSERT_EQ(tokens[0].length(), 10);
+    ASSERT_EQ(tokens[0], std::string(10, 'a'));
+}
+
+TEST_F(CharGroupTokenizerTest, SpecialUnicodeSymbols) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[symbol]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "hello©world®test™end");
+    std::vector<std::string> expected = {"hello", "world", "test", "end"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, MathSymbols) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[symbol]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "a+b=c×d÷e∑f");
+    std::vector<std::string> expected = {"a", "b", "c", "d", "e", "f"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, CurrencySymbols) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[symbol]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "100$200€300¥400");
+    std::vector<std::string> expected = {"100", "200", "300", "400"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, CJKWithEnglishAndDigits) {
+    CharGroupTokenizerFactory factory;
+    Settings settings;
+    settings.set("tokenize_on_chars", "[cjk]");
+    factory.initialize(settings);
+
+    auto tokens = tokenize(factory, "abc中文123");
+    std::vector<std::string> expected = {"abc", "中", "文", "123"};
+    ASSERT_EQ(tokens, expected);
+}
+
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
new file mode 100644
index 00000000000..389c576365e
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.indexpolicy;
+
+import org.apache.doris.common.DdlException;
+
+import com.google.common.collect.ImmutableSet;
+
+import java.util.Map;
+import java.util.Set;
+
+public class CharGroupTokenizerValidator extends BasePolicyValidator {
+    private static final Set<String> ALLOWED_PROPS = ImmutableSet.of(
+            "type", "max_token_length", "tokenize_on_chars");
+
+    private static final Set<String> VALID_CHAR_TYPES = ImmutableSet.of(
+            "letter", "digit", "whitespace", "punctuation", "symbol", "cjk");
+
+    public CharGroupTokenizerValidator() {
+        super(ALLOWED_PROPS);
+    }
+
+    @Override
+    protected String getTypeName() {
+        return "char group tokenizer";
+    }
+
+    @Override
+    protected void validateSpecific(Map<String, String> props) throws 
DdlException {
+        // max_token_length
+        if (props.containsKey("max_token_length")) {
+            try {
+                int v = Integer.parseInt(props.get("max_token_length"));
+                if (v <= 0) {
+                    throw new DdlException("max_token_length must be a 
positive integer (default: 255)");
+                }
+            } catch (NumberFormatException e) {
+                throw new DdlException("max_token_length must be a positive 
integer (default: 255)");
+            }
+        }
+
+        // tokenize_on_chars: only check bracketed format and non-empty content
+        if (props.containsKey("tokenize_on_chars")) {
+            String raw = props.get("tokenize_on_chars");
+            if (raw == null || raw.trim().isEmpty()) {
+                throw new DdlException("tokenize_on_chars cannot be empty if 
specified");
+            }
+            String[] items = raw.split("\\s*,\\s*");
+            for (String item : items) {
+                String trimmed = item.trim();
+                if (!trimmed.startsWith("[") || !trimmed.endsWith("]")) {
+                    throw new DdlException("Each item in tokenize_on_chars 
must be enclosed in square brackets. "
+                            + "Invalid item: " + item);
+                }
+                String content = trimmed.substring(1, trimmed.length() - 1);
+                if (content.length() == 0) {
+                    throw new DdlException("tokenize_on_chars cannot contain 
empty items: " + item);
+                }
+                validateTokenizeOnCharsContent(content, item);
+            }
+        }
+    }
+
+    private void validateTokenizeOnCharsContent(String content, String 
originalItem) throws DdlException {
+        if (VALID_CHAR_TYPES.contains(content)) {
+            return;
+        }
+        if (content.startsWith("\\")) {
+            return;
+        }
+        if (content.codePointCount(0, content.length()) != 1) {
+            throw new DdlException("Invalid tokenize_on_chars item: " + 
originalItem + ". "
+                    + "Content must be either a valid character type (" + 
VALID_CHAR_TYPES + "), "
+                    + "an escaped character (starting with \\), or a single 
unicode character.");
+        }
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
index b410487dc39..d3be0f48f85 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
@@ -242,6 +242,9 @@ public class IndexPolicyMgr implements Writable, 
GsonPostProcessable {
             case "keyword":
                 validator = new KeywordTokenizerValidator();
                 break;
+            case "char_group":
+                validator = new CharGroupTokenizerValidator();
+                break;
             default:
                 throw new DdlException("Unsupported tokenizer type: " + type
                         + ". Supported types: " + 
IndexPolicy.BUILTIN_TOKENIZERS);
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
 
b/fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
index d2161b6a61c..e48432bd98f 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
@@ -200,4 +200,24 @@ public class PolicyValidatorTests {
                 () -> validator.validate(props));
         Assertions.assertTrue(exception.getMessage().contains("does not 
support parameter"));
     }
+
+    @Test
+    public void testCharGroupTokenizer_ValidProperties() throws Exception {
+        CharGroupTokenizerValidator validator = new 
CharGroupTokenizerValidator();
+        Map<String, String> props = new HashMap<>();
+        props.put("max_token_length", "255");
+        props.put("tokenize_on_chars", "[whitespace], [punctuation]");
+        validator.validate(props); // Should not throw
+    }
+
+    @Test
+    public void testCharGroupTokenizer_InvalidTokenizeOnChars_NoBrackets() {
+        CharGroupTokenizerValidator validator = new 
CharGroupTokenizerValidator();
+        Map<String, String> props = new HashMap<>();
+        props.put("tokenize_on_chars", "[whitespace], punctuation"); // second 
item missing brackets
+
+        Exception exception = Assertions.assertThrows(DdlException.class,
+                () -> validator.validate(props));
+        Assertions.assertTrue(exception.getMessage().contains("enclosed in 
square brackets"));
+    }
 }
diff --git 
a/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out 
b/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
new file mode 100644
index 00000000000..29f00892a5c
Binary files /dev/null and 
b/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out 
differ
diff --git 
a/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
 
b/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
new file mode 100644
index 00000000000..4eaa52664c5
--- /dev/null
+++ 
b/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import java.sql.SQLException
+
+suite("test_char_group_tokenizer", "p0") {
+    def tbl = "test_char_group_tokenizer_tbl"
+
+    // 1) Basic whitespace + punctuation splitting
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS 
char_group_ws_punct_tokenizer
+        PROPERTIES
+        (
+            "type" = "char_group",
+            "tokenize_on_chars" = "[whitespace], [punctuation]"
+        );
+    """
+
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS 
char_group_ws_punct_analyzer
+        PROPERTIES
+        (
+            "tokenizer" = "char_group_ws_punct_tokenizer"
+        );
+    """
+
+    // 2) Split CJK characters (each CJK char becomes a token when enabled)
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS char_group_cjk_tokenizer
+        PROPERTIES
+        (
+            "type" = "char_group",
+            "tokenize_on_chars" = "[cjk]"
+        );
+    """
+
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS char_group_cjk_analyzer
+        PROPERTIES
+        (
+            "tokenizer" = "char_group_cjk_tokenizer"
+        );
+    """
+
+    // 3) Custom chars and escaped chars: split on '-' and '_' and also 
newline/tab/carriage return
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS 
char_group_custom_tokenizer
+        PROPERTIES
+        (
+            "type" = "char_group",
+            "tokenize_on_chars" = "[-], [_], [\\n], [\\t], [\\r]"
+        );
+    """
+
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS char_group_custom_analyzer
+        PROPERTIES
+        (
+            "tokenizer" = "char_group_custom_tokenizer"
+        );
+    """
+
+    // Wait for analyzers to be ready
+    sql """ select sleep(10) """
+
+    // Tokenize checks for whitespace + punctuation
+    qt_tokenize_sql """ select tokenize("Hello,World!Test?End", 
'"analyzer"="char_group_ws_punct_analyzer"'); """
+    // Chinese punctuation should split with punctuation enabled
+    qt_tokenize_sql """ select tokenize("你好，世界！测试？结束。", 
'"analyzer"="char_group_ws_punct_analyzer"'); """
+
+    // CJK split behavior mixed with ASCII and digits
+    qt_tokenize_sql """ select tokenize("abc中文123", 
'"analyzer"="char_group_cjk_analyzer"'); """
+
+    // Custom and escaped characters
+    qt_tokenize_sql """ select tokenize("hello-world_test", 
'"analyzer"="char_group_custom_analyzer"'); """
+    qt_tokenize_sql """ select tokenize("hello\nworld\ttest\rend", 
'"analyzer"="char_group_custom_analyzer"'); """
+
+    // Create a table to validate integration with inverted index + analyzer
+    sql "DROP TABLE IF EXISTS ${tbl}"
+    sql """
+        CREATE TABLE ${tbl} (
+            `id` bigint NOT NULL AUTO_INCREMENT(1),
+            `ch` text NULL,
+            INDEX idx_ch (`ch`) USING INVERTED PROPERTIES("support_phrase" = 
"true", "analyzer" = "char_group_ws_punct_analyzer")
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`id`)
+        DISTRIBUTED BY RANDOM BUCKETS 1
+        PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+
+    sql """ insert into ${tbl} values (1, "Hello,World!Test?End"); """
+    sql """ insert into ${tbl} values (2, "你好，世界！测试？结束。"); """
+    sql """ insert into ${tbl} values (3, "abc中文123"); """
+
+    try {
+        sql "sync"
+        sql """ set enable_common_expr_pushdown = true; """
+
+        // Match queries leveraging the analyzer
+        qt_sql """ select id, ch from ${tbl} where ch match 'World'; """
+        qt_sql """ select id, ch from ${tbl} where ch match '世界'; """
+        qt_sql """ select id, ch from ${tbl} where ch match 'Test'; """
+    } finally {
+        // keep objects for further cases if needed
+    }
+
+    // Optional cleanup for analyzers (skip if used by index)
+    try {
+        sql "drop inverted index analyzer char_group_ws_punct_analyzer"
+        sql "drop inverted index analyzer char_group_cjk_analyzer"
+        sql "drop inverted index analyzer char_group_custom_analyzer"
+    } catch (SQLException e) {
+        // It may be used by index; ignore
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-3.1 updated: branch-3.1: [feature](inverted index) add char group tokenizer #54479 (#54616)

Reply via email to