This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 2a1adc65c32 branch-3.1: [feature](inverted index) add char group
tokenizer #54479 (#54616)
2a1adc65c32 is described below
commit 2a1adc65c32ffef37baab6946acd23ab23450926
Author: zzzxl <[email protected]>
AuthorDate: Wed Aug 13 12:11:23 2025 +0800
branch-3.1: [feature](inverted index) add char group tokenizer #54479
(#54616)
pick #54479
---
.../inverted_index/analysis_factory_mgr.cpp | 3 +
.../char/char_group_tokenizer_factory.cpp | 158 +++++++++++
.../tokenizer/char/char_group_tokenizer_factory.h | 51 ++++
.../tokenizer/char/char_tokenizer.cpp | 93 +++++++
.../inverted_index/tokenizer/char/char_tokenizer.h | 50 ++++
.../tokenizer/keyword/keyword_tokenizer.h | 2 -
.../segment_v2/inverted_index/util/string_helper.h | 37 +++
.../char_group_tokenizer_factory_test.cpp | 292 +++++++++++++++++++++
.../indexpolicy/CharGroupTokenizerValidator.java | 92 +++++++
.../apache/doris/indexpolicy/IndexPolicyMgr.java | 3 +
.../doris/indexpolicy/PolicyValidatorTests.java | 20 ++
.../analyzer/test_char_group_tokenizer.out | Bin 0 -> 999 bytes
.../analyzer/test_char_group_tokenizer.groovy | 131 +++++++++
13 files changed, 930 insertions(+), 2 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
index 6b2904ec7f0..51585e5580b 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp
@@ -20,6 +20,7 @@
#include
"olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
#include
"olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h"
#include
"olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
+#include
"olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h"
#include
"olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
#include
"olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
#include
"olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
@@ -35,6 +36,8 @@ void AnalysisFactoryMgr::initialise() {
registerFactory("ngram", []() { return
std::make_shared<NGramTokenizerFactory>(); });
registerFactory("edge_ngram",
[]() { return
std::make_shared<EdgeNGramTokenizerFactory>(); });
+ registerFactory("char_group",
+ []() { return
std::make_shared<CharGroupTokenizerFactory>(); });
// token_filter
registerFactory("lowercase", []() { return
std::make_shared<LowerCaseFilterFactory>(); });
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.cpp
new file mode 100644
index 00000000000..7528cb5e4f4
--- /dev/null
+++
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.cpp
@@ -0,0 +1,158 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "char_group_tokenizer_factory.h"
+
+#include <unicode/uscript.h>
+
+#include "common/exception.h"
+#include
"olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.h"
+#include "olap/rowset/segment_v2/inverted_index/util/string_helper.h"
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+void CharGroupTokenizerFactory::initialize(const Settings& settings) {
+ _max_token_length = settings.get_int("max_token_length",
CharTokenizer::DEFAULT_MAX_WORD_LEN);
+
+ for (const auto& str : settings.get_entry_list("tokenize_on_chars")) {
+ if (str.length() == 0) {
+ throw Exception(ErrorCode::INVALID_ARGUMENT,
+ "[tokenize_on_chars] cannot contain empty
characters");
+ }
+
+ icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(str);
+
+ if (unicode_str.countChar32() == 1) {
+ _tokenize_on_chars.insert(unicode_str.char32At(0));
+ } else if (unicode_str.length() > 0 && unicode_str.charAt(0) == '\\') {
+ _tokenize_on_chars.insert(parse_escaped_char(unicode_str));
+ } else {
+ if (str == "letter") {
+ _tokenize_on_letter = true;
+ } else if (str == "digit") {
+ _tokenize_on_digit = true;
+ } else if (str == "whitespace") {
+ _tokenize_on_space = true;
+ } else if (str == "punctuation") {
+ _tokenize_on_punctuation = true;
+ } else if (str == "symbol") {
+ _tokenize_on_symbol = true;
+ } else if (str == "cjk") {
+ _tokenize_on_cjk = true;
+ } else {
+ throw Exception(ErrorCode::INVALID_ARGUMENT,
+ "Invalid escaped char in [" + str + "]");
+ }
+ }
+ }
+}
+
+UChar32 CharGroupTokenizerFactory::parse_escaped_char(const
icu::UnicodeString& unicode_str) {
+ icu::UnicodeString unescaped = unicode_str.unescape();
+
+ if (unescaped.countChar32() != 1) {
+ std::string s;
+ unicode_str.toUTF8String(s);
+ throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid escaped char ["
+ s + "]");
+ }
+
+ return unescaped.char32At(0);
+}
+
+TokenizerPtr CharGroupTokenizerFactory::create() {
+ struct CharGroupConfig {
+ bool tokenize_on_space = false;
+ bool tokenize_on_letter = false;
+ bool tokenize_on_digit = false;
+ bool tokenize_on_punctuation = false;
+ bool tokenize_on_symbol = false;
+ bool tokenize_on_cjk = false;
+ std::unordered_set<UChar32> tokenize_on_chars;
+
+ CharGroupConfig(bool space, bool letter, bool digit, bool punct, bool
symbol, bool cjk,
+ std::unordered_set<UChar32> chars)
+ : tokenize_on_space(space),
+ tokenize_on_letter(letter),
+ tokenize_on_digit(digit),
+ tokenize_on_punctuation(punct),
+ tokenize_on_symbol(symbol),
+ tokenize_on_cjk(cjk),
+ tokenize_on_chars(std::move(chars)) {}
+ };
+
+ class CharGroupTokenizerImpl : public CharTokenizer {
+ public:
+ CharGroupTokenizerImpl(CharGroupConfig config) :
_config(std::move(config)) {}
+ ~CharGroupTokenizerImpl() override = default;
+
+ bool is_cjk_char(UChar32 c) override {
+ if (!_config.tokenize_on_cjk) {
+ return false;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ UScriptCode script = uscript_getScript(c, &status);
+ if (!U_SUCCESS(status)) {
+ return false;
+ }
+
+ return script == USCRIPT_HAN || script == USCRIPT_HIRAGANA ||
+ script == USCRIPT_KATAKANA || script == USCRIPT_HANGUL;
+ }
+
+ bool is_token_char(UChar32 c) override {
+ if (_config.tokenize_on_space && u_isspace(c)) {
+ return false;
+ }
+ if (_config.tokenize_on_letter && u_isalpha(c)) {
+ return false;
+ }
+ if (_config.tokenize_on_digit && u_isdigit(c)) {
+ return false;
+ }
+ if (_config.tokenize_on_punctuation && u_ispunct(c)) {
+ return false;
+ }
+ if (_config.tokenize_on_symbol) {
+ int8_t char_type = u_charType(c);
+ if (char_type == U_MATH_SYMBOL || char_type ==
U_CURRENCY_SYMBOL ||
+ char_type == U_MODIFIER_SYMBOL || char_type ==
U_OTHER_SYMBOL) {
+ return false;
+ }
+ }
+ if (_config.tokenize_on_chars.contains(c)) {
+ return false;
+ }
+ return true;
+ }
+
+ private:
+ CharGroupConfig _config;
+ };
+
+ CharGroupConfig config(_tokenize_on_space, _tokenize_on_letter,
_tokenize_on_digit,
+ _tokenize_on_punctuation, _tokenize_on_symbol,
_tokenize_on_cjk,
+ _tokenize_on_chars);
+
+ auto tokenzier =
std::make_shared<CharGroupTokenizerImpl>(std::move(config));
+ tokenzier->initialize(_max_token_length);
+ return tokenzier;
+}
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h
new file mode 100644
index 00000000000..528e42d9383
--- /dev/null
+++
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h
@@ -0,0 +1,51 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer_factory.h"
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+class CharGroupTokenizerFactory : public TokenizerFactory {
+public:
+ CharGroupTokenizerFactory() = default;
+ ~CharGroupTokenizerFactory() override = default;
+
+ void initialize(const Settings& settings) override;
+
+ TokenizerPtr create() override;
+
+private:
+ static UChar32 parse_escaped_char(const icu::UnicodeString& unicode_str);
+
+ std::unordered_set<UChar32> _tokenize_on_chars;
+
+ int32_t _max_token_length = 0;
+
+ bool _tokenize_on_space = false;
+ bool _tokenize_on_letter = false;
+ bool _tokenize_on_digit = false;
+ bool _tokenize_on_punctuation = false;
+ bool _tokenize_on_symbol = false;
+ bool _tokenize_on_cjk = false;
+};
+using CharGroupTokenizerFactoryPtr =
std::shared_ptr<CharGroupTokenizerFactory>;
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.cpp
new file mode 100644
index 00000000000..f4c14195086
--- /dev/null
+++
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.cpp
@@ -0,0 +1,93 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "char_tokenizer.h"
+
+#include "common/exception.h"
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+void CharTokenizer::initialize(int32_t max_token_len) {
+ if (max_token_len > MAX_TOKEN_LENGTH_LIMIT || max_token_len <= 0) {
+ throw Exception(ErrorCode::INVALID_ARGUMENT,
+ "maxTokenLen must be greater than 0 and less than " +
+ std::to_string(MAX_TOKEN_LENGTH_LIMIT) +
+ " passed: " + std::to_string(max_token_len));
+ }
+ _max_token_len = max_token_len;
+}
+
+Token* CharTokenizer::next(Token* token) {
+ if (!token) {
+ return nullptr;
+ }
+
+ int32_t start = -1;
+ int32_t end = -1;
+ while (true) {
+ if (_buffer_index >= _data_len) {
+ if (start == -1) {
+ return nullptr;
+ }
+ break;
+ }
+
+ UChar32 c = U_UNASSIGNED;
+ const int32_t prev_i = _buffer_index;
+ U8_NEXT(_char_buffer, _buffer_index, _data_len, c);
+ if (c < 0) {
+ continue;
+ }
+
+ if (is_cjk_char(c)) {
+ if (start == -1) {
+ start = prev_i;
+ end = _buffer_index - 1;
+ } else {
+ _buffer_index = prev_i;
+ }
+ break;
+ } else if (is_token_char(c)) {
+ if (start == -1) {
+ start = prev_i;
+ }
+ end = _buffer_index - 1;
+ int32_t current_length = end - start + 1;
+ if (current_length >= _max_token_len) {
+ break;
+ }
+ } else if (start != -1) {
+ break;
+ }
+ }
+
+ int32_t length = end - start + 1;
+ std::string_view term(_char_buffer + start, length);
+ set(token, term);
+ return token;
+}
+
+void CharTokenizer::reset() {
+ DorisTokenizer::reset();
+
+ _buffer_index = 0;
+ _data_len = _in->read((const void**)&_char_buffer, 0,
static_cast<int32_t>(_in->size()));
+}
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.h
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.h
new file mode 100644
index 00000000000..e2701d91afa
--- /dev/null
+++
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/char/char_tokenizer.h
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+class CharTokenizer : public DorisTokenizer {
+public:
+ CharTokenizer() = default;
+ ~CharTokenizer() override = default;
+
+ void initialize(int32_t max_token_len);
+ Token* next(Token* token) override;
+ void reset() override;
+
+ virtual bool is_cjk_char(UChar32 c) = 0;
+ virtual bool is_token_char(UChar32 c) = 0;
+
+ static constexpr int32_t DEFAULT_MAX_WORD_LEN = 255;
+
+private:
+ static constexpr int32_t MAX_TOKEN_LENGTH_LIMIT = 16383;
+
+ int32_t _max_token_len = 0;
+
+ int32_t _buffer_index = 0;
+ int32_t _data_len = 0;
+ const char* _char_buffer = nullptr;
+};
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
index dbe05087ca7..708c060e97a 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer.h
@@ -19,8 +19,6 @@
#include "olap/rowset/segment_v2/inverted_index/tokenizer/tokenizer.h"
-using namespace lucene::analysis;
-
namespace doris::segment_v2::inverted_index {
#include "common/compile_check_begin.h"
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/util/string_helper.h
b/be/src/olap/rowset/segment_v2/inverted_index/util/string_helper.h
new file mode 100644
index 00000000000..a695ebc5e8b
--- /dev/null
+++ b/be/src/olap/rowset/segment_v2/inverted_index/util/string_helper.h
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <boost/locale.hpp>
+
+namespace doris::segment_v2::inverted_index {
+#include "common/compile_check_begin.h"
+
+class StringHelper {
+public:
+ static std::wstring to_wstring(const std::string& str) {
+ return boost::locale::conv::utf_to_utf<wchar_t>(str);
+ }
+
+ static std::string to_string(const std::wstring& wstr) {
+ return boost::locale::conv::utf_to_utf<char>(wstr);
+ }
+};
+
+#include "common/compile_check_end.h"
+} // namespace doris::segment_v2::inverted_index
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp
new file mode 100644
index 00000000000..7d2782098a1
--- /dev/null
+++
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/char_group_tokenizer_factory_test.cpp
@@ -0,0 +1,292 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include
"olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h"
+
+#include <gtest/gtest.h>
+
+#include "olap/rowset/segment_v2/inverted_index/setting.h"
+
+namespace doris::segment_v2::inverted_index {
+
+class CharGroupTokenizerTest : public ::testing::Test {
+protected:
+ std::vector<std::string> tokenize(CharGroupTokenizerFactory& factory,
const std::string& text) {
+ std::vector<std::string> tokens;
+ auto tokenizer = factory.create();
+ {
+ lucene::util::SStringReader<char> reader;
+ reader.init(text.data(), text.size(), false);
+ tokenizer->set_reader(&reader);
+ tokenizer->reset();
+
+ Token t;
+ while (tokenizer->next(&t)) {
+ std::string term(t.termBuffer<char>(), t.termLength<char>());
+ tokens.emplace_back(term);
+ }
+ }
+ return tokens;
+ }
+};
+
+TEST_F(CharGroupTokenizerTest, DefaultConfiguration) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "Hello World 123!");
+ std::vector<std::string> expected = {"Hello World 123!"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnSpace) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[whitespace]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "Hello World\tTab\nNewline");
+ std::vector<std::string> expected = {"Hello", "World", "Tab", "Newline"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnLetter) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[letter]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "Hello123World");
+ std::vector<std::string> expected = {"123"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnDigit) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[digit]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "abc123def456");
+ std::vector<std::string> expected = {"abc", "def"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnPunctuation) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[punctuation]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "Hello,World!Test?End");
+ std::vector<std::string> expected = {"Hello", "World", "Test", "End"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnSymbol) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[symbol]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "Hello$World+Test@End");
+ std::vector<std::string> expected = {"Hello", "World", "Test@End"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, TokenizeOnCustomChars) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[-], [_]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "hello-world_test");
+ std::vector<std::string> expected = {"hello", "world", "test"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, EscapedChars) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[\\n], [\\t], [\\r]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "hello\nworld\ttest\rend");
+ std::vector<std::string> expected = {"hello", "world", "test", "end"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, MaxTokenLength) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("max_token_length", "5");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "hello verylongword world");
+ std::vector<std::string> expected = {"hello", " very", "longw", "ord w",
"orld"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, CombinedConfiguration) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[-], [_]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "hello-world test_case, end!");
+ std::vector<std::string> expected = {"hello", "world test", "case, end!"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, UnicodeCharacters) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[whitespace]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "你好 世界 测试");
+ std::vector<std::string> expected = {"你好", "世界", "测试"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, ChinesePunctuation) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[,], [。], [!], [?]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "你好,世界!测试?结束。");
+ std::vector<std::string> expected = {"你好", "世界", "测试", "结束"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, EmptyString) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "");
+ ASSERT_TRUE(tokens.empty());
+}
+
+TEST_F(CharGroupTokenizerTest, OnlyDelimiters) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[whitespace], [punctuation]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, " !!! ??? ");
+ ASSERT_TRUE(tokens.empty());
+}
+
+TEST_F(CharGroupTokenizerTest, SingleCharacter) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "a");
+ std::vector<std::string> expected = {"a"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, LongText) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[whitespace]");
+ factory.initialize(settings);
+
+ std::string long_text;
+ for (int i = 0; i < 1000; ++i) {
+ long_text += "word" + std::to_string(i) + " ";
+ }
+
+ auto tokens = tokenize(factory, long_text);
+ ASSERT_EQ(tokens.size(), 1000);
+ ASSERT_EQ(tokens[0], "word0");
+ ASSERT_EQ(tokens[999], "word999");
+}
+
+TEST_F(CharGroupTokenizerTest, ConsecutiveDelimiters) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[whitespace], [-]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "hello---world test");
+ std::vector<std::string> expected = {"hello", "world", "test"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, VeryLongToken) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("max_token_length", "10");
+ factory.initialize(settings);
+
+ std::string very_long_word(50, 'a');
+ auto tokens = tokenize(factory, very_long_word);
+ ASSERT_EQ(tokens.size(), 5);
+ ASSERT_EQ(tokens[0].length(), 10);
+ ASSERT_EQ(tokens[0], std::string(10, 'a'));
+}
+
+TEST_F(CharGroupTokenizerTest, SpecialUnicodeSymbols) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[symbol]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "hello©world®test™end");
+ std::vector<std::string> expected = {"hello", "world", "test", "end"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, MathSymbols) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[symbol]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "a+b=c×d÷e∑f");
+ std::vector<std::string> expected = {"a", "b", "c", "d", "e", "f"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, CurrencySymbols) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[symbol]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "100$200€300¥400");
+ std::vector<std::string> expected = {"100", "200", "300", "400"};
+ ASSERT_EQ(tokens, expected);
+}
+
+TEST_F(CharGroupTokenizerTest, CJKWithEnglishAndDigits) {
+ CharGroupTokenizerFactory factory;
+ Settings settings;
+ settings.set("tokenize_on_chars", "[cjk]");
+ factory.initialize(settings);
+
+ auto tokens = tokenize(factory, "abc中文123");
+ std::vector<std::string> expected = {"abc", "中", "文", "123"};
+ ASSERT_EQ(tokens, expected);
+}
+
+} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
new file mode 100644
index 00000000000..389c576365e
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/CharGroupTokenizerValidator.java
@@ -0,0 +1,92 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.indexpolicy;
+
+import org.apache.doris.common.DdlException;
+
+import com.google.common.collect.ImmutableSet;
+
+import java.util.Map;
+import java.util.Set;
+
+public class CharGroupTokenizerValidator extends BasePolicyValidator {
+ private static final Set<String> ALLOWED_PROPS = ImmutableSet.of(
+ "type", "max_token_length", "tokenize_on_chars");
+
+ private static final Set<String> VALID_CHAR_TYPES = ImmutableSet.of(
+ "letter", "digit", "whitespace", "punctuation", "symbol", "cjk");
+
+ public CharGroupTokenizerValidator() {
+ super(ALLOWED_PROPS);
+ }
+
+ @Override
+ protected String getTypeName() {
+ return "char group tokenizer";
+ }
+
+ @Override
+ protected void validateSpecific(Map<String, String> props) throws
DdlException {
+ // max_token_length
+ if (props.containsKey("max_token_length")) {
+ try {
+ int v = Integer.parseInt(props.get("max_token_length"));
+ if (v <= 0) {
+ throw new DdlException("max_token_length must be a
positive integer (default: 255)");
+ }
+ } catch (NumberFormatException e) {
+ throw new DdlException("max_token_length must be a positive
integer (default: 255)");
+ }
+ }
+
+ // tokenize_on_chars: only check bracketed format and non-empty content
+ if (props.containsKey("tokenize_on_chars")) {
+ String raw = props.get("tokenize_on_chars");
+ if (raw == null || raw.trim().isEmpty()) {
+ throw new DdlException("tokenize_on_chars cannot be empty if
specified");
+ }
+ String[] items = raw.split("\\s*,\\s*");
+ for (String item : items) {
+ String trimmed = item.trim();
+ if (!trimmed.startsWith("[") || !trimmed.endsWith("]")) {
+ throw new DdlException("Each item in tokenize_on_chars
must be enclosed in square brackets. "
+ + "Invalid item: " + item);
+ }
+ String content = trimmed.substring(1, trimmed.length() - 1);
+ if (content.length() == 0) {
+ throw new DdlException("tokenize_on_chars cannot contain
empty items: " + item);
+ }
+ validateTokenizeOnCharsContent(content, item);
+ }
+ }
+ }
+
+ private void validateTokenizeOnCharsContent(String content, String
originalItem) throws DdlException {
+ if (VALID_CHAR_TYPES.contains(content)) {
+ return;
+ }
+ if (content.startsWith("\\")) {
+ return;
+ }
+ if (content.codePointCount(0, content.length()) != 1) {
+ throw new DdlException("Invalid tokenize_on_chars item: " +
originalItem + ". "
+ + "Content must be either a valid character type (" +
VALID_CHAR_TYPES + "), "
+ + "an escaped character (starting with \\), or a single
unicode character.");
+ }
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
index b410487dc39..d3be0f48f85 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/IndexPolicyMgr.java
@@ -242,6 +242,9 @@ public class IndexPolicyMgr implements Writable,
GsonPostProcessable {
case "keyword":
validator = new KeywordTokenizerValidator();
break;
+ case "char_group":
+ validator = new CharGroupTokenizerValidator();
+ break;
default:
throw new DdlException("Unsupported tokenizer type: " + type
+ ". Supported types: " +
IndexPolicy.BUILTIN_TOKENIZERS);
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
b/fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
index d2161b6a61c..e48432bd98f 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/indexpolicy/PolicyValidatorTests.java
@@ -200,4 +200,24 @@ public class PolicyValidatorTests {
() -> validator.validate(props));
Assertions.assertTrue(exception.getMessage().contains("does not
support parameter"));
}
+
+ @Test
+ public void testCharGroupTokenizer_ValidProperties() throws Exception {
+ CharGroupTokenizerValidator validator = new
CharGroupTokenizerValidator();
+ Map<String, String> props = new HashMap<>();
+ props.put("max_token_length", "255");
+ props.put("tokenize_on_chars", "[whitespace], [punctuation]");
+ validator.validate(props); // Should not throw
+ }
+
+ @Test
+ public void testCharGroupTokenizer_InvalidTokenizeOnChars_NoBrackets() {
+ CharGroupTokenizerValidator validator = new
CharGroupTokenizerValidator();
+ Map<String, String> props = new HashMap<>();
+ props.put("tokenize_on_chars", "[whitespace], punctuation"); // second
item missing brackets
+
+ Exception exception = Assertions.assertThrows(DdlException.class,
+ () -> validator.validate(props));
+ Assertions.assertTrue(exception.getMessage().contains("enclosed in
square brackets"));
+ }
}
diff --git
a/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
b/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
new file mode 100644
index 00000000000..29f00892a5c
Binary files /dev/null and
b/regression-test/data/inverted_index_p0/analyzer/test_char_group_tokenizer.out
differ
diff --git
a/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
b/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
new file mode 100644
index 00000000000..4eaa52664c5
--- /dev/null
+++
b/regression-test/suites/inverted_index_p0/analyzer/test_char_group_tokenizer.groovy
@@ -0,0 +1,131 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import java.sql.SQLException
+
+suite("test_char_group_tokenizer", "p0") {
+ def tbl = "test_char_group_tokenizer_tbl"
+
+ // 1) Basic whitespace + punctuation splitting
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS
char_group_ws_punct_tokenizer
+ PROPERTIES
+ (
+ "type" = "char_group",
+ "tokenize_on_chars" = "[whitespace], [punctuation]"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS
char_group_ws_punct_analyzer
+ PROPERTIES
+ (
+ "tokenizer" = "char_group_ws_punct_tokenizer"
+ );
+ """
+
+ // 2) Split CJK characters (each CJK char becomes a token when enabled)
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS char_group_cjk_tokenizer
+ PROPERTIES
+ (
+ "type" = "char_group",
+ "tokenize_on_chars" = "[cjk]"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS char_group_cjk_analyzer
+ PROPERTIES
+ (
+ "tokenizer" = "char_group_cjk_tokenizer"
+ );
+ """
+
+ // 3) Custom chars and escaped chars: split on '-' and '_' and also
newline/tab/carriage return
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS
char_group_custom_tokenizer
+ PROPERTIES
+ (
+ "type" = "char_group",
+ "tokenize_on_chars" = "[-], [_], [\\n], [\\t], [\\r]"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS char_group_custom_analyzer
+ PROPERTIES
+ (
+ "tokenizer" = "char_group_custom_tokenizer"
+ );
+ """
+
+ // Wait for analyzers to be ready
+ sql """ select sleep(10) """
+
+ // Tokenize checks for whitespace + punctuation
+ qt_tokenize_sql """ select tokenize("Hello,World!Test?End",
'"analyzer"="char_group_ws_punct_analyzer"'); """
+ // Chinese punctuation should split with punctuation enabled
+ qt_tokenize_sql """ select tokenize("你好,世界!测试?结束。",
'"analyzer"="char_group_ws_punct_analyzer"'); """
+
+ // CJK split behavior mixed with ASCII and digits
+ qt_tokenize_sql """ select tokenize("abc中文123",
'"analyzer"="char_group_cjk_analyzer"'); """
+
+ // Custom and escaped characters
+ qt_tokenize_sql """ select tokenize("hello-world_test",
'"analyzer"="char_group_custom_analyzer"'); """
+ qt_tokenize_sql """ select tokenize("hello\nworld\ttest\rend",
'"analyzer"="char_group_custom_analyzer"'); """
+
+ // Create a table to validate integration with inverted index + analyzer
+ sql "DROP TABLE IF EXISTS ${tbl}"
+ sql """
+ CREATE TABLE ${tbl} (
+ `id` bigint NOT NULL AUTO_INCREMENT(1),
+ `ch` text NULL,
+ INDEX idx_ch (`ch`) USING INVERTED PROPERTIES("support_phrase" =
"true", "analyzer" = "char_group_ws_punct_analyzer")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`id`)
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """ insert into ${tbl} values (1, "Hello,World!Test?End"); """
+ sql """ insert into ${tbl} values (2, "你好,世界!测试?结束。"); """
+ sql """ insert into ${tbl} values (3, "abc中文123"); """
+
+ try {
+ sql "sync"
+ sql """ set enable_common_expr_pushdown = true; """
+
+ // Match queries leveraging the analyzer
+ qt_sql """ select id, ch from ${tbl} where ch match 'World'; """
+ qt_sql """ select id, ch from ${tbl} where ch match '世界'; """
+ qt_sql """ select id, ch from ${tbl} where ch match 'Test'; """
+ } finally {
+ // keep objects for further cases if needed
+ }
+
+ // Optional cleanup for analyzers (skip if used by index)
+ try {
+ sql "drop inverted index analyzer char_group_ws_punct_analyzer"
+ sql "drop inverted index analyzer char_group_cjk_analyzer"
+ sql "drop inverted index analyzer char_group_custom_analyzer"
+ } catch (SQLException e) {
+ // It may be used by index; ignore
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]