This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new e47082dcc51 branch-4.0: [fix](inverted index) fix pinyin bug #57756
(#58852)
e47082dcc51 is described below
commit e47082dcc514baba080279fd738f6d00e2709dab
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Dec 10 10:50:04 2025 +0800
branch-4.0: [fix](inverted index) fix pinyin bug #57756 (#58852)
Cherry-picked from #57756
Co-authored-by: Ryan19929 <[email protected]>
---
.../inverted_index/token_filter/pinyin_filter.cpp | 76 +++-
.../token_filter/pinyin_filter_factory.cpp | 8 -
.../tokenizer/pinyin/pinyin_formatter.cpp | 37 +-
.../tokenizer/pinyin/pinyin_tokenizer.cpp | 58 +--
.../tokenizer/pinyin/pinyin_tokenizer.h | 2 +-
.../token_filter/pinyin_filter_test.cpp | 355 +++++++++++++++++
.../tokenizer/pinyin_analysis_test.cpp | 437 +++++++++++++++++++++
.../inverted_index/tokenizer/pinyin_util_test.cpp | 122 ++++++
.../indexpolicy/PinyinTokenFilterValidator.java | 35 +-
.../indexpolicy/PinyinTokenizerValidator.java | 35 +-
.../analyzer/test_custom_analyzer.out | 38 +-
.../analyzer/test_custom_analyzer.groovy | 93 ++++-
12 files changed, 1168 insertions(+), 128 deletions(-)
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
index 7c9b8579a80..13bf212b21c 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
@@ -48,14 +48,6 @@ PinyinFilter::PinyinFilter(const TokenStreamPtr& in,
std::shared_ptr<PinyinConfi
if (!config_) {
config_ = std::make_shared<PinyinConfig>();
}
-
- // Validate configuration (same as Java validation)
- if (!(config_->keepFirstLetter || config_->keepSeparateFirstLetter ||
config_->keepFullPinyin ||
- config_->keepJoinedFullPinyin || config_->keepSeparateChinese)) {
- throw Exception(ErrorCode::INVALID_ARGUMENT,
- "pinyin config error, can't disable
separate_first_letter, "
- "first_letter and full_pinyin at the same time.");
- }
}
void PinyinFilter::initialize() {
@@ -143,10 +135,26 @@ bool PinyinFilter::readTerm(Token* token) {
}
}
- // Process original text if needed
- if (config_->keepOriginal && !processed_original_) {
+ // Preserve original text if configured or if no candidates were generated
+ // This ensures Unicode symbols (emoji, etc.) are preserved even without
keep_original setting
+ // matching Elasticsearch behavior
+ // NOTE: Must be AFTER processCurrentToken() but BEFORE first_letters to
maintain correct order
+ if (!processed_original_ && has_current_token_) {
+ bool should_add_original = config_->keepOriginal;
+
+ // For emoji/symbol fallback: check if ANY content was generated
(candidates OR pending letters)
+ // If nothing was generated, this is likely an emoji/symbol that
should be preserved
+ if (!should_add_original && candidate_.empty() &&
first_letters_.empty() &&
+ full_pinyin_letters_.empty()) {
+ // No candidates and no pending letters, this is emoji/symbol
+ should_add_original = true;
+ }
+
processed_original_ = true;
- addCandidate(TermItem(current_source_, 0,
static_cast<int>(current_source_.length()), 1));
+ if (should_add_original) {
+ addCandidate(
+ TermItem(current_source_, 0,
static_cast<int>(current_source_.length()), 1));
+ }
}
// Process joined full pinyin if needed
@@ -231,8 +239,22 @@ bool PinyinFilter::processCurrentToken() {
PinyinUtil::instance().convert(source_codepoints,
PinyinFormat::TONELESS_PINYIN_FORMAT);
auto chinese_list = ChineseUtil::segmentChinese(source_codepoints);
+ // Early return optimization: if no Chinese characters found
if (pinyin_list.empty() && chinese_list.empty()) {
- return false;
+ // Check if there are non-ASCII Unicode characters (like emoji) to
preserve
+ bool has_unicode_symbols = false;
+ for (const auto& cp : source_codepoints) {
+ if (cp >= 128) { // Non-ASCII character
+ has_unicode_symbols = true;
+ break;
+ }
+ }
+
+ // If no Unicode symbols, return false and let other filters handle it
+ if (!has_unicode_symbols) {
+ return false;
+ }
+ // Otherwise, continue processing to preserve Unicode symbols
}
// Process each character and generate candidates
@@ -240,7 +262,7 @@ bool PinyinFilter::processCurrentToken() {
std::string first_letters_buffer;
std::string full_pinyin_buffer;
- // Buffer for accumulating ASCII characters (like Java's buff)
+ // Buffer for accumulating ASCII characters
std::string ascii_buffer;
int ascii_buffer_start_pos = -1;
@@ -256,14 +278,27 @@ bool PinyinFilter::processCurrentToken() {
(codepoint >= '0' && codepoint <= '9');
if (is_ascii && is_alnum) {
- // Initialize ASCII buffer if needed
- if (ascii_buffer.empty()) {
- ascii_buffer_start_pos = static_cast<int>(i);
+ // Check if we should process ASCII characters individually
+ if (!config_->keepNoneChineseTogether && config_->keepNoneChinese)
{
+ // Process accumulated ASCII buffer before processing
individual character
+ if (!ascii_buffer.empty()) {
+ processAsciiBuffer(ascii_buffer, ascii_buffer_start_pos,
static_cast<int>(i));
+ ascii_buffer.clear();
+ ascii_buffer_start_pos = -1;
+ }
+ // Process individual ASCII character immediately
+ position_++;
+ std::string single_char(1, static_cast<char>(codepoint));
+ addCandidate(TermItem(single_char, static_cast<int>(i),
static_cast<int>(i + 1),
+ position_));
+ } else {
+ // Accumulate ASCII characters for later processing
+ if (ascii_buffer.empty()) {
+ ascii_buffer_start_pos = static_cast<int>(i);
+ }
+ ascii_buffer += static_cast<char>(codepoint);
}
- // Accumulate ASCII characters
- ascii_buffer += static_cast<char>(codepoint);
-
// Handle ASCII alphanumeric characters for first letters
if (config_->keepNoneChineseInFirstLetter) {
first_letters_buffer += static_cast<char>(codepoint);
@@ -311,6 +346,9 @@ bool PinyinFilter::processCurrentToken() {
}
}
}
+ // For non-ASCII, non-Chinese characters (e.g., emoji, symbols),
+ // we don't add them to candidate. They will only be kept if the
fallback
+ // mechanism is triggered (when candidate_ is empty).
}
}
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
index 8603b74ce5d..68cb927aa43 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
@@ -52,14 +52,6 @@ void PinyinFilterFactory::initialize(const Settings&
settings) {
// Integer parameters
config_->limitFirstLetterLength =
settings.get_int("limit_first_letter_length", 16);
-
- // Validate configuration (same validation as Java)
- if (!(config_->keepFirstLetter || config_->keepSeparateFirstLetter ||
config_->keepFullPinyin ||
- config_->keepJoinedFullPinyin || config_->keepSeparateChinese)) {
- throw Exception(ErrorCode::INVALID_ARGUMENT,
- "pinyin config error, can't disable
separate_first_letter, "
- "first_letter and full_pinyin at the same time.");
- }
}
TokenFilterPtr PinyinFilterFactory::create(const TokenStreamPtr& in) {
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
index 8bae9514faa..577613e29fc 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
@@ -38,6 +38,25 @@ const std::unordered_map<char, std::array<const char*, 4>>
TONE_MARKS = {
};
constexpr const char* VOWELS = "aeiouv";
+
+// Pre-compiled regex patterns (compile once, reuse many times for performance)
+// Only use regex for patterns that require pattern matching (character
classes, etc.)
+const std::regex TONE_NUMBER_REGEX("[1-5]");
+const std::regex PINYIN_VALIDATION_REGEX("[a-z]*[1-5]?");
+const std::regex PINYIN_WITH_TONE_REGEX("[a-z]*[1-5]");
+
+// Helper function for simple string replacement (faster than regex for fixed
strings)
+inline std::string replaceAll(const std::string& str, const std::string& from,
+ const std::string& to) {
+ if (from.empty()) return str;
+ std::string result = str;
+ size_t pos = 0;
+ while ((pos = result.find(from, pos)) != std::string::npos) {
+ result.replace(pos, from.length(), to);
+ pos += to.length();
+ }
+ return result;
+}
} // namespace
std::string PinyinFormatter::formatPinyin(const std::string& pinyin_str,
@@ -62,10 +81,10 @@ std::string PinyinFormatter::formatPinyin(const
std::string& pinyin_str,
switch (working_format.getToneType()) {
case ToneType::WITHOUT_TONE:
- result = std::regex_replace(result, std::regex("[1-5]"), "");
+ result = std::regex_replace(result, TONE_NUMBER_REGEX, "");
break;
case ToneType::WITH_TONE_MARK:
- result = std::regex_replace(result, std::regex("u:"), "v");
+ result = replaceAll(result, "u:", "v");
result = convertToneNumber2ToneMark(result);
break;
case ToneType::WITH_TONE_NUMBER:
@@ -76,10 +95,10 @@ std::string PinyinFormatter::formatPinyin(const
std::string& pinyin_str,
if (working_format.getToneType() != ToneType::WITH_TONE_MARK) {
switch (working_format.getYuCharType()) {
case YuCharType::WITH_V:
- result = std::regex_replace(result, std::regex("u:"), "v");
+ result = replaceAll(result, "u:", "v");
break;
case YuCharType::WITH_U_UNICODE:
- result = std::regex_replace(result, std::regex("u:"), "ü");
+ result = replaceAll(result, "u:", "ü");
break;
case YuCharType::WITH_U_AND_COLON:
default:
@@ -143,7 +162,7 @@ std::string
PinyinFormatter::convertToneNumber2ToneMark(const std::string& pinyi
std::transform(lower_case_pinyin.begin(), lower_case_pinyin.end(),
lower_case_pinyin.begin(),
[](unsigned char c) { return std::tolower(c); });
- if (!std::regex_match(lower_case_pinyin, std::regex("[a-z]*[1-5]?"))) {
+ if (!std::regex_match(lower_case_pinyin, PINYIN_VALIDATION_REGEX)) {
return lower_case_pinyin;
}
@@ -153,7 +172,7 @@ std::string
PinyinFormatter::convertToneNumber2ToneMark(const std::string& pinyi
char unmarked_vowel = DEFAULT_CHAR_VALUE;
int index_of_unmarked_vowel = DEFAULT_INDEX_VALUE;
- if (std::regex_match(lower_case_pinyin, std::regex("[a-z]*[1-5]"))) {
+ if (std::regex_match(lower_case_pinyin, PINYIN_WITH_TONE_REGEX)) {
int tune_number = lower_case_pinyin.back() - '0';
size_t index_of_a = lower_case_pinyin.find('a');
@@ -190,7 +209,7 @@ std::string
PinyinFormatter::convertToneNumber2ToneMark(const std::string& pinyi
std::string result;
std::string prefix = lower_case_pinyin.substr(0,
index_of_unmarked_vowel);
- result += std::regex_replace(prefix, std::regex("v"), "ü");
+ result += replaceAll(prefix, "v", "ü");
result += marked_vowel;
if (index_of_unmarked_vowel + 1 <
@@ -198,14 +217,14 @@ std::string
PinyinFormatter::convertToneNumber2ToneMark(const std::string& pinyi
std::string suffix = lower_case_pinyin.substr(
index_of_unmarked_vowel + 1,
lower_case_pinyin.length() - 1 -
index_of_unmarked_vowel - 1);
- result += std::regex_replace(suffix, std::regex("v"), "ü");
+ result += replaceAll(suffix, "v", "ü");
}
return result;
}
}
} else {
- return std::regex_replace(lower_case_pinyin, std::regex("v"), "ü");
+ return replaceAll(lower_case_pinyin, "v", "ü");
}
return lower_case_pinyin;
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
index 57fcff01ce5..c46709c2ab0 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
+++
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
@@ -40,12 +40,6 @@
PinyinTokenizer::PinyinTokenizer(std::shared_ptr<doris::segment_v2::PinyinConfig
if (!config_) {
config_ = std::make_shared<doris::segment_v2::PinyinConfig>();
}
- if (!(config_->keepFirstLetter || config_->keepSeparateFirstLetter ||
config_->keepFullPinyin ||
- config_->keepJoinedFullPinyin || config_->keepSeparateChinese)) {
- throw Exception(ErrorCode::INVALID_ARGUMENT,
- "pinyin config error, can't disable
separate_first_letter, first_letter "
- "and full_pinyin at the same time.");
- }
candidate_.clear();
terms_filter_.clear();
first_letters_.clear();
@@ -93,7 +87,7 @@ void PinyinTokenizer::processInput() {
}
position_ = 0;
- int ascii_buff_start = -1;
+ int ascii_buff_start_byte = -1;
std::string ascii_buff;
int char_index = 0;
@@ -109,18 +103,15 @@ void PinyinTokenizer::processInput() {
(r.cp >= '0' && r.cp <= '9');
if (is_ascii_context) {
- if (ascii_buff_start < 0) ascii_buff_start = r.byte_start;
+ if (ascii_buff_start_byte < 0) ascii_buff_start_byte =
r.byte_start;
if (is_alnum && config_->keepNoneChinese) {
if (config_->keepNoneChineseTogether) {
ascii_buff.push_back(static_cast<char>(r.cp));
} else {
position_++;
- int char_position = position_;
-
- char_position = char_index + 1;
-
std::string single_char(1, static_cast<char>(r.cp));
- addCandidate(single_char, r.byte_start, r.byte_end,
char_position);
+ // Use byte offset for single ASCII character
+ addCandidate(single_char, r.byte_start, r.byte_end,
char_index + 1);
}
}
if (is_alnum && config_->keepNoneChineseInFirstLetter) {
@@ -131,7 +122,7 @@ void PinyinTokenizer::processInput() {
}
} else {
if (!ascii_buff.empty()) {
- parseBuff(ascii_buff, ascii_buff_start);
+ parseBuff(ascii_buff, ascii_buff_start_byte);
}
bool incr_position = false;
if (!pinyin.empty() && !chinese.empty()) {
@@ -159,20 +150,21 @@ void PinyinTokenizer::processInput() {
}
if (!ascii_buff.empty()) {
- parseBuff(ascii_buff, ascii_buff_start);
+ parseBuff(ascii_buff, ascii_buff_start_byte);
}
}
+ int total_byte_length = runes_.empty() ? 0 : runes_.back().byte_end;
+
if (config_->keepOriginal && !processed_original_) {
processed_original_ = true;
std::string source_utf8 = codepointsToUtf8(source_codepoints_);
- addCandidate(source_utf8, 0, static_cast<int>(source_utf8.length()),
1);
+ addCandidate(source_utf8, 0, total_byte_length, 1);
}
if (config_->keepJoinedFullPinyin && !processed_full_pinyin_letter_ &&
!full_pinyin_letters_.empty()) {
processed_full_pinyin_letter_ = true;
- std::string source_utf8 = codepointsToUtf8(source_codepoints_);
- addCandidate(full_pinyin_letters_, 0,
static_cast<int>(source_utf8.length()), 1);
+ addCandidate(full_pinyin_letters_, 0, total_byte_length, 1);
full_pinyin_letters_.clear();
}
if (config_->keepFirstLetter && !first_letters_.empty() &&
!processed_first_letter_) {
@@ -188,7 +180,7 @@ void PinyinTokenizer::processInput() {
[](unsigned char x) { return
static_cast<char>(std::tolower(x)); });
}
if (!(config_->keepSeparateFirstLetter && fl.length() <= 1)) {
- addCandidate(fl, 0, static_cast<int>(fl.length()), 1);
+ addCandidate(fl, 0, total_byte_length, 1);
}
}
@@ -214,8 +206,14 @@ Token* PinyinTokenizer::next(Token* token) {
size_t size = std::min(text.size(),
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
token->setNoCopy(text.data(), 0, static_cast<int32_t>(size));
- token->setStartOffset(item.start_offset);
- token->setEndOffset(item.end_offset);
+ if (config_->ignorePinyinOffset) {
+ int total_byte_length = runes_.empty() ? 0 :
runes_.back().byte_end;
+ token->setStartOffset(0);
+ token->setEndOffset(total_byte_length);
+ } else {
+ token->setStartOffset(item.start_offset);
+ token->setEndOffset(item.end_offset);
+ }
int offset = item.position - last_increment_position_;
if (offset < 0) offset = 0;
@@ -297,18 +295,22 @@ void PinyinTokenizer::decode_to_runes() {
}
}
-void PinyinTokenizer::parseBuff(std::string& ascii_buff, int&
ascii_buff_start) {
+void PinyinTokenizer::parseBuff(std::string& ascii_buff, int&
ascii_buff_start_byte) {
if (ascii_buff.empty()) return;
if (!config_->keepNoneChinese) {
ascii_buff.clear();
- ascii_buff_start = -1;
+ ascii_buff_start_byte = -1;
return;
}
- int32_t seg_start = ascii_buff_start;
- int32_t seg_end = seg_start + static_cast<int32_t>(ascii_buff.size());
+
+ // Use byte offset for ASCII buffer
+ // ascii_buff_start_byte is the byte position where the buffer started
+ int32_t buff_byte_size = static_cast<int32_t>(ascii_buff.size());
+ int32_t buff_end_byte = ascii_buff_start_byte + buff_byte_size;
+
if (config_->noneChinesePinyinTokenize) {
std::vector<std::string> result =
PinyinAlphabetTokenizer::walk(ascii_buff);
- int32_t start = seg_start;
+ int32_t start = ascii_buff_start_byte;
for (const std::string& t : result) {
int32_t end = config_->fixedPinyinOffset ? start + 1
: start +
static_cast<int32_t>(t.length());
@@ -319,10 +321,10 @@ void PinyinTokenizer::parseBuff(std::string& ascii_buff,
int& ascii_buff_start)
} else if (config_->keepFirstLetter || config_->keepSeparateFirstLetter ||
config_->keepFullPinyin ||
!config_->keepNoneChineseInJoinedFullPinyin) {
position_++;
- addCandidate(ascii_buff, seg_start, seg_end, position_);
+ addCandidate(ascii_buff, ascii_buff_start_byte, buff_end_byte,
position_);
}
ascii_buff.clear();
- ascii_buff_start = -1;
+ ascii_buff_start_byte = -1;
}
std::string PinyinTokenizer::codepointsToUtf8(const std::vector<UChar32>&
codepoints) const {
diff --git
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
index 5a09b0d3715..ef65874af07 100644
---
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
+++
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
@@ -76,7 +76,7 @@ private:
void decode_to_runes();
void processInput();
- void parseBuff(std::string& ascii_buff, int& ascii_buff_start);
+ void parseBuff(std::string& ascii_buff, int& ascii_buff_start_byte);
std::string codepointsToUtf8(const std::vector<UChar32>& codepoints) const;
};
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
index 9ffb3462ec1..8662cace8ce 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
@@ -406,4 +406,359 @@ TEST_F(PinyinFilterTest, TestTokenFilter_OnlyFullPinyin) {
assertTokens(tokens, expected, "Only full pinyin test");
}
+// Test emoji preservation without keep_original setting
+TEST_F(PinyinFilterTest, TestTokenFilter_EmojiPreservation) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false"; // Emoji should still be preserved via
fallback
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ auto tokens = tokenizeWithFilter("⭐白菜", "standard", config);
+
+ // Standard tokenizer outputs "⭐" and "白菜" as separate tokens
+ // "⭐" -> preserved via fallback (no pinyin candidates)
+ // "白菜" -> "bai", "b", "cai", "c", "bc"
+ std::vector<std::string> expected = {"⭐", "bai", "b", "cai", "c"};
+ assertTokens(tokens, expected, "StandardTokenizer + Emoji preservation");
+}
+
+// Test pure emoji input
+TEST_F(PinyinFilterTest, TestTokenFilter_PureEmoji) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ auto tokens = tokenizeWithFilter("⭐", "standard", config);
+
+ std::vector<std::string> expected = {"⭐"};
+ assertTokens(tokens, expected, "Pure emoji should be preserved");
+}
+
+// Test multiple emojis
+TEST_F(PinyinFilterTest, TestTokenFilter_MultipleEmojis) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ auto tokens = tokenizeWithFilter("🎉中国🚀", "standard", config);
+
+ std::vector<std::string> expected = {"🎉", "zhong", "z", "guo", "g", "🚀"};
+ assertTokens(tokens, expected, "Multiple emojis with Chinese");
+}
+
+// Test keepNoneChineseTogether = false with letters and digits
+TEST_F(PinyinFilterTest, TestTokenFilter_KeepNoneChineseTogetherFalse) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "true";
+ config["keep_none_chinese"] = "true";
+ config["keep_none_chinese_together"] = "false";
+ config["none_chinese_pinyin_tokenize"] = "true";
+ config["lowercase"] = "true";
+ config["remove_duplicated_term"] = "true";
+ config["ignore_pinyin_offset"] = "false";
+
+ auto tokens = tokenizeWithFilter("刘德华ABC123", "keyword", config);
+
+ // Letters are split individually, digits are split individually too
+ std::vector<std::string> expected = {
+ "liu", "刘德华abc123", "ldhabc123", "de", "hua", "a", "b", "c", "1",
"2", "3"};
+ assertTokens(tokens, expected, "KeepNoneChineseTogether=false with mixed
content");
+}
+
+// Test keepNoneChineseTogether = false with pure letters
+TEST_F(PinyinFilterTest,
TestTokenFilter_KeepNoneChineseTogetherFalse_PureLetters) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true"; // Need at least one output format
enabled
+ config["keep_full_pinyin"] = "false";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "true";
+ config["keep_none_chinese_together"] = "false";
+ config["none_chinese_pinyin_tokenize"] = "true";
+ config["lowercase"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ auto tokens = tokenizeWithFilter("DEL", "keyword", config);
+
+ // All letters should be split individually, plus the result from
PinyinAlphabetTokenizer
+ // With keep_first_letter=true, we get the combined first letter too
+ std::vector<std::string> expected = {"D", "DEL", "E", "L"};
+ assertTokens(tokens, expected, "KeepNoneChineseTogether=false pure
letters");
+}
+
+// Test Unicode symbols fallback when no candidates generated
+TEST_F(PinyinFilterTest, TestTokenFilter_UnicodeFallback) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false"; // No keep_original but symbols should
still be preserved
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // Use keyword tokenizer to ensure the input is passed as-is to the filter
+ auto tokens = tokenizeWithFilter("①②③", "keyword", config);
+
+ // Unicode symbols should be preserved even without keep_original via
fallback
+ std::vector<std::string> expected = {"①②③"};
+ assertTokens(tokens, expected, "Unicode symbols preserved without
keep_original");
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_NullConfig) {
+ std::string text = "测试"; // Keep string alive
+ auto reader = std::make_shared<lucene::util::SStringReader<char>>();
+ reader->init(text.data(), text.size(), false);
+
+ StandardTokenizerFactory tokenizer_factory;
+ Settings tokenizer_settings;
+ tokenizer_factory.initialize(tokenizer_settings);
+ auto tokenizer = tokenizer_factory.create();
+ tokenizer->set_reader(reader);
+ tokenizer->reset();
+
+ // Create filter with nullptr config directly
+ PinyinFilter filter(tokenizer, nullptr);
+ filter.initialize();
+
+ Token token;
+ // Should work with default config without crashing
+ EXPECT_NE(filter.next(&token), nullptr);
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_KeepNoneChineseFalse) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ auto tokens = tokenizeWithFilter("测试ABC123", "standard", config);
+ std::vector<std::string> expected = {"ce", "c", "shi", "s", "abc123"};
+ assertTokens(tokens, expected, "KeepNoneChineseFalse test");
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_NoneChinesePinyinTokenizeFalse) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "true";
+ config["none_chinese_pinyin_tokenize"] = "false"; // Don't tokenize ASCII
+ config["ignore_pinyin_offset"] = "false";
+
+ auto tokens = tokenizeWithFilter("刘德华ABC123", "standard", config);
+ std::vector<std::string> expected = {"liu", "l", "de", "d", "hua", "h",
"abc123"};
+ assertTokens(tokens, expected, "NoneChinesePinyinTokenizeFalse test");
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_WhitespaceOnly) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "true";
+ config["keep_none_chinese"] = "true";
+ config["ignore_pinyin_offset"] = "false";
+
+ // Test with whitespace-only and mixed content
+ auto tokens = tokenizeWithFilter(" 测试 ", "keyword", config);
+ std::vector<std::string> expected = {"ce", "测试", "cs", "shi"};
+ assertTokens(tokens, expected, "WhitespaceOnly test");
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_PositionIncrement) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "true";
+ config["keep_none_chinese"] = "true";
+ config["keep_none_chinese_together"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // Use multiple Chinese characters to trigger position increment logic
+ auto tokens = tokenizeWithFilter("刘德华", "standard", config);
+
+ std::vector<std::string> expected = {"liu", "刘", "l", "de", "德", "d",
"hua", "华", "h"};
+ assertTokens(tokens, expected, "PositionIncrement test");
+}
+
+// Test reset() method - verifies PinyinFilter::reset() correctly resets state
+// This tests the code path in pinyin_filter.cpp:99-110
+TEST_F(PinyinFilterTest, TestTokenFilter_Reset) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // First tokenization - tests that filter works correctly
+ auto tokens1 = tokenizeWithFilter("刘德华", "keyword", config);
+
+ EXPECT_GT(tokens1.size(), 0) << "First tokenization should produce tokens";
+ bool has_liu = std::find(tokens1.begin(), tokens1.end(), "liu") !=
tokens1.end();
+ EXPECT_TRUE(has_liu) << "First tokenization should contain 'liu'";
+
+ // Second tokenization with different text - tests that filter state is
independent
+ auto tokens2 = tokenizeWithFilter("张学友", "keyword", config);
+
+ EXPECT_GT(tokens2.size(), 0) << "Second tokenization should produce
tokens";
+ bool has_zhang = std::find(tokens2.begin(), tokens2.end(), "zhang") !=
tokens2.end();
+ EXPECT_TRUE(has_zhang) << "Second tokenization should contain 'zhang'";
+
+ // Ensure tokens from first text are not in second result (state isolation)
+ bool has_liu_in_second = std::find(tokens2.begin(), tokens2.end(), "liu")
!= tokens2.end();
+ EXPECT_FALSE(has_liu_in_second)
+ << "Second tokenization should NOT contain 'liu' from first text";
+}
+
+// Test reset() with empty input after valid input
+TEST_F(PinyinFilterTest, TestTokenFilter_ResetWithEmptyInput) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // First tokenization with valid text
+ auto tokens1 = tokenizeWithFilter("测试", "keyword", config);
+ EXPECT_GT(tokens1.size(), 0) << "First tokenization should produce tokens";
+
+ // Tokenization with empty text
+ auto tokens2 = tokenizeWithFilter("", "keyword", config);
+ EXPECT_EQ(tokens2.size(), 0) << "Empty input should produce no tokens";
+}
+
+// Test Unicode symbol preservation fallback - tests pinyin_filter.cpp:243-259
+// When pinyin_list and chinese_list are empty but there are non-ASCII Unicode
chars,
+// the filter should preserve the original token
+TEST_F(PinyinFilterTest, TestTokenFilter_UnicodeSymbolPreservation) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false"; // Even without keep_original, Unicode
should be preserved
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // Test circled numbers - these are Unicode symbols that cannot be
converted to pinyin
+ auto tokens1 = tokenizeWithFilter("①②③", "keyword", config);
+ EXPECT_EQ(tokens1.size(), 1) << "Circled numbers should be preserved as
one token";
+ EXPECT_EQ(tokens1[0], "①②③") << "Circled numbers should be preserved";
+
+ // Test other Unicode symbols
+ auto tokens2 = tokenizeWithFilter("★☆♠♥", "keyword", config);
+ EXPECT_EQ(tokens2.size(), 1) << "Card suit symbols should be preserved";
+ EXPECT_EQ(tokens2[0], "★☆♠♥") << "Card suit symbols should be preserved";
+
+ // Test mathematical symbols
+ auto tokens3 = tokenizeWithFilter("∑∏∫∂", "keyword", config);
+ EXPECT_EQ(tokens3.size(), 1) << "Math symbols should be preserved";
+ EXPECT_EQ(tokens3[0], "∑∏∫∂") << "Math symbols should be preserved";
+}
+
+// Test Unicode symbols mixed with Chinese characters
+// Note: Standard tokenizer may filter out some Unicode symbols, so we test
with keyword tokenizer
+TEST_F(PinyinFilterTest, TestTokenFilter_UnicodeSymbolsWithChinese) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // Use keyword tokenizer to keep the whole input as one token
+ // This tests the fallback logic for mixed Unicode symbols and Chinese
+ auto tokens = tokenizeWithFilter("①中国", "keyword", config);
+
+ // With keyword tokenizer, pinyin filter should process the whole string
+ // Chinese characters get converted to pinyin, but we need to check
fallback behavior
+ bool has_zhong = std::find(tokens.begin(), tokens.end(), "zhong") !=
tokens.end();
+ bool has_guo = std::find(tokens.begin(), tokens.end(), "guo") !=
tokens.end();
+
+ EXPECT_TRUE(has_zhong) << "Should have pinyin 'zhong'";
+ EXPECT_TRUE(has_guo) << "Should have pinyin 'guo'";
+
+ // Test that Chinese pinyin is correctly generated even with Unicode prefix
+ EXPECT_GT(tokens.size(), 0) << "Should produce tokens";
+}
+
+// Test pure emoji preservation
+TEST_F(PinyinFilterTest, TestTokenFilter_PureEmojiPreservation) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // Pure emoji should be preserved via the fallback logic
+ auto tokens = tokenizeWithFilter("😀😁😂", "keyword", config);
+ EXPECT_EQ(tokens.size(), 1) << "Pure emoji should be preserved as one
token";
+ EXPECT_EQ(tokens[0], "😀😁😂") << "Emoji string should be preserved";
+}
+
+// Test that ASCII-only input without Unicode symbols returns false
+// This tests the code path: if (!has_unicode_symbols) { return false; }
+TEST_F(PinyinFilterTest, TestTokenFilter_AsciiOnlyFallbackHandling) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "false";
+ config["keep_full_pinyin"] = "false";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "false"; // This will cause ASCII to have no
candidates
+ config["keep_joined_full_pinyin"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // With all options disabled, pure ASCII should not produce tokens
+ // because it has no Unicode symbols to preserve via fallback
+ auto tokens = tokenizeWithFilter("abc123", "keyword", config);
+ // The filter returns the original token when no candidates are generated
+ // but for pure ASCII without Unicode symbols, it returns false
+ // Actually, looking at the code, it seems like it would return the
original
+ // Let's verify the actual behavior
+ EXPECT_LE(tokens.size(), 1)
+ << "Pure ASCII with no output options should produce minimal
tokens";
+}
+
+// Test currency and special Unicode symbols
+TEST_F(PinyinFilterTest, TestTokenFilter_CurrencySymbols) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // Currency symbols are Unicode but not Chinese
+ auto tokens = tokenizeWithFilter("€£¥₹", "keyword", config);
+ EXPECT_EQ(tokens.size(), 1) << "Currency symbols should be preserved";
+ EXPECT_EQ(tokens[0], "€£¥₹") << "Currency symbols should be preserved
as-is";
+}
+
+// Test Japanese/Korean characters (CJK but not in Chinese pinyin dict)
+TEST_F(PinyinFilterTest, TestTokenFilter_NonChineseCJK) {
+ std::unordered_map<std::string, std::string> config;
+ config["keep_first_letter"] = "true";
+ config["keep_full_pinyin"] = "true";
+ config["keep_original"] = "false";
+ config["keep_none_chinese"] = "false";
+ config["ignore_pinyin_offset"] = "false";
+
+ // Japanese hiragana - these are non-ASCII Unicode but not Chinese
+ auto tokens1 = tokenizeWithFilter("あいう", "keyword", config);
+ EXPECT_EQ(tokens1.size(), 1) << "Japanese hiragana should be preserved";
+ EXPECT_EQ(tokens1[0], "あいう") << "Japanese hiragana should be preserved
as-is";
+
+ // Korean hangul
+ auto tokens2 = tokenizeWithFilter("한글", "keyword", config);
+ EXPECT_EQ(tokens2.size(), 1) << "Korean hangul should be preserved";
+ EXPECT_EQ(tokens2[0], "한글") << "Korean hangul should be preserved as-is";
+}
+
} // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
index a21fc7cb290..b7b124fa83f 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
@@ -1370,3 +1370,440 @@ TEST_F(PinyinAnalysisTest, TestRepeatedCharacters) {
}
EXPECT_LE(de_count, 1) << "Should have at most one 'de' with
removeDuplicatedTerm";
}
+
+// Test emoji handling in PinyinTokenizer - emojis should be dropped
+TEST_F(PinyinAnalysisTest, TestTokenizer_EmojiShouldBeDropped) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.keepNoneChinese = false;
+ config.ignorePinyinOffset = false;
+
+ // When using PinyinTokenizer (not filter), emojis should be dropped
+ verifyTokens("⭐白菜", config, {"bai", "bc", "cai"});
+}
+
+// Test pure emoji input in PinyinTokenizer - should return empty
+TEST_F(PinyinAnalysisTest, TestTokenizer_PureEmojiDropped) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.keepNoneChinese = false;
+ config.ignorePinyinOffset = false;
+
+ verifyTokens("⭐🎉", config, {});
+}
+
+// Test multiple emojis with Chinese in PinyinTokenizer - emojis dropped
+TEST_F(PinyinAnalysisTest, TestTokenizer_MultipleEmojisDropped) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.keepNoneChinese = false;
+ config.ignorePinyinOffset = false;
+
+ // Emojis should be dropped, only Chinese pinyin remains
+ verifyTokens("🎉中国🚀", config, {"zhong", "zg", "guo"});
+}
+
+// Test keepNoneChineseTogether = false with PinyinTokenizer
+TEST_F(PinyinAnalysisTest, TestTokenizer_KeepNoneChineseTogetherFalse) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = true;
+ config.keepNoneChinese = true;
+ config.keepNoneChineseTogether = false;
+ config.noneChinesePinyinTokenize = true;
+ config.lowercase = true;
+ config.removeDuplicatedTerm = true;
+ config.ignorePinyinOffset = false;
+
+ // Both letters and digits should be split individually
+ verifyTokens("刘德华ABC123", config,
+ {"liu", "刘德华abc123", "ldhabc123", "de", "hua", "a", "b", "c",
"1", "2", "3"});
+}
+
+// Test Unicode symbols in PinyinTokenizer - should be dropped like emojis
+TEST_F(PinyinAnalysisTest, TestTokenizer_UnicodeSymbolsDropped) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.keepNoneChinese = false;
+ config.ignorePinyinOffset = false;
+
+ // Unicode symbols like circled numbers should be dropped
+ verifyTokens("①②③中国", config, {"zhong", "zg", "guo"});
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_SingleChinese) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.keepJoinedFullPinyin = false;
+ config.keepSeparateFirstLetter = false;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "刘德华";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 9;
+ std::vector<std::string> expected_terms = {"liu", "ldh", "de", "hua"};
+ EXPECT_EQ(tokens.size(), expected_terms.size());
+ for (size_t i = 0; i < tokens.size(); ++i) {
+ EXPECT_EQ(tokens[i].term, expected_terms[i]) << "Token mismatch at
position " << i;
+ EXPECT_EQ(tokens[i].startOffset, 0) << "Token: " << tokens[i].term;
+ EXPECT_EQ(tokens[i].endOffset, total_bytes) << "Token: " <<
tokens[i].term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_KeepOriginal) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = true;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "你好";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 6;
+ for (const auto& token : tokens) {
+ EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+ EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_JoinedFullPinyin) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepJoinedFullPinyin = true;
+ config.keepOriginal = false;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "中国";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 6;
+ for (const auto& token : tokens) {
+ EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+ EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_AsciiOnly) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepNoneChinese = true;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "hello";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 5;
+ for (const auto& token : tokens) {
+ EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+ EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_WithNumbers) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepNoneChinese = true;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "北京2008";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 10;
+ for (const auto& token : tokens) {
+ EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+ EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_SpecialChars) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepNoneChinese = true;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "A股B股";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 8;
+ for (const auto& token : tokens) {
+ EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+ EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_Polyphone) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "银行";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 6;
+ for (const auto& token : tokens) {
+ EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+ EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_LongText) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "我爱北京天安门";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 21;
+ for (const auto& token : tokens) {
+ EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+ EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_SingleChinese) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.keepJoinedFullPinyin = false;
+ config.keepSeparateFirstLetter = false;
+ config.ignorePinyinOffset = false;
+
+ std::string text = "刘德华";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ for (const auto& token : tokens) {
+ if (token.term == "liu") {
+ EXPECT_EQ(token.startOffset, 0);
+ EXPECT_EQ(token.endOffset, 3);
+ } else if (token.term == "de") {
+ EXPECT_EQ(token.startOffset, 3);
+ EXPECT_EQ(token.endOffset, 6);
+ } else if (token.term == "hua") {
+ EXPECT_EQ(token.startOffset, 6);
+ EXPECT_EQ(token.endOffset, 9);
+ } else if (token.term == "ldh") {
+ EXPECT_EQ(token.startOffset, 0);
+ EXPECT_EQ(token.endOffset, 9);
+ }
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_MultiByteCharacters) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.ignorePinyinOffset = false;
+
+ std::string text = "你好";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ bool found_ni = false;
+ bool found_hao = false;
+ bool found_nh = false;
+
+ for (const auto& token : tokens) {
+ if (token.term == "ni") {
+ EXPECT_EQ(token.startOffset, 0);
+ EXPECT_EQ(token.endOffset, 3);
+ found_ni = true;
+ } else if (token.term == "hao") {
+ EXPECT_EQ(token.startOffset, 3);
+ EXPECT_EQ(token.endOffset, 6);
+ found_hao = true;
+ } else if (token.term == "nh") {
+ EXPECT_EQ(token.startOffset, 0);
+ EXPECT_EQ(token.endOffset, 6);
+ found_nh = true;
+ }
+ }
+
+ EXPECT_TRUE(found_ni) << "Token 'ni' not found";
+ EXPECT_TRUE(found_hao) << "Token 'hao' not found";
+ EXPECT_TRUE(found_nh) << "Token 'nh' not found";
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_MixedContent) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.ignorePinyinOffset = true;
+
+ std::string text = "刘a德";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ int total_bytes = 7;
+ std::vector<std::string> expected_terms = {"liu", "lad", "a", "de"};
+ EXPECT_EQ(tokens.size(), expected_terms.size());
+ for (size_t i = 0; i < tokens.size(); ++i) {
+ EXPECT_EQ(tokens[i].term, expected_terms[i]) << "Token mismatch at
position " << i;
+ EXPECT_EQ(tokens[i].startOffset, 0) << "Token: " << tokens[i].term;
+ EXPECT_EQ(tokens[i].endOffset, total_bytes) << "Token: " <<
tokens[i].term;
+ }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_MixedContent) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.ignorePinyinOffset = false;
+
+ std::string text = "刘a德";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ bool found_liu = false;
+ bool found_a = false;
+ bool found_de = false;
+
+ for (const auto& token : tokens) {
+ if (token.term == "liu") {
+ EXPECT_EQ(token.startOffset, 0);
+ EXPECT_EQ(token.endOffset, 3);
+ found_liu = true;
+ } else if (token.term == "a") {
+ EXPECT_EQ(token.startOffset, 3);
+ EXPECT_EQ(token.endOffset, 4);
+ found_a = true;
+ } else if (token.term == "de") {
+ EXPECT_EQ(token.startOffset, 4);
+ EXPECT_EQ(token.endOffset, 7);
+ found_de = true;
+ } else if (token.term == "lad") {
+ EXPECT_EQ(token.startOffset, 0);
+ EXPECT_EQ(token.endOffset, 7);
+ }
+ }
+
+ EXPECT_TRUE(found_liu) << "Token 'liu' not found";
+ EXPECT_TRUE(found_a) << "Token 'a' not found";
+ EXPECT_TRUE(found_de) << "Token 'de' not found";
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_PolyphoneWords) {
+ PinyinConfig config;
+ config.keepFirstLetter = true;
+ config.keepFullPinyin = true;
+ config.keepOriginal = false;
+ config.ignorePinyinOffset = false;
+
+ std::string text = "银行";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_GT(tokens.size(), 0);
+
+ bool found_yin = false;
+ bool found_xing_or_hang = false;
+
+ for (const auto& token : tokens) {
+ if (token.term == "yin") {
+ EXPECT_EQ(token.startOffset, 0);
+ EXPECT_EQ(token.endOffset, 3);
+ found_yin = true;
+ } else if (token.term == "xing" || token.term == "hang") {
+ EXPECT_EQ(token.startOffset, 3);
+ EXPECT_EQ(token.endOffset, 6);
+ found_xing_or_hang = true;
+ } else if (token.term == "yx" || token.term == "yh") {
+ EXPECT_EQ(token.startOffset, 0);
+ EXPECT_EQ(token.endOffset, 6);
+ }
+ }
+
+ EXPECT_TRUE(found_yin) << "Token 'yin' not found";
+ EXPECT_TRUE(found_xing_or_hang) << "Token 'xing' or 'hang' not found";
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_JoinedFullPinyin) {
+ PinyinConfig config;
+ config.keepFirstLetter = false;
+ config.keepFullPinyin = false;
+ config.keepJoinedFullPinyin = true;
+ config.keepOriginal = false;
+ config.ignorePinyinOffset = false;
+
+ std::string text = "刘德华";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_EQ(tokens.size(), 1);
+ EXPECT_EQ(tokens[0].term, "liudehua");
+ EXPECT_EQ(tokens[0].startOffset, 0);
+ EXPECT_EQ(tokens[0].endOffset, 9);
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_KeepOriginal) {
+ PinyinConfig config;
+ config.keepFirstLetter = false;
+ config.keepFullPinyin = false;
+ config.keepOriginal = true;
+ config.ignorePinyinOffset = false;
+
+ std::string text = "刘德华";
+ auto result = getStringArrayListHashMap({text}, config);
+ auto& tokens = result[text];
+
+ ASSERT_EQ(tokens.size(), 1);
+ EXPECT_EQ(tokens[0].term, "刘德华");
+ EXPECT_EQ(tokens[0].startOffset, 0);
+ EXPECT_EQ(tokens[0].endOffset, 9);
+}
diff --git
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp
index 605beafbfdd..8badf98a272 100644
---
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp
+++
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp
@@ -27,6 +27,7 @@
#include "common/config.h"
#include "common/logging.h"
#include
"olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_format.h"
+#include
"olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.h"
#include "unicode/utf8.h"
namespace doris::segment_v2::inverted_index {
@@ -682,4 +683,125 @@ TEST_F(PinyinUtilTest, TestUtf8CharCountVariousInputs) {
EXPECT_EQ(getUtf8CharCount("中国abc"), 5);
}
+// Test YuCharType::WITH_U_UNICODE - tests PinyinFormatter replaceAll(result,
"u:", "ü")
+// Note: "绿" returns "lv" from polyphone dict matching, while "女"/"律" return
"u:" format
+TEST_F(PinyinUtilTest, TestYuCharTypeWithUUnicode) {
+ auto& pinyin_util = PinyinUtil::instance();
+
+ // This tests the code path: case YuCharType::WITH_U_UNICODE: result =
replaceAll(result, "u:", "ü");
+ PinyinFormat unicode_u_format(YuCharType::WITH_U_UNICODE,
ToneType::WITHOUT_TONE,
+ CaseType::LOWERCASE);
+
+ // Test character "女" - returns "nu:3" from main dictionary, should become
"nü"
+ std::vector<std::string> result1 =
+ pinyin_util.convert(stringToCodepoints("女"), unicode_u_format);
+ EXPECT_EQ(result1.size(), 1);
+ EXPECT_EQ(result1[0], "nü") << "女 should be 'nü' with WITH_U_UNICODE
format";
+
+ // Test character "律" - returns "lu:4" from main dictionary, should become
"lü"
+ std::vector<std::string> result2 =
+ pinyin_util.convert(stringToCodepoints("律"), unicode_u_format);
+ EXPECT_EQ(result2.size(), 1);
+ EXPECT_EQ(result2[0], "lü") << "律 should be 'lü' with WITH_U_UNICODE
format";
+
+ // Test character "绿" - returns "lv" from polyphone dict (no u: to convert)
+ std::vector<std::string> result3 =
+ pinyin_util.convert(stringToCodepoints("绿"), unicode_u_format);
+ EXPECT_EQ(result3.size(), 1);
+ EXPECT_EQ(result3[0], "lv") << "绿 returns 'lv' from polyphone dict";
+
+ // Test character "旅" - check what format it returns
+ std::vector<std::string> result4 =
+ pinyin_util.convert(stringToCodepoints("旅"), unicode_u_format);
+ EXPECT_EQ(result4.size(), 1);
+ EXPECT_TRUE(result4[0] == "lü" || result4[0] == "lv")
+ << "旅 should be 'lü' or 'lv', got: " << result4[0];
+}
+
+// Test YuCharType::WITH_V - tests PinyinFormatter replaceAll(result, "u:",
"v")
+TEST_F(PinyinUtilTest, TestYuCharTypeWithV) {
+ auto& pinyin_util = PinyinUtil::instance();
+
+ PinyinFormat v_format(YuCharType::WITH_V, ToneType::WITHOUT_TONE,
CaseType::LOWERCASE);
+
+ // Test character "女" - should convert u: to v
+ std::vector<std::string> result1 =
pinyin_util.convert(stringToCodepoints("女"), v_format);
+ EXPECT_EQ(result1.size(), 1);
+ EXPECT_EQ(result1[0], "nv") << "女 should be 'nv' with WITH_V format";
+
+ // Test character "绿"
+ std::vector<std::string> result2 =
pinyin_util.convert(stringToCodepoints("绿"), v_format);
+ EXPECT_EQ(result2.size(), 1);
+ EXPECT_EQ(result2[0], "lv") << "绿 should be 'lv' with WITH_V format";
+}
+
+// Test YuCharType::WITH_U_AND_COLON (default) - keeps "u:" as is
+TEST_F(PinyinUtilTest, TestYuCharTypeWithUAndColon) {
+ auto& pinyin_util = PinyinUtil::instance();
+
+ PinyinFormat colon_format(YuCharType::WITH_U_AND_COLON,
ToneType::WITHOUT_TONE,
+ CaseType::LOWERCASE);
+
+ // Test character "女" - returns "nu:3" from main dict, should keep "nu:"
+ std::vector<std::string> result1 =
pinyin_util.convert(stringToCodepoints("女"), colon_format);
+ EXPECT_EQ(result1.size(), 1);
+ EXPECT_EQ(result1[0], "nu:") << "女 should be 'nu:' with WITH_U_AND_COLON
format";
+
+ // Test character "律" - returns "lu:4" from main dict, should keep "lu:"
+ std::vector<std::string> result2 =
pinyin_util.convert(stringToCodepoints("律"), colon_format);
+ EXPECT_EQ(result2.size(), 1);
+ EXPECT_EQ(result2[0], "lu:") << "律 should be 'lu:' with WITH_U_AND_COLON
format";
+}
+
+// Test YuCharType with different CaseTypes
+TEST_F(PinyinUtilTest, TestYuCharTypeWithCaseTypes) {
+ auto& pinyin_util = PinyinUtil::instance();
+
+ // Test WITH_U_UNICODE + UPPERCASE
+ PinyinFormat unicode_upper(YuCharType::WITH_U_UNICODE,
ToneType::WITHOUT_TONE,
+ CaseType::UPPERCASE);
+ std::vector<std::string> result1 =
pinyin_util.convert(stringToCodepoints("女"), unicode_upper);
+ EXPECT_EQ(result1.size(), 1);
+ // Note: ü is a multi-byte UTF-8 character, uppercase might not work as
expected
+ // The important thing is that u: was replaced with ü before uppercasing
+
+ // Test WITH_V + UPPERCASE
+ PinyinFormat v_upper(YuCharType::WITH_V, ToneType::WITHOUT_TONE,
CaseType::UPPERCASE);
+ std::vector<std::string> result2 =
pinyin_util.convert(stringToCodepoints("女"), v_upper);
+ EXPECT_EQ(result2.size(), 1);
+ EXPECT_EQ(result2[0], "NV") << "女 should be 'NV' with WITH_V + UPPERCASE";
+
+ // Test WITH_V + CAPITALIZE
+ PinyinFormat v_capitalize(YuCharType::WITH_V, ToneType::WITHOUT_TONE,
CaseType::CAPITALIZE);
+ std::vector<std::string> result3 =
pinyin_util.convert(stringToCodepoints("女"), v_capitalize);
+ EXPECT_EQ(result3.size(), 1);
+ EXPECT_EQ(result3[0], "Nv") << "女 should be 'Nv' with WITH_V + CAPITALIZE";
+}
+
+// Test PinyinFormatter directly to ensure u: conversion logic works correctly
+// This tests the code path regardless of dictionary format
+TEST_F(PinyinUtilTest, TestPinyinFormatterUColonConversion) {
+ // Test PinyinFormatter directly to verify u: -> ü conversion
+ PinyinFormat unicode_u_format(YuCharType::WITH_U_UNICODE,
ToneType::WITHOUT_TONE,
+ CaseType::LOWERCASE);
+
+ // Test with known u: input strings
+ std::string result1 = PinyinFormatter::formatPinyin("nu:3",
unicode_u_format);
+ EXPECT_EQ(result1, "nü") << "nu:3 should become 'nü' with WITH_U_UNICODE
format";
+
+ std::string result2 = PinyinFormatter::formatPinyin("lu:4",
unicode_u_format);
+ EXPECT_EQ(result2, "lü") << "lu:4 should become 'lü' with WITH_U_UNICODE
format";
+
+ // Test WITH_V format
+ PinyinFormat v_format(YuCharType::WITH_V, ToneType::WITHOUT_TONE,
CaseType::LOWERCASE);
+ std::string result3 = PinyinFormatter::formatPinyin("nu:3", v_format);
+ EXPECT_EQ(result3, "nv") << "nu:3 should become 'nv' with WITH_V format";
+
+ // Test WITH_U_AND_COLON format
+ PinyinFormat colon_format(YuCharType::WITH_U_AND_COLON,
ToneType::WITHOUT_TONE,
+ CaseType::LOWERCASE);
+ std::string result4 = PinyinFormatter::formatPinyin("nu:3", colon_format);
+ EXPECT_EQ(result4, "nu:") << "nu:3 should become 'nu:' with
WITH_U_AND_COLON format";
+}
+
} // namespace doris::segment_v2::inverted_index
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
index 278a1d3e343..c6fac8622c1 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
@@ -108,42 +108,17 @@ public class PinyinTokenFilterValidator extends
BasePolicyValidator {
*/
private void validateConfigurationLogic(Map<String, String> props) throws
DdlException {
// ensure at least one output format is enabled
- boolean keepOriginal = getBooleanValue(props, "keep_original", false);
boolean keepFirstLetter = getBooleanValue(props, "keep_first_letter",
true);
boolean keepFullPinyin = getBooleanValue(props, "keep_full_pinyin",
true);
boolean keepJoinedFullPinyin = getBooleanValue(props,
"keep_joined_full_pinyin", false);
boolean keepSeparateFirstLetter = getBooleanValue(props,
"keep_separate_first_letter", false);
boolean keepSeparateChinese = getBooleanValue(props,
"keep_separate_chinese", false);
- if (!keepOriginal && !keepFirstLetter && !keepFullPinyin
- && !keepJoinedFullPinyin && !keepSeparateFirstLetter &&
!keepSeparateChinese) {
- throw new DdlException("At least one output format must be
enabled: "
- + "keep_original, keep_first_letter, keep_full_pinyin,
keep_joined_full_pinyin, "
- + "keep_separate_first_letter, or keep_separate_chinese");
- }
-
- // validate keep_separate_first_letter and keep_first_letter
relationship
- if (keepSeparateFirstLetter && !keepFirstLetter) {
- throw new DdlException("keep_separate_first_letter requires
keep_first_letter to be enabled");
- }
-
- // validate keep_none_chinese_in_first_letter and keep_first_letter
relationship
- boolean keepNoneChineseInFirstLetter = getBooleanValue(props,
"keep_none_chinese_in_first_letter", true);
- if (keepNoneChineseInFirstLetter && !keepFirstLetter) {
- throw new DdlException("keep_none_chinese_in_first_letter requires
keep_first_letter to be enabled");
- }
-
- // validate keep_none_chinese_in_joined_full_pinyin and
keep_joined_full_pinyin relationship
- boolean keepNoneChineseInJoinedFullPinyin = getBooleanValue(props,
"keep_none_chinese_in_joined_full_pinyin",
- false);
- if (keepNoneChineseInJoinedFullPinyin && !keepJoinedFullPinyin) {
- throw new DdlException("keep_none_chinese_in_joined_full_pinyin
requires keep_joined_full_pinyin "
- + "to be enabled");
- }
-
- // validate limit_first_letter_length and keep_first_letter
relationship
- if (props.containsKey("limit_first_letter_length") &&
!keepFirstLetter) {
- throw new DdlException("limit_first_letter_length is only valid
when keep_first_letter is enabled");
+ if (!keepFirstLetter && !keepFullPinyin && !keepJoinedFullPinyin
+ && !keepSeparateFirstLetter && !keepSeparateChinese) {
+ throw new DdlException("pinyin config error, at least one output
format must be enabled "
+ + "(keep_first_letter, keep_separate_first_letter,
keep_full_pinyin, "
+ + "keep_joined_full_pinyin, or keep_separate_chinese).");
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
index cf2010af6e3..96a485a527f 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
@@ -106,42 +106,17 @@ public class PinyinTokenizerValidator extends
BasePolicyValidator {
*/
private void validateConfigurationLogic(Map<String, String> props) throws
DdlException {
// ensure at least one output format is enabled
- boolean keepOriginal = getBooleanValue(props, "keep_original", false);
boolean keepFirstLetter = getBooleanValue(props, "keep_first_letter",
true);
boolean keepFullPinyin = getBooleanValue(props, "keep_full_pinyin",
true);
boolean keepJoinedFullPinyin = getBooleanValue(props,
"keep_joined_full_pinyin", false);
boolean keepSeparateFirstLetter = getBooleanValue(props,
"keep_separate_first_letter", false);
boolean keepSeparateChinese = getBooleanValue(props,
"keep_separate_chinese", false);
- if (!keepOriginal && !keepFirstLetter && !keepFullPinyin
- && !keepJoinedFullPinyin && !keepSeparateFirstLetter &&
!keepSeparateChinese) {
- throw new DdlException("At least one output format must be
enabled: "
- + "keep_original, keep_first_letter, keep_full_pinyin,
keep_joined_full_pinyin, "
- + "keep_separate_first_letter, or keep_separate_chinese");
- }
-
- // validate keep_separate_first_letter and keep_first_letter
relationship
- if (keepSeparateFirstLetter && !keepFirstLetter) {
- throw new DdlException("keep_separate_first_letter requires
keep_first_letter to be enabled");
- }
-
- // validate keep_none_chinese_in_first_letter and keep_first_letter
relationship
- boolean keepNoneChineseInFirstLetter = getBooleanValue(props,
"keep_none_chinese_in_first_letter", true);
- if (keepNoneChineseInFirstLetter && !keepFirstLetter) {
- throw new DdlException("keep_none_chinese_in_first_letter requires
keep_first_letter to be enabled");
- }
-
- // validate keep_none_chinese_in_joined_full_pinyin and
keep_joined_full_pinyin relationship
- boolean keepNoneChineseInJoinedFullPinyin = getBooleanValue(props,
- "keep_none_chinese_in_joined_full_pinyin", false);
- if (keepNoneChineseInJoinedFullPinyin && !keepJoinedFullPinyin) {
- throw new DdlException("keep_none_chinese_in_joined_full_pinyin "
- + "requires keep_joined_full_pinyin to be enabled");
- }
-
- // validate limit_first_letter_length and keep_first_letter
relationship
- if (props.containsKey("limit_first_letter_length") &&
!keepFirstLetter) {
- throw new DdlException("limit_first_letter_length is only valid
when keep_first_letter is enabled");
+ if (!keepFirstLetter && !keepFullPinyin && !keepJoinedFullPinyin
+ && !keepSeparateFirstLetter && !keepSeparateChinese) {
+ throw new DdlException("pinyin config error, at least one output
format must be enabled "
+ + "(keep_first_letter, keep_separate_first_letter,
keep_full_pinyin, "
+ + "keep_joined_full_pinyin, or keep_separate_chinese).");
}
}
diff --git
a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
index e6617f25a1c..44852a591e3 100644
--- a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
+++ b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
@@ -87,7 +87,7 @@
[{\n "token": "d"\n }, {\n "token": "dj音乐家"\n }, {\n
"token": "djyyj"\n }, {\n "token": "j"\n }, {\n "token":
"yin"\n }, {\n "token": "yue"\n }, {\n "token": "jia"\n
}]
-- !tokenize_pinyin13 --
-[{\n "token": "liu"\n }, {\n "token": "刘德华abc123"\n }, {\n
"token": "ldhabc123"\n }, {\n "token": "de"\n }, {\n
"token": "hua"\n }, {\n "token": "a"\n }, {\n "token":
"b"\n }, {\n "token": "c"\n }, {\n "token": "123"\n }]
+[{\n "token": "liu"\n }, {\n "token": "刘德华abc123"\n }, {\n
"token": "ldhabc123"\n }, {\n "token": "de"\n }, {\n
"token": "hua"\n }, {\n "token": "a"\n }, {\n "token":
"b"\n }, {\n "token": "c"\n }, {\n "token": "1"\n }, {\n
"token": "2"\n }, {\n "token": "3"\n }]
-- !sql --
1 abcDEF
@@ -266,3 +266,39 @@
-- !sql_standard_pinyin4 --
2 刘德华 张学友
+-- !sql_ignore_offset_true_1 --
+[{\n "token": "liu"\n }, {\n "token": "ldh"\n }, {\n
"token": "de"\n }, {\n "token": "hua"\n }]
+
+-- !sql_ignore_offset_true_2 --
+[{\n "token": "ni"\n }, {\n "token": "nh"\n }, {\n
"token": "hao"\n }]
+
+-- !sql_ignore_offset_true_3 --
+[{\n "token": "yin"\n }, {\n "token": "yx"\n }, {\n
"token": "xing"\n }]
+
+-- !sql_ignore_offset_false_1 --
+[{\n "token": "liu"\n }, {\n "token": "ldh"\n }, {\n
"token": "de"\n }, {\n "token": "hua"\n }]
+
+-- !sql_ignore_offset_false_2 --
+[{\n "token": "ni"\n }, {\n "token": "nh"\n }, {\n
"token": "hao"\n }]
+
+-- !sql_ignore_offset_false_3 --
+[{\n "token": "yin"\n }, {\n "token": "yx"\n }, {\n
"token": "xing"\n }]
+
+-- !sql_ignore_offset_true_mixed --
+[{\n "token": "liu"\n }, {\n "token": "lad"\n }, {\n
"token": "a"\n }, {\n "token": "de"\n }]
+
+-- !sql_ignore_offset_false_mixed --
+[{\n "token": "liu"\n }, {\n "token": "lad"\n }, {\n
"token": "a"\n }, {\n "token": "de"\n }]
+
+-- !sql_table_ignore_offset_1 --
+1 刘德华
+
+-- !sql_table_ignore_offset_2 --
+1 刘德华
+
+-- !sql_table_ignore_offset_3 --
+3 银行卡
+
+-- !sql_table_ignore_offset_4 --
+3 银行卡
+
diff --git
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
index 0b03b785d2b..478fade7fc5 100644
---
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
+++
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
@@ -63,8 +63,8 @@ suite("test_custom_analyzer", "p0") {
CREATE INVERTED INDEX ANALYZER IF NOT EXISTS keyword_lowercase
PROPERTIES
(
- "tokenizer" = "keyword",
- "token_filter" = "asciifolding, lowercase"
+ "tokenizer" = "keyword",
+ "token_filter" = "asciifolding, lowercase"
);
"""
@@ -583,4 +583,93 @@ suite("test_custom_analyzer", "p0") {
} finally {
sql "DROP TABLE IF EXISTS ${indexTbName6}"
}
+
+ // Test ignore_pinyin_offset parameter
+ // Create tokenizer with ignore_pinyin_offset=true (default)
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS
pinyin_tokenizer_ignore_true
+ PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "true",
+ "keep_full_pinyin" = "true",
+ "ignore_pinyin_offset" = "true"
+ );
+ """
+
+ // Create tokenizer with ignore_pinyin_offset=false
+ sql """
+ CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS
pinyin_tokenizer_ignore_false
+ PROPERTIES (
+ "type" = "pinyin",
+ "keep_first_letter" = "true",
+ "keep_full_pinyin" = "true",
+ "ignore_pinyin_offset" = "false"
+ );
+ """
+
+ // Create analyzers
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS
pinyin_analyzer_ignore_true
+ PROPERTIES (
+ "tokenizer" = "pinyin_tokenizer_ignore_true"
+ );
+ """
+
+ sql """
+ CREATE INVERTED INDEX ANALYZER IF NOT EXISTS
pinyin_analyzer_ignore_false
+ PROPERTIES (
+ "tokenizer" = "pinyin_tokenizer_ignore_false"
+ );
+ """
+
+ // Wait for all analyzers to be ready - increased timeout due to many
objects
+ sql """ select sleep(15) """
+
+ // Test with ignore_pinyin_offset=true - all tokens should have same offset
+ qt_sql_ignore_offset_true_1 """ select tokenize('刘德华',
'"analyzer"="pinyin_analyzer_ignore_true"'); """
+ qt_sql_ignore_offset_true_2 """ select tokenize('你好',
'"analyzer"="pinyin_analyzer_ignore_true"'); """
+ qt_sql_ignore_offset_true_3 """ select tokenize('银行',
'"analyzer"="pinyin_analyzer_ignore_true"'); """
+
+ // Test with ignore_pinyin_offset=false - tokens should have independent
offsets
+ qt_sql_ignore_offset_false_1 """ select tokenize('刘德华',
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+ qt_sql_ignore_offset_false_2 """ select tokenize('你好',
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+ qt_sql_ignore_offset_false_3 """ select tokenize('银行',
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+
+ // Test with mixed content
+ qt_sql_ignore_offset_true_mixed """ select tokenize('刘a德',
'"analyzer"="pinyin_analyzer_ignore_true"'); """
+ qt_sql_ignore_offset_false_mixed """ select tokenize('刘a德',
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+
+ // Test table creation and queries with ignore_pinyin_offset
+ def indexTbName7 = "test_custom_analyzer_pinyin_offset"
+ sql "DROP TABLE IF EXISTS ${indexTbName7}"
+ sql """
+ CREATE TABLE ${indexTbName7} (
+ `a` bigint NOT NULL AUTO_INCREMENT(1),
+ `content` text NULL,
+ INDEX idx_content (`content`) USING INVERTED
PROPERTIES("support_phrase" = "true", "analyzer" =
"pinyin_analyzer_ignore_true")
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`a`)
+ DISTRIBUTED BY RANDOM BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1"
+ );
+ """
+
+ sql """ INSERT INTO ${indexTbName7} VALUES (1, "刘德华"); """
+ sql """ INSERT INTO ${indexTbName7} VALUES (2, "你好世界"); """
+ sql """ INSERT INTO ${indexTbName7} VALUES (3, "银行卡"); """
+
+ try {
+ sql "sync"
+ sql """ set enable_common_expr_pushdown = true; """
+
+ // Test queries with ignore_pinyin_offset=true
+ qt_sql_table_ignore_offset_1 """ select * from ${indexTbName7} where
content match 'liu' order by a; """
+ qt_sql_table_ignore_offset_2 """ select * from ${indexTbName7} where
content match 'ldh' order by a; """
+ qt_sql_table_ignore_offset_3 """ select * from ${indexTbName7} where
content match 'yin' order by a; """
+ qt_sql_table_ignore_offset_4 """ select * from ${indexTbName7} where
content match 'hang' order by a; """
+
+ } finally {
+ sql "DROP TABLE IF EXISTS ${indexTbName7}"
+ }
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]