(doris) branch branch-4.0 updated: branch-4.0: [fix](inverted index) fix pinyin bug #57756 (#58852)

yiguolei Tue, 09 Dec 2025 18:50:23 -0800

This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new e47082dcc51 branch-4.0: [fix](inverted index) fix pinyin bug #57756 
(#58852)
e47082dcc51 is described below

commit e47082dcc514baba080279fd738f6d00e2709dab
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Dec 10 10:50:04 2025 +0800

    branch-4.0: [fix](inverted index) fix pinyin bug #57756 (#58852)
    
    Cherry-picked from #57756
    
    Co-authored-by: Ryan19929 <[email protected]>
---
 .../inverted_index/token_filter/pinyin_filter.cpp  |  76 +++-
 .../token_filter/pinyin_filter_factory.cpp         |   8 -
 .../tokenizer/pinyin/pinyin_formatter.cpp          |  37 +-
 .../tokenizer/pinyin/pinyin_tokenizer.cpp          |  58 +--
 .../tokenizer/pinyin/pinyin_tokenizer.h            |   2 +-
 .../token_filter/pinyin_filter_test.cpp            | 355 +++++++++++++++++
 .../tokenizer/pinyin_analysis_test.cpp             | 437 +++++++++++++++++++++
 .../inverted_index/tokenizer/pinyin_util_test.cpp  | 122 ++++++
 .../indexpolicy/PinyinTokenFilterValidator.java    |  35 +-
 .../indexpolicy/PinyinTokenizerValidator.java      |  35 +-
 .../analyzer/test_custom_analyzer.out              |  38 +-
 .../analyzer/test_custom_analyzer.groovy           |  93 ++++-
 12 files changed, 1168 insertions(+), 128 deletions(-)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
index 7c9b8579a80..13bf212b21c 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter.cpp
@@ -48,14 +48,6 @@ PinyinFilter::PinyinFilter(const TokenStreamPtr& in, 
std::shared_ptr<PinyinConfi
     if (!config_) {
         config_ = std::make_shared<PinyinConfig>();
     }
-
-    // Validate configuration (same as Java validation)
-    if (!(config_->keepFirstLetter || config_->keepSeparateFirstLetter || 
config_->keepFullPinyin ||
-          config_->keepJoinedFullPinyin || config_->keepSeparateChinese)) {
-        throw Exception(ErrorCode::INVALID_ARGUMENT,
-                        "pinyin config error, can't disable 
separate_first_letter, "
-                        "first_letter and full_pinyin at the same time.");
-    }
 }
 
 void PinyinFilter::initialize() {
@@ -143,10 +135,26 @@ bool PinyinFilter::readTerm(Token* token) {
         }
     }
 
-    // Process original text if needed
-    if (config_->keepOriginal && !processed_original_) {
+    // Preserve original text if configured or if no candidates were generated
+    // This ensures Unicode symbols (emoji, etc.) are preserved even without 
keep_original setting
+    // matching Elasticsearch behavior
+    // NOTE: Must be AFTER processCurrentToken() but BEFORE first_letters to 
maintain correct order
+    if (!processed_original_ && has_current_token_) {
+        bool should_add_original = config_->keepOriginal;
+
+        // For emoji/symbol fallback: check if ANY content was generated 
(candidates OR pending letters)
+        // If nothing was generated, this is likely an emoji/symbol that 
should be preserved
+        if (!should_add_original && candidate_.empty() && 
first_letters_.empty() &&
+            full_pinyin_letters_.empty()) {
+            // No candidates and no pending letters, this is emoji/symbol
+            should_add_original = true;
+        }
+
         processed_original_ = true;
-        addCandidate(TermItem(current_source_, 0, 
static_cast<int>(current_source_.length()), 1));
+        if (should_add_original) {
+            addCandidate(
+                    TermItem(current_source_, 0, 
static_cast<int>(current_source_.length()), 1));
+        }
     }
 
     // Process joined full pinyin if needed
@@ -231,8 +239,22 @@ bool PinyinFilter::processCurrentToken() {
             PinyinUtil::instance().convert(source_codepoints, 
PinyinFormat::TONELESS_PINYIN_FORMAT);
     auto chinese_list = ChineseUtil::segmentChinese(source_codepoints);
 
+    // Early return optimization: if no Chinese characters found
     if (pinyin_list.empty() && chinese_list.empty()) {
-        return false;
+        // Check if there are non-ASCII Unicode characters (like emoji) to 
preserve
+        bool has_unicode_symbols = false;
+        for (const auto& cp : source_codepoints) {
+            if (cp >= 128) { // Non-ASCII character
+                has_unicode_symbols = true;
+                break;
+            }
+        }
+
+        // If no Unicode symbols, return false and let other filters handle it
+        if (!has_unicode_symbols) {
+            return false;
+        }
+        // Otherwise, continue processing to preserve Unicode symbols
     }
 
     // Process each character and generate candidates
@@ -240,7 +262,7 @@ bool PinyinFilter::processCurrentToken() {
     std::string first_letters_buffer;
     std::string full_pinyin_buffer;
 
-    // Buffer for accumulating ASCII characters (like Java's buff)
+    // Buffer for accumulating ASCII characters
     std::string ascii_buffer;
     int ascii_buffer_start_pos = -1;
 
@@ -256,14 +278,27 @@ bool PinyinFilter::processCurrentToken() {
                         (codepoint >= '0' && codepoint <= '9');
 
         if (is_ascii && is_alnum) {
-            // Initialize ASCII buffer if needed
-            if (ascii_buffer.empty()) {
-                ascii_buffer_start_pos = static_cast<int>(i);
+            // Check if we should process ASCII characters individually
+            if (!config_->keepNoneChineseTogether && config_->keepNoneChinese) 
{
+                // Process accumulated ASCII buffer before processing 
individual character
+                if (!ascii_buffer.empty()) {
+                    processAsciiBuffer(ascii_buffer, ascii_buffer_start_pos, 
static_cast<int>(i));
+                    ascii_buffer.clear();
+                    ascii_buffer_start_pos = -1;
+                }
+                // Process individual ASCII character immediately
+                position_++;
+                std::string single_char(1, static_cast<char>(codepoint));
+                addCandidate(TermItem(single_char, static_cast<int>(i), 
static_cast<int>(i + 1),
+                                      position_));
+            } else {
+                // Accumulate ASCII characters for later processing
+                if (ascii_buffer.empty()) {
+                    ascii_buffer_start_pos = static_cast<int>(i);
+                }
+                ascii_buffer += static_cast<char>(codepoint);
             }
 
-            // Accumulate ASCII characters
-            ascii_buffer += static_cast<char>(codepoint);
-
             // Handle ASCII alphanumeric characters for first letters
             if (config_->keepNoneChineseInFirstLetter) {
                 first_letters_buffer += static_cast<char>(codepoint);
@@ -311,6 +346,9 @@ bool PinyinFilter::processCurrentToken() {
                     }
                 }
             }
+            // For non-ASCII, non-Chinese characters (e.g., emoji, symbols),
+            // we don't add them to candidate. They will only be kept if the 
fallback
+            // mechanism is triggered (when candidate_ is empty).
         }
     }
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
index 8603b74ce5d..68cb927aa43 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_factory.cpp
@@ -52,14 +52,6 @@ void PinyinFilterFactory::initialize(const Settings& 
settings) {
 
     // Integer parameters
     config_->limitFirstLetterLength = 
settings.get_int("limit_first_letter_length", 16);
-
-    // Validate configuration (same validation as Java)
-    if (!(config_->keepFirstLetter || config_->keepSeparateFirstLetter || 
config_->keepFullPinyin ||
-          config_->keepJoinedFullPinyin || config_->keepSeparateChinese)) {
-        throw Exception(ErrorCode::INVALID_ARGUMENT,
-                        "pinyin config error, can't disable 
separate_first_letter, "
-                        "first_letter and full_pinyin at the same time.");
-    }
 }
 
 TokenFilterPtr PinyinFilterFactory::create(const TokenStreamPtr& in) {
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
index 8bae9514faa..577613e29fc 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.cpp
@@ -38,6 +38,25 @@ const std::unordered_map<char, std::array<const char*, 4>> 
TONE_MARKS = {
 };
 
 constexpr const char* VOWELS = "aeiouv";
+
+// Pre-compiled regex patterns (compile once, reuse many times for performance)
+// Only use regex for patterns that require pattern matching (character 
classes, etc.)
+const std::regex TONE_NUMBER_REGEX("[1-5]");
+const std::regex PINYIN_VALIDATION_REGEX("[a-z]*[1-5]?");
+const std::regex PINYIN_WITH_TONE_REGEX("[a-z]*[1-5]");
+
+// Helper function for simple string replacement (faster than regex for fixed 
strings)
+inline std::string replaceAll(const std::string& str, const std::string& from,
+                              const std::string& to) {
+    if (from.empty()) return str;
+    std::string result = str;
+    size_t pos = 0;
+    while ((pos = result.find(from, pos)) != std::string::npos) {
+        result.replace(pos, from.length(), to);
+        pos += to.length();
+    }
+    return result;
+}
 } // namespace
 
 std::string PinyinFormatter::formatPinyin(const std::string& pinyin_str,
@@ -62,10 +81,10 @@ std::string PinyinFormatter::formatPinyin(const 
std::string& pinyin_str,
 
         switch (working_format.getToneType()) {
         case ToneType::WITHOUT_TONE:
-            result = std::regex_replace(result, std::regex("[1-5]"), "");
+            result = std::regex_replace(result, TONE_NUMBER_REGEX, "");
             break;
         case ToneType::WITH_TONE_MARK:
-            result = std::regex_replace(result, std::regex("u:"), "v");
+            result = replaceAll(result, "u:", "v");
             result = convertToneNumber2ToneMark(result);
             break;
         case ToneType::WITH_TONE_NUMBER:
@@ -76,10 +95,10 @@ std::string PinyinFormatter::formatPinyin(const 
std::string& pinyin_str,
         if (working_format.getToneType() != ToneType::WITH_TONE_MARK) {
             switch (working_format.getYuCharType()) {
             case YuCharType::WITH_V:
-                result = std::regex_replace(result, std::regex("u:"), "v");
+                result = replaceAll(result, "u:", "v");
                 break;
             case YuCharType::WITH_U_UNICODE:
-                result = std::regex_replace(result, std::regex("u:"), "ü");
+                result = replaceAll(result, "u:", "ü");
                 break;
             case YuCharType::WITH_U_AND_COLON:
             default:
@@ -143,7 +162,7 @@ std::string 
PinyinFormatter::convertToneNumber2ToneMark(const std::string& pinyi
     std::transform(lower_case_pinyin.begin(), lower_case_pinyin.end(), 
lower_case_pinyin.begin(),
                    [](unsigned char c) { return std::tolower(c); });
 
-    if (!std::regex_match(lower_case_pinyin, std::regex("[a-z]*[1-5]?"))) {
+    if (!std::regex_match(lower_case_pinyin, PINYIN_VALIDATION_REGEX)) {
         return lower_case_pinyin;
     }
 
@@ -153,7 +172,7 @@ std::string 
PinyinFormatter::convertToneNumber2ToneMark(const std::string& pinyi
     char unmarked_vowel = DEFAULT_CHAR_VALUE;
     int index_of_unmarked_vowel = DEFAULT_INDEX_VALUE;
 
-    if (std::regex_match(lower_case_pinyin, std::regex("[a-z]*[1-5]"))) {
+    if (std::regex_match(lower_case_pinyin, PINYIN_WITH_TONE_REGEX)) {
         int tune_number = lower_case_pinyin.back() - '0';
 
         size_t index_of_a = lower_case_pinyin.find('a');
@@ -190,7 +209,7 @@ std::string 
PinyinFormatter::convertToneNumber2ToneMark(const std::string& pinyi
 
                 std::string result;
                 std::string prefix = lower_case_pinyin.substr(0, 
index_of_unmarked_vowel);
-                result += std::regex_replace(prefix, std::regex("v"), "ü");
+                result += replaceAll(prefix, "v", "ü");
                 result += marked_vowel;
 
                 if (index_of_unmarked_vowel + 1 <
@@ -198,14 +217,14 @@ std::string 
PinyinFormatter::convertToneNumber2ToneMark(const std::string& pinyi
                     std::string suffix = lower_case_pinyin.substr(
                             index_of_unmarked_vowel + 1,
                             lower_case_pinyin.length() - 1 - 
index_of_unmarked_vowel - 1);
-                    result += std::regex_replace(suffix, std::regex("v"), "ü");
+                    result += replaceAll(suffix, "v", "ü");
                 }
 
                 return result;
             }
         }
     } else {
-        return std::regex_replace(lower_case_pinyin, std::regex("v"), "ü");
+        return replaceAll(lower_case_pinyin, "v", "ü");
     }
 
     return lower_case_pinyin;
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
index 57fcff01ce5..c46709c2ab0 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.cpp
@@ -40,12 +40,6 @@ 
PinyinTokenizer::PinyinTokenizer(std::shared_ptr<doris::segment_v2::PinyinConfig
     if (!config_) {
         config_ = std::make_shared<doris::segment_v2::PinyinConfig>();
     }
-    if (!(config_->keepFirstLetter || config_->keepSeparateFirstLetter || 
config_->keepFullPinyin ||
-          config_->keepJoinedFullPinyin || config_->keepSeparateChinese)) {
-        throw Exception(ErrorCode::INVALID_ARGUMENT,
-                        "pinyin config error, can't disable 
separate_first_letter, first_letter "
-                        "and full_pinyin at the same time.");
-    }
     candidate_.clear();
     terms_filter_.clear();
     first_letters_.clear();
@@ -93,7 +87,7 @@ void PinyinTokenizer::processInput() {
         }
 
         position_ = 0;
-        int ascii_buff_start = -1;
+        int ascii_buff_start_byte = -1;
         std::string ascii_buff;
         int char_index = 0;
 
@@ -109,18 +103,15 @@ void PinyinTokenizer::processInput() {
                             (r.cp >= '0' && r.cp <= '9');
 
             if (is_ascii_context) {
-                if (ascii_buff_start < 0) ascii_buff_start = r.byte_start;
+                if (ascii_buff_start_byte < 0) ascii_buff_start_byte = 
r.byte_start;
                 if (is_alnum && config_->keepNoneChinese) {
                     if (config_->keepNoneChineseTogether) {
                         ascii_buff.push_back(static_cast<char>(r.cp));
                     } else {
                         position_++;
-                        int char_position = position_;
-
-                        char_position = char_index + 1;
-
                         std::string single_char(1, static_cast<char>(r.cp));
-                        addCandidate(single_char, r.byte_start, r.byte_end, 
char_position);
+                        // Use byte offset for single ASCII character
+                        addCandidate(single_char, r.byte_start, r.byte_end, 
char_index + 1);
                     }
                 }
                 if (is_alnum && config_->keepNoneChineseInFirstLetter) {
@@ -131,7 +122,7 @@ void PinyinTokenizer::processInput() {
                 }
             } else {
                 if (!ascii_buff.empty()) {
-                    parseBuff(ascii_buff, ascii_buff_start);
+                    parseBuff(ascii_buff, ascii_buff_start_byte);
                 }
                 bool incr_position = false;
                 if (!pinyin.empty() && !chinese.empty()) {
@@ -159,20 +150,21 @@ void PinyinTokenizer::processInput() {
         }
 
         if (!ascii_buff.empty()) {
-            parseBuff(ascii_buff, ascii_buff_start);
+            parseBuff(ascii_buff, ascii_buff_start_byte);
         }
     }
 
+    int total_byte_length = runes_.empty() ? 0 : runes_.back().byte_end;
+
     if (config_->keepOriginal && !processed_original_) {
         processed_original_ = true;
         std::string source_utf8 = codepointsToUtf8(source_codepoints_);
-        addCandidate(source_utf8, 0, static_cast<int>(source_utf8.length()), 
1);
+        addCandidate(source_utf8, 0, total_byte_length, 1);
     }
     if (config_->keepJoinedFullPinyin && !processed_full_pinyin_letter_ &&
         !full_pinyin_letters_.empty()) {
         processed_full_pinyin_letter_ = true;
-        std::string source_utf8 = codepointsToUtf8(source_codepoints_);
-        addCandidate(full_pinyin_letters_, 0, 
static_cast<int>(source_utf8.length()), 1);
+        addCandidate(full_pinyin_letters_, 0, total_byte_length, 1);
         full_pinyin_letters_.clear();
     }
     if (config_->keepFirstLetter && !first_letters_.empty() && 
!processed_first_letter_) {
@@ -188,7 +180,7 @@ void PinyinTokenizer::processInput() {
                            [](unsigned char x) { return 
static_cast<char>(std::tolower(x)); });
         }
         if (!(config_->keepSeparateFirstLetter && fl.length() <= 1)) {
-            addCandidate(fl, 0, static_cast<int>(fl.length()), 1);
+            addCandidate(fl, 0, total_byte_length, 1);
         }
     }
 
@@ -214,8 +206,14 @@ Token* PinyinTokenizer::next(Token* token) {
         size_t size = std::min(text.size(), 
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
         token->setNoCopy(text.data(), 0, static_cast<int32_t>(size));
 
-        token->setStartOffset(item.start_offset);
-        token->setEndOffset(item.end_offset);
+        if (config_->ignorePinyinOffset) {
+            int total_byte_length = runes_.empty() ? 0 : 
runes_.back().byte_end;
+            token->setStartOffset(0);
+            token->setEndOffset(total_byte_length);
+        } else {
+            token->setStartOffset(item.start_offset);
+            token->setEndOffset(item.end_offset);
+        }
 
         int offset = item.position - last_increment_position_;
         if (offset < 0) offset = 0;
@@ -297,18 +295,22 @@ void PinyinTokenizer::decode_to_runes() {
     }
 }
 
-void PinyinTokenizer::parseBuff(std::string& ascii_buff, int& 
ascii_buff_start) {
+void PinyinTokenizer::parseBuff(std::string& ascii_buff, int& 
ascii_buff_start_byte) {
     if (ascii_buff.empty()) return;
     if (!config_->keepNoneChinese) {
         ascii_buff.clear();
-        ascii_buff_start = -1;
+        ascii_buff_start_byte = -1;
         return;
     }
-    int32_t seg_start = ascii_buff_start;
-    int32_t seg_end = seg_start + static_cast<int32_t>(ascii_buff.size());
+
+    // Use byte offset for ASCII buffer
+    // ascii_buff_start_byte is the byte position where the buffer started
+    int32_t buff_byte_size = static_cast<int32_t>(ascii_buff.size());
+    int32_t buff_end_byte = ascii_buff_start_byte + buff_byte_size;
+
     if (config_->noneChinesePinyinTokenize) {
         std::vector<std::string> result = 
PinyinAlphabetTokenizer::walk(ascii_buff);
-        int32_t start = seg_start;
+        int32_t start = ascii_buff_start_byte;
         for (const std::string& t : result) {
             int32_t end = config_->fixedPinyinOffset ? start + 1
                                                      : start + 
static_cast<int32_t>(t.length());
@@ -319,10 +321,10 @@ void PinyinTokenizer::parseBuff(std::string& ascii_buff, 
int& ascii_buff_start)
     } else if (config_->keepFirstLetter || config_->keepSeparateFirstLetter ||
                config_->keepFullPinyin || 
!config_->keepNoneChineseInJoinedFullPinyin) {
         position_++;
-        addCandidate(ascii_buff, seg_start, seg_end, position_);
+        addCandidate(ascii_buff, ascii_buff_start_byte, buff_end_byte, 
position_);
     }
     ascii_buff.clear();
-    ascii_buff_start = -1;
+    ascii_buff_start_byte = -1;
 }
 
 std::string PinyinTokenizer::codepointsToUtf8(const std::vector<UChar32>& 
codepoints) const {
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
index 5a09b0d3715..ef65874af07 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_tokenizer.h
@@ -76,7 +76,7 @@ private:
 
     void decode_to_runes();
     void processInput();
-    void parseBuff(std::string& ascii_buff, int& ascii_buff_start);
+    void parseBuff(std::string& ascii_buff, int& ascii_buff_start_byte);
 
     std::string codepointsToUtf8(const std::vector<UChar32>& codepoints) const;
 };
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
 
b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
index 9ffb3462ec1..8662cace8ce 100644
--- 
a/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/token_filter/pinyin_filter_test.cpp
@@ -406,4 +406,359 @@ TEST_F(PinyinFilterTest, TestTokenFilter_OnlyFullPinyin) {
     assertTokens(tokens, expected, "Only full pinyin test");
 }
 
+// Test emoji preservation without keep_original setting
+TEST_F(PinyinFilterTest, TestTokenFilter_EmojiPreservation) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false"; // Emoji should still be preserved via 
fallback
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    auto tokens = tokenizeWithFilter("⭐白菜", "standard", config);
+
+    // Standard tokenizer outputs "⭐" and "白菜" as separate tokens
+    // "⭐" -> preserved via fallback (no pinyin candidates)
+    // "白菜" -> "bai", "b", "cai", "c", "bc"
+    std::vector<std::string> expected = {"⭐", "bai", "b", "cai", "c"};
+    assertTokens(tokens, expected, "StandardTokenizer + Emoji preservation");
+}
+
+// Test pure emoji input
+TEST_F(PinyinFilterTest, TestTokenFilter_PureEmoji) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    auto tokens = tokenizeWithFilter("⭐", "standard", config);
+
+    std::vector<std::string> expected = {"⭐"};
+    assertTokens(tokens, expected, "Pure emoji should be preserved");
+}
+
+// Test multiple emojis
+TEST_F(PinyinFilterTest, TestTokenFilter_MultipleEmojis) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    auto tokens = tokenizeWithFilter("🎉中国🚀", "standard", config);
+
+    std::vector<std::string> expected = {"🎉", "zhong", "z", "guo", "g", "🚀"};
+    assertTokens(tokens, expected, "Multiple emojis with Chinese");
+}
+
+// Test keepNoneChineseTogether = false with letters and digits
+TEST_F(PinyinFilterTest, TestTokenFilter_KeepNoneChineseTogetherFalse) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "true";
+    config["keep_none_chinese"] = "true";
+    config["keep_none_chinese_together"] = "false";
+    config["none_chinese_pinyin_tokenize"] = "true";
+    config["lowercase"] = "true";
+    config["remove_duplicated_term"] = "true";
+    config["ignore_pinyin_offset"] = "false";
+
+    auto tokens = tokenizeWithFilter("刘德华ABC123", "keyword", config);
+
+    // Letters are split individually, digits are split individually too
+    std::vector<std::string> expected = {
+            "liu", "刘德华abc123", "ldhabc123", "de", "hua", "a", "b", "c", "1", 
"2", "3"};
+    assertTokens(tokens, expected, "KeepNoneChineseTogether=false with mixed 
content");
+}
+
+// Test keepNoneChineseTogether = false with pure letters
+TEST_F(PinyinFilterTest, 
TestTokenFilter_KeepNoneChineseTogetherFalse_PureLetters) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true"; // Need at least one output format 
enabled
+    config["keep_full_pinyin"] = "false";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "true";
+    config["keep_none_chinese_together"] = "false";
+    config["none_chinese_pinyin_tokenize"] = "true";
+    config["lowercase"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    auto tokens = tokenizeWithFilter("DEL", "keyword", config);
+
+    // All letters should be split individually, plus the result from 
PinyinAlphabetTokenizer
+    // With keep_first_letter=true, we get the combined first letter too
+    std::vector<std::string> expected = {"D", "DEL", "E", "L"};
+    assertTokens(tokens, expected, "KeepNoneChineseTogether=false pure 
letters");
+}
+
+// Test Unicode symbols fallback when no candidates generated
+TEST_F(PinyinFilterTest, TestTokenFilter_UnicodeFallback) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false"; // No keep_original but symbols should 
still be preserved
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // Use keyword tokenizer to ensure the input is passed as-is to the filter
+    auto tokens = tokenizeWithFilter("①②③", "keyword", config);
+
+    // Unicode symbols should be preserved even without keep_original via 
fallback
+    std::vector<std::string> expected = {"①②③"};
+    assertTokens(tokens, expected, "Unicode symbols preserved without 
keep_original");
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_NullConfig) {
+    std::string text = "测试"; // Keep string alive
+    auto reader = std::make_shared<lucene::util::SStringReader<char>>();
+    reader->init(text.data(), text.size(), false);
+
+    StandardTokenizerFactory tokenizer_factory;
+    Settings tokenizer_settings;
+    tokenizer_factory.initialize(tokenizer_settings);
+    auto tokenizer = tokenizer_factory.create();
+    tokenizer->set_reader(reader);
+    tokenizer->reset();
+
+    // Create filter with nullptr config directly
+    PinyinFilter filter(tokenizer, nullptr);
+    filter.initialize();
+
+    Token token;
+    // Should work with default config without crashing
+    EXPECT_NE(filter.next(&token), nullptr);
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_KeepNoneChineseFalse) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    auto tokens = tokenizeWithFilter("测试ABC123", "standard", config);
+    std::vector<std::string> expected = {"ce", "c", "shi", "s", "abc123"};
+    assertTokens(tokens, expected, "KeepNoneChineseFalse test");
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_NoneChinesePinyinTokenizeFalse) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "true";
+    config["none_chinese_pinyin_tokenize"] = "false"; // Don't tokenize ASCII
+    config["ignore_pinyin_offset"] = "false";
+
+    auto tokens = tokenizeWithFilter("刘德华ABC123", "standard", config);
+    std::vector<std::string> expected = {"liu", "l", "de", "d", "hua", "h", 
"abc123"};
+    assertTokens(tokens, expected, "NoneChinesePinyinTokenizeFalse test");
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_WhitespaceOnly) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "true";
+    config["keep_none_chinese"] = "true";
+    config["ignore_pinyin_offset"] = "false";
+
+    // Test with whitespace-only and mixed content
+    auto tokens = tokenizeWithFilter("   测试   ", "keyword", config);
+    std::vector<std::string> expected = {"ce", "测试", "cs", "shi"};
+    assertTokens(tokens, expected, "WhitespaceOnly test");
+}
+
+TEST_F(PinyinFilterTest, TestTokenFilter_PositionIncrement) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "true";
+    config["keep_none_chinese"] = "true";
+    config["keep_none_chinese_together"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // Use multiple Chinese characters to trigger position increment logic
+    auto tokens = tokenizeWithFilter("刘德华", "standard", config);
+
+    std::vector<std::string> expected = {"liu", "刘", "l", "de", "德", "d", 
"hua", "华", "h"};
+    assertTokens(tokens, expected, "PositionIncrement test");
+}
+
+// Test reset() method - verifies PinyinFilter::reset() correctly resets state
+// This tests the code path in pinyin_filter.cpp:99-110
+TEST_F(PinyinFilterTest, TestTokenFilter_Reset) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // First tokenization - tests that filter works correctly
+    auto tokens1 = tokenizeWithFilter("刘德华", "keyword", config);
+
+    EXPECT_GT(tokens1.size(), 0) << "First tokenization should produce tokens";
+    bool has_liu = std::find(tokens1.begin(), tokens1.end(), "liu") != 
tokens1.end();
+    EXPECT_TRUE(has_liu) << "First tokenization should contain 'liu'";
+
+    // Second tokenization with different text - tests that filter state is 
independent
+    auto tokens2 = tokenizeWithFilter("张学友", "keyword", config);
+
+    EXPECT_GT(tokens2.size(), 0) << "Second tokenization should produce 
tokens";
+    bool has_zhang = std::find(tokens2.begin(), tokens2.end(), "zhang") != 
tokens2.end();
+    EXPECT_TRUE(has_zhang) << "Second tokenization should contain 'zhang'";
+
+    // Ensure tokens from first text are not in second result (state isolation)
+    bool has_liu_in_second = std::find(tokens2.begin(), tokens2.end(), "liu") 
!= tokens2.end();
+    EXPECT_FALSE(has_liu_in_second)
+            << "Second tokenization should NOT contain 'liu' from first text";
+}
+
+// Test reset() with empty input after valid input
+TEST_F(PinyinFilterTest, TestTokenFilter_ResetWithEmptyInput) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // First tokenization with valid text
+    auto tokens1 = tokenizeWithFilter("测试", "keyword", config);
+    EXPECT_GT(tokens1.size(), 0) << "First tokenization should produce tokens";
+
+    // Tokenization with empty text
+    auto tokens2 = tokenizeWithFilter("", "keyword", config);
+    EXPECT_EQ(tokens2.size(), 0) << "Empty input should produce no tokens";
+}
+
+// Test Unicode symbol preservation fallback - tests pinyin_filter.cpp:243-259
+// When pinyin_list and chinese_list are empty but there are non-ASCII Unicode 
chars,
+// the filter should preserve the original token
+TEST_F(PinyinFilterTest, TestTokenFilter_UnicodeSymbolPreservation) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false"; // Even without keep_original, Unicode 
should be preserved
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // Test circled numbers - these are Unicode symbols that cannot be 
converted to pinyin
+    auto tokens1 = tokenizeWithFilter("①②③", "keyword", config);
+    EXPECT_EQ(tokens1.size(), 1) << "Circled numbers should be preserved as 
one token";
+    EXPECT_EQ(tokens1[0], "①②③") << "Circled numbers should be preserved";
+
+    // Test other Unicode symbols
+    auto tokens2 = tokenizeWithFilter("★☆♠♥", "keyword", config);
+    EXPECT_EQ(tokens2.size(), 1) << "Card suit symbols should be preserved";
+    EXPECT_EQ(tokens2[0], "★☆♠♥") << "Card suit symbols should be preserved";
+
+    // Test mathematical symbols
+    auto tokens3 = tokenizeWithFilter("∑∏∫∂", "keyword", config);
+    EXPECT_EQ(tokens3.size(), 1) << "Math symbols should be preserved";
+    EXPECT_EQ(tokens3[0], "∑∏∫∂") << "Math symbols should be preserved";
+}
+
+// Test Unicode symbols mixed with Chinese characters
+// Note: Standard tokenizer may filter out some Unicode symbols, so we test 
with keyword tokenizer
+TEST_F(PinyinFilterTest, TestTokenFilter_UnicodeSymbolsWithChinese) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // Use keyword tokenizer to keep the whole input as one token
+    // This tests the fallback logic for mixed Unicode symbols and Chinese
+    auto tokens = tokenizeWithFilter("①中国", "keyword", config);
+
+    // With keyword tokenizer, pinyin filter should process the whole string
+    // Chinese characters get converted to pinyin, but we need to check 
fallback behavior
+    bool has_zhong = std::find(tokens.begin(), tokens.end(), "zhong") != 
tokens.end();
+    bool has_guo = std::find(tokens.begin(), tokens.end(), "guo") != 
tokens.end();
+
+    EXPECT_TRUE(has_zhong) << "Should have pinyin 'zhong'";
+    EXPECT_TRUE(has_guo) << "Should have pinyin 'guo'";
+
+    // Test that Chinese pinyin is correctly generated even with Unicode prefix
+    EXPECT_GT(tokens.size(), 0) << "Should produce tokens";
+}
+
+// Test pure emoji preservation
+TEST_F(PinyinFilterTest, TestTokenFilter_PureEmojiPreservation) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // Pure emoji should be preserved via the fallback logic
+    auto tokens = tokenizeWithFilter("😀😁😂", "keyword", config);
+    EXPECT_EQ(tokens.size(), 1) << "Pure emoji should be preserved as one 
token";
+    EXPECT_EQ(tokens[0], "😀😁😂") << "Emoji string should be preserved";
+}
+
+// Test that ASCII-only input without Unicode symbols returns false
+// This tests the code path: if (!has_unicode_symbols) { return false; }
+TEST_F(PinyinFilterTest, TestTokenFilter_AsciiOnlyFallbackHandling) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "false";
+    config["keep_full_pinyin"] = "false";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "false"; // This will cause ASCII to have no 
candidates
+    config["keep_joined_full_pinyin"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // With all options disabled, pure ASCII should not produce tokens
+    // because it has no Unicode symbols to preserve via fallback
+    auto tokens = tokenizeWithFilter("abc123", "keyword", config);
+    // The filter returns the original token when no candidates are generated
+    // but for pure ASCII without Unicode symbols, it returns false
+    // Actually, looking at the code, it seems like it would return the 
original
+    // Let's verify the actual behavior
+    EXPECT_LE(tokens.size(), 1)
+            << "Pure ASCII with no output options should produce minimal 
tokens";
+}
+
+// Test currency and special Unicode symbols
+TEST_F(PinyinFilterTest, TestTokenFilter_CurrencySymbols) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // Currency symbols are Unicode but not Chinese
+    auto tokens = tokenizeWithFilter("€£¥₹", "keyword", config);
+    EXPECT_EQ(tokens.size(), 1) << "Currency symbols should be preserved";
+    EXPECT_EQ(tokens[0], "€£¥₹") << "Currency symbols should be preserved 
as-is";
+}
+
+// Test Japanese/Korean characters (CJK but not in Chinese pinyin dict)
+TEST_F(PinyinFilterTest, TestTokenFilter_NonChineseCJK) {
+    std::unordered_map<std::string, std::string> config;
+    config["keep_first_letter"] = "true";
+    config["keep_full_pinyin"] = "true";
+    config["keep_original"] = "false";
+    config["keep_none_chinese"] = "false";
+    config["ignore_pinyin_offset"] = "false";
+
+    // Japanese hiragana - these are non-ASCII Unicode but not Chinese
+    auto tokens1 = tokenizeWithFilter("あいう", "keyword", config);
+    EXPECT_EQ(tokens1.size(), 1) << "Japanese hiragana should be preserved";
+    EXPECT_EQ(tokens1[0], "あいう") << "Japanese hiragana should be preserved 
as-is";
+
+    // Korean hangul
+    auto tokens2 = tokenizeWithFilter("한글", "keyword", config);
+    EXPECT_EQ(tokens2.size(), 1) << "Korean hangul should be preserved";
+    EXPECT_EQ(tokens2[0], "한글") << "Korean hangul should be preserved as-is";
+}
+
 } // namespace doris::segment_v2::inverted_index
\ No newline at end of file
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
 
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
index a21fc7cb290..b7b124fa83f 100644
--- 
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_analysis_test.cpp
@@ -1370,3 +1370,440 @@ TEST_F(PinyinAnalysisTest, TestRepeatedCharacters) {
     }
     EXPECT_LE(de_count, 1) << "Should have at most one 'de' with 
removeDuplicatedTerm";
 }
+
+// Test emoji handling in PinyinTokenizer - emojis should be dropped
+TEST_F(PinyinAnalysisTest, TestTokenizer_EmojiShouldBeDropped) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.keepNoneChinese = false;
+    config.ignorePinyinOffset = false;
+
+    // When using PinyinTokenizer (not filter), emojis should be dropped
+    verifyTokens("⭐白菜", config, {"bai", "bc", "cai"});
+}
+
+// Test pure emoji input in PinyinTokenizer - should return empty
+TEST_F(PinyinAnalysisTest, TestTokenizer_PureEmojiDropped) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.keepNoneChinese = false;
+    config.ignorePinyinOffset = false;
+
+    verifyTokens("⭐🎉", config, {});
+}
+
+// Test multiple emojis with Chinese in PinyinTokenizer - emojis dropped
+TEST_F(PinyinAnalysisTest, TestTokenizer_MultipleEmojisDropped) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.keepNoneChinese = false;
+    config.ignorePinyinOffset = false;
+
+    // Emojis should be dropped, only Chinese pinyin remains
+    verifyTokens("🎉中国🚀", config, {"zhong", "zg", "guo"});
+}
+
+// Test keepNoneChineseTogether = false with PinyinTokenizer
+TEST_F(PinyinAnalysisTest, TestTokenizer_KeepNoneChineseTogetherFalse) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = true;
+    config.keepNoneChinese = true;
+    config.keepNoneChineseTogether = false;
+    config.noneChinesePinyinTokenize = true;
+    config.lowercase = true;
+    config.removeDuplicatedTerm = true;
+    config.ignorePinyinOffset = false;
+
+    // Both letters and digits should be split individually
+    verifyTokens("刘德华ABC123", config,
+                 {"liu", "刘德华abc123", "ldhabc123", "de", "hua", "a", "b", "c", 
"1", "2", "3"});
+}
+
+// Test Unicode symbols in PinyinTokenizer - should be dropped like emojis
+TEST_F(PinyinAnalysisTest, TestTokenizer_UnicodeSymbolsDropped) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.keepNoneChinese = false;
+    config.ignorePinyinOffset = false;
+
+    // Unicode symbols like circled numbers should be dropped
+    verifyTokens("①②③中国", config, {"zhong", "zg", "guo"});
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_SingleChinese) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.keepJoinedFullPinyin = false;
+    config.keepSeparateFirstLetter = false;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "刘德华";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 9;
+    std::vector<std::string> expected_terms = {"liu", "ldh", "de", "hua"};
+    EXPECT_EQ(tokens.size(), expected_terms.size());
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        EXPECT_EQ(tokens[i].term, expected_terms[i]) << "Token mismatch at 
position " << i;
+        EXPECT_EQ(tokens[i].startOffset, 0) << "Token: " << tokens[i].term;
+        EXPECT_EQ(tokens[i].endOffset, total_bytes) << "Token: " << 
tokens[i].term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_KeepOriginal) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = true;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "你好";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 6;
+    for (const auto& token : tokens) {
+        EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+        EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_JoinedFullPinyin) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepJoinedFullPinyin = true;
+    config.keepOriginal = false;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "中国";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 6;
+    for (const auto& token : tokens) {
+        EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+        EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_AsciiOnly) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepNoneChinese = true;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "hello";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 5;
+    for (const auto& token : tokens) {
+        EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+        EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_WithNumbers) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepNoneChinese = true;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "北京2008";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 10;
+    for (const auto& token : tokens) {
+        EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+        EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_SpecialChars) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepNoneChinese = true;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "A股B股";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 8;
+    for (const auto& token : tokens) {
+        EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+        EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_Polyphone) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "银行";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 6;
+    for (const auto& token : tokens) {
+        EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+        EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_LongText) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "我爱北京天安门";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 21;
+    for (const auto& token : tokens) {
+        EXPECT_EQ(token.startOffset, 0) << "Token: " << token.term;
+        EXPECT_EQ(token.endOffset, total_bytes) << "Token: " << token.term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_SingleChinese) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.keepJoinedFullPinyin = false;
+    config.keepSeparateFirstLetter = false;
+    config.ignorePinyinOffset = false;
+
+    std::string text = "刘德华";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    for (const auto& token : tokens) {
+        if (token.term == "liu") {
+            EXPECT_EQ(token.startOffset, 0);
+            EXPECT_EQ(token.endOffset, 3);
+        } else if (token.term == "de") {
+            EXPECT_EQ(token.startOffset, 3);
+            EXPECT_EQ(token.endOffset, 6);
+        } else if (token.term == "hua") {
+            EXPECT_EQ(token.startOffset, 6);
+            EXPECT_EQ(token.endOffset, 9);
+        } else if (token.term == "ldh") {
+            EXPECT_EQ(token.startOffset, 0);
+            EXPECT_EQ(token.endOffset, 9);
+        }
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_MultiByteCharacters) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.ignorePinyinOffset = false;
+
+    std::string text = "你好";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    bool found_ni = false;
+    bool found_hao = false;
+    bool found_nh = false;
+
+    for (const auto& token : tokens) {
+        if (token.term == "ni") {
+            EXPECT_EQ(token.startOffset, 0);
+            EXPECT_EQ(token.endOffset, 3);
+            found_ni = true;
+        } else if (token.term == "hao") {
+            EXPECT_EQ(token.startOffset, 3);
+            EXPECT_EQ(token.endOffset, 6);
+            found_hao = true;
+        } else if (token.term == "nh") {
+            EXPECT_EQ(token.startOffset, 0);
+            EXPECT_EQ(token.endOffset, 6);
+            found_nh = true;
+        }
+    }
+
+    EXPECT_TRUE(found_ni) << "Token 'ni' not found";
+    EXPECT_TRUE(found_hao) << "Token 'hao' not found";
+    EXPECT_TRUE(found_nh) << "Token 'nh' not found";
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_True_MixedContent) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.ignorePinyinOffset = true;
+
+    std::string text = "刘a德";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    int total_bytes = 7;
+    std::vector<std::string> expected_terms = {"liu", "lad", "a", "de"};
+    EXPECT_EQ(tokens.size(), expected_terms.size());
+    for (size_t i = 0; i < tokens.size(); ++i) {
+        EXPECT_EQ(tokens[i].term, expected_terms[i]) << "Token mismatch at 
position " << i;
+        EXPECT_EQ(tokens[i].startOffset, 0) << "Token: " << tokens[i].term;
+        EXPECT_EQ(tokens[i].endOffset, total_bytes) << "Token: " << 
tokens[i].term;
+    }
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_MixedContent) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.ignorePinyinOffset = false;
+
+    std::string text = "刘a德";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    bool found_liu = false;
+    bool found_a = false;
+    bool found_de = false;
+
+    for (const auto& token : tokens) {
+        if (token.term == "liu") {
+            EXPECT_EQ(token.startOffset, 0);
+            EXPECT_EQ(token.endOffset, 3);
+            found_liu = true;
+        } else if (token.term == "a") {
+            EXPECT_EQ(token.startOffset, 3);
+            EXPECT_EQ(token.endOffset, 4);
+            found_a = true;
+        } else if (token.term == "de") {
+            EXPECT_EQ(token.startOffset, 4);
+            EXPECT_EQ(token.endOffset, 7);
+            found_de = true;
+        } else if (token.term == "lad") {
+            EXPECT_EQ(token.startOffset, 0);
+            EXPECT_EQ(token.endOffset, 7);
+        }
+    }
+
+    EXPECT_TRUE(found_liu) << "Token 'liu' not found";
+    EXPECT_TRUE(found_a) << "Token 'a' not found";
+    EXPECT_TRUE(found_de) << "Token 'de' not found";
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_PolyphoneWords) {
+    PinyinConfig config;
+    config.keepFirstLetter = true;
+    config.keepFullPinyin = true;
+    config.keepOriginal = false;
+    config.ignorePinyinOffset = false;
+
+    std::string text = "银行";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_GT(tokens.size(), 0);
+
+    bool found_yin = false;
+    bool found_xing_or_hang = false;
+
+    for (const auto& token : tokens) {
+        if (token.term == "yin") {
+            EXPECT_EQ(token.startOffset, 0);
+            EXPECT_EQ(token.endOffset, 3);
+            found_yin = true;
+        } else if (token.term == "xing" || token.term == "hang") {
+            EXPECT_EQ(token.startOffset, 3);
+            EXPECT_EQ(token.endOffset, 6);
+            found_xing_or_hang = true;
+        } else if (token.term == "yx" || token.term == "yh") {
+            EXPECT_EQ(token.startOffset, 0);
+            EXPECT_EQ(token.endOffset, 6);
+        }
+    }
+
+    EXPECT_TRUE(found_yin) << "Token 'yin' not found";
+    EXPECT_TRUE(found_xing_or_hang) << "Token 'xing' or 'hang' not found";
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_JoinedFullPinyin) {
+    PinyinConfig config;
+    config.keepFirstLetter = false;
+    config.keepFullPinyin = false;
+    config.keepJoinedFullPinyin = true;
+    config.keepOriginal = false;
+    config.ignorePinyinOffset = false;
+
+    std::string text = "刘德华";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_EQ(tokens.size(), 1);
+    EXPECT_EQ(tokens[0].term, "liudehua");
+    EXPECT_EQ(tokens[0].startOffset, 0);
+    EXPECT_EQ(tokens[0].endOffset, 9);
+}
+
+TEST_F(PinyinAnalysisTest, TestIgnorePinyinOffset_False_KeepOriginal) {
+    PinyinConfig config;
+    config.keepFirstLetter = false;
+    config.keepFullPinyin = false;
+    config.keepOriginal = true;
+    config.ignorePinyinOffset = false;
+
+    std::string text = "刘德华";
+    auto result = getStringArrayListHashMap({text}, config);
+    auto& tokens = result[text];
+
+    ASSERT_EQ(tokens.size(), 1);
+    EXPECT_EQ(tokens[0].term, "刘德华");
+    EXPECT_EQ(tokens[0].startOffset, 0);
+    EXPECT_EQ(tokens[0].endOffset, 9);
+}
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp
index 605beafbfdd..8badf98a272 100644
--- 
a/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/tokenizer/pinyin_util_test.cpp
@@ -27,6 +27,7 @@
 #include "common/config.h"
 #include "common/logging.h"
 #include 
"olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_format.h"
+#include 
"olap/rowset/segment_v2/inverted_index/tokenizer/pinyin/pinyin_formatter.h"
 #include "unicode/utf8.h"
 
 namespace doris::segment_v2::inverted_index {
@@ -682,4 +683,125 @@ TEST_F(PinyinUtilTest, TestUtf8CharCountVariousInputs) {
     EXPECT_EQ(getUtf8CharCount("中国abc"), 5);
 }
 
+// Test YuCharType::WITH_U_UNICODE - tests PinyinFormatter replaceAll(result, 
"u:", "ü")
+// Note: "绿" returns "lv" from polyphone dict matching, while "女"/"律" return 
"u:" format
+TEST_F(PinyinUtilTest, TestYuCharTypeWithUUnicode) {
+    auto& pinyin_util = PinyinUtil::instance();
+
+    // This tests the code path: case YuCharType::WITH_U_UNICODE: result = 
replaceAll(result, "u:", "ü");
+    PinyinFormat unicode_u_format(YuCharType::WITH_U_UNICODE, 
ToneType::WITHOUT_TONE,
+                                  CaseType::LOWERCASE);
+
+    // Test character "女" - returns "nu:3" from main dictionary, should become 
"nü"
+    std::vector<std::string> result1 =
+            pinyin_util.convert(stringToCodepoints("女"), unicode_u_format);
+    EXPECT_EQ(result1.size(), 1);
+    EXPECT_EQ(result1[0], "nü") << "女 should be 'nü' with WITH_U_UNICODE 
format";
+
+    // Test character "律" - returns "lu:4" from main dictionary, should become 
"lü"
+    std::vector<std::string> result2 =
+            pinyin_util.convert(stringToCodepoints("律"), unicode_u_format);
+    EXPECT_EQ(result2.size(), 1);
+    EXPECT_EQ(result2[0], "lü") << "律 should be 'lü' with WITH_U_UNICODE 
format";
+
+    // Test character "绿" - returns "lv" from polyphone dict (no u: to convert)
+    std::vector<std::string> result3 =
+            pinyin_util.convert(stringToCodepoints("绿"), unicode_u_format);
+    EXPECT_EQ(result3.size(), 1);
+    EXPECT_EQ(result3[0], "lv") << "绿 returns 'lv' from polyphone dict";
+
+    // Test character "旅" - check what format it returns
+    std::vector<std::string> result4 =
+            pinyin_util.convert(stringToCodepoints("旅"), unicode_u_format);
+    EXPECT_EQ(result4.size(), 1);
+    EXPECT_TRUE(result4[0] == "lü" || result4[0] == "lv")
+            << "旅 should be 'lü' or 'lv', got: " << result4[0];
+}
+
+// Test YuCharType::WITH_V - tests PinyinFormatter replaceAll(result, "u:", 
"v")
+TEST_F(PinyinUtilTest, TestYuCharTypeWithV) {
+    auto& pinyin_util = PinyinUtil::instance();
+
+    PinyinFormat v_format(YuCharType::WITH_V, ToneType::WITHOUT_TONE, 
CaseType::LOWERCASE);
+
+    // Test character "女" - should convert u: to v
+    std::vector<std::string> result1 = 
pinyin_util.convert(stringToCodepoints("女"), v_format);
+    EXPECT_EQ(result1.size(), 1);
+    EXPECT_EQ(result1[0], "nv") << "女 should be 'nv' with WITH_V format";
+
+    // Test character "绿"
+    std::vector<std::string> result2 = 
pinyin_util.convert(stringToCodepoints("绿"), v_format);
+    EXPECT_EQ(result2.size(), 1);
+    EXPECT_EQ(result2[0], "lv") << "绿 should be 'lv' with WITH_V format";
+}
+
+// Test YuCharType::WITH_U_AND_COLON (default) - keeps "u:" as is
+TEST_F(PinyinUtilTest, TestYuCharTypeWithUAndColon) {
+    auto& pinyin_util = PinyinUtil::instance();
+
+    PinyinFormat colon_format(YuCharType::WITH_U_AND_COLON, 
ToneType::WITHOUT_TONE,
+                              CaseType::LOWERCASE);
+
+    // Test character "女" - returns "nu:3" from main dict, should keep "nu:"
+    std::vector<std::string> result1 = 
pinyin_util.convert(stringToCodepoints("女"), colon_format);
+    EXPECT_EQ(result1.size(), 1);
+    EXPECT_EQ(result1[0], "nu:") << "女 should be 'nu:' with WITH_U_AND_COLON 
format";
+
+    // Test character "律" - returns "lu:4" from main dict, should keep "lu:"
+    std::vector<std::string> result2 = 
pinyin_util.convert(stringToCodepoints("律"), colon_format);
+    EXPECT_EQ(result2.size(), 1);
+    EXPECT_EQ(result2[0], "lu:") << "律 should be 'lu:' with WITH_U_AND_COLON 
format";
+}
+
+// Test YuCharType with different CaseTypes
+TEST_F(PinyinUtilTest, TestYuCharTypeWithCaseTypes) {
+    auto& pinyin_util = PinyinUtil::instance();
+
+    // Test WITH_U_UNICODE + UPPERCASE
+    PinyinFormat unicode_upper(YuCharType::WITH_U_UNICODE, 
ToneType::WITHOUT_TONE,
+                               CaseType::UPPERCASE);
+    std::vector<std::string> result1 = 
pinyin_util.convert(stringToCodepoints("女"), unicode_upper);
+    EXPECT_EQ(result1.size(), 1);
+    // Note: ü is a multi-byte UTF-8 character, uppercase might not work as 
expected
+    // The important thing is that u: was replaced with ü before uppercasing
+
+    // Test WITH_V + UPPERCASE
+    PinyinFormat v_upper(YuCharType::WITH_V, ToneType::WITHOUT_TONE, 
CaseType::UPPERCASE);
+    std::vector<std::string> result2 = 
pinyin_util.convert(stringToCodepoints("女"), v_upper);
+    EXPECT_EQ(result2.size(), 1);
+    EXPECT_EQ(result2[0], "NV") << "女 should be 'NV' with WITH_V + UPPERCASE";
+
+    // Test WITH_V + CAPITALIZE
+    PinyinFormat v_capitalize(YuCharType::WITH_V, ToneType::WITHOUT_TONE, 
CaseType::CAPITALIZE);
+    std::vector<std::string> result3 = 
pinyin_util.convert(stringToCodepoints("女"), v_capitalize);
+    EXPECT_EQ(result3.size(), 1);
+    EXPECT_EQ(result3[0], "Nv") << "女 should be 'Nv' with WITH_V + CAPITALIZE";
+}
+
+// Test PinyinFormatter directly to ensure u: conversion logic works correctly
+// This tests the code path regardless of dictionary format
+TEST_F(PinyinUtilTest, TestPinyinFormatterUColonConversion) {
+    // Test PinyinFormatter directly to verify u: -> ü conversion
+    PinyinFormat unicode_u_format(YuCharType::WITH_U_UNICODE, 
ToneType::WITHOUT_TONE,
+                                  CaseType::LOWERCASE);
+
+    // Test with known u: input strings
+    std::string result1 = PinyinFormatter::formatPinyin("nu:3", 
unicode_u_format);
+    EXPECT_EQ(result1, "nü") << "nu:3 should become 'nü' with WITH_U_UNICODE 
format";
+
+    std::string result2 = PinyinFormatter::formatPinyin("lu:4", 
unicode_u_format);
+    EXPECT_EQ(result2, "lü") << "lu:4 should become 'lü' with WITH_U_UNICODE 
format";
+
+    // Test WITH_V format
+    PinyinFormat v_format(YuCharType::WITH_V, ToneType::WITHOUT_TONE, 
CaseType::LOWERCASE);
+    std::string result3 = PinyinFormatter::formatPinyin("nu:3", v_format);
+    EXPECT_EQ(result3, "nv") << "nu:3 should become 'nv' with WITH_V format";
+
+    // Test WITH_U_AND_COLON format
+    PinyinFormat colon_format(YuCharType::WITH_U_AND_COLON, 
ToneType::WITHOUT_TONE,
+                              CaseType::LOWERCASE);
+    std::string result4 = PinyinFormatter::formatPinyin("nu:3", colon_format);
+    EXPECT_EQ(result4, "nu:") << "nu:3 should become 'nu:' with 
WITH_U_AND_COLON format";
+}
+
 } // namespace doris::segment_v2::inverted_index
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
index 278a1d3e343..c6fac8622c1 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenFilterValidator.java
@@ -108,42 +108,17 @@ public class PinyinTokenFilterValidator extends 
BasePolicyValidator {
      */
     private void validateConfigurationLogic(Map<String, String> props) throws 
DdlException {
         // ensure at least one output format is enabled
-        boolean keepOriginal = getBooleanValue(props, "keep_original", false);
         boolean keepFirstLetter = getBooleanValue(props, "keep_first_letter", 
true);
         boolean keepFullPinyin = getBooleanValue(props, "keep_full_pinyin", 
true);
         boolean keepJoinedFullPinyin = getBooleanValue(props, 
"keep_joined_full_pinyin", false);
         boolean keepSeparateFirstLetter = getBooleanValue(props, 
"keep_separate_first_letter", false);
         boolean keepSeparateChinese = getBooleanValue(props, 
"keep_separate_chinese", false);
 
-        if (!keepOriginal && !keepFirstLetter && !keepFullPinyin
-                && !keepJoinedFullPinyin && !keepSeparateFirstLetter && 
!keepSeparateChinese) {
-            throw new DdlException("At least one output format must be 
enabled: "
-                    + "keep_original, keep_first_letter, keep_full_pinyin, 
keep_joined_full_pinyin, "
-                    + "keep_separate_first_letter, or keep_separate_chinese");
-        }
-
-        // validate keep_separate_first_letter and keep_first_letter 
relationship
-        if (keepSeparateFirstLetter && !keepFirstLetter) {
-            throw new DdlException("keep_separate_first_letter requires 
keep_first_letter to be enabled");
-        }
-
-        // validate keep_none_chinese_in_first_letter and keep_first_letter 
relationship
-        boolean keepNoneChineseInFirstLetter = getBooleanValue(props, 
"keep_none_chinese_in_first_letter", true);
-        if (keepNoneChineseInFirstLetter && !keepFirstLetter) {
-            throw new DdlException("keep_none_chinese_in_first_letter requires 
keep_first_letter to be enabled");
-        }
-
-        // validate keep_none_chinese_in_joined_full_pinyin and 
keep_joined_full_pinyin relationship
-        boolean keepNoneChineseInJoinedFullPinyin = getBooleanValue(props, 
"keep_none_chinese_in_joined_full_pinyin",
-                false);
-        if (keepNoneChineseInJoinedFullPinyin && !keepJoinedFullPinyin) {
-            throw new DdlException("keep_none_chinese_in_joined_full_pinyin 
requires keep_joined_full_pinyin "
-                + "to be enabled");
-        }
-
-        // validate limit_first_letter_length and keep_first_letter 
relationship
-        if (props.containsKey("limit_first_letter_length") && 
!keepFirstLetter) {
-            throw new DdlException("limit_first_letter_length is only valid 
when keep_first_letter is enabled");
+        if (!keepFirstLetter && !keepFullPinyin && !keepJoinedFullPinyin
+                && !keepSeparateFirstLetter && !keepSeparateChinese) {
+            throw new DdlException("pinyin config error, at least one output 
format must be enabled "
+                    + "(keep_first_letter, keep_separate_first_letter, 
keep_full_pinyin, "
+                    + "keep_joined_full_pinyin, or keep_separate_chinese).");
         }
     }
 
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
index cf2010af6e3..96a485a527f 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/indexpolicy/PinyinTokenizerValidator.java
@@ -106,42 +106,17 @@ public class PinyinTokenizerValidator extends 
BasePolicyValidator {
      */
     private void validateConfigurationLogic(Map<String, String> props) throws 
DdlException {
         // ensure at least one output format is enabled
-        boolean keepOriginal = getBooleanValue(props, "keep_original", false);
         boolean keepFirstLetter = getBooleanValue(props, "keep_first_letter", 
true);
         boolean keepFullPinyin = getBooleanValue(props, "keep_full_pinyin", 
true);
         boolean keepJoinedFullPinyin = getBooleanValue(props, 
"keep_joined_full_pinyin", false);
         boolean keepSeparateFirstLetter = getBooleanValue(props, 
"keep_separate_first_letter", false);
         boolean keepSeparateChinese = getBooleanValue(props, 
"keep_separate_chinese", false);
 
-        if (!keepOriginal && !keepFirstLetter && !keepFullPinyin
-                && !keepJoinedFullPinyin && !keepSeparateFirstLetter && 
!keepSeparateChinese) {
-            throw new DdlException("At least one output format must be 
enabled: "
-                    + "keep_original, keep_first_letter, keep_full_pinyin, 
keep_joined_full_pinyin, "
-                    + "keep_separate_first_letter, or keep_separate_chinese");
-        }
-
-        // validate keep_separate_first_letter and keep_first_letter 
relationship
-        if (keepSeparateFirstLetter && !keepFirstLetter) {
-            throw new DdlException("keep_separate_first_letter requires 
keep_first_letter to be enabled");
-        }
-
-        // validate keep_none_chinese_in_first_letter and keep_first_letter 
relationship
-        boolean keepNoneChineseInFirstLetter = getBooleanValue(props, 
"keep_none_chinese_in_first_letter", true);
-        if (keepNoneChineseInFirstLetter && !keepFirstLetter) {
-            throw new DdlException("keep_none_chinese_in_first_letter requires 
keep_first_letter to be enabled");
-        }
-
-        // validate keep_none_chinese_in_joined_full_pinyin and 
keep_joined_full_pinyin relationship
-        boolean keepNoneChineseInJoinedFullPinyin = getBooleanValue(props,
-                "keep_none_chinese_in_joined_full_pinyin", false);
-        if (keepNoneChineseInJoinedFullPinyin && !keepJoinedFullPinyin) {
-            throw new DdlException("keep_none_chinese_in_joined_full_pinyin "
-            + "requires keep_joined_full_pinyin to be enabled");
-        }
-
-        // validate limit_first_letter_length and keep_first_letter 
relationship
-        if (props.containsKey("limit_first_letter_length") && 
!keepFirstLetter) {
-            throw new DdlException("limit_first_letter_length is only valid 
when keep_first_letter is enabled");
+        if (!keepFirstLetter && !keepFullPinyin && !keepJoinedFullPinyin
+                && !keepSeparateFirstLetter && !keepSeparateChinese) {
+            throw new DdlException("pinyin config error, at least one output 
format must be enabled "
+                    + "(keep_first_letter, keep_separate_first_letter, 
keep_full_pinyin, "
+                    + "keep_joined_full_pinyin, or keep_separate_chinese).");
         }
     }
 
diff --git 
a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out 
b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
index e6617f25a1c..44852a591e3 100644
--- a/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
+++ b/regression-test/data/inverted_index_p0/analyzer/test_custom_analyzer.out
@@ -87,7 +87,7 @@
 [{\n        "token": "d"\n    }, {\n        "token": "dj音乐家"\n    }, {\n       
 "token": "djyyj"\n    }, {\n        "token": "j"\n    }, {\n        "token": 
"yin"\n    }, {\n        "token": "yue"\n    }, {\n        "token": "jia"\n    
}]
 
 -- !tokenize_pinyin13 --
-[{\n        "token": "liu"\n    }, {\n        "token": "刘德华abc123"\n    }, {\n 
       "token": "ldhabc123"\n    }, {\n        "token": "de"\n    }, {\n        
"token": "hua"\n    }, {\n        "token": "a"\n    }, {\n        "token": 
"b"\n    }, {\n        "token": "c"\n    }, {\n        "token": "123"\n    }]
+[{\n        "token": "liu"\n    }, {\n        "token": "刘德华abc123"\n    }, {\n 
       "token": "ldhabc123"\n    }, {\n        "token": "de"\n    }, {\n        
"token": "hua"\n    }, {\n        "token": "a"\n    }, {\n        "token": 
"b"\n    }, {\n        "token": "c"\n    }, {\n        "token": "1"\n    }, {\n 
       "token": "2"\n    }, {\n        "token": "3"\n    }]
 
 -- !sql --
 1      abcDEF
@@ -266,3 +266,39 @@
 -- !sql_standard_pinyin4 --
 2      刘德华 张学友
 
+-- !sql_ignore_offset_true_1 --
+[{\n        "token": "liu"\n    }, {\n        "token": "ldh"\n    }, {\n       
 "token": "de"\n    }, {\n        "token": "hua"\n    }]
+
+-- !sql_ignore_offset_true_2 --
+[{\n        "token": "ni"\n    }, {\n        "token": "nh"\n    }, {\n        
"token": "hao"\n    }]
+
+-- !sql_ignore_offset_true_3 --
+[{\n        "token": "yin"\n    }, {\n        "token": "yx"\n    }, {\n        
"token": "xing"\n    }]
+
+-- !sql_ignore_offset_false_1 --
+[{\n        "token": "liu"\n    }, {\n        "token": "ldh"\n    }, {\n       
 "token": "de"\n    }, {\n        "token": "hua"\n    }]
+
+-- !sql_ignore_offset_false_2 --
+[{\n        "token": "ni"\n    }, {\n        "token": "nh"\n    }, {\n        
"token": "hao"\n    }]
+
+-- !sql_ignore_offset_false_3 --
+[{\n        "token": "yin"\n    }, {\n        "token": "yx"\n    }, {\n        
"token": "xing"\n    }]
+
+-- !sql_ignore_offset_true_mixed --
+[{\n        "token": "liu"\n    }, {\n        "token": "lad"\n    }, {\n       
 "token": "a"\n    }, {\n        "token": "de"\n    }]
+
+-- !sql_ignore_offset_false_mixed --
+[{\n        "token": "liu"\n    }, {\n        "token": "lad"\n    }, {\n       
 "token": "a"\n    }, {\n        "token": "de"\n    }]
+
+-- !sql_table_ignore_offset_1 --
+1      刘德华
+
+-- !sql_table_ignore_offset_2 --
+1      刘德华
+
+-- !sql_table_ignore_offset_3 --
+3      银行卡
+
+-- !sql_table_ignore_offset_4 --
+3      银行卡
+
diff --git 
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy 
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
index 0b03b785d2b..478fade7fc5 100644
--- 
a/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
+++ 
b/regression-test/suites/inverted_index_p0/analyzer/test_custom_analyzer.groovy
@@ -63,8 +63,8 @@ suite("test_custom_analyzer", "p0") {
         CREATE INVERTED INDEX ANALYZER IF NOT EXISTS keyword_lowercase
         PROPERTIES
         (
-            "tokenizer" = "keyword",
-            "token_filter" = "asciifolding, lowercase"
+        "tokenizer" = "keyword",
+        "token_filter" = "asciifolding, lowercase"
         );
     """
 
@@ -583,4 +583,93 @@ suite("test_custom_analyzer", "p0") {
     } finally {
         sql "DROP TABLE IF EXISTS ${indexTbName6}"
     }
+
+    // Test ignore_pinyin_offset parameter
+    // Create tokenizer with ignore_pinyin_offset=true (default)
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS 
pinyin_tokenizer_ignore_true
+        PROPERTIES (
+            "type" = "pinyin",
+            "keep_first_letter" = "true",
+            "keep_full_pinyin" = "true",
+            "ignore_pinyin_offset" = "true"
+        );
+    """
+
+    // Create tokenizer with ignore_pinyin_offset=false
+    sql """
+        CREATE INVERTED INDEX TOKENIZER IF NOT EXISTS 
pinyin_tokenizer_ignore_false
+        PROPERTIES (
+            "type" = "pinyin",
+            "keep_first_letter" = "true",
+            "keep_full_pinyin" = "true",
+            "ignore_pinyin_offset" = "false"
+        );
+    """
+
+    // Create analyzers
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS 
pinyin_analyzer_ignore_true
+        PROPERTIES (
+            "tokenizer" = "pinyin_tokenizer_ignore_true"
+        );
+    """
+
+    sql """
+        CREATE INVERTED INDEX ANALYZER IF NOT EXISTS 
pinyin_analyzer_ignore_false
+        PROPERTIES (
+            "tokenizer" = "pinyin_tokenizer_ignore_false"
+        );
+    """
+
+    // Wait for all analyzers to be ready - increased timeout due to many 
objects
+    sql """ select sleep(15) """
+
+    // Test with ignore_pinyin_offset=true - all tokens should have same offset
+    qt_sql_ignore_offset_true_1 """ select tokenize('刘德华', 
'"analyzer"="pinyin_analyzer_ignore_true"'); """
+    qt_sql_ignore_offset_true_2 """ select tokenize('你好', 
'"analyzer"="pinyin_analyzer_ignore_true"'); """
+    qt_sql_ignore_offset_true_3 """ select tokenize('银行', 
'"analyzer"="pinyin_analyzer_ignore_true"'); """
+
+    // Test with ignore_pinyin_offset=false - tokens should have independent 
offsets
+    qt_sql_ignore_offset_false_1 """ select tokenize('刘德华', 
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+    qt_sql_ignore_offset_false_2 """ select tokenize('你好', 
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+    qt_sql_ignore_offset_false_3 """ select tokenize('银行', 
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+
+    // Test with mixed content
+    qt_sql_ignore_offset_true_mixed """ select tokenize('刘a德', 
'"analyzer"="pinyin_analyzer_ignore_true"'); """
+    qt_sql_ignore_offset_false_mixed """ select tokenize('刘a德', 
'"analyzer"="pinyin_analyzer_ignore_false"'); """
+
+    // Test table creation and queries with ignore_pinyin_offset
+    def indexTbName7 = "test_custom_analyzer_pinyin_offset"
+    sql "DROP TABLE IF EXISTS ${indexTbName7}"
+    sql """
+        CREATE TABLE ${indexTbName7} (
+            `a` bigint NOT NULL AUTO_INCREMENT(1),
+            `content` text NULL,
+            INDEX idx_content (`content`) USING INVERTED 
PROPERTIES("support_phrase" = "true", "analyzer" = 
"pinyin_analyzer_ignore_true")
+        ) ENGINE=OLAP
+        DUPLICATE KEY(`a`)
+        DISTRIBUTED BY RANDOM BUCKETS 1
+        PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+
+    sql """ INSERT INTO ${indexTbName7} VALUES (1, "刘德华"); """
+    sql """ INSERT INTO ${indexTbName7} VALUES (2, "你好世界"); """
+    sql """ INSERT INTO ${indexTbName7} VALUES (3, "银行卡"); """
+
+    try {
+        sql "sync"
+        sql """ set enable_common_expr_pushdown = true; """
+
+        // Test queries with ignore_pinyin_offset=true
+        qt_sql_table_ignore_offset_1 """ select * from ${indexTbName7} where 
content match 'liu' order by a; """
+        qt_sql_table_ignore_offset_2 """ select * from ${indexTbName7} where 
content match 'ldh' order by a; """
+        qt_sql_table_ignore_offset_3 """ select * from ${indexTbName7} where 
content match 'yin' order by a; """
+        qt_sql_table_ignore_offset_4 """ select * from ${indexTbName7} where 
content match 'hang' order by a; """
+
+    } finally {
+        sql "DROP TABLE IF EXISTS ${indexTbName7}"
+    }
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-4.0 updated: branch-4.0: [fix](inverted index) fix pinyin bug #57756 (#58852)

Reply via email to