(doris-thirdparty) branch clucene updated: [test](unicode) add more ut (#288)

airborne Wed, 05 Mar 2025 07:08:35 -0800

This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new 74926996d64 [test](unicode) add more ut (#288)
74926996d64 is described below

commit 74926996d64e262eea70376f5807335b40dc5c37
Author: airborne12 <jiang...@selectdb.com>
AuthorDate: Wed Mar 5 23:08:23 2025 +0800

    [test](unicode) add more ut (#288)
---
 src/test/store/TestUTF8Chars.cpp | 726 +++++++++++++++++++++++++++++----------
 1 file changed, 538 insertions(+), 188 deletions(-)

diff --git a/src/test/store/TestUTF8Chars.cpp b/src/test/store/TestUTF8Chars.cpp
index 76eca31664c..22695f1b560 100644
--- a/src/test/store/TestUTF8Chars.cpp
+++ b/src/test/store/TestUTF8Chars.cpp
@@ -4,22 +4,29 @@
 #include "CLucene/store/IndexOutput.h"
 #include "CLucene/store/RAMDirectory.h"
 #include "CuTest.h"
+#include <codecvt>
 #include <ctime>
+#include <locale>
 #include <string>
 #include <vector>
 #include <utility>
 #include <iostream>
+#include "CLucene/index/_TermInfosWriter.h"
+#include "CLucene/index/_TermInfosReader.h"
+#include "CLucene/index/_FieldInfos.h"
+#include "CLucene/index/_TermInfo.h"
+#include "CLucene/index/Term.h"
 
 using namespace lucene::store;
 
 // Add a helper macro for printing more detailed error messages when 
assertions fail
 #define CuAssertTrueWithMessage(tc, message, condition) \
-    do { \
-        if (!(condition)) { \
-            printf("Assertion failed: %s\n", message); \
-        } \
-        CuAssertTrue(tc, condition); \
-    } while(0)
+    do {                                                \
+        if (!(condition)) {                             \
+            printf("Assertion failed: %s\n", message);  \
+        }                                               \
+        CuAssertTrue(tc, condition);                    \
+    } while (0)
 
 static void TestUTF8WriteAndReadChars(CuTest* tc) {
     RAMDirectory* dir = _CLNEW RAMDirectory();
@@ -33,8 +40,8 @@ static void TestUTF8WriteAndReadChars(CuTest* tc) {
     testString.push_back(L'你');            // 3 bytes
     testString.push_back(L'好');            // 3 bytes
     testString.push_back((wchar_t)0x1F600); // 4 bytes
-
-    output->writeString(testString);
+    output->writeVInt(testString.length());
+    output->writeSChars<TCHAR>(testString.c_str(), testString.length());
 
     output->close();
     _CLDELETE(output);
@@ -54,10 +61,10 @@ static void TestUTF8WriteAndReadChars(CuTest* tc) {
 
     for (size_t i = 0; i < testString.size(); i++) {
         printf("Character #%zu: 0x%04X -> Readback: 0x%04X, %s\n", i, 
(unsigned int)testString[i],
-                   (unsigned int)readBackString[i],
-                   (testString[i] == readBackString[i] ? "Success" : 
"Failed"));
+               (unsigned int)readBackString[i],
+               (testString[i] == readBackString[i] ? "Success" : "Failed"));
         char errorMsg[256];
-        sprintf(errorMsg, "Character mismatch - Position: %zu, Original: 
0x%04X, Readback: 0x%04X", 
+        sprintf(errorMsg, "Character mismatch - Position: %zu, Original: 
0x%04X, Readback: 0x%04X",
                 i, (unsigned int)testString[i], (unsigned 
int)readBackString[i]);
         CuAssertTrueWithMessage(tc, errorMsg, testString[i] == 
readBackString[i]);
     }
@@ -109,7 +116,7 @@ static void TestUnicodeRanges(CuTest* tc) {
     // Create a test string containing characters from various Unicode ranges
     std::vector<std::pair<wchar_t, const char*>> unicodeTestChars = {
             // ASCII range (U+0000 - U+007F) - 1 byte UTF-8
-//            {0x0000, "NULL character (U+0000)"},
+            //            {0x0000, "NULL character (U+0000)"},
             {0x0001, "Start of Heading (U+0001)"},
             {0x007F, "Delete (U+007F)"},
             {L'A', "Latin letter A (U+0041)"},
@@ -148,56 +155,47 @@ static void TestUnicodeRanges(CuTest* tc) {
             {0x10348, "Gothic letter 𐍈 (U+10348)"},
             {0x10400, "Deseret letter 𐐀 (U+10400)"},
             {0x10FFFF, "Unicode maximum value (U+10FFFF)"},
-            
-            // 数学符号
+
             {0x2200, "For All ∀ (U+2200)"},
             {0x2211, "Summation ∑ (U+2211)"},
             {0x221E, "Infinity ∞ (U+221E)"},
             {0x2248, "Almost Equal To ≈ (U+2248)"},
-            
-            // 货币符号
+
             {0x20AC, "Euro € (U+20AC)"},
             {0x20BD, "Russian Ruble ₽ (U+20BD)"},
             {0x20B9, "Indian Rupee ₹ (U+20B9)"},
             {0x20A9, "Won Sign ₩ (U+20A9)"},
-            
-            // 箭头符号
+
             {0x2190, "Left Arrow ← (U+2190)"},
             {0x2192, "Right Arrow → (U+2192)"},
             {0x2191, "Up Arrow ↑ (U+2191)"},
             {0x2193, "Down Arrow ↓ (U+2193)"},
-            
-            // 框线符号
+
             {0x2550, "Box Drawing ═ (U+2550)"},
             {0x2551, "Box Drawing ║ (U+2551)"},
             {0x2554, "Box Drawing ╔ (U+2554)"},
             {0x2557, "Box Drawing ╗ (U+2557)"},
-            
-            // 字母符号
+
             {0x2122, "Trade Mark ™ (U+2122)"},
             {0x2105, "Care Of ℅ (U+2105)"},
             {0x2113, "Script Small L ℓ (U+2113)"},
             {0x2116, "Numero Sign № (U+2116)"},
-            
-            // 装饰符号
+
             {0x2600, "Black Sun with Rays ☀ (U+2600)"},
             {0x2602, "Umbrella ☂ (U+2602)"},
             {0x2614, "Umbrella with Rain Drops ☔ (U+2614)"},
             {0x2665, "Black Heart Suit ♥ (U+2665)"},
-            
-            // 更多补充平面字符
+
             {0x1D400, "Mathematical Bold Capital A 𝐀 (U+1D400)"},
             {0x1D538, "Mathematical Double-Struck Capital A 𝔸 (U+1D538)"},
             {0x1F300, "Cyclone 🌀 (U+1F300)"},
             {0x1F431, "Cat Face 🐱 (U+1F431)"},
             {0x1F52B, "Pistol 🔫 (U+1F52B)"},
             {0x1F697, "Automobile 🚗 (U+1F697)"},
-            
-            // 额外CJK字符
+
             {0x20000, "CJK Unified Ideograph 𠀀 (U+20000)"},
             {0x2A700, "CJK Unified Ideograph 𪜀 (U+2A700)"},
-            
-            // 其他技术符号
+
             {0x2300, "Diameter Sign ⌀ (U+2300)"},
             {0x231B, "Hourglass ⌛ (U+231B)"},
             {0x2328, "Keyboard ⌨ (U+2328)"},
@@ -240,12 +238,14 @@ static void TestUnicodeRanges(CuTest* tc) {
             wchar_t original = testString[i];
             wchar_t readBack = readBackString[i];
 
-            printf("Character #%zu: 0x%04X (%s) -> Readback: 0x%04X, %s\n", i, 
(unsigned int)original,
-                   unicodeTestChars[i].second, (unsigned int)readBack,
+            printf("Character #%zu: 0x%04X (%s) -> Readback: 0x%04X, %s\n", i,
+                   (unsigned int)original, unicodeTestChars[i].second, 
(unsigned int)readBack,
                    (original == readBack ? "Success" : "Failed"));
 
             char errorMsg[256];
-            sprintf(errorMsg, "Unicode character mismatch - Position: %zu, 
Character: %s, Original: 0x%04X, Readback: 0x%04X",
+            sprintf(errorMsg,
+                    "Unicode character mismatch - Position: %zu, Character: 
%s, Original: 0x%04X, "
+                    "Readback: 0x%04X",
                     i, unicodeTestChars[i].second, (unsigned int)original, 
(unsigned int)readBack);
             CuAssertTrueWithMessage(tc, errorMsg, original == readBack);
         }
@@ -265,13 +265,14 @@ static void TestEdgeCases(CuTest* tc) {
 
     // Create a test string containing edge cases
     std::wstring testString;
-    
+
     // 1. Empty string test
     IndexOutput* emptyOutput = dir->createOutput("empty_string.dat");
-    emptyOutput->writeString(L"");
+    emptyOutput->writeVInt(0);
+    emptyOutput->writeSChars<TCHAR>(L"", 0);
     emptyOutput->close();
     _CLDELETE(emptyOutput);
-    
+
     IndexInput* emptyInput = nullptr;
     CLuceneError emptyError;
     dir->openInput("empty_string.dat", emptyInput, emptyError);
@@ -281,7 +282,7 @@ static void TestEdgeCases(CuTest* tc) {
     CuAssertIntEquals(tc, _T("Empty string length should be 0"), 0, 
(int)emptyReadBack.size());
     emptyInput->close();
     _CLDELETE(emptyInput);
-    
+
     // 2. Long string test (containing various characters)
     std::wstring longString;
     // Add 1000 mixed characters
@@ -291,47 +292,51 @@ static void TestEdgeCases(CuTest* tc) {
         longString.push_back(0x4E00 + i % 100); // Chinese
         longString.push_back(0x1F600 + i % 50); // Emoji
     }
-    
+
     IndexOutput* longOutput = dir->createOutput("long_string.dat");
-    longOutput->writeString(longString);
+    longOutput->writeVInt(longString.length());
+    longOutput->writeSChars<TCHAR>(longString.c_str(), longString.length());
     longOutput->close();
     _CLDELETE(longOutput);
-    
+
     IndexInput* longInput = nullptr;
     CLuceneError longError;
     dir->openInput("long_string.dat", longInput, longError);
     TCHAR* longStr = longInput->readString();
     std::wstring longReadBack(longStr);
     _CLDELETE_LARRAY(longStr);
-    
-    CuAssertIntEquals(tc, _T("Long string length mismatch"), 
(int)longString.size(), (int)longReadBack.size());
-    
+
+    CuAssertIntEquals(tc, _T("Long string length mismatch"), 
(int)longString.size(),
+                      (int)longReadBack.size());
+
     // Only check some characters to avoid too much output
     printf("\n=== Long String Test (showing first 10 characters) ===\n");
     for (size_t i = 0; i < 10 && i < longString.size(); i++) {
-        printf("Character #%zu: 0x%04X -> Readback: 0x%04X, %s\n", 
-               i, 
-               (unsigned int)longString[i],
-               (unsigned int)longReadBack[i], 
+        printf("Character #%zu: 0x%04X -> Readback: 0x%04X, %s\n", i, 
(unsigned int)longString[i],
+               (unsigned int)longReadBack[i],
                (longString[i] == longReadBack[i] ? "Success" : "Failed"));
-        
+
         char errorMsg[256];
-        sprintf(errorMsg, "Long string character mismatch - Position: %zu, 
Original: 0x%04X, Readback: 0x%04X", 
+        sprintf(errorMsg,
+                "Long string character mismatch - Position: %zu, Original: 
0x%04X, Readback: "
+                "0x%04X",
                 i, (unsigned int)longString[i], (unsigned int)longReadBack[i]);
         CuAssertTrueWithMessage(tc, errorMsg, longString[i] == 
longReadBack[i]);
     }
-    
+
     // Random sample check
     for (size_t i = 0; i < longString.size(); i += 100) {
         char errorMsg[256];
-        sprintf(errorMsg, "Long string sample check failed - Position: %zu, 
Original: 0x%04X, Readback: 0x%04X", 
+        sprintf(errorMsg,
+                "Long string sample check failed - Position: %zu, Original: 
0x%04X, Readback: "
+                "0x%04X",
                 i, (unsigned int)longString[i], (unsigned int)longReadBack[i]);
         CuAssertTrueWithMessage(tc, errorMsg, longString[i] == 
longReadBack[i]);
     }
-    
+
     longInput->close();
     _CLDELETE(longInput);
-    
+
     _CLDELETE(dir);
 }
 
@@ -347,23 +352,21 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) {
         int expectedBytes;
     };
 
-    std::vector<BoundaryTest> boundaryTests = {
-        // 1-byte boundaries
-        //{0x0000, "NULL (U+0000) - 1-byte lower bound", 1},
-        {0x007F, "DELETE (U+007F) - 1-byte upper bound", 1},
-        
-        // 2-byte boundaries
-        {0x0080, "PAD (U+0080) - 2-byte lower bound", 2},
-        {0x07FF, "2-byte upper bound (U+07FF)", 2},
-        
-        // 3-byte boundaries
-        {0x0800, "3-byte lower bound (U+0800)", 3},
-        {0xFFFF, "BMP upper bound (U+FFFF)", 3},
-        
-        // 4-byte boundaries (supplementary planes)
-        {0x10000, "SMP lower bound (U+10000)", 4},
-        {0x10FFFF, "Unicode upper bound (U+10FFFF)", 4}
-    };
+    std::vector<BoundaryTest> boundaryTests = {// 1-byte boundaries
+                                               //{0x0000, "NULL (U+0000) - 
1-byte lower bound", 1},
+                                               {0x007F, "DELETE (U+007F) - 
1-byte upper bound", 1},
+
+                                               // 2-byte boundaries
+                                               {0x0080, "PAD (U+0080) - 2-byte 
lower bound", 2},
+                                               {0x07FF, "2-byte upper bound 
(U+07FF)", 2},
+
+                                               // 3-byte boundaries
+                                               {0x0800, "3-byte lower bound 
(U+0800)", 3},
+                                               {0xFFFF, "BMP upper bound 
(U+FFFF)", 3},
+
+                                               // 4-byte boundaries 
(supplementary planes)
+                                               {0x10000, "SMP lower bound 
(U+10000)", 4},
+                                               {0x10FFFF, "Unicode upper bound 
(U+10FFFF)", 4}};
 
     // Build test string
     std::wstring testString;
@@ -373,20 +376,20 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) {
 
     // Write test string
     IndexOutput* output = dir->createOutput(testFileName);
-    
+
     // Manually write each character and record byte count
     output->writeVInt(testString.length());
-    
+
     std::vector<int> actualBytes;
     int64_t startPos, endPos;
-    
+
     for (size_t i = 0; i < testString.size(); i++) {
         startPos = output->getFilePointer();
         output->writeChars(&testString[i], 1);
         endPos = output->getFilePointer();
         actualBytes.push_back((int)(endPos - startPos));
     }
-    
+
     output->close();
     _CLDELETE(output);
 
@@ -394,7 +397,7 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) {
     IndexInput* input = nullptr;
     CLuceneError error;
     dir->openInput(testFileName, input, error);
-    
+
     const int32_t len = input->readVInt();
     TCHAR* buffer = _CL_NEWARRAY(TCHAR, len + 1);
     input->readChars(buffer, 0, len);
@@ -403,31 +406,31 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) {
     _CLDELETE_LARRAY(buffer);
 
     // Verify length
-    CuAssertIntEquals(tc, _T("Boundary test string length mismatch"), 
(int)testString.size(), (int)readBackString.size());
+    CuAssertIntEquals(tc, _T("Boundary test string length mismatch"), 
(int)testString.size(),
+                      (int)readBackString.size());
 
     // Verify each character and their byte counts
     printf("\n=== UTF-8 Encoding Boundary Tests ===\n");
     for (size_t i = 0; i < testString.size(); i++) {
         wchar_t original = testString[i];
         wchar_t readBack = readBackString[i];
-        
-        printf("Character #%zu: U+%04X (%s)\n", 
-               i, 
-               (unsigned int)original, 
+
+        printf("Character #%zu: U+%04X (%s)\n", i, (unsigned int)original,
                boundaryTests[i].description);
-        printf("  - Expected bytes: %d, Actual bytes: %d, %s\n", 
-               boundaryTests[i].expectedBytes, 
+        printf("  - Expected bytes: %d, Actual bytes: %d, %s\n", 
boundaryTests[i].expectedBytes,
                actualBytes[i],
                (boundaryTests[i].expectedBytes == actualBytes[i] ? "Success" : 
"Failed"));
-        printf("  - Readback: U+%04X, %s\n", 
-               (unsigned int)readBack, 
+        printf("  - Readback: U+%04X, %s\n", (unsigned int)readBack,
                (original == readBack ? "Success" : "Failed"));
-        
+
         char errorMsg[256];
-        sprintf(errorMsg, "Boundary character mismatch - Position: %zu, 
Description: %s, Original: 0x%04X, Readback: 0x%04X", 
+        sprintf(errorMsg,
+                "Boundary character mismatch - Position: %zu, Description: %s, 
Original: 0x%04X, "
+                "Readback: 0x%04X",
                 i, boundaryTests[i].description, (unsigned int)original, 
(unsigned int)readBack);
         CuAssertTrueWithMessage(tc, errorMsg, original == readBack);
-        CuAssertIntEquals(tc, _T("UTF-8 encoding byte count mismatch"), 
boundaryTests[i].expectedBytes, actualBytes[i]);
+        CuAssertIntEquals(tc, _T("UTF-8 encoding byte count mismatch"),
+                          boundaryTests[i].expectedBytes, actualBytes[i]);
     }
 
     input->close();
@@ -438,81 +441,83 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) {
 // Test special UTF-8 sequences
 static void TestSpecialUTF8Sequences(CuTest* tc) {
     RAMDirectory* dir = _CLNEW RAMDirectory();
-    
+
     // Test some special UTF-8 sequences
     struct SpecialSequenceTest {
         std::wstring str;
         const char* description;
     };
-    
+
     std::vector<SpecialSequenceTest> specialTests = {
-        {L"", "Empty string"},
-        {L"Hello", "Pure ASCII string"},
-        {L"你好世界", "Pure Chinese string"},
-        {L"Hello, 世界!", "Mixed ASCII and Chinese"},
-        {L"🌍🌎🌏", "Pure Emoji (4-byte characters)"},
-        {L"Earth: 🌍 🌎 🌏", "Mixed Chinese and Emoji"},
-        {L"A\u0000B", "String containing NULL character"},
-        {L"سلام دنیا", "Arabic/Persian text"},
-        {L"こんにちは世界", "Japanese and Chinese"},
-        {L"Hello\nWorld", "String with newline"},
-        {L"Tab\tCharacter", "String with tab character"},
-        {std::wstring(1000, L'A'), "1000 identical characters"},
-        {L"😀😃😄😁😆😅😂🤣", "Consecutive Emoji"}
-    };
-    
+            {L"", "Empty string"},
+            {L"Hello", "Pure ASCII string"},
+            {L"你好世界", "Pure Chinese string"},
+            {L"Hello, 世界!", "Mixed ASCII and Chinese"},
+            {L"🌍🌎🌏", "Pure Emoji (4-byte characters)"},
+            {L"Earth: 🌍 🌎 🌏", "Mixed Chinese and Emoji"},
+            {L"A\u0000B", "String containing NULL character"},
+            {L"سلام دنیا", "Arabic/Persian text"},
+            {L"こんにちは世界", "Japanese and Chinese"},
+            {L"Hello\nWorld", "String with newline"},
+            {L"Tab\tCharacter", "String with tab character"},
+            {std::wstring(1000, L'A'), "1000 identical characters"},
+            {L"😀😃😄😁😆😅😂🤣", "Consecutive Emoji"}};
+
     for (size_t testIndex = 0; testIndex < specialTests.size(); testIndex++) {
         const auto& test = specialTests[testIndex];
         std::string testFileName = "special_test_" + std::to_string(testIndex) 
+ ".dat";
-        
+
         // Write test string
         IndexOutput* output = dir->createOutput(testFileName.c_str());
-        output->writeString(test.str);
+        output->writeVInt(test.str.length());
+        output->writeSChars<TCHAR>(test.str.c_str(), test.str.length());
         output->close();
         _CLDELETE(output);
-        
+
         // Read and verify
         IndexInput* input = nullptr;
         CLuceneError error;
         dir->openInput(testFileName.c_str(), input, error);
-        
+
         TCHAR* readBackStr = input->readString();
         std::wstring readBackString(readBackStr);
         _CLDELETE_LARRAY(readBackStr);
-        
+
         printf("\n=== Special UTF-8 Sequence Test #%zu: %s ===\n", testIndex, 
test.description);
-        printf("  - Original length: %zu, Readback length: %zu, %s\n", 
-               test.str.length(), 
+        printf("  - Original length: %zu, Readback length: %zu, %s\n", 
test.str.length(),
                readBackString.length(),
                (test.str.length() == readBackString.length() ? "Success" : 
"Failed"));
-        
-        CuAssertIntEquals(tc, _T("Special sequence length mismatch"), 
(int)test.str.length(), (int)readBackString.length());
-        
+
+        CuAssertIntEquals(tc, _T("Special sequence length mismatch"), 
(int)test.str.length(),
+                          (int)readBackString.length());
+
         // For shorter strings, print each character for comparison
         if (test.str.length() <= 20) {
             for (size_t i = 0; i < test.str.length(); i++) {
-                printf("  Character #%zu: U+%04X -> Readback: U+%04X, %s\n", 
-                       i, 
-                       (unsigned int)test.str[i],
-                       (unsigned int)readBackString[i], 
+                printf("  Character #%zu: U+%04X -> Readback: U+%04X, %s\n", i,
+                       (unsigned int)test.str[i], (unsigned 
int)readBackString[i],
                        (test.str[i] == readBackString[i] ? "Success" : 
"Failed"));
-                
+
                 char errorMsg[256];
-                sprintf(errorMsg, "Special sequence character mismatch - Test: 
%s, Position: %zu, Original: 0x%04X, Readback: 0x%04X", 
-                        test.description, i, (unsigned int)test.str[i], 
(unsigned int)readBackString[i]);
+                sprintf(errorMsg,
+                        "Special sequence character mismatch - Test: %s, 
Position: %zu, Original: "
+                        "0x%04X, Readback: 0x%04X",
+                        test.description, i, (unsigned int)test.str[i],
+                        (unsigned int)readBackString[i]);
                 CuAssertTrueWithMessage(tc, errorMsg, test.str[i] == 
readBackString[i]);
             }
         } else {
             // For longer strings, just check equality
             char errorMsg[256];
-            sprintf(errorMsg, "Long special sequence mismatch - Test: %s, 
Length: %zu", test.description, test.str.length());
+            sprintf(errorMsg, "Long special sequence mismatch - Test: %s, 
Length: %zu",
+                    test.description, test.str.length());
             CuAssertTrueWithMessage(tc, errorMsg, test.str == readBackString);
         }
-        
+
         input->close();
         _CLDELETE(input);
     }
-    
+
     _CLDELETE(dir);
 }
 
@@ -520,66 +525,69 @@ static void TestSpecialUTF8Sequences(CuTest* tc) {
 static void TestUTF8Performance(CuTest* tc) {
     RAMDirectory* dir = _CLNEW RAMDirectory();
     const char* testFileName = "test_utf8_performance.dat";
-    
+
     // Create a large test string with various types of characters
     std::wstring testString;
     const int testSize = 100000; // 100,000 characters
-    
+
     // Add different types of characters
     for (int i = 0; i < testSize / 4; i++) {
-        testString.push_back(L'A' + (i % 26));  // ASCII characters
+        testString.push_back(L'A' + (i % 26));     // ASCII characters
         testString.push_back(0x00A0 + (i % 128));  // Latin extended
-        testString.push_back(0x4E00 + (i % 1000));  // Chinese characters
+        testString.push_back(0x4E00 + (i % 1000)); // Chinese characters
         testString.push_back(0x1F600 + (i % 50));  // Emoji (4-byte characters)
     }
-    
+
     printf("\n=== UTF-8 Encoding Performance Test (String length: %zu) ===\n", 
testString.size());
-    
+
     // Measure write time
     clock_t writeStart = clock();
-    
+
     IndexOutput* output = dir->createOutput(testFileName);
-    output->writeString(testString);
+    output->writeVInt(testString.length());
+    output->writeSChars<TCHAR>(testString.c_str(), testString.length());
     output->close();
     _CLDELETE(output);
-    
+
     clock_t writeEnd = clock();
     double writeTime = ((double)(writeEnd - writeStart)) / CLOCKS_PER_SEC;
     printf("Write time: %.6f seconds\n", writeTime);
-    
+
     // Measure read time
     clock_t readStart = clock();
-    
+
     IndexInput* input = nullptr;
     CLuceneError error;
     dir->openInput(testFileName, input, error);
-    
+
     TCHAR* readBackStr = input->readString();
     std::wstring readBackString(readBackStr);
     _CLDELETE_LARRAY(readBackStr);
-    
+
     clock_t readEnd = clock();
     double readTime = ((double)(readEnd - readStart)) / CLOCKS_PER_SEC;
     printf("Read time: %.6f seconds\n", readTime);
-    
+
     // Verify results
     char lengthErrorMsg[256];
-    sprintf(lengthErrorMsg, "Performance test string length mismatch - 
Original: %zu, Readback: %zu", 
+    sprintf(lengthErrorMsg,
+            "Performance test string length mismatch - Original: %zu, 
Readback: %zu",
             testString.size(), readBackString.size());
-    CuAssertIntEquals(tc, _T("Performance test string length mismatch"), 
(int)testString.size(), (int)readBackString.size());
+    CuAssertIntEquals(tc, _T("Performance test string length mismatch"), 
(int)testString.size(),
+                      (int)readBackString.size());
     printf("  %s\n", lengthErrorMsg); // Print error message directly instead 
of using macro
-    
+
     char contentErrorMsg[256];
     sprintf(contentErrorMsg, "Performance test string content mismatch");
     CuAssertTrueWithMessage(tc, contentErrorMsg, testString == readBackString);
-    
+
     // Calculate characters processed per second
     double writeCharsPerSec = testString.size() / writeTime;
     double readCharsPerSec = readBackString.size() / readTime;
-    
+
     printf("Write speed: %.2f characters/second\n", writeCharsPerSec);
     printf("Read speed: %.2f characters/second\n", readCharsPerSec);
-    
+
     input->close();
     _CLDELETE(input);
     _CLDELETE(dir);
@@ -591,26 +599,26 @@ static void TestUTF8Compatibility(CuTest* tc) {
 
     // Test characters covering different ranges
     const std::vector<wchar_t> testChars = {
-        // Basic 1-byte
-        L'A',    
-        // Common 3-byte Chinese characters
-        L'你', L'好',
-        // 4-byte characters from different planes
-        0x1F600, // 😀 Grinning Face (Emoji)
-        0xF600, // 丽
-        0x1F64F, // 🙏 Folded Hands
-        0xF64F, // 碌
-        0x20021, // 𠀡 CJK Unified Ideographs Extension B
-        0x0021, //  ! Basic Latin
-        0x2A6D6, // 𪛖 CJK Unified Ideographs Extension C
-        0xA6D6, // ꛖ Hangul Jamo Extended-B
-        0x10123, // 𐄣 Ancient Greek Numbers
-        0x0123, //  Cuneiform
-        0x10348, // 𐍈 Gothic Letter Hwair
-        0x0348, //  Ȉ
-        // Boundary cases
-        //0x10000, // Minimum 4-byte character
-        0x10FFFF  // Maximum valid Unicode code point
+            // Basic 1-byte
+            L'A',
+            // Common 3-byte Chinese characters
+            L'你', L'好',
+            // 4-byte characters from different planes
+            0x1F600, // 😀 Grinning Face (Emoji)
+            0xF600,  // 丽
+            0x1F64F, // 🙏 Folded Hands
+            0xF64F,  // 碌
+            0x20021, // 𠀡 CJK Unified Ideographs Extension B
+            0x0021,  //  ! Basic Latin
+            0x2A6D6, // 𪛖 CJK Unified Ideographs Extension C
+            0xA6D6,  // ꛖ Hangul Jamo Extended-B
+            0x10123, // 𐄣 Ancient Greek Numbers
+            0x0123,  //  Cuneiform
+            0x10348, // 𐍈 Gothic Letter Hwair
+            0x0348,  //  Ȉ
+            // Boundary cases
+            //0x10000, // Minimum 4-byte character
+            0x10FFFF // Maximum valid Unicode code point
     };
 
     // Build test string with mixed character types
@@ -637,10 +645,8 @@ static void TestUTF8Compatibility(CuTest* tc) {
         buffer[len] = 0;
 
         for (int32_t i = 0; i < len; i++) {
-            printf("  Character #%d: U+%04X -> Readback: U+%04X, %s\n", 
-                   i, 
-                   (unsigned int)testString[i],
-                   (unsigned int)buffer[i], 
+            printf("  Character #%d: U+%04X -> Readback: U+%04X, %s\n", i,
+                   (unsigned int)testString[i], (unsigned int)buffer[i],
                    (testString[i] == buffer[i] ? "Success" : "Failed"));
         }
         CuAssertTrue(tc, buffer[3] != testString[3]); // old code cannot parse 
4-byte character
@@ -655,13 +661,13 @@ static void TestUTF8Compatibility(CuTest* tc) {
         // Old implementation write (only handles 3 bytes)
         IndexOutput* output = dir->createOutput("old_write_new_read.dat");
         output->writeVInt(testString.length());
-        WriteCharsLegacy(output, testString.c_str(), testString.length());
+        output->writeSCharsOrigin<TCHAR>(testString.c_str(), 
testString.length());
         output->close();
         _CLDELETE(output);
 
         // Read using both methods for comparison
-        std::wstring oldResult;  // Old method read result
-        std::wstring newResult;  // New method read result
+        std::wstring oldResult; // Old method read result
+        std::wstring newResult; // New method read result
 
         // Old method read
         {
@@ -697,37 +703,379 @@ static void TestUTF8Compatibility(CuTest* tc) {
             _CLDELETE(input);
         }
         printf("newResult.size() = %zu\n", newResult.size());
-        printf("oldResult.size() = %zu, newResult.size() = %zu\n", 
oldResult.size(), newResult.size());
+        printf("oldResult.size() = %zu, newResult.size() = %zu\n", 
oldResult.size(),
+               newResult.size());
         // Compare results
         char lengthErrorMsg[256];
-        sprintf(lengthErrorMsg, "Compatibility test length mismatch - Old 
method: %zu, New method: %zu", 
+        sprintf(lengthErrorMsg,
+                "Compatibility test length mismatch - Old method: %zu, New 
method: %zu",
                 oldResult.size(), newResult.size());
         CuAssertTrueWithMessage(tc, lengthErrorMsg, oldResult.size() == 
newResult.size());
-        
+
         // Verify each character
         for (size_t i = 0; i < oldResult.size(); i++) {
             wchar_t oldChar = oldResult[i];
             wchar_t newChar = newResult[i];
-            
-            printf("Character #%zu: Old method: U+%04X, New method: U+%04X, 
%s, Original: U+%04X\n", 
-                   i, 
-                   (unsigned int)oldChar,
-                   (unsigned int)newChar, 
-                   (oldChar == newChar ? "Match" : "Mismatch"),
-                   (unsigned int)testString[i]);
-            
+
+            printf("Character #%zu: Old method: U+%04X, New method: U+%04X, 
%s, Original: U+%04X\n",
+                   i, (unsigned int)oldChar, (unsigned int)newChar,
+                   (oldChar == newChar ? "Match" : "Mismatch"), (unsigned 
int)testString[i]);
+
             char errorMsg[256];
-            sprintf(errorMsg, "Character mismatch - Position: %zu, Old method: 
U+%04X, New method: U+%04X", 
-                    i, (unsigned int)oldChar, (unsigned int)newChar);
-            
+            sprintf(errorMsg,
+                    "Character mismatch - Position: %zu, Old method: U+%04X, 
New method: U+%04X", i,
+                    (unsigned int)oldChar, (unsigned int)newChar);
+
             CuAssertTrueWithMessage(tc, errorMsg, oldChar == newChar);
         }
-        
-        printf("\nCompatibility test: %s\n", 
-               (oldResult == newResult ? "PASSED - Methods are compatible" : 
"FAILED - Methods are not compatible"));
+
+        printf("\nCompatibility test: %s\n",
+               (oldResult == newResult ? "PASSED - Methods are compatible"
+                                       : "FAILED - Methods are not 
compatible"));
+    }
+
+    _CLDELETE(dir);
+}
+
+void print_wstring(const std::wstring& wstr) {
+    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+    std::string str = converter.to_bytes(wstr);
+    std::cout << str << std::endl;
+}
+// Test STermInfosWriter with various Unicode characters (write and read)
+static void TestSTermInfosWriterUnicode(CuTest* tc) {
+    printf("\n=== Testing STermInfosWriter<TCHAR> with Unicode (Write and 
Read) ===\n");
+
+    // Create a RAM directory for testing
+    Directory* dir = _CLNEW RAMDirectory();
+
+    // Create field infos
+    FieldInfos* fieldInfos = _CLNEW FieldInfos();
+    fieldInfos->add(_T("content"), false);
+
+    const char* segmentName = "test_unicode";
+
+    // Define Unicode test cases
+    struct UnicodeTermTest {
+        std::wstring str;
+        const char* description;
+    };
+
+    std::vector<UnicodeTermTest> testTerms = {
+            {L"A", "Basic Latin A (U+0041)"},
+            {L"z", "Basic Latin z (U+007A)"},
+            {L"©", "Copyright Sign (U+00A9)"},
+            {L"é", "Latin Small E with acute (U+00E9)"},
+            {L"œ", "Latin Small Ligature OE (U+0153)"},
+            {L"Ω", "Greek Capital Omega (U+03A9)"},
+            {L"π", "Greek Small Pi (U+03C0)"},
+            {L"Я", "Cyrillic Capital Letter Ya (U+042F)"},
+            {L"я", "Cyrillic Small Letter Ya (U+044F)"},
+            {L"א", "Hebrew Letter Alef (U+05D0)"},
+            {L"ا", "Arabic Letter Alef (U+0627)"},
+            {L"अ", "Devanagari Letter A (U+0905)"},
+            {L"一", "CJK Unified Ideograph (U+4E00)"},
+            {L"中", "CJK Unified Ideograph (U+4E2D)"},
+            {L"文", "CJK Unified Ideograph (U+6587)"},
+            {L"青", "CJK Unified Ideograph (U+9752)"},
+            {L"あ", "Hiragana Letter A (U+3042)"},
+            {L"ア", "Katakana Letter A (U+30A2)"},
+            {L"가", "Hangul Syllable GA (U+AC00)"},
+
+            {L"∀", "For All (U+2200)"},
+            {L"∑", "Summation (U+2211)"},
+            {L"∞", "Infinity (U+221E)"},
+            {L"≈", "Almost Equal To (U+2248)"},
+            {L"€", "Euro (U+20AC)"},
+            {L"₽", "Russian Ruble (U+20BD)"},
+            {L"₹", "Indian Rupee (U+20B9)"},
+            {L"₩", "Won Sign (U+20A9)"},
+            {L"←", "Left Arrow (U+2190)"},
+            {L"→", "Right Arrow (U+2192)"},
+            {L"═", "Box Drawing (U+2550)"},
+            {L"║", "Box Drawing (U+2551)"},
+            {L"☀", "Black Sun with Rays (U+2600)"},
+            {L"♥", "Black Heart Suit (U+2665)"},
+
+            {L"Hello世界", "Mixed English and Chinese"},
+            {L"Café☕", "Latin with accent and emoji"},
+            {L"Москва🏙", "Cyrillic with emoji"},
+            {L"こんにちは🌸", "Japanese with emoji"},
+            {L"안녕하세요🇰🇷", "Korean with flag emoji"},
+            {L"αβγδεζηθ", "Greek alphabet sequence"},
+            {L"∀x∈ℝ∃y≥x", "Mathematical expression"},
+            {L"♠♥♦♣", "Card suits"},
+            {L"←↑→↓", "Arrow directions"},
+            {L"╔═══╗║   ║╚═══╝", "Box drawing frame"},
+
+            {L"苹果🍎Apple", "Mixed Chinese, emoji and English"},
+            {L"数学∫f(x)dx=F(x)+C", "Mathematical formula with Chinese"},
+            {L"こんにちは世界", "Japanese and Chinese"},
+            {L"тест тест 测试 테스트", "Mixed Cyrillic, Chinese and Korean"},
+            {L"😀😁😂🤣😃😄😅😆", "Multiple emoji sequence"}};
+
+    // === WRITE PHASE ===
+    printf("\n--- Write Phase ---\n");
+
+    // Create a TermInfosWriter
+    STermInfosWriter<TCHAR>* writer =
+            _CLNEW STermInfosWriter<TCHAR>(dir, segmentName, fieldInfos, 128);
+    writer->setEnableCorrectTermWrite(true);
+
+    // Add terms to the writer
+    TermInfo* ti = _CLNEW TermInfo();
+    int64_t freqPointer = 0;
+    int64_t proxPointer = 1000;
+
+    // Store terms for verification
+    std::vector<Term*> terms;
+    std::vector<TermInfo*> termInfos;
+
+    // Add all test terms to the writer
+    for (size_t i = 0; i < testTerms.size(); i++) {
+        // Create term
+        Term* term = _CLNEW Term(_T("content"), testTerms[i].str.c_str());
+        terms.push_back(term);
+
+        // Create and store term info
+        TermInfo* currentTi = _CLNEW TermInfo();
+        currentTi->docFreq = i + 1;
+        currentTi->freqPointer = freqPointer;
+        currentTi->proxPointer = proxPointer;
+        currentTi->skipOffset = (i % 10 == 0) ? 10 : 0;
+        termInfos.push_back(currentTi);
+
+        // Add term to writer
+        TermInfo tempTi;
+        tempTi.docFreq = currentTi->docFreq;
+        tempTi.freqPointer = currentTi->freqPointer;
+        tempTi.proxPointer = currentTi->proxPointer;
+        tempTi.skipOffset = currentTi->skipOffset;
+        writer->add(term->field(), term->text(), term->textLength(), &tempTi);
+
+        printf("Added Term #%zu: \"", i);
+        print_wstring(testTerms[i].str);
+        printf("\" - %s\n", testTerms[i].description);
+
+        // Increment pointers for next term
+        freqPointer += 100 + i * 10;
+        proxPointer += 200 + i * 20;
+    }
+
+    // Close the writer
+    writer->close();
+    _CLDELETE(writer);
+    _CLDELETE(ti);
+
+    // === READ PHASE ===
+    printf("\n--- Read Phase ---\n");
+
+    // Create a TermInfosReader
+    TermInfosReader* reader = _CLNEW TermInfosReader(dir, segmentName, 
fieldInfos);
+
+    // Verify each term can be read back correctly
+    for (size_t i = 0; i < terms.size(); i++) {
+        Term* originalTerm = terms[i];
+        TermInfo* originalInfo = termInfos[i];
+
+        // Read term info
+        auto readInfo = reader->get(originalTerm, nullptr);
+
+        printf("Term #%zu: \"", i);
+        print_wstring(testTerms[i].str);
+        printf("\" - %s\n", readInfo ? "Found" : "NOT FOUND");
+
+        if (readInfo) {
+            // Verify values match
+            CuAssertTrue(tc, originalInfo->docFreq == readInfo->docFreq);
+            CuAssertTrue(tc, originalInfo->freqPointer == 
readInfo->freqPointer);
+            CuAssertTrue(tc, originalInfo->proxPointer == 
readInfo->proxPointer);
+        }
+
+        _CLDELETE(readInfo);
+    }
+
+    // Verify term enumeration
+    SegmentTermEnum* termEnum = reader->terms();
+    size_t count = 0;
+
+    printf("\n--- Term Enumeration ---\n");
+    while (termEnum->next()) {
+        Term* term = termEnum->term(false);
+        printf("Enum #%zu: ", count);
+        print_wstring(term->text());
+        CuAssertTrueWithMessage(tc, "Term text mismatch",
+                                _tcscmp(term->text(), 
testTerms[count].str.c_str()) == 0);
+        count++;
+    }
+
+    printf("Total enumerated terms: %zu (expected: %zu)\n", count, 
terms.size());
+    CuAssertTrue(tc, (int)terms.size() == (int)count);
+
+    _CLDELETE(termEnum);
+    _CLDELETE(reader);
+
+    // Clean up
+    for (size_t i = 0; i < terms.size(); i++) {
+        _CLDELETE(terms[i]);
+        _CLDELETE(termInfos[i]);
     }
 
+    _CLDELETE(fieldInfos);
     _CLDELETE(dir);
+
+    printf("STermInfosWriter/Reader Unicode test completed successfully\n");
+}
+
+// Test STermInfosWriter with Unicode characters when correctTermWrite is 
disabled
+static void TestSTermInfosWriterUnicodeDisabled(CuTest* tc) {
+    printf("\n=== Testing STermInfosWriter<TCHAR> with Unicode (Disabled 
Correct Term Write) "
+           "===\n");
+
+    // Create a RAM directory for testing
+    Directory* dir = _CLNEW RAMDirectory();
+
+    // Create field infos
+    FieldInfos* fieldInfos = _CLNEW FieldInfos();
+    fieldInfos->add(_T("content"), false);
+
+    const char* segmentName = "test_unicode_disabled";
+
+    // Define Unicode test cases - one normal char and one > 0xFFFF
+    struct UnicodeTermTest {
+        std::wstring str;
+        const char* description;
+        bool shouldCorruptOnRead;
+    };
+
+    std::vector<UnicodeTermTest> testTerms = {
+            {L"中", "CJK Unified Ideograph (U+4E2D)", false}, // Regular 
Unicode character
+            {L"𠜎", "CJK Unified Ideograph Extension B (U+2070E)",
+             true} // Character beyond BMP (> 0xFFFF)
+    };
+
+    // === WRITE PHASE ===
+    printf("\n--- Write Phase ---\n");
+
+    // Create a TermInfosWriter with correctTermWrite disabled
+    STermInfosWriter<TCHAR>* writer =
+            _CLNEW STermInfosWriter<TCHAR>(dir, segmentName, fieldInfos, 128);
+    writer->setEnableCorrectTermWrite(false); // Disable correct term write
+
+    // Add terms to the writer
+    int64_t freqPointer = 0;
+    int64_t proxPointer = 1000;
+
+    // Store terms for verification
+    std::vector<Term*> terms;
+    std::vector<TermInfo*> termInfos;
+    std::vector<std::wstring> originalStrings;
+
+    // Add all test terms to the writer
+    for (size_t i = 0; i < testTerms.size(); i++) {
+        // Store original string for later comparison
+        originalStrings.push_back(testTerms[i].str);
+
+        // Create term
+        Term* term = _CLNEW Term(_T("content"), testTerms[i].str.c_str());
+        terms.push_back(term);
+
+        // Create and store term info
+        TermInfo* currentTi = _CLNEW TermInfo();
+        currentTi->docFreq = i + 1;
+        currentTi->freqPointer = freqPointer;
+        currentTi->proxPointer = proxPointer;
+        currentTi->skipOffset = 0;
+        termInfos.push_back(currentTi);
+
+        // Add term to writer
+        TermInfo tempTi;
+        tempTi.docFreq = currentTi->docFreq;
+        tempTi.freqPointer = currentTi->freqPointer;
+        tempTi.proxPointer = currentTi->proxPointer;
+        tempTi.skipOffset = currentTi->skipOffset;
+        writer->add(term->field(), term->text(), term->textLength(), &tempTi);
+
+        printf("Added Term #%zu: \"", i);
+        print_wstring(testTerms[i].str);
+        printf("\" - %s\n", testTerms[i].description);
+
+        // Increment pointers for next term
+        freqPointer += 100 + i * 10;
+        proxPointer += 200 + i * 20;
+    }
+
+    // Close the writer
+    writer->close();
+    _CLDELETE(writer);
+
+    // === READ PHASE ===
+    printf("\n--- Read Phase ---\n");
+
+    // Create a TermInfosReader
+    TermInfosReader* reader = _CLNEW TermInfosReader(dir, segmentName, 
fieldInfos);
+
+    // Verify each term's read behavior
+    for (size_t i = 0; i < terms.size(); i++) {
+        Term* originalTerm = terms[i];
+        TermInfo* originalInfo = termInfos[i];
+
+        // Read term info
+        auto readInfo = reader->get(originalTerm, nullptr);
+
+        printf("Term #%zu: \"", i);
+        print_wstring(testTerms[i].str);
+        printf("\" - %s\n", readInfo ? "Found" : "NOT FOUND");
+
+        if (readInfo) {
+            // Verify values match
+            CuAssertTrue(tc, originalInfo->docFreq == readInfo->docFreq);
+            CuAssertTrue(tc, originalInfo->freqPointer == 
readInfo->freqPointer);
+            CuAssertTrue(tc, originalInfo->proxPointer == 
readInfo->proxPointer);
+        }
+
+        _CLDELETE(readInfo);
+    }
+
+    // Verify term enumeration
+    SegmentTermEnum* termEnum = reader->terms();
+    size_t count = 0;
+
+    printf("\n--- Term Enumeration ---\n");
+    while (termEnum->next()) {
+        Term* term = termEnum->term(false);
+        printf("Enum #%zu: ", count);
+        print_wstring(term->text());
+
+        // For regular Unicode characters, they should be preserved correctly
+        if (!testTerms[count].shouldCorruptOnRead) {
+            CuAssertTrueWithMessage(tc, "Regular Unicode term text should 
match",
+                                    _tcscmp(term->text(), 
originalStrings[count].c_str()) == 0);
+        } else {
+            // For characters beyond BMP (>0xFFFF), they should NOT match due 
to disabled correctTermWrite
+            printf(" - Expected corruption for high Unicode character\n");
+            CuAssertTrueWithMessage(tc, "High Unicode term should be 
corrupted",
+                                    _tcscmp(term->text(), 
originalStrings[count].c_str()) != 0);
+        }
+        count++;
+    }
+
+    printf("Total enumerated terms: %zu (expected: %zu)\n", count, 
terms.size());
+    CuAssertTrue(tc, (int)terms.size() == (int)count);
+
+    _CLDELETE(termEnum);
+    _CLDELETE(reader);
+
+    // Clean up
+    for (size_t i = 0; i < terms.size(); i++) {
+        _CLDELETE(terms[i]);
+        _CLDELETE(termInfos[i]);
+    }
+
+    _CLDELETE(fieldInfos);
+    _CLDELETE(dir);
+
+    printf("STermInfosWriter/Reader Unicode Disabled test completed\n");
 }
 
 CuSuite* testUTF8CharsSuite() {
@@ -740,6 +1088,8 @@ CuSuite* testUTF8CharsSuite() {
     SUITE_ADD_TEST(suite, TestSpecialUTF8Sequences);
     SUITE_ADD_TEST(suite, TestUTF8Performance);
     SUITE_ADD_TEST(suite, TestUTF8Compatibility);
+    SUITE_ADD_TEST(suite, TestSTermInfosWriterUnicode);
+    SUITE_ADD_TEST(suite, TestSTermInfosWriterUnicodeDisabled);
 
     return suite;
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris-thirdparty) branch clucene updated: [test](unicode) add more ut (#288)

Reply via email to