This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new 74926996d64 [test](unicode) add more ut (#288) 74926996d64 is described below commit 74926996d64e262eea70376f5807335b40dc5c37 Author: airborne12 <jiang...@selectdb.com> AuthorDate: Wed Mar 5 23:08:23 2025 +0800 [test](unicode) add more ut (#288) --- src/test/store/TestUTF8Chars.cpp | 726 +++++++++++++++++++++++++++++---------- 1 file changed, 538 insertions(+), 188 deletions(-) diff --git a/src/test/store/TestUTF8Chars.cpp b/src/test/store/TestUTF8Chars.cpp index 76eca31664c..22695f1b560 100644 --- a/src/test/store/TestUTF8Chars.cpp +++ b/src/test/store/TestUTF8Chars.cpp @@ -4,22 +4,29 @@ #include "CLucene/store/IndexOutput.h" #include "CLucene/store/RAMDirectory.h" #include "CuTest.h" +#include <codecvt> #include <ctime> +#include <locale> #include <string> #include <vector> #include <utility> #include <iostream> +#include "CLucene/index/_TermInfosWriter.h" +#include "CLucene/index/_TermInfosReader.h" +#include "CLucene/index/_FieldInfos.h" +#include "CLucene/index/_TermInfo.h" +#include "CLucene/index/Term.h" using namespace lucene::store; // Add a helper macro for printing more detailed error messages when assertions fail #define CuAssertTrueWithMessage(tc, message, condition) \ - do { \ - if (!(condition)) { \ - printf("Assertion failed: %s\n", message); \ - } \ - CuAssertTrue(tc, condition); \ - } while(0) + do { \ + if (!(condition)) { \ + printf("Assertion failed: %s\n", message); \ + } \ + CuAssertTrue(tc, condition); \ + } while (0) static void TestUTF8WriteAndReadChars(CuTest* tc) { RAMDirectory* dir = _CLNEW RAMDirectory(); @@ -33,8 +40,8 @@ static void TestUTF8WriteAndReadChars(CuTest* tc) { testString.push_back(L'你'); // 3 bytes testString.push_back(L'好'); // 3 bytes testString.push_back((wchar_t)0x1F600); // 4 bytes - - output->writeString(testString); + output->writeVInt(testString.length()); + output->writeSChars<TCHAR>(testString.c_str(), testString.length()); output->close(); _CLDELETE(output); @@ -54,10 +61,10 @@ static void TestUTF8WriteAndReadChars(CuTest* tc) { for (size_t i = 0; i < testString.size(); i++) { printf("Character #%zu: 0x%04X -> Readback: 0x%04X, %s\n", i, (unsigned int)testString[i], - (unsigned int)readBackString[i], - (testString[i] == readBackString[i] ? "Success" : "Failed")); + (unsigned int)readBackString[i], + (testString[i] == readBackString[i] ? "Success" : "Failed")); char errorMsg[256]; - sprintf(errorMsg, "Character mismatch - Position: %zu, Original: 0x%04X, Readback: 0x%04X", + sprintf(errorMsg, "Character mismatch - Position: %zu, Original: 0x%04X, Readback: 0x%04X", i, (unsigned int)testString[i], (unsigned int)readBackString[i]); CuAssertTrueWithMessage(tc, errorMsg, testString[i] == readBackString[i]); } @@ -109,7 +116,7 @@ static void TestUnicodeRanges(CuTest* tc) { // Create a test string containing characters from various Unicode ranges std::vector<std::pair<wchar_t, const char*>> unicodeTestChars = { // ASCII range (U+0000 - U+007F) - 1 byte UTF-8 -// {0x0000, "NULL character (U+0000)"}, + // {0x0000, "NULL character (U+0000)"}, {0x0001, "Start of Heading (U+0001)"}, {0x007F, "Delete (U+007F)"}, {L'A', "Latin letter A (U+0041)"}, @@ -148,56 +155,47 @@ static void TestUnicodeRanges(CuTest* tc) { {0x10348, "Gothic letter 𐍈 (U+10348)"}, {0x10400, "Deseret letter 𐐀 (U+10400)"}, {0x10FFFF, "Unicode maximum value (U+10FFFF)"}, - - // 数学符号 + {0x2200, "For All ∀ (U+2200)"}, {0x2211, "Summation ∑ (U+2211)"}, {0x221E, "Infinity ∞ (U+221E)"}, {0x2248, "Almost Equal To ≈ (U+2248)"}, - - // 货币符号 + {0x20AC, "Euro € (U+20AC)"}, {0x20BD, "Russian Ruble ₽ (U+20BD)"}, {0x20B9, "Indian Rupee ₹ (U+20B9)"}, {0x20A9, "Won Sign ₩ (U+20A9)"}, - - // 箭头符号 + {0x2190, "Left Arrow ← (U+2190)"}, {0x2192, "Right Arrow → (U+2192)"}, {0x2191, "Up Arrow ↑ (U+2191)"}, {0x2193, "Down Arrow ↓ (U+2193)"}, - - // 框线符号 + {0x2550, "Box Drawing ═ (U+2550)"}, {0x2551, "Box Drawing ║ (U+2551)"}, {0x2554, "Box Drawing ╔ (U+2554)"}, {0x2557, "Box Drawing ╗ (U+2557)"}, - - // 字母符号 + {0x2122, "Trade Mark ™ (U+2122)"}, {0x2105, "Care Of ℅ (U+2105)"}, {0x2113, "Script Small L ℓ (U+2113)"}, {0x2116, "Numero Sign № (U+2116)"}, - - // 装饰符号 + {0x2600, "Black Sun with Rays ☀ (U+2600)"}, {0x2602, "Umbrella ☂ (U+2602)"}, {0x2614, "Umbrella with Rain Drops ☔ (U+2614)"}, {0x2665, "Black Heart Suit ♥ (U+2665)"}, - - // 更多补充平面字符 + {0x1D400, "Mathematical Bold Capital A 𝐀 (U+1D400)"}, {0x1D538, "Mathematical Double-Struck Capital A 𝔸 (U+1D538)"}, {0x1F300, "Cyclone 🌀 (U+1F300)"}, {0x1F431, "Cat Face 🐱 (U+1F431)"}, {0x1F52B, "Pistol 🔫 (U+1F52B)"}, {0x1F697, "Automobile 🚗 (U+1F697)"}, - - // 额外CJK字符 + {0x20000, "CJK Unified Ideograph 𠀀 (U+20000)"}, {0x2A700, "CJK Unified Ideograph 𪜀 (U+2A700)"}, - - // 其他技术符号 + {0x2300, "Diameter Sign ⌀ (U+2300)"}, {0x231B, "Hourglass ⌛ (U+231B)"}, {0x2328, "Keyboard ⌨ (U+2328)"}, @@ -240,12 +238,14 @@ static void TestUnicodeRanges(CuTest* tc) { wchar_t original = testString[i]; wchar_t readBack = readBackString[i]; - printf("Character #%zu: 0x%04X (%s) -> Readback: 0x%04X, %s\n", i, (unsigned int)original, - unicodeTestChars[i].second, (unsigned int)readBack, + printf("Character #%zu: 0x%04X (%s) -> Readback: 0x%04X, %s\n", i, + (unsigned int)original, unicodeTestChars[i].second, (unsigned int)readBack, (original == readBack ? "Success" : "Failed")); char errorMsg[256]; - sprintf(errorMsg, "Unicode character mismatch - Position: %zu, Character: %s, Original: 0x%04X, Readback: 0x%04X", + sprintf(errorMsg, + "Unicode character mismatch - Position: %zu, Character: %s, Original: 0x%04X, " + "Readback: 0x%04X", i, unicodeTestChars[i].second, (unsigned int)original, (unsigned int)readBack); CuAssertTrueWithMessage(tc, errorMsg, original == readBack); } @@ -265,13 +265,14 @@ static void TestEdgeCases(CuTest* tc) { // Create a test string containing edge cases std::wstring testString; - + // 1. Empty string test IndexOutput* emptyOutput = dir->createOutput("empty_string.dat"); - emptyOutput->writeString(L""); + emptyOutput->writeVInt(0); + emptyOutput->writeSChars<TCHAR>(L"", 0); emptyOutput->close(); _CLDELETE(emptyOutput); - + IndexInput* emptyInput = nullptr; CLuceneError emptyError; dir->openInput("empty_string.dat", emptyInput, emptyError); @@ -281,7 +282,7 @@ static void TestEdgeCases(CuTest* tc) { CuAssertIntEquals(tc, _T("Empty string length should be 0"), 0, (int)emptyReadBack.size()); emptyInput->close(); _CLDELETE(emptyInput); - + // 2. Long string test (containing various characters) std::wstring longString; // Add 1000 mixed characters @@ -291,47 +292,51 @@ static void TestEdgeCases(CuTest* tc) { longString.push_back(0x4E00 + i % 100); // Chinese longString.push_back(0x1F600 + i % 50); // Emoji } - + IndexOutput* longOutput = dir->createOutput("long_string.dat"); - longOutput->writeString(longString); + longOutput->writeVInt(longString.length()); + longOutput->writeSChars<TCHAR>(longString.c_str(), longString.length()); longOutput->close(); _CLDELETE(longOutput); - + IndexInput* longInput = nullptr; CLuceneError longError; dir->openInput("long_string.dat", longInput, longError); TCHAR* longStr = longInput->readString(); std::wstring longReadBack(longStr); _CLDELETE_LARRAY(longStr); - - CuAssertIntEquals(tc, _T("Long string length mismatch"), (int)longString.size(), (int)longReadBack.size()); - + + CuAssertIntEquals(tc, _T("Long string length mismatch"), (int)longString.size(), + (int)longReadBack.size()); + // Only check some characters to avoid too much output printf("\n=== Long String Test (showing first 10 characters) ===\n"); for (size_t i = 0; i < 10 && i < longString.size(); i++) { - printf("Character #%zu: 0x%04X -> Readback: 0x%04X, %s\n", - i, - (unsigned int)longString[i], - (unsigned int)longReadBack[i], + printf("Character #%zu: 0x%04X -> Readback: 0x%04X, %s\n", i, (unsigned int)longString[i], + (unsigned int)longReadBack[i], (longString[i] == longReadBack[i] ? "Success" : "Failed")); - + char errorMsg[256]; - sprintf(errorMsg, "Long string character mismatch - Position: %zu, Original: 0x%04X, Readback: 0x%04X", + sprintf(errorMsg, + "Long string character mismatch - Position: %zu, Original: 0x%04X, Readback: " + "0x%04X", i, (unsigned int)longString[i], (unsigned int)longReadBack[i]); CuAssertTrueWithMessage(tc, errorMsg, longString[i] == longReadBack[i]); } - + // Random sample check for (size_t i = 0; i < longString.size(); i += 100) { char errorMsg[256]; - sprintf(errorMsg, "Long string sample check failed - Position: %zu, Original: 0x%04X, Readback: 0x%04X", + sprintf(errorMsg, + "Long string sample check failed - Position: %zu, Original: 0x%04X, Readback: " + "0x%04X", i, (unsigned int)longString[i], (unsigned int)longReadBack[i]); CuAssertTrueWithMessage(tc, errorMsg, longString[i] == longReadBack[i]); } - + longInput->close(); _CLDELETE(longInput); - + _CLDELETE(dir); } @@ -347,23 +352,21 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) { int expectedBytes; }; - std::vector<BoundaryTest> boundaryTests = { - // 1-byte boundaries - //{0x0000, "NULL (U+0000) - 1-byte lower bound", 1}, - {0x007F, "DELETE (U+007F) - 1-byte upper bound", 1}, - - // 2-byte boundaries - {0x0080, "PAD (U+0080) - 2-byte lower bound", 2}, - {0x07FF, "2-byte upper bound (U+07FF)", 2}, - - // 3-byte boundaries - {0x0800, "3-byte lower bound (U+0800)", 3}, - {0xFFFF, "BMP upper bound (U+FFFF)", 3}, - - // 4-byte boundaries (supplementary planes) - {0x10000, "SMP lower bound (U+10000)", 4}, - {0x10FFFF, "Unicode upper bound (U+10FFFF)", 4} - }; + std::vector<BoundaryTest> boundaryTests = {// 1-byte boundaries + //{0x0000, "NULL (U+0000) - 1-byte lower bound", 1}, + {0x007F, "DELETE (U+007F) - 1-byte upper bound", 1}, + + // 2-byte boundaries + {0x0080, "PAD (U+0080) - 2-byte lower bound", 2}, + {0x07FF, "2-byte upper bound (U+07FF)", 2}, + + // 3-byte boundaries + {0x0800, "3-byte lower bound (U+0800)", 3}, + {0xFFFF, "BMP upper bound (U+FFFF)", 3}, + + // 4-byte boundaries (supplementary planes) + {0x10000, "SMP lower bound (U+10000)", 4}, + {0x10FFFF, "Unicode upper bound (U+10FFFF)", 4}}; // Build test string std::wstring testString; @@ -373,20 +376,20 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) { // Write test string IndexOutput* output = dir->createOutput(testFileName); - + // Manually write each character and record byte count output->writeVInt(testString.length()); - + std::vector<int> actualBytes; int64_t startPos, endPos; - + for (size_t i = 0; i < testString.size(); i++) { startPos = output->getFilePointer(); output->writeChars(&testString[i], 1); endPos = output->getFilePointer(); actualBytes.push_back((int)(endPos - startPos)); } - + output->close(); _CLDELETE(output); @@ -394,7 +397,7 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) { IndexInput* input = nullptr; CLuceneError error; dir->openInput(testFileName, input, error); - + const int32_t len = input->readVInt(); TCHAR* buffer = _CL_NEWARRAY(TCHAR, len + 1); input->readChars(buffer, 0, len); @@ -403,31 +406,31 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) { _CLDELETE_LARRAY(buffer); // Verify length - CuAssertIntEquals(tc, _T("Boundary test string length mismatch"), (int)testString.size(), (int)readBackString.size()); + CuAssertIntEquals(tc, _T("Boundary test string length mismatch"), (int)testString.size(), + (int)readBackString.size()); // Verify each character and their byte counts printf("\n=== UTF-8 Encoding Boundary Tests ===\n"); for (size_t i = 0; i < testString.size(); i++) { wchar_t original = testString[i]; wchar_t readBack = readBackString[i]; - - printf("Character #%zu: U+%04X (%s)\n", - i, - (unsigned int)original, + + printf("Character #%zu: U+%04X (%s)\n", i, (unsigned int)original, boundaryTests[i].description); - printf(" - Expected bytes: %d, Actual bytes: %d, %s\n", - boundaryTests[i].expectedBytes, + printf(" - Expected bytes: %d, Actual bytes: %d, %s\n", boundaryTests[i].expectedBytes, actualBytes[i], (boundaryTests[i].expectedBytes == actualBytes[i] ? "Success" : "Failed")); - printf(" - Readback: U+%04X, %s\n", - (unsigned int)readBack, + printf(" - Readback: U+%04X, %s\n", (unsigned int)readBack, (original == readBack ? "Success" : "Failed")); - + char errorMsg[256]; - sprintf(errorMsg, "Boundary character mismatch - Position: %zu, Description: %s, Original: 0x%04X, Readback: 0x%04X", + sprintf(errorMsg, + "Boundary character mismatch - Position: %zu, Description: %s, Original: 0x%04X, " + "Readback: 0x%04X", i, boundaryTests[i].description, (unsigned int)original, (unsigned int)readBack); CuAssertTrueWithMessage(tc, errorMsg, original == readBack); - CuAssertIntEquals(tc, _T("UTF-8 encoding byte count mismatch"), boundaryTests[i].expectedBytes, actualBytes[i]); + CuAssertIntEquals(tc, _T("UTF-8 encoding byte count mismatch"), + boundaryTests[i].expectedBytes, actualBytes[i]); } input->close(); @@ -438,81 +441,83 @@ static void TestUTF8EncodingBoundaries(CuTest* tc) { // Test special UTF-8 sequences static void TestSpecialUTF8Sequences(CuTest* tc) { RAMDirectory* dir = _CLNEW RAMDirectory(); - + // Test some special UTF-8 sequences struct SpecialSequenceTest { std::wstring str; const char* description; }; - + std::vector<SpecialSequenceTest> specialTests = { - {L"", "Empty string"}, - {L"Hello", "Pure ASCII string"}, - {L"你好世界", "Pure Chinese string"}, - {L"Hello, 世界!", "Mixed ASCII and Chinese"}, - {L"🌍🌎🌏", "Pure Emoji (4-byte characters)"}, - {L"Earth: 🌍 🌎 🌏", "Mixed Chinese and Emoji"}, - {L"A\u0000B", "String containing NULL character"}, - {L"سلام دنیا", "Arabic/Persian text"}, - {L"こんにちは世界", "Japanese and Chinese"}, - {L"Hello\nWorld", "String with newline"}, - {L"Tab\tCharacter", "String with tab character"}, - {std::wstring(1000, L'A'), "1000 identical characters"}, - {L"😀😃😄😁😆😅😂🤣", "Consecutive Emoji"} - }; - + {L"", "Empty string"}, + {L"Hello", "Pure ASCII string"}, + {L"你好世界", "Pure Chinese string"}, + {L"Hello, 世界!", "Mixed ASCII and Chinese"}, + {L"🌍🌎🌏", "Pure Emoji (4-byte characters)"}, + {L"Earth: 🌍 🌎 🌏", "Mixed Chinese and Emoji"}, + {L"A\u0000B", "String containing NULL character"}, + {L"سلام دنیا", "Arabic/Persian text"}, + {L"こんにちは世界", "Japanese and Chinese"}, + {L"Hello\nWorld", "String with newline"}, + {L"Tab\tCharacter", "String with tab character"}, + {std::wstring(1000, L'A'), "1000 identical characters"}, + {L"😀😃😄😁😆😅😂🤣", "Consecutive Emoji"}}; + for (size_t testIndex = 0; testIndex < specialTests.size(); testIndex++) { const auto& test = specialTests[testIndex]; std::string testFileName = "special_test_" + std::to_string(testIndex) + ".dat"; - + // Write test string IndexOutput* output = dir->createOutput(testFileName.c_str()); - output->writeString(test.str); + output->writeVInt(test.str.length()); + output->writeSChars<TCHAR>(test.str.c_str(), test.str.length()); output->close(); _CLDELETE(output); - + // Read and verify IndexInput* input = nullptr; CLuceneError error; dir->openInput(testFileName.c_str(), input, error); - + TCHAR* readBackStr = input->readString(); std::wstring readBackString(readBackStr); _CLDELETE_LARRAY(readBackStr); - + printf("\n=== Special UTF-8 Sequence Test #%zu: %s ===\n", testIndex, test.description); - printf(" - Original length: %zu, Readback length: %zu, %s\n", - test.str.length(), + printf(" - Original length: %zu, Readback length: %zu, %s\n", test.str.length(), readBackString.length(), (test.str.length() == readBackString.length() ? "Success" : "Failed")); - - CuAssertIntEquals(tc, _T("Special sequence length mismatch"), (int)test.str.length(), (int)readBackString.length()); - + + CuAssertIntEquals(tc, _T("Special sequence length mismatch"), (int)test.str.length(), + (int)readBackString.length()); + // For shorter strings, print each character for comparison if (test.str.length() <= 20) { for (size_t i = 0; i < test.str.length(); i++) { - printf(" Character #%zu: U+%04X -> Readback: U+%04X, %s\n", - i, - (unsigned int)test.str[i], - (unsigned int)readBackString[i], + printf(" Character #%zu: U+%04X -> Readback: U+%04X, %s\n", i, + (unsigned int)test.str[i], (unsigned int)readBackString[i], (test.str[i] == readBackString[i] ? "Success" : "Failed")); - + char errorMsg[256]; - sprintf(errorMsg, "Special sequence character mismatch - Test: %s, Position: %zu, Original: 0x%04X, Readback: 0x%04X", - test.description, i, (unsigned int)test.str[i], (unsigned int)readBackString[i]); + sprintf(errorMsg, + "Special sequence character mismatch - Test: %s, Position: %zu, Original: " + "0x%04X, Readback: 0x%04X", + test.description, i, (unsigned int)test.str[i], + (unsigned int)readBackString[i]); CuAssertTrueWithMessage(tc, errorMsg, test.str[i] == readBackString[i]); } } else { // For longer strings, just check equality char errorMsg[256]; - sprintf(errorMsg, "Long special sequence mismatch - Test: %s, Length: %zu", test.description, test.str.length()); + sprintf(errorMsg, "Long special sequence mismatch - Test: %s, Length: %zu", + test.description, test.str.length()); CuAssertTrueWithMessage(tc, errorMsg, test.str == readBackString); } - + input->close(); _CLDELETE(input); } - + _CLDELETE(dir); } @@ -520,66 +525,69 @@ static void TestSpecialUTF8Sequences(CuTest* tc) { static void TestUTF8Performance(CuTest* tc) { RAMDirectory* dir = _CLNEW RAMDirectory(); const char* testFileName = "test_utf8_performance.dat"; - + // Create a large test string with various types of characters std::wstring testString; const int testSize = 100000; // 100,000 characters - + // Add different types of characters for (int i = 0; i < testSize / 4; i++) { - testString.push_back(L'A' + (i % 26)); // ASCII characters + testString.push_back(L'A' + (i % 26)); // ASCII characters testString.push_back(0x00A0 + (i % 128)); // Latin extended - testString.push_back(0x4E00 + (i % 1000)); // Chinese characters + testString.push_back(0x4E00 + (i % 1000)); // Chinese characters testString.push_back(0x1F600 + (i % 50)); // Emoji (4-byte characters) } - + printf("\n=== UTF-8 Encoding Performance Test (String length: %zu) ===\n", testString.size()); - + // Measure write time clock_t writeStart = clock(); - + IndexOutput* output = dir->createOutput(testFileName); - output->writeString(testString); + output->writeVInt(testString.length()); + output->writeSChars<TCHAR>(testString.c_str(), testString.length()); output->close(); _CLDELETE(output); - + clock_t writeEnd = clock(); double writeTime = ((double)(writeEnd - writeStart)) / CLOCKS_PER_SEC; printf("Write time: %.6f seconds\n", writeTime); - + // Measure read time clock_t readStart = clock(); - + IndexInput* input = nullptr; CLuceneError error; dir->openInput(testFileName, input, error); - + TCHAR* readBackStr = input->readString(); std::wstring readBackString(readBackStr); _CLDELETE_LARRAY(readBackStr); - + clock_t readEnd = clock(); double readTime = ((double)(readEnd - readStart)) / CLOCKS_PER_SEC; printf("Read time: %.6f seconds\n", readTime); - + // Verify results char lengthErrorMsg[256]; - sprintf(lengthErrorMsg, "Performance test string length mismatch - Original: %zu, Readback: %zu", + sprintf(lengthErrorMsg, + "Performance test string length mismatch - Original: %zu, Readback: %zu", testString.size(), readBackString.size()); - CuAssertIntEquals(tc, _T("Performance test string length mismatch"), (int)testString.size(), (int)readBackString.size()); + CuAssertIntEquals(tc, _T("Performance test string length mismatch"), (int)testString.size(), + (int)readBackString.size()); printf(" %s\n", lengthErrorMsg); // Print error message directly instead of using macro - + char contentErrorMsg[256]; sprintf(contentErrorMsg, "Performance test string content mismatch"); CuAssertTrueWithMessage(tc, contentErrorMsg, testString == readBackString); - + // Calculate characters processed per second double writeCharsPerSec = testString.size() / writeTime; double readCharsPerSec = readBackString.size() / readTime; - + printf("Write speed: %.2f characters/second\n", writeCharsPerSec); printf("Read speed: %.2f characters/second\n", readCharsPerSec); - + input->close(); _CLDELETE(input); _CLDELETE(dir); @@ -591,26 +599,26 @@ static void TestUTF8Compatibility(CuTest* tc) { // Test characters covering different ranges const std::vector<wchar_t> testChars = { - // Basic 1-byte - L'A', - // Common 3-byte Chinese characters - L'你', L'好', - // 4-byte characters from different planes - 0x1F600, // 😀 Grinning Face (Emoji) - 0xF600, // 丽 - 0x1F64F, // 🙏 Folded Hands - 0xF64F, // 碌 - 0x20021, // 𠀡 CJK Unified Ideographs Extension B - 0x0021, // ! Basic Latin - 0x2A6D6, // 𪛖 CJK Unified Ideographs Extension C - 0xA6D6, // ꛖ Hangul Jamo Extended-B - 0x10123, // 𐄣 Ancient Greek Numbers - 0x0123, // Cuneiform - 0x10348, // 𐍈 Gothic Letter Hwair - 0x0348, // Ȉ - // Boundary cases - //0x10000, // Minimum 4-byte character - 0x10FFFF // Maximum valid Unicode code point + // Basic 1-byte + L'A', + // Common 3-byte Chinese characters + L'你', L'好', + // 4-byte characters from different planes + 0x1F600, // 😀 Grinning Face (Emoji) + 0xF600, // 丽 + 0x1F64F, // 🙏 Folded Hands + 0xF64F, // 碌 + 0x20021, // 𠀡 CJK Unified Ideographs Extension B + 0x0021, // ! Basic Latin + 0x2A6D6, // 𪛖 CJK Unified Ideographs Extension C + 0xA6D6, // ꛖ Hangul Jamo Extended-B + 0x10123, // 𐄣 Ancient Greek Numbers + 0x0123, // Cuneiform + 0x10348, // 𐍈 Gothic Letter Hwair + 0x0348, // Ȉ + // Boundary cases + //0x10000, // Minimum 4-byte character + 0x10FFFF // Maximum valid Unicode code point }; // Build test string with mixed character types @@ -637,10 +645,8 @@ static void TestUTF8Compatibility(CuTest* tc) { buffer[len] = 0; for (int32_t i = 0; i < len; i++) { - printf(" Character #%d: U+%04X -> Readback: U+%04X, %s\n", - i, - (unsigned int)testString[i], - (unsigned int)buffer[i], + printf(" Character #%d: U+%04X -> Readback: U+%04X, %s\n", i, + (unsigned int)testString[i], (unsigned int)buffer[i], (testString[i] == buffer[i] ? "Success" : "Failed")); } CuAssertTrue(tc, buffer[3] != testString[3]); // old code cannot parse 4-byte character @@ -655,13 +661,13 @@ static void TestUTF8Compatibility(CuTest* tc) { // Old implementation write (only handles 3 bytes) IndexOutput* output = dir->createOutput("old_write_new_read.dat"); output->writeVInt(testString.length()); - WriteCharsLegacy(output, testString.c_str(), testString.length()); + output->writeSCharsOrigin<TCHAR>(testString.c_str(), testString.length()); output->close(); _CLDELETE(output); // Read using both methods for comparison - std::wstring oldResult; // Old method read result - std::wstring newResult; // New method read result + std::wstring oldResult; // Old method read result + std::wstring newResult; // New method read result // Old method read { @@ -697,37 +703,379 @@ static void TestUTF8Compatibility(CuTest* tc) { _CLDELETE(input); } printf("newResult.size() = %zu\n", newResult.size()); - printf("oldResult.size() = %zu, newResult.size() = %zu\n", oldResult.size(), newResult.size()); + printf("oldResult.size() = %zu, newResult.size() = %zu\n", oldResult.size(), + newResult.size()); // Compare results char lengthErrorMsg[256]; - sprintf(lengthErrorMsg, "Compatibility test length mismatch - Old method: %zu, New method: %zu", + sprintf(lengthErrorMsg, + "Compatibility test length mismatch - Old method: %zu, New method: %zu", oldResult.size(), newResult.size()); CuAssertTrueWithMessage(tc, lengthErrorMsg, oldResult.size() == newResult.size()); - + // Verify each character for (size_t i = 0; i < oldResult.size(); i++) { wchar_t oldChar = oldResult[i]; wchar_t newChar = newResult[i]; - - printf("Character #%zu: Old method: U+%04X, New method: U+%04X, %s, Original: U+%04X\n", - i, - (unsigned int)oldChar, - (unsigned int)newChar, - (oldChar == newChar ? "Match" : "Mismatch"), - (unsigned int)testString[i]); - + + printf("Character #%zu: Old method: U+%04X, New method: U+%04X, %s, Original: U+%04X\n", + i, (unsigned int)oldChar, (unsigned int)newChar, + (oldChar == newChar ? "Match" : "Mismatch"), (unsigned int)testString[i]); + char errorMsg[256]; - sprintf(errorMsg, "Character mismatch - Position: %zu, Old method: U+%04X, New method: U+%04X", - i, (unsigned int)oldChar, (unsigned int)newChar); - + sprintf(errorMsg, + "Character mismatch - Position: %zu, Old method: U+%04X, New method: U+%04X", i, + (unsigned int)oldChar, (unsigned int)newChar); + CuAssertTrueWithMessage(tc, errorMsg, oldChar == newChar); } - - printf("\nCompatibility test: %s\n", - (oldResult == newResult ? "PASSED - Methods are compatible" : "FAILED - Methods are not compatible")); + + printf("\nCompatibility test: %s\n", + (oldResult == newResult ? "PASSED - Methods are compatible" + : "FAILED - Methods are not compatible")); + } + + _CLDELETE(dir); +} + +void print_wstring(const std::wstring& wstr) { + std::wstring_convert<std::codecvt_utf8<wchar_t>> converter; + std::string str = converter.to_bytes(wstr); + std::cout << str << std::endl; +} +// Test STermInfosWriter with various Unicode characters (write and read) +static void TestSTermInfosWriterUnicode(CuTest* tc) { + printf("\n=== Testing STermInfosWriter<TCHAR> with Unicode (Write and Read) ===\n"); + + // Create a RAM directory for testing + Directory* dir = _CLNEW RAMDirectory(); + + // Create field infos + FieldInfos* fieldInfos = _CLNEW FieldInfos(); + fieldInfos->add(_T("content"), false); + + const char* segmentName = "test_unicode"; + + // Define Unicode test cases + struct UnicodeTermTest { + std::wstring str; + const char* description; + }; + + std::vector<UnicodeTermTest> testTerms = { + {L"A", "Basic Latin A (U+0041)"}, + {L"z", "Basic Latin z (U+007A)"}, + {L"©", "Copyright Sign (U+00A9)"}, + {L"é", "Latin Small E with acute (U+00E9)"}, + {L"œ", "Latin Small Ligature OE (U+0153)"}, + {L"Ω", "Greek Capital Omega (U+03A9)"}, + {L"π", "Greek Small Pi (U+03C0)"}, + {L"Я", "Cyrillic Capital Letter Ya (U+042F)"}, + {L"я", "Cyrillic Small Letter Ya (U+044F)"}, + {L"א", "Hebrew Letter Alef (U+05D0)"}, + {L"ا", "Arabic Letter Alef (U+0627)"}, + {L"अ", "Devanagari Letter A (U+0905)"}, + {L"一", "CJK Unified Ideograph (U+4E00)"}, + {L"中", "CJK Unified Ideograph (U+4E2D)"}, + {L"文", "CJK Unified Ideograph (U+6587)"}, + {L"青", "CJK Unified Ideograph (U+9752)"}, + {L"あ", "Hiragana Letter A (U+3042)"}, + {L"ア", "Katakana Letter A (U+30A2)"}, + {L"가", "Hangul Syllable GA (U+AC00)"}, + + {L"∀", "For All (U+2200)"}, + {L"∑", "Summation (U+2211)"}, + {L"∞", "Infinity (U+221E)"}, + {L"≈", "Almost Equal To (U+2248)"}, + {L"€", "Euro (U+20AC)"}, + {L"₽", "Russian Ruble (U+20BD)"}, + {L"₹", "Indian Rupee (U+20B9)"}, + {L"₩", "Won Sign (U+20A9)"}, + {L"←", "Left Arrow (U+2190)"}, + {L"→", "Right Arrow (U+2192)"}, + {L"═", "Box Drawing (U+2550)"}, + {L"║", "Box Drawing (U+2551)"}, + {L"☀", "Black Sun with Rays (U+2600)"}, + {L"♥", "Black Heart Suit (U+2665)"}, + + {L"Hello世界", "Mixed English and Chinese"}, + {L"Café☕", "Latin with accent and emoji"}, + {L"Москва🏙", "Cyrillic with emoji"}, + {L"こんにちは🌸", "Japanese with emoji"}, + {L"안녕하세요🇰🇷", "Korean with flag emoji"}, + {L"αβγδεζηθ", "Greek alphabet sequence"}, + {L"∀x∈ℝ∃y≥x", "Mathematical expression"}, + {L"♠♥♦♣", "Card suits"}, + {L"←↑→↓", "Arrow directions"}, + {L"╔═══╗║ ║╚═══╝", "Box drawing frame"}, + + {L"苹果🍎Apple", "Mixed Chinese, emoji and English"}, + {L"数学∫f(x)dx=F(x)+C", "Mathematical formula with Chinese"}, + {L"こんにちは世界", "Japanese and Chinese"}, + {L"тест тест 测试 테스트", "Mixed Cyrillic, Chinese and Korean"}, + {L"😀😁😂🤣😃😄😅😆", "Multiple emoji sequence"}}; + + // === WRITE PHASE === + printf("\n--- Write Phase ---\n"); + + // Create a TermInfosWriter + STermInfosWriter<TCHAR>* writer = + _CLNEW STermInfosWriter<TCHAR>(dir, segmentName, fieldInfos, 128); + writer->setEnableCorrectTermWrite(true); + + // Add terms to the writer + TermInfo* ti = _CLNEW TermInfo(); + int64_t freqPointer = 0; + int64_t proxPointer = 1000; + + // Store terms for verification + std::vector<Term*> terms; + std::vector<TermInfo*> termInfos; + + // Add all test terms to the writer + for (size_t i = 0; i < testTerms.size(); i++) { + // Create term + Term* term = _CLNEW Term(_T("content"), testTerms[i].str.c_str()); + terms.push_back(term); + + // Create and store term info + TermInfo* currentTi = _CLNEW TermInfo(); + currentTi->docFreq = i + 1; + currentTi->freqPointer = freqPointer; + currentTi->proxPointer = proxPointer; + currentTi->skipOffset = (i % 10 == 0) ? 10 : 0; + termInfos.push_back(currentTi); + + // Add term to writer + TermInfo tempTi; + tempTi.docFreq = currentTi->docFreq; + tempTi.freqPointer = currentTi->freqPointer; + tempTi.proxPointer = currentTi->proxPointer; + tempTi.skipOffset = currentTi->skipOffset; + writer->add(term->field(), term->text(), term->textLength(), &tempTi); + + printf("Added Term #%zu: \"", i); + print_wstring(testTerms[i].str); + printf("\" - %s\n", testTerms[i].description); + + // Increment pointers for next term + freqPointer += 100 + i * 10; + proxPointer += 200 + i * 20; + } + + // Close the writer + writer->close(); + _CLDELETE(writer); + _CLDELETE(ti); + + // === READ PHASE === + printf("\n--- Read Phase ---\n"); + + // Create a TermInfosReader + TermInfosReader* reader = _CLNEW TermInfosReader(dir, segmentName, fieldInfos); + + // Verify each term can be read back correctly + for (size_t i = 0; i < terms.size(); i++) { + Term* originalTerm = terms[i]; + TermInfo* originalInfo = termInfos[i]; + + // Read term info + auto readInfo = reader->get(originalTerm, nullptr); + + printf("Term #%zu: \"", i); + print_wstring(testTerms[i].str); + printf("\" - %s\n", readInfo ? "Found" : "NOT FOUND"); + + if (readInfo) { + // Verify values match + CuAssertTrue(tc, originalInfo->docFreq == readInfo->docFreq); + CuAssertTrue(tc, originalInfo->freqPointer == readInfo->freqPointer); + CuAssertTrue(tc, originalInfo->proxPointer == readInfo->proxPointer); + } + + _CLDELETE(readInfo); + } + + // Verify term enumeration + SegmentTermEnum* termEnum = reader->terms(); + size_t count = 0; + + printf("\n--- Term Enumeration ---\n"); + while (termEnum->next()) { + Term* term = termEnum->term(false); + printf("Enum #%zu: ", count); + print_wstring(term->text()); + CuAssertTrueWithMessage(tc, "Term text mismatch", + _tcscmp(term->text(), testTerms[count].str.c_str()) == 0); + count++; + } + + printf("Total enumerated terms: %zu (expected: %zu)\n", count, terms.size()); + CuAssertTrue(tc, (int)terms.size() == (int)count); + + _CLDELETE(termEnum); + _CLDELETE(reader); + + // Clean up + for (size_t i = 0; i < terms.size(); i++) { + _CLDELETE(terms[i]); + _CLDELETE(termInfos[i]); } + _CLDELETE(fieldInfos); _CLDELETE(dir); + + printf("STermInfosWriter/Reader Unicode test completed successfully\n"); +} + +// Test STermInfosWriter with Unicode characters when correctTermWrite is disabled +static void TestSTermInfosWriterUnicodeDisabled(CuTest* tc) { + printf("\n=== Testing STermInfosWriter<TCHAR> with Unicode (Disabled Correct Term Write) " + "===\n"); + + // Create a RAM directory for testing + Directory* dir = _CLNEW RAMDirectory(); + + // Create field infos + FieldInfos* fieldInfos = _CLNEW FieldInfos(); + fieldInfos->add(_T("content"), false); + + const char* segmentName = "test_unicode_disabled"; + + // Define Unicode test cases - one normal char and one > 0xFFFF + struct UnicodeTermTest { + std::wstring str; + const char* description; + bool shouldCorruptOnRead; + }; + + std::vector<UnicodeTermTest> testTerms = { + {L"中", "CJK Unified Ideograph (U+4E2D)", false}, // Regular Unicode character + {L"𠜎", "CJK Unified Ideograph Extension B (U+2070E)", + true} // Character beyond BMP (> 0xFFFF) + }; + + // === WRITE PHASE === + printf("\n--- Write Phase ---\n"); + + // Create a TermInfosWriter with correctTermWrite disabled + STermInfosWriter<TCHAR>* writer = + _CLNEW STermInfosWriter<TCHAR>(dir, segmentName, fieldInfos, 128); + writer->setEnableCorrectTermWrite(false); // Disable correct term write + + // Add terms to the writer + int64_t freqPointer = 0; + int64_t proxPointer = 1000; + + // Store terms for verification + std::vector<Term*> terms; + std::vector<TermInfo*> termInfos; + std::vector<std::wstring> originalStrings; + + // Add all test terms to the writer + for (size_t i = 0; i < testTerms.size(); i++) { + // Store original string for later comparison + originalStrings.push_back(testTerms[i].str); + + // Create term + Term* term = _CLNEW Term(_T("content"), testTerms[i].str.c_str()); + terms.push_back(term); + + // Create and store term info + TermInfo* currentTi = _CLNEW TermInfo(); + currentTi->docFreq = i + 1; + currentTi->freqPointer = freqPointer; + currentTi->proxPointer = proxPointer; + currentTi->skipOffset = 0; + termInfos.push_back(currentTi); + + // Add term to writer + TermInfo tempTi; + tempTi.docFreq = currentTi->docFreq; + tempTi.freqPointer = currentTi->freqPointer; + tempTi.proxPointer = currentTi->proxPointer; + tempTi.skipOffset = currentTi->skipOffset; + writer->add(term->field(), term->text(), term->textLength(), &tempTi); + + printf("Added Term #%zu: \"", i); + print_wstring(testTerms[i].str); + printf("\" - %s\n", testTerms[i].description); + + // Increment pointers for next term + freqPointer += 100 + i * 10; + proxPointer += 200 + i * 20; + } + + // Close the writer + writer->close(); + _CLDELETE(writer); + + // === READ PHASE === + printf("\n--- Read Phase ---\n"); + + // Create a TermInfosReader + TermInfosReader* reader = _CLNEW TermInfosReader(dir, segmentName, fieldInfos); + + // Verify each term's read behavior + for (size_t i = 0; i < terms.size(); i++) { + Term* originalTerm = terms[i]; + TermInfo* originalInfo = termInfos[i]; + + // Read term info + auto readInfo = reader->get(originalTerm, nullptr); + + printf("Term #%zu: \"", i); + print_wstring(testTerms[i].str); + printf("\" - %s\n", readInfo ? "Found" : "NOT FOUND"); + + if (readInfo) { + // Verify values match + CuAssertTrue(tc, originalInfo->docFreq == readInfo->docFreq); + CuAssertTrue(tc, originalInfo->freqPointer == readInfo->freqPointer); + CuAssertTrue(tc, originalInfo->proxPointer == readInfo->proxPointer); + } + + _CLDELETE(readInfo); + } + + // Verify term enumeration + SegmentTermEnum* termEnum = reader->terms(); + size_t count = 0; + + printf("\n--- Term Enumeration ---\n"); + while (termEnum->next()) { + Term* term = termEnum->term(false); + printf("Enum #%zu: ", count); + print_wstring(term->text()); + + // For regular Unicode characters, they should be preserved correctly + if (!testTerms[count].shouldCorruptOnRead) { + CuAssertTrueWithMessage(tc, "Regular Unicode term text should match", + _tcscmp(term->text(), originalStrings[count].c_str()) == 0); + } else { + // For characters beyond BMP (>0xFFFF), they should NOT match due to disabled correctTermWrite + printf(" - Expected corruption for high Unicode character\n"); + CuAssertTrueWithMessage(tc, "High Unicode term should be corrupted", + _tcscmp(term->text(), originalStrings[count].c_str()) != 0); + } + count++; + } + + printf("Total enumerated terms: %zu (expected: %zu)\n", count, terms.size()); + CuAssertTrue(tc, (int)terms.size() == (int)count); + + _CLDELETE(termEnum); + _CLDELETE(reader); + + // Clean up + for (size_t i = 0; i < terms.size(); i++) { + _CLDELETE(terms[i]); + _CLDELETE(termInfos[i]); + } + + _CLDELETE(fieldInfos); + _CLDELETE(dir); + + printf("STermInfosWriter/Reader Unicode Disabled test completed\n"); } CuSuite* testUTF8CharsSuite() { @@ -740,6 +1088,8 @@ CuSuite* testUTF8CharsSuite() { SUITE_ADD_TEST(suite, TestSpecialUTF8Sequences); SUITE_ADD_TEST(suite, TestUTF8Performance); SUITE_ADD_TEST(suite, TestUTF8Compatibility); + SUITE_ADD_TEST(suite, TestSTermInfosWriterUnicode); + SUITE_ADD_TEST(suite, TestSTermInfosWriterUnicodeDisabled); return suite; } \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org