This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new 3aa17be9e4a [fix](unicode) fix 4 bytes unicode read and write bug 
(#255)
3aa17be9e4a is described below

commit 3aa17be9e4a496e7e8ddf9e114e56addf8b536c2
Author: airborne12 <jiang...@selectdb.com>
AuthorDate: Thu Nov 28 11:08:06 2024 +0800

    [fix](unicode) fix 4 bytes unicode read and write bug (#255)
    
    * [fix](unicode) fix 4 bytes unicode read and write bug
---
 src/core/CLucene/store/IndexInput.cpp  | 22 ++++++------
 src/core/CLucene/store/IndexOutput.cpp | 62 ++++++++++++++++++++++------------
 2 files changed, 51 insertions(+), 33 deletions(-)

diff --git a/src/core/CLucene/store/IndexInput.cpp 
b/src/core/CLucene/store/IndexInput.cpp
index 930b16392ae..82c5165e53c 100644
--- a/src/core/CLucene/store/IndexInput.cpp
+++ b/src/core/CLucene/store/IndexInput.cpp
@@ -135,23 +135,23 @@ CL_NS_USE(util)
     for (int32_t i = start; i < end; ++i) {
       b = readByte();
       if ((b & 0x80) == 0) {
+        // 1-byte sequence: 0xxxxxxx
         b = (b & 0x7F);
-      } else if ((b & 0xE0) != 0xE0) {
-        b = (((b & 0x1F) << 6)
-          | (readByte() & 0x3F));
-      } else {
-                 b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6);
-                 b |= (readByte() & 0x3F);
+      } else if ((b & 0xE0) == 0xC0) {
+        // 2-byte sequence: 110xxxxx 10xxxxxx
+        b = (((b & 0x1F) << 6) | (readByte() & 0x3F));
+      } else if ((b & 0xF0) == 0xE0) {
+          // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
+          b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6) | (readByte() & 
0x3F);
+      } else if ((b & 0xF8) == 0xF0) {
+          // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+          b = ((b & 0x07) << 18) | ((readByte() & 0x3F) << 12) |
+              ((readByte() & 0x3F) << 6) | (readByte() & 0x3F);
       }
       buffer[i] = b;
        }
   }
 
-
-
-
-
-
 BufferedIndexInput::BufferedIndexInput(int32_t _bufferSize):
                buffer(NULL),
                
bufferSize(_bufferSize>=0?_bufferSize:CL_NS(store)::BufferedIndexOutput::BUFFER_SIZE),
diff --git a/src/core/CLucene/store/IndexOutput.cpp 
b/src/core/CLucene/store/IndexOutput.cpp
index 77c37400d8e..f28ca07b3c3 100644
--- a/src/core/CLucene/store/IndexOutput.cpp
+++ b/src/core/CLucene/store/IndexOutput.cpp
@@ -165,16 +165,25 @@ CL_NS_DEF(store)
 
       const int32_t end = length;
       for (int32_t i = 0; i < end; ++i) {
-          const int32_t code = (int32_t)s[i];
-          if (code >= 0x01 && code <= 0x7F)
+          auto code = (uint32_t)s[i];
+          if (code >= 0x00 && code <= 0x7F) {
               writeByte((uint8_t)code);
-          else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
+          } else if (code <= 0x7FF) {
               writeByte((uint8_t)(0xC0 | (code >> 6)));
               writeByte((uint8_t)(0x80 | (code & 0x3F)));
-          } else {
-              writeByte((uint8_t)(0xE0 | (((uint32_t)code) >> 12))); 
//unsigned shift
+          } else if (code <= 0xFFFF) {
+              writeByte((uint8_t)(0xE0 | (code >> 12)));
+              writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+              writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else if (code <= 0x10FFFF) {
+              writeByte((uint8_t)(0xF0 | (code >> 18)));
+              writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F)));
               writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
               writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else {
+              writeByte(0xEF);
+              writeByte(0xBF);
+              writeByte(0xBD);
           }
       }
   }
@@ -188,23 +197,32 @@ CL_NS_DEF(store)
   }
 
   void IndexOutput::writeChars(const TCHAR* s, const int32_t length){
-    if ( length < 0 )
-      _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a 
positive value.");
-
-    const int32_t end = length;
-    for (int32_t i = 0; i < end; ++i) {
-        const int32_t code = (int32_t)s[i];
-        if (code >= 0x01 && code <= 0x7F)
-                                       writeByte((uint8_t)code);
-        else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) {
-                                       writeByte((uint8_t)(0xC0 | (code >> 
6)));
-                                       writeByte((uint8_t)(0x80 | (code & 
0x3F)));
-        } else {
-                                       writeByte((uint8_t)(0xE0 | 
(((uint32_t)code) >> 12))); //unsigned shift
-                                       writeByte((uint8_t)(0x80 | ((code >> 6) 
& 0x3F)));
-                                       writeByte((uint8_t)(0x80 | (code & 
0x3F)));
-        }
-    }
+      if ( length < 0 )
+          _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be 
a positive value.");
+
+      const int32_t end = length;
+      for (int32_t i = 0; i < end; ++i) {
+          auto code = (uint32_t)s[i];
+          if (code >= 0x00 && code <= 0x7F) {
+              writeByte((uint8_t)code);
+          } else if (code <= 0x7FF) {
+              writeByte((uint8_t)(0xC0 | (code >> 6)));
+              writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else if (code <= 0xFFFF) {
+              writeByte((uint8_t)(0xE0 | (code >> 12)));
+              writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+              writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else if (code <= 0x10FFFF) {
+              writeByte((uint8_t)(0xF0 | (code >> 18)));
+              writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F)));
+              writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F)));
+              writeByte((uint8_t)(0x80 | (code & 0x3F)));
+          } else {
+              writeByte(0xEF);
+              writeByte(0xBF);
+              writeByte(0xBD);
+          }
+      }
   }
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to