This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new 3aa17be9e4a [fix](unicode) fix 4 bytes unicode read and write bug (#255) 3aa17be9e4a is described below commit 3aa17be9e4a496e7e8ddf9e114e56addf8b536c2 Author: airborne12 <jiang...@selectdb.com> AuthorDate: Thu Nov 28 11:08:06 2024 +0800 [fix](unicode) fix 4 bytes unicode read and write bug (#255) * [fix](unicode) fix 4 bytes unicode read and write bug --- src/core/CLucene/store/IndexInput.cpp | 22 ++++++------ src/core/CLucene/store/IndexOutput.cpp | 62 ++++++++++++++++++++++------------ 2 files changed, 51 insertions(+), 33 deletions(-) diff --git a/src/core/CLucene/store/IndexInput.cpp b/src/core/CLucene/store/IndexInput.cpp index 930b16392ae..82c5165e53c 100644 --- a/src/core/CLucene/store/IndexInput.cpp +++ b/src/core/CLucene/store/IndexInput.cpp @@ -135,23 +135,23 @@ CL_NS_USE(util) for (int32_t i = start; i < end; ++i) { b = readByte(); if ((b & 0x80) == 0) { + // 1-byte sequence: 0xxxxxxx b = (b & 0x7F); - } else if ((b & 0xE0) != 0xE0) { - b = (((b & 0x1F) << 6) - | (readByte() & 0x3F)); - } else { - b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6); - b |= (readByte() & 0x3F); + } else if ((b & 0xE0) == 0xC0) { + // 2-byte sequence: 110xxxxx 10xxxxxx + b = (((b & 0x1F) << 6) | (readByte() & 0x3F)); + } else if ((b & 0xF0) == 0xE0) { + // 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx + b = ((b & 0x0F) << 12) | ((readByte() & 0x3F) << 6) | (readByte() & 0x3F); + } else if ((b & 0xF8) == 0xF0) { + // 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + b = ((b & 0x07) << 18) | ((readByte() & 0x3F) << 12) | + ((readByte() & 0x3F) << 6) | (readByte() & 0x3F); } buffer[i] = b; } } - - - - - BufferedIndexInput::BufferedIndexInput(int32_t _bufferSize): buffer(NULL), bufferSize(_bufferSize>=0?_bufferSize:CL_NS(store)::BufferedIndexOutput::BUFFER_SIZE), diff --git a/src/core/CLucene/store/IndexOutput.cpp b/src/core/CLucene/store/IndexOutput.cpp index 77c37400d8e..f28ca07b3c3 100644 --- a/src/core/CLucene/store/IndexOutput.cpp +++ b/src/core/CLucene/store/IndexOutput.cpp @@ -165,16 +165,25 @@ CL_NS_DEF(store) const int32_t end = length; for (int32_t i = 0; i < end; ++i) { - const int32_t code = (int32_t)s[i]; - if (code >= 0x01 && code <= 0x7F) + auto code = (uint32_t)s[i]; + if (code >= 0x00 && code <= 0x7F) { writeByte((uint8_t)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { + } else if (code <= 0x7FF) { writeByte((uint8_t)(0xC0 | (code >> 6))); writeByte((uint8_t)(0x80 | (code & 0x3F))); - } else { - writeByte((uint8_t)(0xE0 | (((uint32_t)code) >> 12))); //unsigned shift + } else if (code <= 0xFFFF) { + writeByte((uint8_t)(0xE0 | (code >> 12))); + writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F))); + writeByte((uint8_t)(0x80 | (code & 0x3F))); + } else if (code <= 0x10FFFF) { + writeByte((uint8_t)(0xF0 | (code >> 18))); + writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F))); writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F))); writeByte((uint8_t)(0x80 | (code & 0x3F))); + } else { + writeByte(0xEF); + writeByte(0xBF); + writeByte(0xBD); } } } @@ -188,23 +197,32 @@ CL_NS_DEF(store) } void IndexOutput::writeChars(const TCHAR* s, const int32_t length){ - if ( length < 0 ) - _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value."); - - const int32_t end = length; - for (int32_t i = 0; i < end; ++i) { - const int32_t code = (int32_t)s[i]; - if (code >= 0x01 && code <= 0x7F) - writeByte((uint8_t)code); - else if (((code >= 0x80) && (code <= 0x7FF)) || code == 0) { - writeByte((uint8_t)(0xC0 | (code >> 6))); - writeByte((uint8_t)(0x80 | (code & 0x3F))); - } else { - writeByte((uint8_t)(0xE0 | (((uint32_t)code) >> 12))); //unsigned shift - writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F))); - writeByte((uint8_t)(0x80 | (code & 0x3F))); - } - } + if ( length < 0 ) + _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value."); + + const int32_t end = length; + for (int32_t i = 0; i < end; ++i) { + auto code = (uint32_t)s[i]; + if (code >= 0x00 && code <= 0x7F) { + writeByte((uint8_t)code); + } else if (code <= 0x7FF) { + writeByte((uint8_t)(0xC0 | (code >> 6))); + writeByte((uint8_t)(0x80 | (code & 0x3F))); + } else if (code <= 0xFFFF) { + writeByte((uint8_t)(0xE0 | (code >> 12))); + writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F))); + writeByte((uint8_t)(0x80 | (code & 0x3F))); + } else if (code <= 0x10FFFF) { + writeByte((uint8_t)(0xF0 | (code >> 18))); + writeByte((uint8_t)(0x80 | ((code >> 12) & 0x3F))); + writeByte((uint8_t)(0x80 | ((code >> 6) & 0x3F))); + writeByte((uint8_t)(0x80 | (code & 0x3F))); + } else { + writeByte(0xEF); + writeByte(0xBF); + writeByte(0xBD); + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org