This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new 4caf1086 [improvement](keyword) keyword type uses the SDocument process (#97) 4caf1086 is described below commit 4caf10866a7a35358d19e3831298c4a6b29d62a8 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Thu Jul 6 13:09:07 2023 +0800 [improvement](keyword) keyword type uses the SDocument process (#97) --- src/core/CLucene/index/SDocumentWriter.cpp | 6 +-- src/core/CLucene/index/TermInfosWriter.cpp | 69 ++++++++++++++++++++++++------ src/core/CLucene/store/IndexOutput.cpp | 14 ++++++ src/core/CLucene/store/IndexOutput.h | 2 + src/core/CLucene/util/stringUtil.h | 20 +++++++++ 5 files changed, 96 insertions(+), 15 deletions(-) diff --git a/src/core/CLucene/index/SDocumentWriter.cpp b/src/core/CLucene/index/SDocumentWriter.cpp index 3b22fdad..33da7a67 100644 --- a/src/core/CLucene/index/SDocumentWriter.cpp +++ b/src/core/CLucene/index/SDocumentWriter.cpp @@ -739,8 +739,8 @@ int32_t SDocumentsWriter<T>::ThreadState::comparePostings(Posting *p1, Posting * const T *pos1 = scharPool->buffers[p1->textStart >> CHAR_BLOCK_SHIFT] + (p1->textStart & CHAR_BLOCK_MASK); const T *pos2 = scharPool->buffers[p2->textStart >> CHAR_BLOCK_SHIFT] + (p2->textStart & CHAR_BLOCK_MASK); while (true) { - const T c1 = *pos1++; - const T c2 = *pos2++; + const auto c1 = static_cast<typename std::make_unsigned<T>::type>(*pos1++); + const auto c2 = static_cast<typename std::make_unsigned<T>::type>(*pos2++); if (c1 < c2) if (CLUCENE_END_OF_WORD == c2) return 1; @@ -753,8 +753,8 @@ int32_t SDocumentsWriter<T>::ThreadState::comparePostings(Posting *p1, Posting * return 1; else if (CLUCENE_END_OF_WORD == c1) return 0; + } } -} template<typename T> void SDocumentsWriter<T>::ThreadState::quickSort(Posting **postings, int32_t lo, int32_t hi) { diff --git a/src/core/CLucene/index/TermInfosWriter.cpp b/src/core/CLucene/index/TermInfosWriter.cpp index 6b9060ec..6a457407 100644 --- a/src/core/CLucene/index/TermInfosWriter.cpp +++ b/src/core/CLucene/index/TermInfosWriter.cpp @@ -15,6 +15,7 @@ #include "_FieldInfos.h" #include "_TermInfosWriter.h" #include <assert.h> +#include <iostream> CL_NS_USE(util) CL_NS_USE(store) @@ -176,20 +177,64 @@ void STermInfosWriter<T>::close() { template <typename T> void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText, int32_t termTextLength) { - int32_t start = 0; - const int32_t limit = termTextLength < lastTermTextLength ? termTextLength : lastTermTextLength; - while (start < limit) { - if (termText[start] != lastTermText.values[start]) - break; - start++; - } + if constexpr (std::is_same_v<T, char>) { + std::string_view utf8Str(termText, termTextLength); + int32_t utf8Length = 0; + { + size_t i = 0; + for (; i < utf8Str.size();) { + int32_t n = StringUtil::utf8_byte_count(utf8Str[i]); + i += n; + utf8Length++; + } + assert(i == utf8Str.size()); + } - int32_t length = termTextLength - start; + int32_t start = 0; + int32_t utf8Start = 0; + int32_t limit = termTextLength < lastTermTextLength ? termTextLength : lastTermTextLength; + auto prefixCompare = [this, &utf8Str, &termText](int32_t& start, int32_t& utf8Start, int32_t limit) { + while (start < limit) { + int32_t n = StringUtil::utf8_byte_count(utf8Str[start]); + for (int32_t j = 0; j < n; j++) { + int32_t cur = start + j; + if (termText[cur] != lastTermText.values[cur]) { + return; + } + } + start += n; + utf8Start++; + } + }; + + prefixCompare(start, utf8Start, limit); + assert(start <= termTextLength); + assert(utf8Start <= utf8Length); + int32_t length = termTextLength - start; + utf8Length -= utf8Start; + + // std::cout << "term: " << utf8Str << ", utf8Start: " << utf8Start << ", utf8Length: " << utf8Length << ", length: " << length << std::endl; + + output->writeVInt(utf8Start); + output->writeVInt(utf8Length); + output->writeU8SChars(termText + start, length); + output->writeVInt(fieldNumber); + } else { + int32_t start = 0; + const int32_t limit = termTextLength < lastTermTextLength ? termTextLength : lastTermTextLength; + while (start < limit) { + if (termText[start] != lastTermText.values[start]) + break; + start++; + } - output->writeVInt(start); // write shared prefix length - output->writeVInt(length); // write delta length - output->writeSChars(termText + start, length);// write delta chars - output->writeVInt(fieldNumber); // write field num + int32_t length = termTextLength - start; + + output->writeVInt(start); // write shared prefix length + output->writeVInt(length); // write delta length + output->writeSChars(termText + start, length);// write delta chars + output->writeVInt(fieldNumber); // write field num + } } template class STermInfosWriter<char>; diff --git a/src/core/CLucene/store/IndexOutput.cpp b/src/core/CLucene/store/IndexOutput.cpp index 92fd4d9c..1d44aff1 100644 --- a/src/core/CLucene/store/IndexOutput.cpp +++ b/src/core/CLucene/store/IndexOutput.cpp @@ -8,6 +8,7 @@ #include "IndexOutput.h" #include "IndexInput.h" #include "CLucene/util/Misc.h" +#include "CLucene/util/stringUtil.h" CL_NS_USE(util) CL_NS_DEF(store) @@ -185,6 +186,19 @@ CL_NS_DEF(store) writeBytes((const uint8_t*)s, length); } + void IndexOutput::writeU8SChars(const char* s, const int32_t length) { + if ( length < 0 ) + _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value."); + + for (int32_t i = 0; i < length;) { + auto* chars = (const uint8_t*)s + i; + int32_t n = StringUtil::utf8_byte_count(*chars); + assert(n >= 1 && n <= 4); + writeBytes(chars, (n > 2 ? 3 : n)); + i += n; + } + } + void IndexOutput::writeChars(const TCHAR* s, const int32_t length){ if ( length < 0 ) _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a positive value."); diff --git a/src/core/CLucene/store/IndexOutput.h b/src/core/CLucene/store/IndexOutput.h index 6b6ca321..e8eff025 100644 --- a/src/core/CLucene/store/IndexOutput.h +++ b/src/core/CLucene/store/IndexOutput.h @@ -83,6 +83,8 @@ public: void writeChars(const TCHAR* s, const int32_t length); template<typename T> void writeSChars(const T* s, int32_t length); + + void writeU8SChars(const char* s, const int32_t length); /** Closes this stream to further operations. */ virtual void close() = 0; diff --git a/src/core/CLucene/util/stringUtil.h b/src/core/CLucene/util/stringUtil.h index da0547d6..5a715445 100644 --- a/src/core/CLucene/util/stringUtil.h +++ b/src/core/CLucene/util/stringUtil.h @@ -203,6 +203,26 @@ public: } #endif + + static inline int32_t utf8_byte_count(uint8_t c) { + static constexpr int32_t LUT[256] = { + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, -1, -1, -1, -1, -1, -1, -1}; + return LUT[c]; + } }; #endif//_lucene_util__stringutil_H --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org