This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new 4caf1086 [improvement](keyword) keyword type uses the SDocument 
process (#97)
4caf1086 is described below

commit 4caf10866a7a35358d19e3831298c4a6b29d62a8
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Thu Jul 6 13:09:07 2023 +0800

    [improvement](keyword) keyword type uses the SDocument process (#97)
---
 src/core/CLucene/index/SDocumentWriter.cpp |  6 +--
 src/core/CLucene/index/TermInfosWriter.cpp | 69 ++++++++++++++++++++++++------
 src/core/CLucene/store/IndexOutput.cpp     | 14 ++++++
 src/core/CLucene/store/IndexOutput.h       |  2 +
 src/core/CLucene/util/stringUtil.h         | 20 +++++++++
 5 files changed, 96 insertions(+), 15 deletions(-)

diff --git a/src/core/CLucene/index/SDocumentWriter.cpp 
b/src/core/CLucene/index/SDocumentWriter.cpp
index 3b22fdad..33da7a67 100644
--- a/src/core/CLucene/index/SDocumentWriter.cpp
+++ b/src/core/CLucene/index/SDocumentWriter.cpp
@@ -739,8 +739,8 @@ int32_t 
SDocumentsWriter<T>::ThreadState::comparePostings(Posting *p1, Posting *
     const T *pos1 = scharPool->buffers[p1->textStart >> CHAR_BLOCK_SHIFT] + 
(p1->textStart & CHAR_BLOCK_MASK);
     const T *pos2 = scharPool->buffers[p2->textStart >> CHAR_BLOCK_SHIFT] + 
(p2->textStart & CHAR_BLOCK_MASK);
     while (true) {
-        const T c1 = *pos1++;
-        const T c2 = *pos2++;
+        const auto c1 = static_cast<typename 
std::make_unsigned<T>::type>(*pos1++);
+        const auto c2 = static_cast<typename 
std::make_unsigned<T>::type>(*pos2++);
         if (c1 < c2)
             if (CLUCENE_END_OF_WORD == c2)
                 return 1;
@@ -753,8 +753,8 @@ int32_t 
SDocumentsWriter<T>::ThreadState::comparePostings(Posting *p1, Posting *
                 return 1;
         else if (CLUCENE_END_OF_WORD == c1)
             return 0;
+        }
     }
-}
 
 template<typename T>
 void SDocumentsWriter<T>::ThreadState::quickSort(Posting **postings, int32_t 
lo, int32_t hi) {
diff --git a/src/core/CLucene/index/TermInfosWriter.cpp 
b/src/core/CLucene/index/TermInfosWriter.cpp
index 6b9060ec..6a457407 100644
--- a/src/core/CLucene/index/TermInfosWriter.cpp
+++ b/src/core/CLucene/index/TermInfosWriter.cpp
@@ -15,6 +15,7 @@
 #include "_FieldInfos.h"
 #include "_TermInfosWriter.h"
 #include <assert.h>
+#include <iostream>
 
 CL_NS_USE(util)
 CL_NS_USE(store)
@@ -176,20 +177,64 @@ void STermInfosWriter<T>::close() {
 
 template <typename T>
 void STermInfosWriter<T>::writeTerm(int32_t fieldNumber, const T *termText, 
int32_t termTextLength) {
-    int32_t start = 0;
-    const int32_t limit = termTextLength < lastTermTextLength ? termTextLength 
: lastTermTextLength;
-    while (start < limit) {
-        if (termText[start] != lastTermText.values[start])
-            break;
-        start++;
-    }
+    if constexpr (std::is_same_v<T, char>) {
+        std::string_view utf8Str(termText, termTextLength);
+        int32_t utf8Length = 0;
+        {
+            size_t i = 0;
+            for (; i < utf8Str.size();) {
+                int32_t n = StringUtil::utf8_byte_count(utf8Str[i]);
+                i += n;
+                utf8Length++;
+            }
+            assert(i == utf8Str.size());
+        }
 
-    int32_t length = termTextLength - start;
+        int32_t start = 0;
+        int32_t utf8Start = 0;
+        int32_t limit = termTextLength < lastTermTextLength ? termTextLength : 
lastTermTextLength;
+        auto prefixCompare = [this, &utf8Str, &termText](int32_t& start, 
int32_t& utf8Start, int32_t limit) {
+            while (start < limit) {
+                int32_t n = StringUtil::utf8_byte_count(utf8Str[start]);
+                for (int32_t j = 0; j < n; j++) {
+                    int32_t cur = start + j;
+                    if (termText[cur] != lastTermText.values[cur]) {
+                        return;
+                    }
+                }
+                start += n;
+                utf8Start++;
+            }
+        };
+
+        prefixCompare(start, utf8Start, limit);
+        assert(start <= termTextLength);
+        assert(utf8Start <= utf8Length);
+        int32_t length = termTextLength - start;
+        utf8Length -= utf8Start;
+
+        // std::cout << "term: " << utf8Str << ", utf8Start: " << utf8Start << 
", utf8Length: " << utf8Length << ", length: " << length << std::endl;
+
+        output->writeVInt(utf8Start);
+        output->writeVInt(utf8Length);
+        output->writeU8SChars(termText + start, length);
+        output->writeVInt(fieldNumber);
+    } else {
+        int32_t start = 0;
+        const int32_t limit = termTextLength < lastTermTextLength ? 
termTextLength : lastTermTextLength;
+        while (start < limit) {
+            if (termText[start] != lastTermText.values[start])
+                break;
+            start++;
+        }
 
-    output->writeVInt(start);                    // write shared prefix length
-    output->writeVInt(length);                   // write delta length
-    output->writeSChars(termText + start, length);// write delta chars
-    output->writeVInt(fieldNumber);              // write field num
+        int32_t length = termTextLength - start;
+
+        output->writeVInt(start);                    // write shared prefix 
length
+        output->writeVInt(length);                   // write delta length
+        output->writeSChars(termText + start, length);// write delta chars
+        output->writeVInt(fieldNumber);              // write field num
+    }
 }
 
 template class STermInfosWriter<char>;
diff --git a/src/core/CLucene/store/IndexOutput.cpp 
b/src/core/CLucene/store/IndexOutput.cpp
index 92fd4d9c..1d44aff1 100644
--- a/src/core/CLucene/store/IndexOutput.cpp
+++ b/src/core/CLucene/store/IndexOutput.cpp
@@ -8,6 +8,7 @@
 #include "IndexOutput.h"
 #include "IndexInput.h"
 #include "CLucene/util/Misc.h"
+#include "CLucene/util/stringUtil.h"
 
 CL_NS_USE(util)
 CL_NS_DEF(store)
@@ -185,6 +186,19 @@ CL_NS_DEF(store)
       writeBytes((const uint8_t*)s, length);
   }
 
+  void IndexOutput::writeU8SChars(const char* s, const int32_t length) {
+    if ( length < 0 )
+      _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a 
positive value.");
+
+    for (int32_t i = 0; i < length;) {
+      auto* chars = (const uint8_t*)s + i;
+      int32_t n = StringUtil::utf8_byte_count(*chars);
+      assert(n >= 1 && n <= 4);
+      writeBytes(chars, (n > 2 ? 3 : n));
+      i += n;
+    }
+  }
+
   void IndexOutput::writeChars(const TCHAR* s, const int32_t length){
     if ( length < 0 )
       _CLTHROWA(CL_ERR_IllegalArgument, "IO Argument Error. Value must be a 
positive value.");
diff --git a/src/core/CLucene/store/IndexOutput.h 
b/src/core/CLucene/store/IndexOutput.h
index 6b6ca321..e8eff025 100644
--- a/src/core/CLucene/store/IndexOutput.h
+++ b/src/core/CLucene/store/IndexOutput.h
@@ -83,6 +83,8 @@ public:
        void writeChars(const TCHAR* s, const int32_t length);
     template<typename T>
     void writeSChars(const T* s, int32_t length);
+       
+       void writeU8SChars(const char* s, const int32_t length);
 
     /** Closes this stream to further operations. */
        virtual void close() = 0;
diff --git a/src/core/CLucene/util/stringUtil.h 
b/src/core/CLucene/util/stringUtil.h
index da0547d6..5a715445 100644
--- a/src/core/CLucene/util/stringUtil.h
+++ b/src/core/CLucene/util/stringUtil.h
@@ -203,6 +203,26 @@ public:
     }
 
 #endif
+
+    static inline int32_t utf8_byte_count(uint8_t c) {
+        static constexpr int32_t LUT[256] = {
+            2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+            1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+            1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+            1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+            1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+            1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+            1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
+            1,  1,  1,  1,  1,  1,  1,  1,  1,  -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+            -1, -1, -1, -1, -1, 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+            2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
+            2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
+            3,  3,  4,  4,  4,  4,  4,  4,  4,  4,  -1, -1, -1, -1, -1, -1, 
-1};
+        return LUT[c];
+    }
 };
 
 #endif//_lucene_util__stringutil_H


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to