This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new 04ed43c3 [optimize](reader) optimize the tii, tis file structure (#146)
04ed43c3 is described below

commit 04ed43c3c70f2c976e95260b07f08b197e1b40ae
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Thu Dec 7 20:30:35 2023 +0800

    [optimize](reader) optimize the tii, tis file structure (#146)
---
 src/core/CLucene/index/SegmentTermEnum.cpp | 85 +++++++++++++++++++++---------
 src/core/CLucene/index/TermInfosReader.cpp | 27 ++++++----
 src/core/CLucene/index/TermInfosWriter.cpp | 15 ++++--
 src/core/CLucene/index/_SegmentTermEnum.h  | 15 ++++--
 src/core/CLucene/index/_TermInfosWriter.h  |  7 +--
 src/core/CLucene/store/IndexInput.h        |  4 +-
 6 files changed, 102 insertions(+), 51 deletions(-)

diff --git a/src/core/CLucene/index/SegmentTermEnum.cpp 
b/src/core/CLucene/index/SegmentTermEnum.cpp
index 1383451c..574d9396 100644
--- a/src/core/CLucene/index/SegmentTermEnum.cpp
+++ b/src/core/CLucene/index/SegmentTermEnum.cpp
@@ -17,7 +17,7 @@
 CL_NS_USE(store)
 CL_NS_DEF(index)
 
-       SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const 
bool isi):
+       SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const 
bool isi, int32_t in_format):
                fieldInfos(fis){
        //Func - Constructor
        //Pre  - i holds a reference to an instance of IndexInput
@@ -40,8 +40,8 @@ CL_NS_DEF(index)
                //Set isClone to false as the instance is not clone of another 
instance
                isClone      = false;
 
+               int32_t firstInt = in_format == -4 ? in_format : 
input->readInt();
 
-               int32_t firstInt = input->readInt();
     if (firstInt >= 0) {
          // original-format file, without explicit format version number
          format = 0;
@@ -62,30 +62,47 @@ CL_NS_DEF(index)
             _CLTHROWT(CL_ERR_CorruptIndex,err);
          }
 
-         size = input->readLong();                    // read the size
-         if (size < 0) {                              // read the size at file 
footer, if size < 0
-             auto pos = input->getFilePointer();
-             input->seek(input->length() - 8);
-             size = input->readLong();
-             input->seek(pos);
-         }
-         
-         if(format == -1){
-            if (!isIndex) {
-               indexInterval = input->readInt();
-               formatM1SkipInterval = input->readInt();
-            }
-            // switch off skipTo optimization for file format prior to 1.4rc2 
in order to avoid a bug in 
-            // skipTo implementation of these versions
-            skipInterval = LUCENE_INT32_MAX_SHOULDBE;
-         }else{
-            indexInterval = input->readInt();
-            skipInterval = input->readInt();
-            if ( format == -3 ) {
-               // this new format introduces multi-level skipping
-               maxSkipLevels = input->readInt();
-            }
-         }
+                                if (format == -4) {
+                                               if (isIndex) {
+                                                       size = 
input->readLong();
+                                                       if (size < 0) {
+                                                               auto pos = 
input->getFilePointer();
+                                                               
input->seek(input->length() - 16);
+                                                               size = 
input->readLong();
+                                                               tisSize = 
input->readLong();
+                                                               
input->seek(pos);
+                                                       }
+                                                       
+                                                       indexInterval = 
input->readInt();
+                                                       skipInterval = 
input->readInt();
+                                                       maxSkipLevels = 
input->readInt();
+                                               }
+                                } else {
+                                               size = input->readLong();       
             // read the size
+                                               if (size < 0) {                 
             // read the size at file footer, if size < 0
+                                                               auto pos = 
input->getFilePointer();
+                                                               
input->seek(input->length() - 8);
+                                                               size = 
input->readLong();
+                                                               
input->seek(pos);
+                                               }
+                                               
+                                               if(format == -1){
+                                                               if (!isIndex) {
+                                                                       
indexInterval = input->readInt();
+                                                                       
formatM1SkipInterval = input->readInt();
+                                                               }
+                                                               // switch off 
skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in 
+                                                               // skipTo 
implementation of these versions
+                                                               skipInterval = 
LUCENE_INT32_MAX_SHOULDBE;
+                                               }else{
+                                                               indexInterval = 
input->readInt();
+                                                               skipInterval = 
input->readInt();
+                                                               if ( format == 
-3 ) {
+                                                                       // this 
new format introduces multi-level skipping
+                                                                       
maxSkipLevels = input->readInt();
+                                                               }
+                                               }
+                                }
       }
        }
 
@@ -113,6 +130,7 @@ CL_NS_DEF(index)
                bufferLength = clone.bufferLength;
                prev         = clone.prev==NULL?NULL:_CLNEW 
Term(clone.prev->field(),clone.prev->text(),false);
                size         = clone.size;
+               tisSize     = clone.tisSize;
 
       format       = clone.format;
       indexInterval= clone.indexInterval;
@@ -156,6 +174,21 @@ CL_NS_DEF(index)
                        }
        }
 
+       void SegmentTermEnum::initByTii(SegmentTermEnum* tii) {
+               if (format == -4) {
+                       size = tii->tisSize;
+                       indexInterval = tii->indexInterval;
+                       skipInterval = tii->skipInterval;
+                       maxSkipLevels = tii->maxSkipLevels;
+                       size_t header = sizeof(format) +
+                                                                               
        sizeof(size) +
+                                                                               
        sizeof(indexInterval) + 
+                                                                               
        sizeof(skipInterval) + 
+                                                                               
        sizeof(maxSkipLevels);
+                       input->seek(header);
+               }
+       }
+
        const char* SegmentTermEnum::getObjectName() const{ return 
getClassName(); }
        const char* SegmentTermEnum::getClassName(){ return "SegmentTermEnum"; }
 
diff --git a/src/core/CLucene/index/TermInfosReader.cpp 
b/src/core/CLucene/index/TermInfosReader.cpp
index b28bb7ee..7996d4d4 100644
--- a/src/core/CLucene/index/TermInfosReader.cpp
+++ b/src/core/CLucene/index/TermInfosReader.cpp
@@ -50,20 +50,25 @@ CL_NS_DEF(index)
           indexIsRead = false;
 
          try {
-                 //Create an SegmentTermEnum for storing all the terms read of 
the segment
-                 origEnum = _CLNEW SegmentTermEnum( directory->openInput( 
tisFile.c_str(), readBufferSize ), fieldInfos, false);
-                 _size =  origEnum->size;
-                 totalIndexInterval = origEnum->indexInterval;
-                 indexEnum = _CLNEW SegmentTermEnum( directory->openInput( 
tiiFile.c_str(), readBufferSize ), fieldInfos, true);
+      //Create an SegmentTermEnum for storing all the terms read of the segment
 
-                 //Check if enumerator points to a valid instance
-                 CND_CONDITION(origEnum != NULL, "No memory could be allocated 
for orig enumerator");
-                 CND_CONDITION(indexEnum != NULL, "No memory could be 
allocated for index enumerator");
+      // tii
+      auto tiiStream = directory->openInput( tiiFile.c_str(), readBufferSize );
+      indexEnum = _CLNEW SegmentTermEnum(tiiStream, fieldInfos, true, -1);
+      CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index 
enumerator");
 
-                 //call ensureIndexIsRead to load data to memory right now
-                 ensureIndexIsRead();
+      // tis
+      auto tisStream = directory->openInput( tisFile.c_str(), readBufferSize );
+      origEnum = _CLNEW SegmentTermEnum(tisStream, fieldInfos, false, 
indexEnum->getFormat());
+      origEnum->initByTii(indexEnum);
+      CND_CONDITION(origEnum != NULL, "No memory could be allocated for index 
enumerator");
+      _size = origEnum->size;
+      totalIndexInterval = origEnum->indexInterval;
 
-                 success = true;
+      //call ensureIndexIsRead to load data to memory right now
+      ensureIndexIsRead();
+
+      success = true;
          } _CLFINALLY({
                  // With lock-less commits, it's entirely possible (and
                  // fine) to hit a FileNotFound exception above. In
diff --git a/src/core/CLucene/index/TermInfosWriter.cpp 
b/src/core/CLucene/index/TermInfosWriter.cpp
index 32b6a100..b6a45d7d 100644
--- a/src/core/CLucene/index/TermInfosWriter.cpp
+++ b/src/core/CLucene/index/TermInfosWriter.cpp
@@ -159,9 +159,18 @@ void STermInfosWriter<T>::add(int32_t fieldNumber, const T 
*termText, int32_t te
 template <typename T>
 void STermInfosWriter<T>::close() {
     if (output) {
-        //write size at start
-        //output->seek(4);          // write size after format
-        output->writeLong(size);// do not seek now, directly write size at 
file footer
+        if (FORMAT == -4) {
+            output->writeLong(size);
+            if (!isIndex) {
+                other->tisSize = size;
+            } else {
+                output->writeLong(tisSize);
+            }
+        } else {
+            //write size at start
+            //output->seek(4);          // write size after format
+            output->writeLong(size);// do not seek now, directly write size at 
file footer
+        }
         output->close();
         _CLDELETE(output);
 
diff --git a/src/core/CLucene/index/_SegmentTermEnum.h 
b/src/core/CLucene/index/_SegmentTermEnum.h
index a2559082..b5fa419d 100644
--- a/src/core/CLucene/index/_SegmentTermEnum.h
+++ b/src/core/CLucene/index/_SegmentTermEnum.h
@@ -34,13 +34,14 @@ private:
 
        CL_NS(store)::IndexInput* input;    ///The IndexInput that reads from 
the Term Infos File
        FieldInfos* fieldInfos; ///contains the Field Infos for the segment
-       int64_t size;                   ///The size of the enumeration
+       int64_t size = 0;                       ///The size of the enumeration
+       int64_t tisSize = 0;
        int64_t position;               ///The position of the current (term) 
in the enumeration
        int64_t indexPointer;
        Term* prev;                             ///The previous current
-       int32_t indexInterval;
-       int32_t skipInterval;
-       int32_t maxSkipLevels;
+       int32_t indexInterval = 0;
+       int32_t skipInterval = 0;
+       int32_t maxSkipLevels = 0;
 
        friend class TermInfosReader;
        friend class SegmentTermDocs;
@@ -54,11 +55,13 @@ protected:
 
 public:
        ///Constructor
-       SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const 
bool isi );
+       SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const 
bool isi, int32_t in_format = -1);
 
        ///Destructor
        ~SegmentTermEnum();
 
+       void initByTii(SegmentTermEnum* tii);
+
        /**
         * Moves the current of the set to the next in the set
         */
@@ -117,6 +120,8 @@ public:
        const char* getObjectName() const;
        static const char* getClassName();
 
+       int32_t getFormat() { return format; }
+
 private:
        /**
         * Reads the next term in the enumeration
diff --git a/src/core/CLucene/index/_TermInfosWriter.h 
b/src/core/CLucene/index/_TermInfosWriter.h
index 2bd7713a..3acc2abe 100644
--- a/src/core/CLucene/index/_TermInfosWriter.h
+++ b/src/core/CLucene/index/_TermInfosWriter.h
@@ -24,7 +24,8 @@ private:
     FieldInfos *fieldInfos;
     CL_NS(store)::IndexOutput *output;
     TermInfo *lastTi;
-    int64_t size;
+    int64_t size = 0;
+    int64_t tisSize = 0;
 
     int64_t lastIndexPointer;
     bool isIndex;
@@ -44,7 +45,7 @@ private:
 public:
     int32_t maxSkipLevels;
 
-    LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -3);
+    LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -4);
     LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_TERMDOCS_SKIP_INTERVAL = 
PFOR_BLOCK_SIZE);
 
     int32_t indexInterval;// = 128
@@ -96,7 +97,7 @@ public:
     int32_t maxSkipLevels;
 
     /** The file format version, a negative number. */
-    LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -3);
+    LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -4);
 
     //Expert: The fraction of {@link TermDocs} entries stored in skip tables,
     //used to accellerate {@link TermDocs#skipTo(int)}.  Larger values result 
in
diff --git a/src/core/CLucene/store/IndexInput.h 
b/src/core/CLucene/store/IndexInput.h
index 4d5b24d8..e17f9eb3 100644
--- a/src/core/CLucene/store/IndexInput.h
+++ b/src/core/CLucene/store/IndexInput.h
@@ -132,9 +132,7 @@ CL_NS_DEF(store)
                virtual const char* getObjectName() const = 0;
         short readShort();
 
-               virtual void setIdxFileCache(bool index) {
-                               
_CLTHROWA(CL_ERR_UnsupportedOperation,"UnsupportedOperationException: 
IndexInput::setIdxFileCache");
-               }
+               virtual void setIdxFileCache(bool index) {}
 
     };
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to