This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new 04ed43c3 [optimize](reader) optimize the tii, tis file structure (#146) 04ed43c3 is described below commit 04ed43c3c70f2c976e95260b07f08b197e1b40ae Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Thu Dec 7 20:30:35 2023 +0800 [optimize](reader) optimize the tii, tis file structure (#146) --- src/core/CLucene/index/SegmentTermEnum.cpp | 85 +++++++++++++++++++++--------- src/core/CLucene/index/TermInfosReader.cpp | 27 ++++++---- src/core/CLucene/index/TermInfosWriter.cpp | 15 ++++-- src/core/CLucene/index/_SegmentTermEnum.h | 15 ++++-- src/core/CLucene/index/_TermInfosWriter.h | 7 +-- src/core/CLucene/store/IndexInput.h | 4 +- 6 files changed, 102 insertions(+), 51 deletions(-) diff --git a/src/core/CLucene/index/SegmentTermEnum.cpp b/src/core/CLucene/index/SegmentTermEnum.cpp index 1383451c..574d9396 100644 --- a/src/core/CLucene/index/SegmentTermEnum.cpp +++ b/src/core/CLucene/index/SegmentTermEnum.cpp @@ -17,7 +17,7 @@ CL_NS_USE(store) CL_NS_DEF(index) - SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const bool isi): + SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const bool isi, int32_t in_format): fieldInfos(fis){ //Func - Constructor //Pre - i holds a reference to an instance of IndexInput @@ -40,8 +40,8 @@ CL_NS_DEF(index) //Set isClone to false as the instance is not clone of another instance isClone = false; + int32_t firstInt = in_format == -4 ? in_format : input->readInt(); - int32_t firstInt = input->readInt(); if (firstInt >= 0) { // original-format file, without explicit format version number format = 0; @@ -62,30 +62,47 @@ CL_NS_DEF(index) _CLTHROWT(CL_ERR_CorruptIndex,err); } - size = input->readLong(); // read the size - if (size < 0) { // read the size at file footer, if size < 0 - auto pos = input->getFilePointer(); - input->seek(input->length() - 8); - size = input->readLong(); - input->seek(pos); - } - - if(format == -1){ - if (!isIndex) { - indexInterval = input->readInt(); - formatM1SkipInterval = input->readInt(); - } - // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in - // skipTo implementation of these versions - skipInterval = LUCENE_INT32_MAX_SHOULDBE; - }else{ - indexInterval = input->readInt(); - skipInterval = input->readInt(); - if ( format == -3 ) { - // this new format introduces multi-level skipping - maxSkipLevels = input->readInt(); - } - } + if (format == -4) { + if (isIndex) { + size = input->readLong(); + if (size < 0) { + auto pos = input->getFilePointer(); + input->seek(input->length() - 16); + size = input->readLong(); + tisSize = input->readLong(); + input->seek(pos); + } + + indexInterval = input->readInt(); + skipInterval = input->readInt(); + maxSkipLevels = input->readInt(); + } + } else { + size = input->readLong(); // read the size + if (size < 0) { // read the size at file footer, if size < 0 + auto pos = input->getFilePointer(); + input->seek(input->length() - 8); + size = input->readLong(); + input->seek(pos); + } + + if(format == -1){ + if (!isIndex) { + indexInterval = input->readInt(); + formatM1SkipInterval = input->readInt(); + } + // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in + // skipTo implementation of these versions + skipInterval = LUCENE_INT32_MAX_SHOULDBE; + }else{ + indexInterval = input->readInt(); + skipInterval = input->readInt(); + if ( format == -3 ) { + // this new format introduces multi-level skipping + maxSkipLevels = input->readInt(); + } + } + } } } @@ -113,6 +130,7 @@ CL_NS_DEF(index) bufferLength = clone.bufferLength; prev = clone.prev==NULL?NULL:_CLNEW Term(clone.prev->field(),clone.prev->text(),false); size = clone.size; + tisSize = clone.tisSize; format = clone.format; indexInterval= clone.indexInterval; @@ -156,6 +174,21 @@ CL_NS_DEF(index) } } + void SegmentTermEnum::initByTii(SegmentTermEnum* tii) { + if (format == -4) { + size = tii->tisSize; + indexInterval = tii->indexInterval; + skipInterval = tii->skipInterval; + maxSkipLevels = tii->maxSkipLevels; + size_t header = sizeof(format) + + sizeof(size) + + sizeof(indexInterval) + + sizeof(skipInterval) + + sizeof(maxSkipLevels); + input->seek(header); + } + } + const char* SegmentTermEnum::getObjectName() const{ return getClassName(); } const char* SegmentTermEnum::getClassName(){ return "SegmentTermEnum"; } diff --git a/src/core/CLucene/index/TermInfosReader.cpp b/src/core/CLucene/index/TermInfosReader.cpp index b28bb7ee..7996d4d4 100644 --- a/src/core/CLucene/index/TermInfosReader.cpp +++ b/src/core/CLucene/index/TermInfosReader.cpp @@ -50,20 +50,25 @@ CL_NS_DEF(index) indexIsRead = false; try { - //Create an SegmentTermEnum for storing all the terms read of the segment - origEnum = _CLNEW SegmentTermEnum( directory->openInput( tisFile.c_str(), readBufferSize ), fieldInfos, false); - _size = origEnum->size; - totalIndexInterval = origEnum->indexInterval; - indexEnum = _CLNEW SegmentTermEnum( directory->openInput( tiiFile.c_str(), readBufferSize ), fieldInfos, true); + //Create an SegmentTermEnum for storing all the terms read of the segment - //Check if enumerator points to a valid instance - CND_CONDITION(origEnum != NULL, "No memory could be allocated for orig enumerator"); - CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); + // tii + auto tiiStream = directory->openInput( tiiFile.c_str(), readBufferSize ); + indexEnum = _CLNEW SegmentTermEnum(tiiStream, fieldInfos, true, -1); + CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); - //call ensureIndexIsRead to load data to memory right now - ensureIndexIsRead(); + // tis + auto tisStream = directory->openInput( tisFile.c_str(), readBufferSize ); + origEnum = _CLNEW SegmentTermEnum(tisStream, fieldInfos, false, indexEnum->getFormat()); + origEnum->initByTii(indexEnum); + CND_CONDITION(origEnum != NULL, "No memory could be allocated for index enumerator"); + _size = origEnum->size; + totalIndexInterval = origEnum->indexInterval; - success = true; + //call ensureIndexIsRead to load data to memory right now + ensureIndexIsRead(); + + success = true; } _CLFINALLY({ // With lock-less commits, it's entirely possible (and // fine) to hit a FileNotFound exception above. In diff --git a/src/core/CLucene/index/TermInfosWriter.cpp b/src/core/CLucene/index/TermInfosWriter.cpp index 32b6a100..b6a45d7d 100644 --- a/src/core/CLucene/index/TermInfosWriter.cpp +++ b/src/core/CLucene/index/TermInfosWriter.cpp @@ -159,9 +159,18 @@ void STermInfosWriter<T>::add(int32_t fieldNumber, const T *termText, int32_t te template <typename T> void STermInfosWriter<T>::close() { if (output) { - //write size at start - //output->seek(4); // write size after format - output->writeLong(size);// do not seek now, directly write size at file footer + if (FORMAT == -4) { + output->writeLong(size); + if (!isIndex) { + other->tisSize = size; + } else { + output->writeLong(tisSize); + } + } else { + //write size at start + //output->seek(4); // write size after format + output->writeLong(size);// do not seek now, directly write size at file footer + } output->close(); _CLDELETE(output); diff --git a/src/core/CLucene/index/_SegmentTermEnum.h b/src/core/CLucene/index/_SegmentTermEnum.h index a2559082..b5fa419d 100644 --- a/src/core/CLucene/index/_SegmentTermEnum.h +++ b/src/core/CLucene/index/_SegmentTermEnum.h @@ -34,13 +34,14 @@ private: CL_NS(store)::IndexInput* input; ///The IndexInput that reads from the Term Infos File FieldInfos* fieldInfos; ///contains the Field Infos for the segment - int64_t size; ///The size of the enumeration + int64_t size = 0; ///The size of the enumeration + int64_t tisSize = 0; int64_t position; ///The position of the current (term) in the enumeration int64_t indexPointer; Term* prev; ///The previous current - int32_t indexInterval; - int32_t skipInterval; - int32_t maxSkipLevels; + int32_t indexInterval = 0; + int32_t skipInterval = 0; + int32_t maxSkipLevels = 0; friend class TermInfosReader; friend class SegmentTermDocs; @@ -54,11 +55,13 @@ protected: public: ///Constructor - SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const bool isi ); + SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const bool isi, int32_t in_format = -1); ///Destructor ~SegmentTermEnum(); + void initByTii(SegmentTermEnum* tii); + /** * Moves the current of the set to the next in the set */ @@ -117,6 +120,8 @@ public: const char* getObjectName() const; static const char* getClassName(); + int32_t getFormat() { return format; } + private: /** * Reads the next term in the enumeration diff --git a/src/core/CLucene/index/_TermInfosWriter.h b/src/core/CLucene/index/_TermInfosWriter.h index 2bd7713a..3acc2abe 100644 --- a/src/core/CLucene/index/_TermInfosWriter.h +++ b/src/core/CLucene/index/_TermInfosWriter.h @@ -24,7 +24,8 @@ private: FieldInfos *fieldInfos; CL_NS(store)::IndexOutput *output; TermInfo *lastTi; - int64_t size; + int64_t size = 0; + int64_t tisSize = 0; int64_t lastIndexPointer; bool isIndex; @@ -44,7 +45,7 @@ private: public: int32_t maxSkipLevels; - LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -3); + LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -4); LUCENE_STATIC_CONSTANT(int32_t, DEFAULT_TERMDOCS_SKIP_INTERVAL = PFOR_BLOCK_SIZE); int32_t indexInterval;// = 128 @@ -96,7 +97,7 @@ public: int32_t maxSkipLevels; /** The file format version, a negative number. */ - LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -3); + LUCENE_STATIC_CONSTANT(int32_t, FORMAT = -4); //Expert: The fraction of {@link TermDocs} entries stored in skip tables, //used to accellerate {@link TermDocs#skipTo(int)}. Larger values result in diff --git a/src/core/CLucene/store/IndexInput.h b/src/core/CLucene/store/IndexInput.h index 4d5b24d8..e17f9eb3 100644 --- a/src/core/CLucene/store/IndexInput.h +++ b/src/core/CLucene/store/IndexInput.h @@ -132,9 +132,7 @@ CL_NS_DEF(store) virtual const char* getObjectName() const = 0; short readShort(); - virtual void setIdxFileCache(bool index) { - _CLTHROWA(CL_ERR_UnsupportedOperation,"UnsupportedOperationException: IndexInput::setIdxFileCache"); - } + virtual void setIdxFileCache(bool index) {} }; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org