This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene-2.0 in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene-2.0 by this push: new ab319e51 [fix](memory leak) fix MultiPhraseQuery memory leak (#176) ab319e51 is described below commit ab319e51dfabdf85e6ba33f8d1754cacff8c5cd8 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Wed Jan 10 14:37:07 2024 +0800 [fix](memory leak) fix MultiPhraseQuery memory leak (#176) * [Fix](memory leak) fix memory leak found in fault injection case (#170) * [Fix](multi segment) fix multisegment doc overflow (#174) * [fix](MultiPhrase) fix MultiPhraseQuery memory leak (#175) --------- Co-authored-by: airborne12 <airborn...@gmail.com> --- src/core/CLucene/index/MultiSegmentReader.cpp | 4 + src/core/CLucene/index/SegmentInfos.cpp | 3 + src/core/CLucene/index/SegmentReader.cpp | 7 +- src/core/CLucene/index/SegmentTermEnum.cpp | 801 +++++++++++++------------- src/core/CLucene/index/TermInfosReader.cpp | 737 ++++++++++++------------ src/core/CLucene/index/_SegmentTermEnum.h | 3 +- src/core/CLucene/search/MultiPhraseQuery.cpp | 4 +- src/core/CLucene/store/IndexOutput.cpp | 13 +- src/test/CMakeLists.txt | 1 + src/test/query/TestMultiPhraseQuery.cpp | 163 ++++++ src/test/test.h | 1 + src/test/tests.cpp | 1 + 12 files changed, 950 insertions(+), 788 deletions(-) diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp b/src/core/CLucene/index/MultiSegmentReader.cpp index ad37807e..b4be5f01 100644 --- a/src/core/CLucene/index/MultiSegmentReader.cpp +++ b/src/core/CLucene/index/MultiSegmentReader.cpp @@ -561,6 +561,10 @@ int32_t MultiTermDocs::docFreq() { int32_t MultiTermDocs::doc() const { CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was called"); + // if not found term, current will return INT_MAX, we could not add base, otherwise it will overflow. + if (current->doc() == LUCENE_INT32_MAX_SHOULDBE) { + return LUCENE_INT32_MAX_SHOULDBE; + } return base + current->doc(); } int32_t MultiTermDocs::freq() const { diff --git a/src/core/CLucene/index/SegmentInfos.cpp b/src/core/CLucene/index/SegmentInfos.cpp index 03532129..60a36954 100644 --- a/src/core/CLucene/index/SegmentInfos.cpp +++ b/src/core/CLucene/index/SegmentInfos.cpp @@ -826,6 +826,9 @@ string SegmentInfo::segString(Directory* dir) { // Try not to leave a truncated segments_N file in // the index: directory->deleteFile(segmentFileName.c_str()); + if (output != nullptr) { + _CLDELETE(output); + } } ) ) diff --git a/src/core/CLucene/index/SegmentReader.cpp b/src/core/CLucene/index/SegmentReader.cpp index f7741a9f..72126366 100644 --- a/src/core/CLucene/index/SegmentReader.cpp +++ b/src/core/CLucene/index/SegmentReader.cpp @@ -257,7 +257,12 @@ SegmentReader *SegmentReader::get(Directory *dir, SegmentInfo *si, instance->init(dir, sis, closeDir); // TODO: make this configurable... bool fieldsReaderExist = false; - instance->initialize(si, readBufferSize == -1 ? BufferedIndexInput::BUFFER_SIZE : readBufferSize, doOpenStores, fieldsReaderExist); + try { + instance->initialize(si, readBufferSize == -1 ? BufferedIndexInput::BUFFER_SIZE : readBufferSize, doOpenStores, fieldsReaderExist); + } catch (CLuceneError& e) { + _CLDELETE(instance) + throw e; + } return instance; } diff --git a/src/core/CLucene/index/SegmentTermEnum.cpp b/src/core/CLucene/index/SegmentTermEnum.cpp index 574d9396..8179c7b7 100644 --- a/src/core/CLucene/index/SegmentTermEnum.cpp +++ b/src/core/CLucene/index/SegmentTermEnum.cpp @@ -5,424 +5,419 @@ * the GNU Lesser General Public License, as specified in the COPYING file. ------------------------------------------------------------------------------*/ #include "CLucene/_ApiHeader.h" -#include "_SegmentHeader.h" -#include "_SegmentTermEnum.h" - +#include "Term.h" #include "Terms.h" #include "_FieldInfos.h" -#include "Term.h" +#include "_SegmentHeader.h" +#include "_SegmentTermEnum.h" #include "_TermInfo.h" #include "_TermInfosWriter.h" CL_NS_USE(store) CL_NS_DEF(index) - SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const bool isi, int32_t in_format): - fieldInfos(fis){ - //Func - Constructor - //Pre - i holds a reference to an instance of IndexInput - // fis holds a reference to an instance of FieldInfos - // isi - //Post - An instance of SegmentTermEnum has been created - input = i; - position = -1; - //Instantiate a Term with empty field, empty text and which is interned (see term.h what interned means) - _term = _CLNEW Term; - isIndex = isi; - termInfo = _CLNEW TermInfo(); - indexPointer = 0; - buffer = NULL; - bufferLength = 0; - prev = NULL; - formatM1SkipInterval = 0; - maxSkipLevels = 1; - - //Set isClone to false as the instance is not clone of another instance - isClone = false; - - int32_t firstInt = in_format == -4 ? in_format : input->readInt(); +SegmentTermEnum::SegmentTermEnum(IndexInput* i, FieldInfos* fis, const bool isi) + : fieldInfos(fis) { + //Func - Constructor + //Pre - i holds a reference to an instance of IndexInput + // fis holds a reference to an instance of FieldInfos + // isi + //Post - An instance of SegmentTermEnum has been created + input = i; + position = -1; + //Instantiate a Term with empty field, empty text and which is interned (see term.h what interned means) + _term = _CLNEW Term; + isIndex = isi; + termInfo = _CLNEW TermInfo(); + indexPointer = 0; + buffer = NULL; + bufferLength = 0; + prev = NULL; + formatM1SkipInterval = 0; + maxSkipLevels = 1; + + //Set isClone to false as the instance is not clone of another instance + isClone = false; +} + +void SegmentTermEnum::init(int32_t in_format) { + int32_t firstInt = in_format == -4 ? in_format : input->readInt(); if (firstInt >= 0) { - // original-format file, without explicit format version number - format = 0; - size = firstInt; + // original-format file, without explicit format version number + format = 0; + size = firstInt; - // back-compatible settings - indexInterval = 128; - skipInterval = LUCENE_INT32_MAX_SHOULDBE; // switch off skipTo optimization + // back-compatible settings + indexInterval = 128; + skipInterval = LUCENE_INT32_MAX_SHOULDBE; // switch off skipTo optimization - } else { - // we have a format version number - format = firstInt; + } else { + // we have a format version number + format = firstInt; - // check that it is a format we can understand - if (format < TermInfosWriter::FORMAT){ + // check that it is a format we can understand + if (format < TermInfosWriter::FORMAT) { TCHAR err[30]; - _sntprintf(err,30,_T("Unknown format version: %d"), format); - _CLTHROWT(CL_ERR_CorruptIndex,err); - } - - if (format == -4) { - if (isIndex) { - size = input->readLong(); - if (size < 0) { - auto pos = input->getFilePointer(); - input->seek(input->length() - 16); - size = input->readLong(); - tisSize = input->readLong(); - input->seek(pos); - } - - indexInterval = input->readInt(); - skipInterval = input->readInt(); - maxSkipLevels = input->readInt(); - } - } else { - size = input->readLong(); // read the size - if (size < 0) { // read the size at file footer, if size < 0 - auto pos = input->getFilePointer(); - input->seek(input->length() - 8); - size = input->readLong(); - input->seek(pos); - } - - if(format == -1){ - if (!isIndex) { - indexInterval = input->readInt(); - formatM1SkipInterval = input->readInt(); - } - // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in - // skipTo implementation of these versions - skipInterval = LUCENE_INT32_MAX_SHOULDBE; - }else{ - indexInterval = input->readInt(); - skipInterval = input->readInt(); - if ( format == -3 ) { - // this new format introduces multi-level skipping - maxSkipLevels = input->readInt(); - } - } - } - } - } - - SegmentTermEnum::SegmentTermEnum(const SegmentTermEnum& clone): - fieldInfos(clone.fieldInfos) - { - //Func - Constructor - // The instance is created by cloning all properties of clone - //Pre - clone holds a valid reference to SegmentTermEnum - //Post - An instance of SegmentTermEnum with the same properties as clone - - input = clone.input->clone(); - //Copy the postion from the clone - position = clone.position; - - if ( clone._term != NULL ){ - _term = _CLNEW Term; - _term->set(clone._term,clone._term->text()); - }else - _term = NULL; - isIndex = clone.isIndex; - termInfo = _CLNEW TermInfo(clone.termInfo); - indexPointer = clone.indexPointer; - buffer = clone.buffer==NULL?NULL:(TCHAR*)malloc(sizeof(TCHAR) * (clone.bufferLength+1)); - bufferLength = clone.bufferLength; - prev = clone.prev==NULL?NULL:_CLNEW Term(clone.prev->field(),clone.prev->text(),false); - size = clone.size; - tisSize = clone.tisSize; - - format = clone.format; - indexInterval= clone.indexInterval; - skipInterval = clone.skipInterval; - formatM1SkipInterval = clone.formatM1SkipInterval; - maxSkipLevels = clone.maxSkipLevels; - - //Set isClone to true as this instance is a clone of another instance - isClone = true; - - //Copy the contents of buffer of clone to the buffer of this instance - if ( clone.buffer != NULL ) - memcpy(buffer,clone.buffer,bufferLength * sizeof(TCHAR)); - } - - SegmentTermEnum::~SegmentTermEnum(){ - //Func - Destructor - //Pre - true - //Post - The instance has been destroyed. If this instance was a clone - // then the inputstream is closed and deleted too. - - //todo: revisit this... close() should clean up most of everything. - - //Finalize prev - _CLDECDELETE(prev ); - //Finalize term - _CLDECDELETE( _term ); - - - //Delete the buffer if necessary - if ( buffer != NULL ) free(buffer); - //Delete termInfo if necessary - _CLDELETE(termInfo); - - //Check if this instance is a clone - if ( isClone ){ - //Close the inputstream - input->close(); - //delete the inputstream - _CLDELETE(input); - } - } - - void SegmentTermEnum::initByTii(SegmentTermEnum* tii) { - if (format == -4) { - size = tii->tisSize; - indexInterval = tii->indexInterval; - skipInterval = tii->skipInterval; - maxSkipLevels = tii->maxSkipLevels; - size_t header = sizeof(format) + - sizeof(size) + - sizeof(indexInterval) + - sizeof(skipInterval) + - sizeof(maxSkipLevels); - input->seek(header); - } - } - - const char* SegmentTermEnum::getObjectName() const{ return getClassName(); } - const char* SegmentTermEnum::getClassName(){ return "SegmentTermEnum"; } - - bool SegmentTermEnum::next(){ - //Func - Moves the current of the set to the next in the set - //Pre - true - //Post - If the end has been reached NULL is returned otherwise the term has - // become the next Term in the enumeration - - //Increase position by and and check if the end has been reached - if (position++ >= size-1) { - //delete term - _CLDECDELETE(_term); - return false; - } - - //delete the previous enumerated term - Term* tmp=NULL; - if ( prev != NULL ){ - if ( _LUCENE_ATOMIC_INT_GET(prev->__cl_refcount) > 1 ){ - _CLDECDELETE(prev); //todo: tune other places try and delete its term - }else - tmp = prev; //we are going to re-use this term - } - //prev becomes the current enumerated term - prev = _term; - //term becomes the next term read from inputStream input - _term = readTerm(tmp); - - //Read docFreq, the number of documents which contain the term. - termInfo->docFreq = input->readVInt(); - //Read freqPointer, a pointer into the TermFreqs file (.frq) - termInfo->freqPointer += input->readVLong(); - - //Read proxPointer, a pointer into the TermPosition file (.prx). - termInfo->proxPointer += input->readVLong(); - - if(format == -1){ - // just read skipOffset in order to increment file pointer; - // value is never used since skipTo is switched off - if (!isIndex) { + _sntprintf(err, 30, _T("Unknown format version: %d"), format); + _CLTHROWT(CL_ERR_CorruptIndex, err); + } + + if (format == -4) { + if (isIndex) { + size = input->readLong(); + if (size < 0) { + auto pos = input->getFilePointer(); + input->seek(input->length() - 16); + size = input->readLong(); + tisSize = input->readLong(); + input->seek(pos); + } + + indexInterval = input->readInt(); + skipInterval = input->readInt(); + maxSkipLevels = input->readInt(); + } + } else { + size = input->readLong(); // read the size + if (size < 0) { // read the size at file footer, if size < 0 + auto pos = input->getFilePointer(); + input->seek(input->length() - 8); + size = input->readLong(); + input->seek(pos); + } + + if (format == -1) { + if (!isIndex) { + indexInterval = input->readInt(); + formatM1SkipInterval = input->readInt(); + } + // switch off skipTo optimization for file format prior to 1.4rc2 in order to avoid a bug in + // skipTo implementation of these versions + skipInterval = LUCENE_INT32_MAX_SHOULDBE; + } else { + indexInterval = input->readInt(); + skipInterval = input->readInt(); + if (format == -3) { + // this new format introduces multi-level skipping + maxSkipLevels = input->readInt(); + } + } + } + } +} + +SegmentTermEnum::SegmentTermEnum(const SegmentTermEnum& clone) : fieldInfos(clone.fieldInfos) { + //Func - Constructor + // The instance is created by cloning all properties of clone + //Pre - clone holds a valid reference to SegmentTermEnum + //Post - An instance of SegmentTermEnum with the same properties as clone + + input = clone.input->clone(); + //Copy the postion from the clone + position = clone.position; + + if (clone._term != NULL) { + _term = _CLNEW Term; + _term->set(clone._term, clone._term->text()); + } else + _term = NULL; + isIndex = clone.isIndex; + termInfo = _CLNEW TermInfo(clone.termInfo); + indexPointer = clone.indexPointer; + buffer = clone.buffer == NULL ? NULL : (TCHAR*)malloc(sizeof(TCHAR) * (clone.bufferLength + 1)); + bufferLength = clone.bufferLength; + prev = clone.prev == NULL ? NULL : _CLNEW Term(clone.prev->field(), clone.prev->text(), false); + size = clone.size; + tisSize = clone.tisSize; + + format = clone.format; + indexInterval = clone.indexInterval; + skipInterval = clone.skipInterval; + formatM1SkipInterval = clone.formatM1SkipInterval; + maxSkipLevels = clone.maxSkipLevels; + + //Set isClone to true as this instance is a clone of another instance + isClone = true; + + //Copy the contents of buffer of clone to the buffer of this instance + if (clone.buffer != NULL) memcpy(buffer, clone.buffer, bufferLength * sizeof(TCHAR)); +} + +SegmentTermEnum::~SegmentTermEnum() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed. If this instance was a clone + // then the inputstream is closed and deleted too. + + //todo: revisit this... close() should clean up most of everything. + + //Finalize prev + _CLDECDELETE(prev); + //Finalize term + _CLDECDELETE(_term); + + //Delete the buffer if necessary + if (buffer != NULL) free(buffer); + //Delete termInfo if necessary + _CLDELETE(termInfo); + + //Check if this instance is a clone + if (isClone) { + //Close the inputstream + input->close(); + //delete the inputstream + _CLDELETE(input); + } +} + +void SegmentTermEnum::initByTii(SegmentTermEnum* tii) { + if (format == -4) { + size = tii->tisSize; + indexInterval = tii->indexInterval; + skipInterval = tii->skipInterval; + maxSkipLevels = tii->maxSkipLevels; + size_t header = sizeof(format) + sizeof(size) + sizeof(indexInterval) + + sizeof(skipInterval) + sizeof(maxSkipLevels); + input->seek(header); + } +} + +const char* SegmentTermEnum::getObjectName() const { + return getClassName(); +} +const char* SegmentTermEnum::getClassName() { + return "SegmentTermEnum"; +} + +bool SegmentTermEnum::next() { + //Func - Moves the current of the set to the next in the set + //Pre - true + //Post - If the end has been reached NULL is returned otherwise the term has + // become the next Term in the enumeration + + //Increase position by and and check if the end has been reached + if (position++ >= size - 1) { + //delete term + _CLDECDELETE(_term); + return false; + } + + //delete the previous enumerated term + Term* tmp = NULL; + if (prev != NULL) { + if (_LUCENE_ATOMIC_INT_GET(prev->__cl_refcount) > 1) { + _CLDECDELETE(prev); //todo: tune other places try and delete its term + } else + tmp = prev; //we are going to re-use this term + } + //prev becomes the current enumerated term + prev = _term; + //term becomes the next term read from inputStream input + _term = readTerm(tmp); + + //Read docFreq, the number of documents which contain the term. + termInfo->docFreq = input->readVInt(); + //Read freqPointer, a pointer into the TermFreqs file (.frq) + termInfo->freqPointer += input->readVLong(); + + //Read proxPointer, a pointer into the TermPosition file (.prx). + termInfo->proxPointer += input->readVLong(); + + if (format == -1) { + // just read skipOffset in order to increment file pointer; + // value is never used since skipTo is switched off + if (!isIndex) { if (termInfo->docFreq > formatM1SkipInterval) { - termInfo->skipOffset = input->readVInt(); + termInfo->skipOffset = input->readVInt(); } - } - }else{ - if (termInfo->docFreq >= skipInterval) - termInfo->skipOffset = input->readVInt(); - } - - //Check if the enumeration is an index - if (isIndex) - //read index pointer - indexPointer += input->readVLong(); - - return true; - } - - Term* SegmentTermEnum::term(bool pointer) { - if ( pointer ) - return _CL_POINTER(_term); - else - return _term; - } - - void SegmentTermEnum::scanTo(const Term *term){ - //Func - Scan for Term without allocating new Terms - //Pre - term != NULL - //Post - The iterator term has been moved to the position where Term is expected to be - // in the enumeration - while ( term->compareTo(this->_term) > 0 && next()) - { - } - } - - void SegmentTermEnum::close() { - //Func - Closes the enumeration to further activity, freeing resources. - //Pre - true - //Post - The inputStream input has been closed - - input->close(); - } - - int32_t SegmentTermEnum::docFreq() const { - //Func - Returns the document frequency of the current term in the set - //Pre - termInfo != NULL - // next() must have been called once - //Post - The document frequency of the current enumerated term has been returned - - return termInfo->docFreq; - } - - void SegmentTermEnum::seek(const int64_t pointer, const int32_t p, Term* t, TermInfo* ti) { - //Func - Repositions term and termInfo within the enumeration - //Pre - pointer >= 0 - // p >= 0 and contains the new position within the enumeration - // t is a valid reference to a Term and is the new current term in the enumeration - // ti is a valid reference to a TermInfo and is corresponding TermInfo form the new - // current Term - //Post - term and terminfo have been repositioned within the enumeration - - //Reset the IndexInput input to pointer - input->seek(pointer); - //Assign the new position - position = p; - - //finalize the current term - if ( _term == NULL || _LUCENE_ATOMIC_INT_GET(_term->__cl_refcount) > 1 ){ - _CLDECDELETE(_term); - //Get a pointer from t and increase the reference counter of t - _term = _CLNEW Term; //cannot use reference, because TermInfosReader uses non ref-counted array - } - _term->set(t,t->text()); - - //finalize prev - _CLDECDELETE(prev); - - //Change the current termInfo so it matches the new current term - termInfo->set(ti); - - //Have the buffer grown if needed - if ( bufferLength <= _term->textLength() ) - growBuffer(_term->textLength(), true ); // copy term text into buffer - else - _tcsncpy(buffer,_term->text(),bufferLength); //just copy the buffer - } - - TermInfo* SegmentTermEnum::getTermInfo()const { - //Func - Returns a clone of the current termInfo - //Pre - termInfo != NULL - // next() must have been called once - //Post - A clone of the current termInfo has been returned - - return _CLNEW TermInfo(*termInfo); //clone - } - - void SegmentTermEnum::getTermInfo(TermInfo* ti)const { - //Func - Retrieves a clone of termInfo through the reference ti - //Pre - ti contains a valid reference to TermInfo - // termInfo != NULL - // next() must have been called once - //Post - ti contains a clone of termInfo - - ti->set(termInfo); - } - - int64_t SegmentTermEnum::freqPointer()const { - //Func - Returns the freqpointer of the current termInfo - //Pre - termInfo != NULL - // next() must have been called once - //Post - The freqpointer of the current termInfo has been returned - - return termInfo->freqPointer; - } - - int64_t SegmentTermEnum::proxPointer()const { - //Func - Returns the proxPointer of the current termInfo - //Pre - termInfo != NULL - // next() must have been called once - //Post - the proxPointer of the current termInfo has been returned - - return termInfo->proxPointer; - } - - SegmentTermEnum* SegmentTermEnum::clone() const { - //Func - Returns a clone of this instance - //Pre - true - //Post - An clone of this instance has been returned - - return _CLNEW SegmentTermEnum(*this); - } - - Term* SegmentTermEnum::readTerm(Term* reuse) { - //Func - Reads the next term in the enumeration - //Pre - true - //Post - The next Term in the enumeration has been read and returned - - //Read the start position from the inputStream input - int32_t start = input->readVInt(); - //Read the length of term in the inputStream input - int32_t length = input->readVInt(); - - //Calculated the total lenght of bytes that buffer must be to contain the current - //chars in buffer and the new ones yet to be read - uint32_t totalLength = start + length; - - if (static_cast<uint32_t>(bufferLength) < totalLength+1) - growBuffer(totalLength, false); //dont copy the buffer over. - - //Read a length number of characters into the buffer from position start in the inputStream input - input->readChars(buffer, start, length); - //Null terminate the string - buffer[totalLength] = 0; - - //Return a new Term - int32_t field = input->readVInt(); - const TCHAR* fieldname = fieldInfos->fieldName(field); - if ( reuse == NULL ) - reuse = _CLNEW Term; - - reuse->set(fieldname, buffer, false); - return reuse; - } - - void SegmentTermEnum::growBuffer(const uint32_t length, bool force_copy) { - //Func - Instantiate a buffer of length length+1 - //Pre - length > 0 - //Post - pre(buffer) has been deleted with its contents. A new buffer - // has been allocated of length length+1 and the text of term has been copied - // to buffer - //todo: we could guess that we will need to re-grow this - //buffer a few times...so start off with a reasonable grow - //value... - if ( bufferLength > length ) - return; - - //Store the new bufferLength - if ( length - bufferLength < 8 ) - bufferLength = length+8; - else - bufferLength = length+1; - - bool copy = buffer==NULL; - - //Instantiate the new buffer + 1 is needed for terminator '\0' - if ( buffer == NULL ) - buffer = (TCHAR*)malloc(sizeof(TCHAR) * (bufferLength+1)); - else - buffer = (TCHAR*)realloc(buffer, sizeof(TCHAR) * (bufferLength+1)); - - if ( copy || force_copy){ - //Copy the text of term into buffer - _tcsncpy(buffer,_term->text(),bufferLength); - } - } + } + } else { + if (termInfo->docFreq >= skipInterval) termInfo->skipOffset = input->readVInt(); + } + + //Check if the enumeration is an index + if (isIndex) + //read index pointer + indexPointer += input->readVLong(); + + return true; +} + +Term* SegmentTermEnum::term(bool pointer) { + if (pointer) + return _CL_POINTER(_term); + else + return _term; +} + +void SegmentTermEnum::scanTo(const Term* term) { + //Func - Scan for Term without allocating new Terms + //Pre - term != NULL + //Post - The iterator term has been moved to the position where Term is expected to be + // in the enumeration + while (term->compareTo(this->_term) > 0 && next()) { + } +} + +void SegmentTermEnum::close() { + //Func - Closes the enumeration to further activity, freeing resources. + //Pre - true + //Post - The inputStream input has been closed + + input->close(); +} + +int32_t SegmentTermEnum::docFreq() const { + //Func - Returns the document frequency of the current term in the set + //Pre - termInfo != NULL + // next() must have been called once + //Post - The document frequency of the current enumerated term has been returned + + return termInfo->docFreq; +} + +void SegmentTermEnum::seek(const int64_t pointer, const int32_t p, Term* t, TermInfo* ti) { + //Func - Repositions term and termInfo within the enumeration + //Pre - pointer >= 0 + // p >= 0 and contains the new position within the enumeration + // t is a valid reference to a Term and is the new current term in the enumeration + // ti is a valid reference to a TermInfo and is corresponding TermInfo form the new + // current Term + //Post - term and terminfo have been repositioned within the enumeration + + //Reset the IndexInput input to pointer + input->seek(pointer); + //Assign the new position + position = p; + + //finalize the current term + if (_term == NULL || _LUCENE_ATOMIC_INT_GET(_term->__cl_refcount) > 1) { + _CLDECDELETE(_term); + //Get a pointer from t and increase the reference counter of t + _term = _CLNEW + Term; //cannot use reference, because TermInfosReader uses non ref-counted array + } + _term->set(t, t->text()); + + //finalize prev + _CLDECDELETE(prev); + + //Change the current termInfo so it matches the new current term + termInfo->set(ti); + + //Have the buffer grown if needed + if (bufferLength <= _term->textLength()) + growBuffer(_term->textLength(), true); // copy term text into buffer + else + _tcsncpy(buffer, _term->text(), bufferLength); //just copy the buffer +} + +TermInfo* SegmentTermEnum::getTermInfo() const { + //Func - Returns a clone of the current termInfo + //Pre - termInfo != NULL + // next() must have been called once + //Post - A clone of the current termInfo has been returned + + return _CLNEW TermInfo(*termInfo); //clone +} + +void SegmentTermEnum::getTermInfo(TermInfo* ti) const { + //Func - Retrieves a clone of termInfo through the reference ti + //Pre - ti contains a valid reference to TermInfo + // termInfo != NULL + // next() must have been called once + //Post - ti contains a clone of termInfo + + ti->set(termInfo); +} + +int64_t SegmentTermEnum::freqPointer() const { + //Func - Returns the freqpointer of the current termInfo + //Pre - termInfo != NULL + // next() must have been called once + //Post - The freqpointer of the current termInfo has been returned + + return termInfo->freqPointer; +} + +int64_t SegmentTermEnum::proxPointer() const { + //Func - Returns the proxPointer of the current termInfo + //Pre - termInfo != NULL + // next() must have been called once + //Post - the proxPointer of the current termInfo has been returned + + return termInfo->proxPointer; +} + +SegmentTermEnum* SegmentTermEnum::clone() const { + //Func - Returns a clone of this instance + //Pre - true + //Post - An clone of this instance has been returned + + return _CLNEW SegmentTermEnum(*this); +} + +Term* SegmentTermEnum::readTerm(Term* reuse) { + //Func - Reads the next term in the enumeration + //Pre - true + //Post - The next Term in the enumeration has been read and returned + + //Read the start position from the inputStream input + int32_t start = input->readVInt(); + //Read the length of term in the inputStream input + int32_t length = input->readVInt(); + + //Calculated the total lenght of bytes that buffer must be to contain the current + //chars in buffer and the new ones yet to be read + uint32_t totalLength = start + length; + + if (static_cast<uint32_t>(bufferLength) < totalLength + 1) + growBuffer(totalLength, false); //dont copy the buffer over. + + //Read a length number of characters into the buffer from position start in the inputStream input + input->readChars(buffer, start, length); + //Null terminate the string + buffer[totalLength] = 0; + + //Return a new Term + int32_t field = input->readVInt(); + const TCHAR* fieldname = fieldInfos->fieldName(field); + if (reuse == NULL) reuse = _CLNEW Term; + + reuse->set(fieldname, buffer, false); + return reuse; +} + +void SegmentTermEnum::growBuffer(const uint32_t length, bool force_copy) { + //Func - Instantiate a buffer of length length+1 + //Pre - length > 0 + //Post - pre(buffer) has been deleted with its contents. A new buffer + // has been allocated of length length+1 and the text of term has been copied + // to buffer + //todo: we could guess that we will need to re-grow this + //buffer a few times...so start off with a reasonable grow + //value... + if (bufferLength > length) return; + + //Store the new bufferLength + if (length - bufferLength < 8) + bufferLength = length + 8; + else + bufferLength = length + 1; + + bool copy = buffer == NULL; + + //Instantiate the new buffer + 1 is needed for terminator '\0' + if (buffer == NULL) + buffer = (TCHAR*)malloc(sizeof(TCHAR) * (bufferLength + 1)); + else + buffer = (TCHAR*)realloc(buffer, sizeof(TCHAR) * (bufferLength + 1)); + + if (copy || force_copy) { + //Copy the text of term into buffer + _tcsncpy(buffer, _term->text(), bufferLength); + } +} CL_NS_END diff --git a/src/core/CLucene/index/TermInfosReader.cpp b/src/core/CLucene/index/TermInfosReader.cpp index 7996d4d4..6cf8b42f 100644 --- a/src/core/CLucene/index/TermInfosReader.cpp +++ b/src/core/CLucene/index/TermInfosReader.cpp @@ -24,207 +24,206 @@ CL_NS_USE(store) CL_NS_USE(util) CL_NS_DEF(index) - - TermInfosReader::TermInfosReader(Directory* dir, const char* seg, FieldInfos* fis, const int32_t readBufferSize): - directory (dir),fieldInfos (fis), indexTerms(NULL), indexInfos(NULL), indexPointers(NULL), indexDivisor(1) - { - //Func - Constructor. - // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii) - //Pre - dir is a reference to a valid Directory - // Fis contains a valid reference to an FieldInfos instance - // seg != NULL and contains the name of the segment - //Post - An instance has been created and the index named seg has been read. (Remember - // a segment is nothing more then an independently readable index) - - CND_PRECONDITION(seg != NULL, "seg is NULL"); - - //Initialize the name of the segment - segment = seg; - - //Create a filname fo a Term Info File - string tisFile = Misc::segmentname(segment,".tis"); - string tiiFile = Misc::segmentname(segment,".tii"); - bool success = false; +TermInfosReader::TermInfosReader(Directory* dir, const char* seg, FieldInfos* fis, + const int32_t readBufferSize) + : directory(dir), + fieldInfos(fis), + indexTerms(NULL), + indexInfos(NULL), + indexPointers(NULL), + indexDivisor(1) { + //Func - Constructor. + // Reads the TermInfos file (.tis) and eventually the Term Info Index file (.tii) + //Pre - dir is a reference to a valid Directory + // Fis contains a valid reference to an FieldInfos instance + // seg != NULL and contains the name of the segment + //Post - An instance has been created and the index named seg has been read. (Remember + // a segment is nothing more then an independently readable index) + + CND_PRECONDITION(seg != NULL, "seg is NULL"); + + //Initialize the name of the segment + segment = seg; + + //Create a filname fo a Term Info File + string tisFile = Misc::segmentname(segment, ".tis"); + string tiiFile = Misc::segmentname(segment, ".tii"); + bool success = false; origEnum = indexEnum = NULL; _size = indexTermsLength = totalIndexInterval = 0; - indexIsRead = false; - - try { - //Create an SegmentTermEnum for storing all the terms read of the segment - - // tii - auto tiiStream = directory->openInput( tiiFile.c_str(), readBufferSize ); - indexEnum = _CLNEW SegmentTermEnum(tiiStream, fieldInfos, true, -1); - CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); - - // tis - auto tisStream = directory->openInput( tisFile.c_str(), readBufferSize ); - origEnum = _CLNEW SegmentTermEnum(tisStream, fieldInfos, false, indexEnum->getFormat()); - origEnum->initByTii(indexEnum); - CND_CONDITION(origEnum != NULL, "No memory could be allocated for index enumerator"); - _size = origEnum->size; - totalIndexInterval = origEnum->indexInterval; - - //call ensureIndexIsRead to load data to memory right now - ensureIndexIsRead(); - - success = true; - } _CLFINALLY({ - // With lock-less commits, it's entirely possible (and - // fine) to hit a FileNotFound exception above. In - // this case, we want to explicitly close any subset - // of things that were opened so that we don't have to - // wait for a GC to do so. - if (!success) { - close(); - } - }); - - } - - TermInfosReader::~TermInfosReader(){ - //Func - Destructor - //Pre - true - //Post - The instance has been destroyed - - //Close the TermInfosReader to be absolutly sure that enumerator has been closed - //and the arrays indexTerms, indexPointers and indexInfos and their elements - //have been destroyed - close(); - } - int32_t TermInfosReader::getSkipInterval() const { + indexIsRead = false; + + try { + //Create an SegmentTermEnum for storing all the terms read of the segment + + // tii + auto tiiStream = directory->openInput(tiiFile.c_str(), readBufferSize); + indexEnum = _CLNEW SegmentTermEnum(tiiStream, fieldInfos, true); + indexEnum->init(-1); + CND_CONDITION(indexEnum != NULL, "No memory could be allocated for index enumerator"); + + // tis + auto tisStream = directory->openInput(tisFile.c_str(), readBufferSize); + origEnum = _CLNEW SegmentTermEnum(tisStream, fieldInfos, false); + origEnum->init(indexEnum->getFormat()); + origEnum->initByTii(indexEnum); + CND_CONDITION(origEnum != NULL, "No memory could be allocated for index enumerator"); + _size = origEnum->size; + totalIndexInterval = origEnum->indexInterval; + + //call ensureIndexIsRead to load data to memory right now + ensureIndexIsRead(); + + success = true; + } + _CLFINALLY({ + // With lock-less commits, it's entirely possible (and + // fine) to hit a FileNotFound exception above. In + // this case, we want to explicitly close any subset + // of things that were opened so that we don't have to + // wait for a GC to do so. + if (!success) { + close(); + } + }); +} + +TermInfosReader::~TermInfosReader() { + //Func - Destructor + //Pre - true + //Post - The instance has been destroyed + + //Close the TermInfosReader to be absolutly sure that enumerator has been closed + //and the arrays indexTerms, indexPointers and indexInfos and their elements + //have been destroyed + close(); +} +int32_t TermInfosReader::getSkipInterval() const { return origEnum->skipInterval; - } +} - int32_t TermInfosReader::getMaxSkipLevels() const { +int32_t TermInfosReader::getMaxSkipLevels() const { return origEnum->maxSkipLevels; - } - - void TermInfosReader::setIndexDivisor(const int32_t _indexDivisor) { - if (indexDivisor < 1) - _CLTHROWA(CL_ERR_IllegalArgument, "indexDivisor must be > 0"); +} - if (indexTerms != NULL) - _CLTHROWA(CL_ERR_IllegalArgument, "index terms are already loaded"); +void TermInfosReader::setIndexDivisor(const int32_t _indexDivisor) { + if (indexDivisor < 1) _CLTHROWA(CL_ERR_IllegalArgument, "indexDivisor must be > 0"); - this->indexDivisor = _indexDivisor; - totalIndexInterval = origEnum->indexInterval * _indexDivisor; - } + if (indexTerms != NULL) _CLTHROWA(CL_ERR_IllegalArgument, "index terms are already loaded"); - int32_t TermInfosReader::getIndexDivisor() const { return indexDivisor; } - void TermInfosReader::close() { + this->indexDivisor = _indexDivisor; + totalIndexInterval = origEnum->indexInterval * _indexDivisor; +} - //Check if indexTerms and indexInfos exist - if (indexTerms && indexInfos){ - //Iterate through arrays indexTerms and indexPointer to - //destroy their elements +int32_t TermInfosReader::getIndexDivisor() const { + return indexDivisor; +} +void TermInfosReader::close() { + //Check if indexTerms and indexInfos exist + if (indexTerms && indexInfos) { + //Iterate through arrays indexTerms and indexPointer to + //destroy their elements #ifdef _DEBUG - for ( int32_t i=0; i<indexTermsLength;++i ){ + for (int32_t i = 0; i < indexTermsLength; ++i) { indexTerms[i].__cl_refcount--; - } + } #endif - } - //Delete the arrays - if (indexTerms){ - delete [] indexTerms; - indexTerms = NULL; - } - if (indexInfos){ - _CLDELETE_ARRAY(indexInfos); - indexInfos = NULL; - } - - //Delete the arrays - if (indexPointers) { + } + //Delete the arrays + if (indexTerms) { + delete[] indexTerms; + indexTerms = NULL; + } + if (indexInfos) { + _CLDELETE_ARRAY(indexInfos); + indexInfos = NULL; + } + + //Delete the arrays + if (indexPointers) { _CLDELETE_ARRAY(indexPointers); indexPointers = NULL; - } + } - if (origEnum != NULL){ + if (origEnum != NULL) { origEnum->close(); - //Get a pointer to IndexInput used by the enumeration but - //instantiated in the constructor by directory.open( tisFile ) - IndexInput *is = origEnum->input; + //Get a pointer to IndexInput used by the enumeration but + //instantiated in the constructor by directory.open( tisFile ) + IndexInput* is = origEnum->input; //Delete the enumuration enumerator _CLDELETE(origEnum); //Delete the IndexInput _CLDELETE(is); - } + } - if (indexEnum != NULL){ + if (indexEnum != NULL) { indexEnum->close(); - //Get a pointer to IndexInput used by the enumeration but - //instantiated in the constructor by directory.open( tiiFile ) - IndexInput *is = indexEnum->input; + //Get a pointer to IndexInput used by the enumeration but + //instantiated in the constructor by directory.open( tiiFile ) + IndexInput* is = indexEnum->input; //Delete the enumuration enumerator _CLDELETE(indexEnum); - indexEnum = NULL; + indexEnum = NULL; //Delete the IndexInput _CLDELETE(is); - } - enumerators.setNull(); - } - - int64_t TermInfosReader::size() const{ - //Func - Return the size of the enumeration of TermInfos - //Pre - true - //Post - size has been returened + } + enumerators.setNull(); +} - return _size; - } +int64_t TermInfosReader::size() const { + //Func - Return the size of the enumeration of TermInfos + //Pre - true + //Post - size has been returened + return _size; +} - Term* TermInfosReader::get(const int32_t position) { - //Func - Returns the nth term in the set - //Pre - position > = 0 - //Post - The n-th term in the set has been returned +Term* TermInfosReader::get(const int32_t position) { + //Func - Returns the nth term in the set + //Pre - position > = 0 + //Post - The n-th term in the set has been returned - //Check if the size is 0 because then there are no terms - if (_size == 0) - return NULL; + //Check if the size is 0 because then there are no terms + if (_size == 0) return NULL; - SegmentTermEnum* enumerator = getEnum(); + SegmentTermEnum* enumerator = getEnum(); - if ( - enumerator != NULL //an enumeration exists - && enumerator->term(false) != NULL // term is at or past current - && position >= enumerator->position - && position < (enumerator->position + totalIndexInterval) - ) - { - return scanEnum(position); // can avoid seek - } + if (enumerator != NULL //an enumeration exists + && enumerator->term(false) != NULL // term is at or past current + && position >= enumerator->position && + position < (enumerator->position + totalIndexInterval)) { + return scanEnum(position); // can avoid seek + } //random-access: must seek seekEnum(position / totalIndexInterval); - //Get the Term at position + //Get the Term at position return scanEnum(position); - } +} - SegmentTermEnum* TermInfosReader::getEnum(){ +SegmentTermEnum* TermInfosReader::getEnum() { SegmentTermEnum* termEnum = enumerators.get(); - if (termEnum == NULL){ - termEnum = terms(); - enumerators.set(termEnum); + if (termEnum == NULL) { + termEnum = terms(); + enumerators.set(termEnum); } return termEnum; - } +} - TermInfo* TermInfosReader::get(const Term* term){ - //Func - Returns a TermInfo for a term - //Pre - term holds a valid reference to term - //Post - if term can be found its TermInfo has been returned otherwise NULL +TermInfo* TermInfosReader::get(const Term* term) { + //Func - Returns a TermInfo for a term + //Pre - term holds a valid reference to term + //Post - if term can be found its TermInfo has been returned otherwise NULL //If the size of the enumeration is 0 then no Terms have been read - if (_size == 0) - return NULL; + if (_size == 0) return NULL; ensureIndexIsRead(); @@ -233,250 +232,238 @@ CL_NS_DEF(index) // optimize sequential access: first try scanning cached enumerator w/o seeking if ( - //the current term of the enumeration enumerator is not at the end AND - enumerator->term(false) != NULL && - ( - //there exists a previous current called prev and term is positioned after this prev OR - ( enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) || - //term is positioned at the same position as the current of enumerator or at a higher position - term->compareTo(enumerator->term(false)) >= 0 ) - ) - { - - //Calculate the offset for the position - int32_t _enumOffset = (int32_t)(enumerator->position/totalIndexInterval)+1; - - // but before end of block - if ( - //the length of indexTerms (the number of terms in enumerator) equals - //_enum_offset OR - indexTermsLength == _enumOffset || - //term is positioned in front of term found at _enumOffset in indexTerms - term->compareTo(&indexTerms[_enumOffset]) < 0){ - - //no need to seek, retrieve the TermInfo for term - return scanEnum(term); + //the current term of the enumeration enumerator is not at the end AND + enumerator->term(false) != NULL && + ( + //there exists a previous current called prev and term is positioned after this prev OR + (enumerator->prev != NULL && term->compareTo(enumerator->prev) > 0) || + //term is positioned at the same position as the current of enumerator or at a higher position + term->compareTo(enumerator->term(false)) >= 0)) { + //Calculate the offset for the position + int32_t _enumOffset = (int32_t)(enumerator->position / totalIndexInterval) + 1; + + // but before end of block + if ( + //the length of indexTerms (the number of terms in enumerator) equals + //_enum_offset OR + indexTermsLength == _enumOffset || + //term is positioned in front of term found at _enumOffset in indexTerms + term->compareTo(&indexTerms[_enumOffset]) < 0) { + //no need to seek, retrieve the TermInfo for term + return scanEnum(term); } } //Reposition current term in the enumeration seekEnum(getIndexOffset(term)); - //Return the TermInfo for term + //Return the TermInfo for term return scanEnum(term); - } - +} - int64_t TermInfosReader::getPosition(const Term* term) { - //Func - Returns the position of a Term in the set - //Pre - term holds a valid reference to a Term - // enumerator != NULL - //Post - If term was found then its position is returned otherwise -1 +int64_t TermInfosReader::getPosition(const Term* term) { + //Func - Returns the position of a Term in the set + //Pre - term holds a valid reference to a Term + // enumerator != NULL + //Post - If term was found then its position is returned otherwise -1 - //if the enumeration is empty then return -1 - if (_size == 0) - return -1; + //if the enumeration is empty then return -1 + if (_size == 0) return -1; - ensureIndexIsRead(); - - //Retrieve the indexOffset for term - int32_t indexOffset = getIndexOffset(term); - seekEnum(indexOffset); + ensureIndexIsRead(); - SegmentTermEnum* enumerator = getEnum(); + //Retrieve the indexOffset for term + int32_t indexOffset = getIndexOffset(term); + seekEnum(indexOffset); - while(term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) {} + SegmentTermEnum* enumerator = getEnum(); - if ( term->equals(enumerator->term(false)) ){ - return enumerator->position; - }else - return -1; - } + while (term->compareTo(enumerator->term(false)) > 0 && enumerator->next()) { + } - SegmentTermEnum* TermInfosReader::terms(const Term* term) { - //Func - Returns an enumeration of terms starting at or after the named term. - // If term is null then enumerator is set to the beginning - //Pre - term holds a valid reference to a Term - // enumerator != NULL - //Post - An enumeration of terms starting at or after the named term has been returned + if (term->equals(enumerator->term(false))) { + return enumerator->position; + } else + return -1; +} + +SegmentTermEnum* TermInfosReader::terms(const Term* term) { + //Func - Returns an enumeration of terms starting at or after the named term. + // If term is null then enumerator is set to the beginning + //Pre - term holds a valid reference to a Term + // enumerator != NULL + //Post - An enumeration of terms starting at or after the named term has been returned + + SegmentTermEnum* enumerator = NULL; + if (term != NULL) { + //Seek enumerator to term; delete the new TermInfo that's returned. + TermInfo* ti = get(term); + _CLLDELETE(ti); + enumerator = getEnum(); + } else + enumerator = origEnum; + + //Clone the entire enumeration + SegmentTermEnum* cln = enumerator->clone(); + + //Check if cln points to a valid instance + CND_CONDITION(cln != NULL, "cln is NULL"); + + return cln; +} + +void TermInfosReader::ensureIndexIsRead() { + //Func - Reads the term info index file or .tti file. + // This file contains every IndexInterval-th entry from the .tis file, + // along with its location in the "tis" file. This is designed to be read entirely + // into memory and used to provide random access to the "tis" file. + //Pre - indexTerms = NULL + // indexInfos = NULL + // indexPointers = NULL + //Post - The term info index file has been read into memory - SegmentTermEnum* enumerator = NULL; - if ( term != NULL ){ - //Seek enumerator to term; delete the new TermInfo that's returned. - TermInfo* ti = get(term); - _CLLDELETE(ti); - enumerator = getEnum(); - }else - enumerator = origEnum; + SCOPED_LOCK_MUTEX(THIS_LOCK) - //Clone the entire enumeration - SegmentTermEnum* cln = enumerator->clone(); + if (indexIsRead) return; + + //https://jira.qianxin-inc.cn/browse/XHBUG-2921 + //https://jira.qianxin-inc.cn/browse/XHBUG-3053 + if (indexEnum == NULL) _CLTHROWA(CL_ERR_NullPointer, "indexEnum is NULL"); + + try { + indexTermsLength = (size_t)indexEnum->size; + + //Instantiate an block of Term's,so that each one doesn't have to be new'd + indexTerms = new Term[indexTermsLength]; + CND_CONDITION( + indexTerms != NULL, + "No memory could be allocated for indexTerms"); //Check if is indexTerms is a valid array + + //Instantiate an big block of TermInfo's, so that each one doesn't have to be new'd + indexInfos = _CL_NEWARRAY(TermInfo, indexTermsLength); + CND_CONDITION( + indexInfos != NULL, + "No memory could be allocated for indexInfos"); //Check if is indexInfos is a valid array + + //Instantiate an array indexPointers that contains pointers to the term info index file + indexPointers = _CL_NEWARRAY(int64_t, indexTermsLength); + CND_CONDITION( + indexPointers != NULL, + "No memory could be allocated for indexPointers"); //Check if is indexPointers is a valid array + + //Iterate through the terms of indexEnum + for (int32_t i = 0; indexEnum->next(); ++i) { + indexTerms[i].set(indexEnum->term(false), indexEnum->term(false)->text()); + indexEnum->getTermInfo(&indexInfos[i]); + indexPointers[i] = indexEnum->indexPointer; + + for (int32_t j = 1; j < indexDivisor; j++) + if (!indexEnum->next()) break; + } + indexIsRead = true; + } + _CLFINALLY(indexEnum->close(); + //Close and delete the IndexInput is. The close is done by the destructor. + _CLDELETE(indexEnum->input); _CLDELETE(indexEnum); indexEnum = NULL;); +} + +int32_t TermInfosReader::getIndexOffset(const Term* term) { + //Func - Returns the offset of the greatest index entry which is less than or equal to term. + //Pre - term holds a reference to a valid term + // indexTerms != NULL + //Post - The new offset has been returned + + //Check if is indexTerms is a valid array + CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); + + int32_t lo = 0; + int32_t hi = indexTermsLength - 1; + int32_t mid; + int32_t delta; + + while (hi >= lo) { + //Start in the middle betwee hi and lo + mid = (lo + hi) >> 1; + + //Check if is indexTerms[mid] is a valid instance of Term + CND_PRECONDITION(&indexTerms[mid] != NULL, "indexTerms[mid] is NULL"); + CND_PRECONDITION(mid < indexTermsLength, "mid >= indexTermsLength"); + + //Determine if term is before mid or after mid + delta = term->compareTo(&indexTerms[mid]); + if (delta < 0) { + //Calculate the new hi + hi = mid - 1; + } else if (delta > 0) { + //Calculate the new lo + lo = mid + 1; + } else { + //term has been found so return its position + return mid; + } + } + // the new starting offset + return hi; +} + +void TermInfosReader::seekEnum(const int32_t indexOffset) { + //Func - Reposition the current Term and TermInfo to indexOffset + //Pre - indexOffset >= 0 + // indexTerms != NULL + // indexInfos != NULL + // indexPointers != NULL + //Post - The current Term and Terminfo have been repositioned to indexOffset + + CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number"); + CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); + CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL"); + CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL"); - //Check if cln points to a valid instance - CND_CONDITION(cln != NULL,"cln is NULL"); + SegmentTermEnum* enumerator = getEnum(); + enumerator->seek(indexPointers[indexOffset], (indexOffset * totalIndexInterval) - 1, + &indexTerms[indexOffset], &indexInfos[indexOffset]); +} + +TermInfo* TermInfosReader::scanEnum(const Term* term) { + //Func - Scans the Enumeration of terms for term and returns the corresponding TermInfo instance if found. + // The search is started from the current term. + //Pre - term contains a valid reference to a Term + // enumerator != NULL + //Post - if term has been found the corresponding TermInfo has been returned otherwise NULL + // has been returned - return cln; - } + SegmentTermEnum* enumerator = getEnum(); + enumerator->scanTo(term); + + //Check if the at the position the Term term can be found + if (enumerator->term(false) != NULL && term->equals(enumerator->term(false))) { + //Return the TermInfo instance about term + return enumerator->getTermInfo(); + } else { + //term was not found so no TermInfo can be returned + return NULL; + } +} +Term* TermInfosReader::scanEnum(const int32_t position) { + //Func - Scans the enumeration to the requested position and returns the + // Term located at that position + //Pre - position > = 0 + // enumerator != NULL + //Post - The Term at the requested position has been returned - void TermInfosReader::ensureIndexIsRead() { - //Func - Reads the term info index file or .tti file. - // This file contains every IndexInterval-th entry from the .tis file, - // along with its location in the "tis" file. This is designed to be read entirely - // into memory and used to provide random access to the "tis" file. - //Pre - indexTerms = NULL - // indexInfos = NULL - // indexPointers = NULL - //Post - The term info index file has been read into memory + SegmentTermEnum* enumerator = getEnum(); - SCOPED_LOCK_MUTEX(THIS_LOCK) + //As long the position of the enumeration enumerator is smaller than the requested one + while (enumerator->position < position) { + //Move the current of enumerator to the next + if (!enumerator->next()) { + //If there is no next it means that the requested position was to big + return NULL; + } + } - if (indexIsRead) - return; - - //https://jira.qianxin-inc.cn/browse/XHBUG-2921 - //https://jira.qianxin-inc.cn/browse/XHBUG-3053 - if (indexEnum == NULL) - _CLTHROWA(CL_ERR_NullPointer, "indexEnum is NULL"); - - try { - indexTermsLength = (size_t)indexEnum->size; - - //Instantiate an block of Term's,so that each one doesn't have to be new'd - indexTerms = new Term[indexTermsLength]; - CND_CONDITION(indexTerms != NULL,"No memory could be allocated for indexTerms");//Check if is indexTerms is a valid array - - //Instantiate an big block of TermInfo's, so that each one doesn't have to be new'd - indexInfos = _CL_NEWARRAY(TermInfo,indexTermsLength); - CND_CONDITION(indexInfos != NULL,"No memory could be allocated for indexInfos"); //Check if is indexInfos is a valid array - - //Instantiate an array indexPointers that contains pointers to the term info index file - indexPointers = _CL_NEWARRAY(int64_t,indexTermsLength); - CND_CONDITION(indexPointers != NULL,"No memory could be allocated for indexPointers");//Check if is indexPointers is a valid array - - //Iterate through the terms of indexEnum - for (int32_t i = 0; indexEnum->next(); ++i){ - indexTerms[i].set(indexEnum->term(false),indexEnum->term(false)->text()); - indexEnum->getTermInfo(&indexInfos[i]); - indexPointers[i] = indexEnum->indexPointer; - - for (int32_t j = 1; j < indexDivisor; j++) - if (!indexEnum->next()) - break; - } - indexIsRead = true; - }_CLFINALLY( - indexEnum->close(); - //Close and delete the IndexInput is. The close is done by the destructor. - _CLDELETE( indexEnum->input ); - _CLDELETE( indexEnum ); - indexEnum = NULL; - ); - } - - - int32_t TermInfosReader::getIndexOffset(const Term* term){ - //Func - Returns the offset of the greatest index entry which is less than or equal to term. - //Pre - term holds a reference to a valid term - // indexTerms != NULL - //Post - The new offset has been returned - - //Check if is indexTerms is a valid array - CND_PRECONDITION(indexTerms != NULL,"indexTerms is NULL"); - - int32_t lo = 0; - int32_t hi = indexTermsLength - 1; - int32_t mid; - int32_t delta; - - while (hi >= lo) { - //Start in the middle betwee hi and lo - mid = (lo + hi) >> 1; - - //Check if is indexTerms[mid] is a valid instance of Term - CND_PRECONDITION(&indexTerms[mid] != NULL,"indexTerms[mid] is NULL"); - CND_PRECONDITION(mid < indexTermsLength,"mid >= indexTermsLength"); - - //Determine if term is before mid or after mid - delta = term->compareTo(&indexTerms[mid]); - if (delta < 0){ - //Calculate the new hi - hi = mid - 1; - }else if (delta > 0){ - //Calculate the new lo - lo = mid + 1; - }else{ - //term has been found so return its position - return mid; - } - } - // the new starting offset - return hi; - } - - void TermInfosReader::seekEnum(const int32_t indexOffset) { - //Func - Reposition the current Term and TermInfo to indexOffset - //Pre - indexOffset >= 0 - // indexTerms != NULL - // indexInfos != NULL - // indexPointers != NULL - //Post - The current Term and Terminfo have been repositioned to indexOffset - - CND_PRECONDITION(indexOffset >= 0, "indexOffset contains a negative number"); - CND_PRECONDITION(indexTerms != NULL, "indexTerms is NULL"); - CND_PRECONDITION(indexInfos != NULL, "indexInfos is NULL"); - CND_PRECONDITION(indexPointers != NULL, "indexPointers is NULL"); - - SegmentTermEnum* enumerator = getEnum(); - enumerator->seek( - indexPointers[indexOffset], - (indexOffset * totalIndexInterval) - 1, - &indexTerms[indexOffset], - &indexInfos[indexOffset] - ); - } - - - TermInfo* TermInfosReader::scanEnum(const Term* term) { - //Func - Scans the Enumeration of terms for term and returns the corresponding TermInfo instance if found. - // The search is started from the current term. - //Pre - term contains a valid reference to a Term - // enumerator != NULL - //Post - if term has been found the corresponding TermInfo has been returned otherwise NULL - // has been returned - - SegmentTermEnum* enumerator = getEnum(); - enumerator->scanTo(term); - - //Check if the at the position the Term term can be found - if (enumerator->term(false) != NULL && term->equals(enumerator->term(false)) ){ - //Return the TermInfo instance about term - return enumerator->getTermInfo(); - }else{ - //term was not found so no TermInfo can be returned - return NULL; - } - } - - Term* TermInfosReader::scanEnum(const int32_t position) { - //Func - Scans the enumeration to the requested position and returns the - // Term located at that position - //Pre - position > = 0 - // enumerator != NULL - //Post - The Term at the requested position has been returned - - SegmentTermEnum* enumerator = getEnum(); - - //As long the position of the enumeration enumerator is smaller than the requested one - while(enumerator->position < position){ - //Move the current of enumerator to the next - if (!enumerator->next()){ - //If there is no next it means that the requested position was to big - return NULL; - } - } - - //Return the Term a the requested position - return enumerator->term(); - } + //Return the Term a the requested position + return enumerator->term(); +} CL_NS_END diff --git a/src/core/CLucene/index/_SegmentTermEnum.h b/src/core/CLucene/index/_SegmentTermEnum.h index b5fa419d..3dd2c8c5 100644 --- a/src/core/CLucene/index/_SegmentTermEnum.h +++ b/src/core/CLucene/index/_SegmentTermEnum.h @@ -55,7 +55,8 @@ protected: public: ///Constructor - SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const bool isi, int32_t in_format = -1); + SegmentTermEnum(CL_NS(store)::IndexInput* i, FieldInfos* fis, const bool isi); + void init(int32_t in_format = -1); ///Destructor ~SegmentTermEnum(); diff --git a/src/core/CLucene/search/MultiPhraseQuery.cpp b/src/core/CLucene/search/MultiPhraseQuery.cpp index 54273702..107c8b11 100644 --- a/src/core/CLucene/search/MultiPhraseQuery.cpp +++ b/src/core/CLucene/search/MultiPhraseQuery.cpp @@ -211,8 +211,8 @@ Query* MultiPhraseQuery::rewrite(IndexReader* /*reader*/) { ArrayBase<Term*>* terms = termArrays->at(0); BooleanQuery* boq = _CLNEW BooleanQuery(true); for ( size_t i=0;i<terms->length;i++ ){ - boq->add(_CLNEW TermQuery((*terms)[i]), BooleanClause::SHOULD); - } + boq->add(_CLNEW TermQuery((*terms)[i]), true, BooleanClause::SHOULD); + } boq->setBoost(getBoost()); return boq; } else { diff --git a/src/core/CLucene/store/IndexOutput.cpp b/src/core/CLucene/store/IndexOutput.cpp index 05e7695f..77c37400 100644 --- a/src/core/CLucene/store/IndexOutput.cpp +++ b/src/core/CLucene/store/IndexOutput.cpp @@ -35,12 +35,13 @@ CL_NS_DEF(store) close(); } - void BufferedIndexOutput::close(){ - flush(); - _CLDELETE_ARRAY( buffer ); - - bufferStart = 0; - bufferPosition = 0; + void BufferedIndexOutput::close() { + // flush may throw error here, if we do not delete buffer for all circumstances, + // we may close again in destructor above, that would cause pure virtual function call for flushBuffer + try { + flush(); + } + _CLFINALLY(_CLDELETE_ARRAY(buffer); bufferStart = 0; bufferPosition = 0;) } void BufferedIndexOutput::writeByte(const uint8_t b) { diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index a487f343..b06f7f7d 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -102,6 +102,7 @@ SET(test_files ./tests.cpp ./util/TestStringBuffer.cpp ./util/English.cpp ./util/TestStrConvert.cpp + ./query/TestMultiPhraseQuery.cpp ${test_HEADERS}) IF (USE_SHARED_OBJECT_FILES) GET_SHARED_FILES(clucene_shared_Files) diff --git a/src/test/query/TestMultiPhraseQuery.cpp b/src/test/query/TestMultiPhraseQuery.cpp new file mode 100644 index 00000000..ccc4fe7f --- /dev/null +++ b/src/test/query/TestMultiPhraseQuery.cpp @@ -0,0 +1,163 @@ +#include <CLucene.h> + +#include <iostream> +#include <memory> +#include <vector> + +#include "CLucene/debug/error.h" +#include "CLucene/index/IndexReader.h" +#include "CLucene/index/Term.h" +#include "CLucene/search/MultiPhraseQuery.h" +#include "CLucene/store/Directory.h" +#include "CLucene/store/FSDirectory.h" +#include "CLucene/store/RAMDirectory.h" +#include "test.h" + +CL_NS_USE(util) +CL_NS_USE(store) +CL_NS_USE(search) +CL_NS_USE(index) + +void testSimple1Add(CuTest* tc) { + RAMDirectory dir; + + SimpleAnalyzer<char> analyzer; + IndexWriter w(&dir, &analyzer, true); + w.setUseCompoundFile(false); + auto field_name = lucene::util::Misc::_charToWide("name"); + std::string value = "value"; + + Document doc; + auto field = _CLNEW Field(field_name, Field::INDEX_TOKENIZED | Field::STORE_NO); + + auto char_string_reader = std::make_unique<lucene::util::SStringReader<char>>(); + char_string_reader->init(value.data(), value.size(), true); + auto stream = analyzer.tokenStream(field->name(), char_string_reader.get()); + field->setValue(stream); + doc.add(*field); + + w.addDocument(&doc); + w.close(); + + IndexSearcher index_searcher(&dir); + { + MultiPhraseQuery query; + + Term* t1 = _CLNEW Term(_T( "name" ), _T( "t1" )); + query.add(t1); + _CLLDECDELETE(t1); + + std::vector<int32_t> result; + index_searcher._search(&query, [&result](const int32_t docid, const float_t /*score*/) { + result.push_back(docid); + }); + CLUCENE_ASSERT(result.size() == 0); + } + + _CLDELETE(stream) + _CLDELETE_ARRAY(field_name) +} + +void testSimple2Add(CuTest* tc) { + RAMDirectory dir; + + SimpleAnalyzer<char> analyzer; + IndexWriter w(&dir, &analyzer, true); + w.setUseCompoundFile(false); + auto field_name = lucene::util::Misc::_charToWide("name"); + std::string value = "value"; + + Document doc; + auto field = _CLNEW Field(field_name, Field::INDEX_TOKENIZED | Field::STORE_NO); + + auto char_string_reader = std::make_unique<lucene::util::SStringReader<char>>(); + char_string_reader->init(value.data(), value.size(), true); + auto stream = analyzer.tokenStream(field->name(), char_string_reader.get()); + field->setValue(stream); + doc.add(*field); + + w.addDocument(&doc); + w.close(); + + IndexSearcher index_searcher(&dir); + { + MultiPhraseQuery query; + + std::vector<Term*> terms; + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t2" ))); + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t3" ))); + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t4" ))); + query.add(terms); + for (int32_t i = 0; i < terms.size(); i++) { + _CLLDECDELETE(terms[i]); + } + + std::vector<int32_t> result; + index_searcher._search(&query, [&result](const int32_t docid, const float_t /*score*/) { + result.push_back(docid); + }); + CLUCENE_ASSERT(result.size() == 0); + } + + _CLDELETE(stream) + _CLDELETE_ARRAY(field_name) +} + +void testMultiAdd(CuTest* tc) { + RAMDirectory dir; + + SimpleAnalyzer<char> analyzer; + IndexWriter w(&dir, &analyzer, true); + w.setUseCompoundFile(false); + auto field_name = lucene::util::Misc::_charToWide("name"); + std::string value = "value"; + + Document doc; + auto field = _CLNEW Field(field_name, Field::INDEX_TOKENIZED | Field::STORE_NO); + + auto char_string_reader = std::make_unique<lucene::util::SStringReader<char>>(); + char_string_reader->init(value.data(), value.size(), true); + auto stream = analyzer.tokenStream(field->name(), char_string_reader.get()); + field->setValue(stream); + doc.add(*field); + + w.addDocument(&doc); + w.close(); + + IndexSearcher index_searcher(&dir); + { + MultiPhraseQuery query; + + Term* t1 = _CLNEW Term(_T( "name" ), _T( "t1" )); + query.add(t1); + _CLLDECDELETE(t1); + + std::vector<Term*> terms; + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t2" ))); + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t3" ))); + terms.push_back(_CLNEW Term(_T( "name" ), _T( "t4" ))); + query.add(terms); + for (int32_t i = 0; i < terms.size(); i++) { + _CLLDECDELETE(terms[i]); + } + + std::vector<int32_t> result; + index_searcher._search(&query, [&result](const int32_t docid, const float_t /*score*/) { + result.push_back(docid); + }); + CLUCENE_ASSERT(result.size() == 0); + } + + _CLDELETE(stream) + _CLDELETE_ARRAY(field_name) +} + +CuSuite* testMultiPhraseQuery(void) { + CuSuite* suite = CuSuiteNew(_T("CLucene MultiPhraseQuery Test")); + + SUITE_ADD_TEST(suite, testSimple1Add); + SUITE_ADD_TEST(suite, testSimple2Add); + SUITE_ADD_TEST(suite, testMultiAdd); + + return suite; +} \ No newline at end of file diff --git a/src/test/test.h b/src/test/test.h index cbf08803..da1fde00 100644 --- a/src/test/test.h +++ b/src/test/test.h @@ -82,6 +82,7 @@ CuSuite *testTermVectorsReader(void); CuSuite *teststandard95(void); CuSuite *testStrConvert(void); CuSuite *testSearchRange(void); +CuSuite *testMultiPhraseQuery(void); #ifdef TEST_CONTRIB_LIBS //CuSuite *testGermanAnalyzer(void); diff --git a/src/test/tests.cpp b/src/test/tests.cpp index 5d5421cb..5ca803b9 100644 --- a/src/test/tests.cpp +++ b/src/test/tests.cpp @@ -48,6 +48,7 @@ unittest tests[] = { // {"termvectorsreader", testTermVectorsReader}, {"strconvert", testStrConvert}, {"searchRange", testSearchRange}, + {"MultiPhraseQuery", testMultiPhraseQuery}, #ifdef TEST_CONTRIB_LIBS {"chinese", testchinese}, #endif --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org