This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new e15a89a5 [Optimize](search) Optimizemultiple terms conjunction query (#117) e15a89a5 is described below commit e15a89a5627b1a2e88ef924bee6e0a3c6c6495a3 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Mon Sep 4 16:38:32 2023 +0800 [Optimize](search) Optimizemultiple terms conjunction query (#117) --- .../CLucene/analysis/jieba/ChineseTokenizer.cpp | 5 +- src/core/CLucene/document/Field.h | 7 + src/core/CLucene/index/CodeMode.h | 3 +- src/core/CLucene/index/DocRange.h | 7 +- src/core/CLucene/index/FieldInfos.cpp | 31 +- src/core/CLucene/index/IndexReader.cpp | 4 + src/core/CLucene/index/IndexReader.h | 3 + src/core/CLucene/index/IndexVersion.h | 8 + src/core/CLucene/index/IndexWriter.cpp | 53 ++-- src/core/CLucene/index/MultiSegmentReader.cpp | 24 +- src/core/CLucene/index/SDocumentWriter.cpp | 50 ++- src/core/CLucene/index/SDocumentWriter.h | 1 + src/core/CLucene/index/SegmentReader.cpp | 5 + src/core/CLucene/index/SegmentTermDocs.cpp | 348 +++++++++------------ src/core/CLucene/index/Terms.h | 4 + src/core/CLucene/index/_FieldInfos.h | 8 + src/core/CLucene/index/_MultiSegmentReader.h | 4 + src/core/CLucene/index/_SegmentHeader.h | 69 ++++ src/core/CLucene/search/query/DcoIdSetIterator.h | 16 + src/core/CLucene/search/query/TermIterator.h | 51 +++ src/core/CMakeLists.txt | 2 + 21 files changed, 435 insertions(+), 268 deletions(-) diff --git a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp index ef126c97..c72e2451 100644 --- a/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp +++ b/src/contribs-lib/CLucene/analysis/jieba/ChineseTokenizer.cpp @@ -50,8 +50,9 @@ CL_NS(analysis)::Token *ChineseTokenizer::next(lucene::analysis::Token *token) { dataLen = tokens_text.size(); } if (bufferIndex < dataLen) { - auto token_text = tokens_text[bufferIndex++]; - token->setNoCopy(token_text.data(), 0, token_text.size()); + std::string& token_text = tokens_text[bufferIndex++]; + size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN)); + token->setNoCopy(token_text.data(), 0, size); return token; } return nullptr; diff --git a/src/core/CLucene/document/Field.h b/src/core/CLucene/document/Field.h index 970d06ee..23c0ad17 100644 --- a/src/core/CLucene/document/Field.h +++ b/src/core/CLucene/document/Field.h @@ -9,6 +9,8 @@ #include "CLucene/util/Array.h" #include "CLucene/util/Equators.h" +#include "CLucene/index/IndexVersion.h" + /* Fieldable reading: https://issues.apache.org/jira/browse/LUCENE-1219?page=com.atlassian.jira.plugin.system.issuetabpanels:comment- tabpanel&focusedCommentId=12578199#action_12578199 @@ -310,6 +312,9 @@ public: virtual const char* getObjectName() const; static const char* getClassName(); + void setIndexVersion(IndexVersion indexVersion) { indexVersion_ = indexVersion; } + IndexVersion getIndexVersion() { return indexVersion_; } + protected: /** * Set configs using XOR. This resets all the settings @@ -327,6 +332,8 @@ protected: const TCHAR* _name; uint32_t config; float_t boost; + + IndexVersion indexVersion_ = IndexVersion::kV1; }; CL_NS_END #endif diff --git a/src/core/CLucene/index/CodeMode.h b/src/core/CLucene/index/CodeMode.h index 5ac3718f..3c39e94e 100644 --- a/src/core/CLucene/index/CodeMode.h +++ b/src/core/CLucene/index/CodeMode.h @@ -4,7 +4,8 @@ CL_NS_DEF(index) enum class CodeMode { kDefault = 0, - kPfor = 1 + kPfor = 1, + kRange = 2 }; CL_NS_END \ No newline at end of file diff --git a/src/core/CLucene/index/DocRange.h b/src/core/CLucene/index/DocRange.h index 81b27047..ef7906a2 100644 --- a/src/core/CLucene/index/DocRange.h +++ b/src/core/CLucene/index/DocRange.h @@ -15,15 +15,16 @@ enum class DocRangeType { class DocRange { public: - DocRange() : doc_many(PFOR_BLOCK_SIZE + 3), freq_many(PFOR_BLOCK_SIZE + 3) {} + DocRange() = default; + ~DocRange() = default; public: DocRangeType type_ = DocRangeType::kNone; uint32_t doc_many_size_ = 0; uint32_t freq_many_size_ = 0; - std::vector<uint32_t> doc_many; - std::vector<uint32_t> freq_many; + std::vector<uint32_t>* doc_many = nullptr; + std::vector<uint32_t>* freq_many = nullptr; std::pair<uint32_t, uint32_t> doc_range; }; \ No newline at end of file diff --git a/src/core/CLucene/index/FieldInfos.cpp b/src/core/CLucene/index/FieldInfos.cpp index b3260e6d..00e0c427 100644 --- a/src/core/CLucene/index/FieldInfos.cpp +++ b/src/core/CLucene/index/FieldInfos.cpp @@ -103,6 +103,17 @@ bool FieldInfos::hasProx() { return false; } +IndexVersion FieldInfos::getIndexVersion() { + int numFields = byNumber.size(); + for (int i = 0; i < numFields; i++) { + FieldInfo* fi = fieldInfo(i); + if (fi->indexVersion_ > IndexVersion::kV0) { + return fi->indexVersion_; + } + } + return IndexVersion::kV0; +} + void FieldInfos::addIndexed(const TCHAR** names, const bool storeTermVectors, const bool storePositionWithTermVector, const bool storeOffsetWithTermVector) { size_t i = 0; @@ -228,8 +239,16 @@ void FieldInfos::write(IndexOutput* output) const{ if (fi->storePayloads) bits |= STORE_PAYLOADS; if (fi->hasProx) bits |= TERM_FREQ_AND_POSITIONS; + if (fi->getIndexVersion() > IndexVersion::kV0) { + bits |= 0x80; + } + output->writeString(fi->name,_tcslen(fi->name)); output->writeByte(bits); + + if (fi->getIndexVersion() > IndexVersion::kV0) { + output->writeVInt(static_cast<int32_t>(fi->getIndexVersion())); + } } } @@ -239,7 +258,7 @@ void FieldInfos::read(IndexInput* input) { bool isIndexed,storeTermVector,storePositionsWithTermVector,storeOffsetWithTermVector,omitNorms,hasProx,storePayloads; for (int32_t i = 0; i < size; ++i){ TCHAR* name = input->readString(); //we could read name into a string buffer, but we can't be sure what the maximum field length will be. - bits = input->readByte(); + bits = input->readByte(); isIndexed = (bits & IS_INDEXED) != 0; storeTermVector = (bits & STORE_TERMVECTOR) != 0; storePositionsWithTermVector = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0; @@ -247,8 +266,14 @@ void FieldInfos::read(IndexInput* input) { omitNorms = (bits & OMIT_NORMS) != 0; storePayloads = (bits & STORE_PAYLOADS) != 0; hasProx = (bits & TERM_FREQ_AND_POSITIONS) != 0; - - addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, hasProx, storePayloads); + + FieldInfo* fi = addInternal(name, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, hasProx, storePayloads); + if ((bits & 0x80) == 0) { + fi->setIndexVersion(IndexVersion::kV0); + } else { + IndexVersion indexVersion = (IndexVersion)input->readVInt(); + fi->setIndexVersion(indexVersion); + } _CLDELETE_CARRAY(name); } } diff --git a/src/core/CLucene/index/IndexReader.cpp b/src/core/CLucene/index/IndexReader.cpp index 91ca18ed..0fb2fe59 100644 --- a/src/core/CLucene/index/IndexReader.cpp +++ b/src/core/CLucene/index/IndexReader.cpp @@ -529,4 +529,8 @@ CL_NS(store)::Directory* IndexReader::getDirectory() { _internal->closeCallbacks.put(callback, parameter); } +IndexVersion IndexReader::getIndexVersion() { + _CLTHROWA(CL_ERR_UnsupportedOperation, "IndexReader This reader does not support this method."); +} + CL_NS_END diff --git a/src/core/CLucene/index/IndexReader.h b/src/core/CLucene/index/IndexReader.h index 9079025a..defb6e75 100644 --- a/src/core/CLucene/index/IndexReader.h +++ b/src/core/CLucene/index/IndexReader.h @@ -11,6 +11,7 @@ #include "CLucene/util/Array.h" #include "CLucene/util/VoidList.h" #include "CLucene/LuceneThreads.h" +#include "CLucene/index/IndexVersion.h" CL_CLASS_DEF(store,Directory) CL_CLASS_DEF(store,LuceneLock) @@ -676,6 +677,8 @@ public: */ void addCloseCallback(CloseCallback callback, void* parameter); + virtual IndexVersion getIndexVersion(); + friend class SegmentReader; friend class MultiReader; friend class IndexWriter; diff --git a/src/core/CLucene/index/IndexVersion.h b/src/core/CLucene/index/IndexVersion.h new file mode 100644 index 00000000..448b1872 --- /dev/null +++ b/src/core/CLucene/index/IndexVersion.h @@ -0,0 +1,8 @@ +#pragma once + +enum class IndexVersion { + kV0 = 0, + kV1 = 1, + + kNone +}; \ No newline at end of file diff --git a/src/core/CLucene/index/IndexWriter.cpp b/src/core/CLucene/index/IndexWriter.cpp index 24ac2b08..6abd2716 100644 --- a/src/core/CLucene/index/IndexWriter.cpp +++ b/src/core/CLucene/index/IndexWriter.cpp @@ -280,10 +280,10 @@ void IndexWriter::init(Directory *d, Analyzer *a, const bool create, const bool if (analyzer->isSDocOpt()) { docWriter = _CLNEW SDocumentsWriter<char>(directory, this); } else { - docWriter = _CLNEW DocumentsWriter(directory, this); + _CLTHROWA(CL_ERR_IllegalArgument, "IndexWriter::init: Only support SDocumentsWriter"); } } else { - docWriter = _CLNEW DocumentsWriter(directory, this); + _CLTHROWA(CL_ERR_IllegalArgument, "IndexWriter::init: Only support SDocumentsWriter"); } // Default deleter (for backwards compatibility) is // KeepOnlyLastCommitDeleter: @@ -1702,14 +1702,13 @@ void IndexWriter::mergeTerms(bool hasProx) { } if ((++df % skipInterval) == 0) { + freqOut->writeByte((char)CodeMode::kPfor); + freqOut->writeVInt(docDeltaBuffer.size()); + encode(freqOut, docDeltaBuffer, true); if (hasProx) { - freqOut->writeByte((char)CodeMode::kPfor); - freqOut->writeVInt(docDeltaBuffer.size()); - // doc - encode(freqOut, docDeltaBuffer, true); - // freq encode(freqOut, freqBuffer, false); } + skipWriter->setSkipData(lastDoc, false, -1); skipWriter->bufferSkip(df); } @@ -1717,8 +1716,8 @@ void IndexWriter::mergeTerms(bool hasProx) { assert(destDocId > lastDoc || df == 1); lastDoc = destDocId; + docDeltaBuffer.push_back(destDocId); if (hasProx) { - // position int32_t lastPosition = 0; for (int32_t i = 0; i < descPositions.size(); i++) { int32_t position = descPositions[i]; @@ -1726,15 +1725,7 @@ void IndexWriter::mergeTerms(bool hasProx) { proxOut->writeVInt(delta); lastPosition = position; } - - docDeltaBuffer.push_back(destDocId); freqBuffer.push_back(destFreq); - } else { - docDeltaBuffer.push_back(destDocId); - if (docDeltaBuffer.size() == PFOR_BLOCK_SIZE) { - freqOut->writeVInt(docDeltaBuffer.size()); - encode(freqOut, docDeltaBuffer, true); - } } smi = match[destDoc->srcIdx]; @@ -1775,7 +1766,7 @@ void IndexWriter::mergeTerms(bool hasProx) { proxPointer = proxPointers[i]; } - if (hasProx) { + { auto& docDeltaBuffer = docDeltaBuffers[i]; auto& freqBuffer = freqBuffers[i]; @@ -1783,26 +1774,24 @@ void IndexWriter::mergeTerms(bool hasProx) { freqOutput->writeVInt(docDeltaBuffer.size()); uint32_t lastDoc = 0; for (int32_t i = 0; i < docDeltaBuffer.size(); i++) { - uint32_t newDocCode = (docDeltaBuffer[i] - lastDoc) << 1; - lastDoc = docDeltaBuffer[i]; - uint32_t freq = freqBuffer[i]; - if (1 == freq) { - freqOutput->writeVInt(newDocCode | 1); + uint32_t curDoc = docDeltaBuffer[i]; + if (hasProx) { + uint32_t newDocCode = (docDeltaBuffer[i] - lastDoc) << 1; + lastDoc = curDoc; + uint32_t freq = freqBuffer[i]; + if (1 == freq) { + freqOutput->writeVInt(newDocCode | 1); + } else { + freqOutput->writeVInt(newDocCode); + freqOutput->writeVInt(freq); + } } else { - freqOutput->writeVInt(newDocCode); - freqOutput->writeVInt(freq); + freqOutput->writeVInt(curDoc - lastDoc); + lastDoc = curDoc; } } docDeltaBuffer.resize(0); freqBuffer.resize(0); - } else { - freqOutput->writeVInt(docDeltaBuffers[i].size()); - uint32_t lDoc = 0; - for (auto &docDelta: docDeltaBuffers[i]) { - freqOutput->writeVInt(docDelta - lDoc); - lDoc = docDelta; - } - docDeltaBuffers[i].resize(0); } int64_t skipPointer = skipListWriter->writeSkip(freqOutput); diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp b/src/core/CLucene/index/MultiSegmentReader.cpp index 01438080..d4e8c8ea 100644 --- a/src/core/CLucene/index/MultiSegmentReader.cpp +++ b/src/core/CLucene/index/MultiSegmentReader.cpp @@ -485,6 +485,12 @@ const char* MultiSegmentReader::getObjectName() const{ return getClassName(); } +IndexVersion MultiSegmentReader::getIndexVersion() { + for (size_t i = 0; i < subReaders->length; i++) { + return (*subReaders)[i]->getIndexVersion(); + } + _CLTHROWA(CL_ERR_IllegalState, "MultiSegmentReader::getIndexVersion index open failed."); +} @@ -540,6 +546,14 @@ TermPositions* MultiTermDocs::__asTermPositions(){ return NULL; } +int32_t MultiTermDocs::docFreq() { + int32_t docFreq = 0; + for (size_t i = 0; i < readerTermDocs->length; i++) { + docFreq += readerTermDocs->values[i]->docFreq(); + } + return docFreq; +} + int32_t MultiTermDocs::doc() const { CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was called"); return base + current->doc(); @@ -577,6 +591,12 @@ void MultiTermDocs::seek( Term* tterm) { base = 0; pointer = 0; current = NULL; + + for (int32_t i = 0; i < readerTermDocs->length; i++) { + termDocs(i); + } + base = starts[pointer]; + current = termDocs(pointer++); } bool MultiTermDocs::next() { @@ -628,8 +648,8 @@ bool MultiTermDocs::readRange(DocRange* docRange) { current = nullptr; } else { if (docRange->type_ == DocRangeType::kMany) { - auto begin = docRange->doc_many.begin(); - auto end = docRange->doc_many.begin() + docRange->doc_many_size_; + auto begin = docRange->doc_many->begin(); + auto end = docRange->doc_many->begin() + docRange->doc_many_size_; std::transform(begin, end, begin, [this](int32_t val) { return val + base; }); } else if (docRange->type_ == DocRangeType::kRange) { docRange->doc_range.first += base; diff --git a/src/core/CLucene/index/SDocumentWriter.cpp b/src/core/CLucene/index/SDocumentWriter.cpp index 5d6de917..c76b31fb 100644 --- a/src/core/CLucene/index/SDocumentWriter.cpp +++ b/src/core/CLucene/index/SDocumentWriter.cpp @@ -166,6 +166,7 @@ void SDocumentsWriter<T>::ThreadState::init(Document *doc, int32_t doc_id) { FieldInfo *fi = _parent->fieldInfos->add(field->name(), field->isIndexed(), field->isTermVectorStored(), field->isStorePositionWithTermVector(), field->isStoreOffsetWithTermVector(), field->getOmitNorms(), !field->getOmitTermFreqAndPositions(), false); + fi->setIndexVersion(field->getIndexVersion()); if (fi->isIndexed && !fi->omitNorms) { // Maybe grow our buffered norms if (_parent->norms.length <= fi->number) { @@ -240,6 +241,7 @@ void SDocumentsWriter<T>::ThreadState::init(Document *doc, int32_t doc_id) { fp->docFields.values[fp->fieldCount++] = field; } _parent->hasProx_ = _parent->fieldInfos->hasProx(); + _parent->indexVersion_ = _parent->fieldInfos->getIndexVersion(); } template<typename T> @@ -1172,12 +1174,10 @@ void SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa while (numToMerge > 0) { if ((++df % skipInterval) == 0) { + freqOut->writeByte((char)CodeMode::kPfor); + freqOut->writeVInt(docDeltaBuffer.size()); + encode(freqOut, docDeltaBuffer, true); if (hasProx_) { - freqOut->writeByte((char)CodeMode::kPfor); - freqOut->writeVInt(docDeltaBuffer.size()); - // doc - encode(freqOut, docDeltaBuffer, true); - // freq encode(freqOut, freqBuffer, false); } @@ -1205,22 +1205,14 @@ void SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa // changing the format to match Lucene's segment // format. + docDeltaBuffer.push_back(doc); if (hasProx_) { - // position for (int32_t j = 0; j < termDocFreq; j++) { const int32_t code = prox.readVInt(); assert(0 == (code & 1)); proxOut->writeVInt(code >> 1); } - - docDeltaBuffer.push_back(doc); freqBuffer.push_back(termDocFreq); - } else { - docDeltaBuffer.push_back(doc); - if (docDeltaBuffer.size() == PFOR_BLOCK_SIZE) { - freqOut->writeVInt(docDeltaBuffer.size()); - encode(freqOut, docDeltaBuffer, true); - } } if (!minState->nextDoc()) { @@ -1253,13 +1245,14 @@ void SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa // Done merging this term { - if (hasProx_) { - freqOut->writeByte((char)CodeMode::kDefault); - freqOut->writeVInt(docDeltaBuffer.size()); - uint32_t lastDoc = 0; - for (int32_t i = 0; i < docDeltaBuffer.size(); i++) { - uint32_t newDocCode = (docDeltaBuffer[i] - lastDoc) << 1; - lastDoc = docDeltaBuffer[i]; + freqOut->writeByte((char)CodeMode::kDefault); + freqOut->writeVInt(docDeltaBuffer.size()); + uint32_t lastDoc = 0; + for (int32_t i = 0; i < docDeltaBuffer.size(); i++) { + uint32_t curDoc = docDeltaBuffer[i]; + if (hasProx_) { + uint32_t newDocCode = (curDoc - lastDoc) << 1; + lastDoc = curDoc; uint32_t freq = freqBuffer[i]; if (1 == freq) { freqOut->writeVInt(newDocCode | 1); @@ -1267,18 +1260,13 @@ void SDocumentsWriter<T>::appendPostings(ArrayBase<typename ThreadState::FieldDa freqOut->writeVInt(newDocCode); freqOut->writeVInt(freq); } + } else { + freqOut->writeVInt(curDoc - lastDoc); + lastDoc = curDoc; } - docDeltaBuffer.resize(0); - freqBuffer.resize(0); - } else { - freqOut->writeVInt(docDeltaBuffer.size()); - uint32_t lDoc = 0; - for (auto& docDelta : docDeltaBuffer) { - freqOut->writeVInt(docDelta - lDoc); - lDoc = docDelta; - } - docDeltaBuffer.resize(0); } + docDeltaBuffer.resize(0); + freqBuffer.resize(0); } int64_t skipPointer = skipListWriter->writeSkip(freqOut); diff --git a/src/core/CLucene/index/SDocumentWriter.h b/src/core/CLucene/index/SDocumentWriter.h index 5163c2a1..7e250be3 100644 --- a/src/core/CLucene/index/SDocumentWriter.h +++ b/src/core/CLucene/index/SDocumentWriter.h @@ -54,6 +54,7 @@ private: std::ostream* infoStream{}; int64_t ramBufferSize; bool hasProx_ = false; + IndexVersion indexVersion_ = IndexVersion::kV1; public: class FieldMergeState; diff --git a/src/core/CLucene/index/SegmentReader.cpp b/src/core/CLucene/index/SegmentReader.cpp index 0b53de82..e0ec0c14 100644 --- a/src/core/CLucene/index/SegmentReader.cpp +++ b/src/core/CLucene/index/SegmentReader.cpp @@ -1123,4 +1123,9 @@ bool SegmentReader::normsClosed() { } return true; } + +IndexVersion SegmentReader::getIndexVersion() { + return _fieldInfos->getIndexVersion(); +} + CL_NS_END diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp b/src/core/CLucene/index/SegmentTermDocs.cpp index 765f6424..9108f1df 100644 --- a/src/core/CLucene/index/SegmentTermDocs.cpp +++ b/src/core/CLucene/index/SegmentTermDocs.cpp @@ -21,7 +21,8 @@ CL_NS_DEF(index) SegmentTermDocs::SegmentTermDocs(const SegmentReader *_parent) : parent(_parent), freqStream(_parent->freqStream->clone()), count(0), df(0), deletedDocs(_parent->deletedDocs), _doc(0), _freq(0), skipInterval(_parent->tis->getSkipInterval()), maxSkipLevels(_parent->tis->getMaxSkipLevels()), skipListReader(NULL), freqBasePointer(0), proxBasePointer(0), - skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0) { + skipPointer(0), haveSkipped(false), pointer(0), pointerMax(0), indexVersion_(_parent->_fieldInfos->getIndexVersion()), + hasProx(_parent->_fieldInfos->hasProx()), buffer_(freqStream, hasProx, indexVersion_) { CND_CONDITION(_parent != NULL, "Parent is NULL"); memset(docs,0,PFOR_BLOCK_SIZE*sizeof(int32_t)); memset(freqs,0,PFOR_BLOCK_SIZE*sizeof(int32_t)); @@ -35,6 +36,10 @@ TermPositions *SegmentTermDocs::__asTermPositions() { return NULL; } +int32_t SegmentTermDocs::docFreq() { + return df; +} + void SegmentTermDocs::seek(Term *term) { TermInfo *ti = parent->tis->get(term); seek(ti, term); @@ -63,7 +68,7 @@ void SegmentTermDocs::seek(const TermInfo *ti, Term *term) { count = 0; FieldInfo *fi = parent->_fieldInfos->fieldInfo(term->field()); currentFieldStoresPayloads = (fi != NULL) ? fi->storePayloads : false; - hasProx = (fi != nullptr) && fi->hasProx; + // hasProx = (fi != nullptr) && fi->hasProx; if (ti == NULL) { df = 0; } else {// punt case @@ -90,141 +95,39 @@ int32_t SegmentTermDocs::freq() const { } bool SegmentTermDocs::next() { - pointer++; - if (pointer >= pointerMax) { - pointerMax = SegmentTermDocs::read(docs, freqs, PFOR_BLOCK_SIZE); // refill buffer - if (pointerMax != 0) { - pointer = 0; - } else { - // NOTE: do not close here, try to close by upper caller or destruct. - //SegmentTermDocs::close(); // close stream - _doc = LUCENE_INT32_MAX_SHOULDBE; // set to sentinel value - return false; - } + if (count == df) { + _doc = LUCENE_INT32_MAX_SHOULDBE; + return false; } - _doc = docs[pointer]; - _freq = freqs[pointer]; - return true; -} -/*bool SegmentTermDocs::next() { - while (true) { - if (count == df) - return false; + _doc = buffer_.getDoc(); + if (hasProx) { + _freq = buffer_.getFreq(); + } - uint32_t docCode = freqStream->readVInt(); - _doc += docCode >> 1; //unsigned shift - if ((docCode & 1) != 0)// if low bit is set - _freq = 1; // _freq is one - else - _freq = freqStream->readVInt();// else read _freq - count++; + count++; - if ((deletedDocs == NULL) || (_doc >= 0 && deletedDocs->get(_doc) == false)) - break; - skippingDoc(); - } return true; -}*/ +} int32_t SegmentTermDocs::read(int32_t *docs, int32_t *freqs, int32_t length) { int32_t i = 0; + + if (count == df) { + return i; + } - if (hasProx) { - if (count == df) { - return i; - } - - char mode = freqStream->readByte(); - - uint32_t arraySize = freqStream->readVInt(); - std::vector<uint32_t> _docs(arraySize); - std::vector<uint32_t> _freqs(arraySize); - - if (mode == (char)CodeMode::kDefault) { - uint32_t docDelta = 0; - for (uint32_t i = 0; i < arraySize; i++) { - uint32_t docCode = freqStream->readVInt(); - docDelta += (docCode >> 1); - _docs[i] = docDelta; - if ((docCode & 1) != 0) { - _freqs[i] = 1; - } else { - _freqs[i] = freqStream->readVInt(); - } - } - } else { - // NOTE: Pad arraySize from 511 to 512 for alignment since the first block size is 511, and add one more extra space to prevent overflow. - auto paddingSize = (arraySize / PFOR_BLOCK_SIZE) * PFOR_BLOCK_SIZE + PFOR_BLOCK_SIZE; - _docs.resize(paddingSize + 1); - _freqs.resize(paddingSize + 1); - { - uint32_t SerializedSize = freqStream->readVInt(); - std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); - freqStream->readBytes(buf.data(), SerializedSize); - P4DEC(buf.data(), arraySize, _docs.data()); - } - { - uint32_t SerializedSize = freqStream->readVInt(); - std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); - freqStream->readBytes(buf.data(), SerializedSize); - P4NZDEC(buf.data(), arraySize, _freqs.data()); - } - } + while (i < length && count < df) { + _doc = buffer_.getDoc(); + docs[i] = _doc; - while (i < arraySize && count < df) { - _doc = _docs[i]; - _freq = _freqs[i]; - count++; - if (deletedDocs == NULL || (_doc >= 0 && !deletedDocs->get(_doc))) { - docs[i] = _doc; - freqs[i] = _freq; - i++; - } + if (hasProx) { + _freq = buffer_.getFreq(); + freqs[i] = _freq; } - } else { - //todo: one optimization would be to get the pointer buffer for ram or mmap dirs - //and iterate over them instead of using readByte() intensive functions. - if (i < length && count < df) { - uint32_t arraySize = freqStream->readVInt(); - if (arraySize < PFOR_BLOCK_SIZE) { - int32_t _docDelta{0}; - while (i < length && count < df && i < arraySize) { - // manually inlined call to next() for speed - uint32_t docCode = freqStream->readVInt(); - _docDelta += docCode; - _doc = _docDelta; - _freq = 1; // _freq is one - count++; - - if (deletedDocs == NULL || (_doc >= 0 && !deletedDocs->get(_doc))) { - docs[i] = _doc; - freqs[i] = _freq; - i++; - } - } - } else { - // need one more space for decode, otherwise will buffer overflow when decoding - std::vector<uint32_t> arr(arraySize + 1); - uint32_t serializedSize = freqStream->readVInt(); - // need more space for decode, otherwise will buffer overflow when decoding - std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE); - freqStream->readBytes(buf.data(), serializedSize); - P4DEC(buf.data(), arraySize, arr.data()); - - while (i < length && count < df && i < arraySize) { - _doc = arr[i]; - _freq = 1;// _freq is one - if (deletedDocs == NULL || (_doc >= 0 && !deletedDocs->get(_doc))) { - docs[i] = _doc; - freqs[i] = _freq; - i++; - } - count++; - } - } - } + count++; + i++; } return i; @@ -235,75 +138,17 @@ bool SegmentTermDocs::readRange(DocRange* docRange) { return false; } - if (hasProx) { - char mode = freqStream->readByte(); - uint32_t arraySize = freqStream->readVInt(); - if (mode == (char)CodeMode::kDefault) { - uint32_t docDelta = 0; - for (uint32_t i = 0; i < arraySize; i++) { - uint32_t docCode = freqStream->readVInt(); - docDelta += (docCode >> 1); - docRange->doc_many[i] = docDelta; - if ((docCode & 1) != 0) { - docRange->freq_many[i] = 1; - } else { - docRange->freq_many[i] = freqStream->readVInt(); - } - } - docRange->type_ = DocRangeType::kMany; - docRange->doc_many_size_ = arraySize; - docRange->freq_many_size_ = arraySize; - } else { - { - uint32_t SerializedSize = freqStream->readVInt(); - std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); - freqStream->readBytes(buf.data(), SerializedSize); - P4DEC(buf.data(), arraySize, docRange->doc_many.data()); - } - { - uint32_t SerializedSize = freqStream->readVInt(); - std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); - freqStream->readBytes(buf.data(), SerializedSize); - P4NZDEC(buf.data(), arraySize, docRange->freq_many.data()); - } - docRange->type_ = DocRangeType::kMany; - docRange->doc_many_size_ = arraySize; - docRange->freq_many_size_ = arraySize; - } - count += arraySize; - } else { - uint32_t arraySize = freqStream->readVInt(); - if (arraySize < PFOR_BLOCK_SIZE) { - uint32_t docDelta = 0; - for (uint32_t i = 0; i < arraySize; i++) { - uint32_t docCode = freqStream->readVInt(); - docDelta += docCode; - docRange->doc_many[i] = docDelta; - } - docRange->type_ = DocRangeType::kMany; - docRange->doc_many_size_ = arraySize; - } else { - { - uint32_t serializedSize = freqStream->readVInt(); - std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE); - freqStream->readBytes(buf.data(), serializedSize); - P4DEC(buf.data(), arraySize, docRange->doc_many.data()); - } - docRange->type_ = DocRangeType::kMany; - docRange->doc_many_size_ = arraySize; - } - count += arraySize; - } + buffer_.readRange(docRange); - if (docRange->type_ == DocRangeType::kMany) { - if (docRange->doc_many_size_ > 0) { - uint32_t start = docRange->doc_many[0]; - uint32_t end = docRange->doc_many[docRange->doc_many_size_ - 1]; - if ((end - start) == docRange->doc_many_size_ - 1) { - docRange->doc_range.first = start; - docRange->doc_range.second = start + docRange->doc_many_size_; - docRange->type_ = DocRangeType::kRange; - } + count += docRange->doc_many_size_; + + if (docRange->doc_many_size_ > 0) { + uint32_t start = (*docRange->doc_many)[0]; + uint32_t end = (*docRange->doc_many)[docRange->doc_many_size_ - 1]; + if ((end - start) == docRange->doc_many_size_ - 1) { + docRange->doc_range.first = start; + docRange->doc_range.second = start + docRange->doc_many_size_; + docRange->type_ = DocRangeType::kRange; } } @@ -329,7 +174,7 @@ bool SegmentTermDocs::skipTo(const int32_t target) { _doc = skipListReader->getDoc(); count = newCount; - pointer = pointerMax; + buffer_.refill(); } } @@ -341,5 +186,120 @@ bool SegmentTermDocs::skipTo(const int32_t target) { return true; } +void TermDocsBuffer::refill() { + cur_doc_ = 0; + cur_freq_ = 0; + + if (indexVersion_ == IndexVersion::kV1) { + size_ = refillV1(); + } else { + size_ = refillV0(); + } +} + +void TermDocsBuffer::readRange(DocRange* docRange) { + int32_t size = 0; + if (indexVersion_ == IndexVersion::kV1) { + size = refillV1(); + } else { + size = refillV0(); + } + docRange->type_ = DocRangeType::kMany; + docRange->doc_many = &docs_; + docRange->doc_many_size_ = size; + if (hasProx_) { + docRange->freq_many = &freqs_; + docRange->freq_many_size_ = size; + } +} + +int32_t TermDocsBuffer::refillV0() { + if (hasProx_) { + char mode = freqStream_->readByte(); + uint32_t arraySize = freqStream_->readVInt(); + if (mode == (char)CodeMode::kPfor) { + { + uint32_t SerializedSize = freqStream_->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + freqStream_->readBytes(buf.data(), SerializedSize); + P4DEC(buf.data(), arraySize, docs_.data()); + } + { + uint32_t SerializedSize = freqStream_->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + freqStream_->readBytes(buf.data(), SerializedSize); + P4NZDEC(buf.data(), arraySize, freqs_.data()); + } + } else if (mode == (char)CodeMode::kDefault) { + uint32_t docDelta = 0; + for (uint32_t i = 0; i < arraySize; i++) { + uint32_t docCode = freqStream_->readVInt(); + docDelta += (docCode >> 1); + docs_[i] = docDelta; + if ((docCode & 1) != 0) { + freqs_[i] = 1; + } else { + freqs_[i] = freqStream_->readVInt(); + } + } + } + return arraySize; + } else { + uint32_t arraySize = freqStream_->readVInt(); + if (arraySize < PFOR_BLOCK_SIZE) { + uint32_t docDelta = 0; + for (uint32_t i = 0; i < arraySize; i++) { + uint32_t docCode = freqStream_->readVInt(); + docDelta += docCode; + docs_[i] = docDelta; + } + } else { + { + uint32_t serializedSize = freqStream_->readVInt(); + std::vector<uint8_t> buf(serializedSize + PFOR_BLOCK_SIZE); + freqStream_->readBytes(buf.data(), serializedSize); + P4DEC(buf.data(), arraySize, docs_.data()); + } + } + return arraySize; + } +} + +int32_t TermDocsBuffer::refillV1() { + char mode = freqStream_->readByte(); + uint32_t arraySize = freqStream_->readVInt(); + if (mode == (char)CodeMode::kPfor) { + { + uint32_t SerializedSize = freqStream_->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + freqStream_->readBytes(buf.data(), SerializedSize); + P4DEC(buf.data(), arraySize, docs_.data()); + } + if (hasProx_) { + uint32_t SerializedSize = freqStream_->readVInt(); + std::vector<uint8_t> buf(SerializedSize + PFOR_BLOCK_SIZE); + freqStream_->readBytes(buf.data(), SerializedSize); + P4NZDEC(buf.data(), arraySize, freqs_.data()); + } + } else if (mode == (char)CodeMode::kDefault) { + uint32_t docDelta = 0; + for (uint32_t i = 0; i < arraySize; i++) { + uint32_t docCode = freqStream_->readVInt(); + if (hasProx_) { + docDelta += (docCode >> 1); + docs_[i] = docDelta; + if ((docCode & 1) != 0) { + freqs_[i] = 1; + } else { + freqs_[i] = freqStream_->readVInt(); + } + } else { + docDelta += docCode; + docs_[i] = docDelta; + } + } + } + return arraySize; +} CL_NS_END diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h index 4d701ca7..620105fd 100644 --- a/src/core/CLucene/index/Terms.h +++ b/src/core/CLucene/index/Terms.h @@ -82,6 +82,10 @@ public: * No dynamic casting is required and no RTTI data is needed to do this */ virtual TermPositions* __asTermPositions()=0; + + virtual int32_t docFreq() { + _CLTHROWA(CL_ERR_UnsupportedOperation, "TermDocs::docFreq does not support this method."); + } }; diff --git a/src/core/CLucene/index/_FieldInfos.h b/src/core/CLucene/index/_FieldInfos.h index 414dcc90..ed142c44 100644 --- a/src/core/CLucene/index/_FieldInfos.h +++ b/src/core/CLucene/index/_FieldInfos.h @@ -8,6 +8,8 @@ #define _lucene_index_FieldInfos_ #include "CLucene/store/Directory.h" +#include "CLucene/index/IndexVersion.h" + CL_CLASS_DEF(document,Document) CL_CLASS_DEF(document,Field) @@ -33,6 +35,7 @@ class FieldInfo :LUCENE_BASE{ bool omitNorms; // omit norms associated with indexed fields bool hasProx = false; + IndexVersion indexVersion_ = IndexVersion::kV1; bool storePayloads; // whether this field stores payloads together with term positions @@ -67,6 +70,10 @@ class FieldInfo :LUCENE_BASE{ * @memory - caller is responsible for deleting the returned object */ FieldInfo* clone(); + +public: + void setIndexVersion(IndexVersion indexVersion) { indexVersion_ = indexVersion; } + IndexVersion getIndexVersion() { return indexVersion_; } }; /** Access to the Field Info file that describes document fields and whether or @@ -126,6 +133,7 @@ public: void addIndexed(const TCHAR** names, const bool storeTermVectors, const bool storePositionWithTermVector, const bool storeOffsetWithTermVector); bool hasProx(); + IndexVersion getIndexVersion(); /** * Assumes the fields are not storing term vectors. diff --git a/src/core/CLucene/index/_MultiSegmentReader.h b/src/core/CLucene/index/_MultiSegmentReader.h index c8460273..46b32caa 100644 --- a/src/core/CLucene/index/_MultiSegmentReader.h +++ b/src/core/CLucene/index/_MultiSegmentReader.h @@ -119,6 +119,8 @@ public: static const char* getClassName(); const char* getObjectName() const; + + IndexVersion getIndexVersion() override; }; @@ -159,6 +161,8 @@ public: void close(); virtual TermPositions* __asTermPositions(); + + int32_t docFreq() override; }; diff --git a/src/core/CLucene/index/_SegmentHeader.h b/src/core/CLucene/index/_SegmentHeader.h index 8cbb8959..e177f576 100644 --- a/src/core/CLucene/index/_SegmentHeader.h +++ b/src/core/CLucene/index/_SegmentHeader.h @@ -25,10 +25,67 @@ #include "DirectoryIndexReader.h" #include "_SkipListReader.h" #include "CLucene/util/_ThreadLocal.h" +#include "CLucene/index/IndexVersion.h" CL_NS_DEF(index) class SegmentReader; +class TermDocsBuffer { +public: + TermDocsBuffer(CL_NS(store)::IndexInput* freqStream, bool hasProx, IndexVersion indexVersion) + : docs_(PFOR_BLOCK_SIZE + 3), + freqs_(PFOR_BLOCK_SIZE + 3), + freqStream_(freqStream), + hasProx_(hasProx), + indexVersion_(indexVersion) { + } + + ~TermDocsBuffer() { + cur_doc_ = 0; + cur_freq_ = 0; + + docs_.clear(); + freqs_.clear(); + + freqStream_ = nullptr; + } + + inline int32_t getDoc() { + if (cur_doc_ >= size_) { + refill(); + } + return docs_[cur_doc_++]; + } + + inline int32_t getFreq() { + if (cur_freq_ >= size_) { + refill(); + } + return freqs_[cur_freq_++]; + } + + void refill(); + void readRange(DocRange* docRange); + +private: + int32_t refillV0(); + int32_t refillV1(); + +private: + uint32_t size_ = 0; + + uint32_t cur_doc_ = 0; + std::vector<uint32_t> docs_; + + uint32_t cur_freq_ = 0; + std::vector<uint32_t> freqs_; + + CL_NS(store)::IndexInput* freqStream_ = nullptr; + + bool hasProx_ = false; + IndexVersion indexVersion_ = IndexVersion::kV0; +}; + class SegmentTermDocs:public virtual TermDocs { protected: const SegmentReader* parent; @@ -57,6 +114,7 @@ private: protected: bool currentFieldStoresPayloads; bool hasProx = false; + IndexVersion indexVersion_ = IndexVersion::kV0; public: ///\param Parent must be a segment reader @@ -82,9 +140,18 @@ public: virtual TermPositions* __asTermPositions(); + int32_t docFreq() override; + protected: virtual void skippingDoc(){} virtual void skipProx(const int64_t /*proxPointer*/, const int32_t /*payloadLength*/){} + +private: + bool readRangeV0(DocRange* docRange); + bool readRangeV1(DocRange* docRange); + +private: + TermDocsBuffer buffer_; }; @@ -439,6 +506,8 @@ private: void startCommit(); void rollbackCommit(); + IndexVersion getIndexVersion() override; + //allow various classes to access the internals of this. this allows us to have //a more tight idea of the package friend class IndexReader; diff --git a/src/core/CLucene/search/query/DcoIdSetIterator.h b/src/core/CLucene/search/query/DcoIdSetIterator.h new file mode 100644 index 00000000..88aa4313 --- /dev/null +++ b/src/core/CLucene/search/query/DcoIdSetIterator.h @@ -0,0 +1,16 @@ +#pragma once + +#include "CLucene/index/DocRange.h" + +class DocIdSetIterator { +public: + DocIdSetIterator() = default; + virtual ~DocIdSetIterator() = default; + + virtual int32_t docID() = 0; + virtual int32_t nextDoc() = 0; + virtual int32_t advance(int32_t target) = 0; + + virtual int32_t docFreq() const = 0; + virtual bool readRange(DocRange* docRange) const = 0; +}; \ No newline at end of file diff --git a/src/core/CLucene/search/query/TermIterator.h b/src/core/CLucene/search/query/TermIterator.h new file mode 100644 index 00000000..e0cf23a4 --- /dev/null +++ b/src/core/CLucene/search/query/TermIterator.h @@ -0,0 +1,51 @@ +#pragma once + +#include "CLucene/search/query/DcoIdSetIterator.h" +#include "CLucene/index/Terms.h" + +#include <limits.h> + +CL_NS_USE(index) + +class TermIterator : public DocIdSetIterator { +public: + TermIterator() = default; + TermIterator(TermDocs* termDocs) : termDocs_(termDocs) { + } + + virtual ~TermIterator() = default; + + bool isEmpty() { + return termDocs_ == nullptr; + } + + int32_t docID() override { + uint32_t docId = termDocs_->doc(); + return docId >= INT_MAX ? INT_MAX : docId; + } + + int32_t nextDoc() override { + if (termDocs_->next()) { + return termDocs_->doc(); + } + return INT_MAX; + } + + int32_t advance(int32_t target) override { + if (termDocs_->skipTo(target)) { + return termDocs_->doc(); + } + return INT_MAX; + } + + int32_t docFreq() const override { + return termDocs_->docFreq(); + } + + bool readRange(DocRange* docRange) const override { + return termDocs_->readRange(docRange); + } + +private: + TermDocs* termDocs_ = nullptr; +}; \ No newline at end of file diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index e2264120..e1c13305 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -199,6 +199,8 @@ SET(clucene_core_Files ./CLucene/search/spans/SpanWeight.cpp ./CLucene/search/spans/SpanWeight.h ./CLucene/search/spans/TermSpans.cpp + ./CLucene/search/query/DcoIdSetIterator.h + ./CLucene/search/query/TermIterator.h ) #if USE_SHARED_OBJECT_FILES then we link directly to the object files (means rebuilding them for the core) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org