This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new 3bc310b2855 [opt](inverted index) add performance profiling for remote io access in inverted index (#250) 3bc310b2855 is described below commit 3bc310b285514eed5ca12b6726d2d070a32e2d10 Author: zzzxl <yangs...@selectdb.com> AuthorDate: Tue Nov 12 15:03:02 2024 +0800 [opt](inverted index) add performance profiling for remote io access in inverted index (#250) --- src/core/CLucene/index/IndexReader.cpp | 8 ++++---- src/core/CLucene/index/IndexReader.h | 8 ++++---- src/core/CLucene/index/MultiReader.cpp | 6 ++++-- src/core/CLucene/index/MultiReader.h | 4 ++-- src/core/CLucene/index/MultiSegmentReader.cpp | 14 ++++++++++---- src/core/CLucene/index/SegmentReader.cpp | 12 ++++++++---- src/core/CLucene/index/SegmentTermDocs.cpp | 14 +++++++++++++- src/core/CLucene/index/SegmentTermEnum.cpp | 6 ++++++ src/core/CLucene/index/SegmentTermPositions.cpp | 5 +++++ src/core/CLucene/index/SkipListReader.cpp | 6 ++++++ src/core/CLucene/index/TermInfosReader.cpp | 7 +++++++ src/core/CLucene/index/Terms.h | 2 ++ src/core/CLucene/index/_MultiSegmentReader.h | 9 +++++++-- src/core/CLucene/index/_SegmentHeader.h | 11 ++++++++--- src/core/CLucene/index/_SegmentTermEnum.h | 2 ++ src/core/CLucene/index/_SkipListReader.h | 2 ++ src/core/CLucene/index/_TermInfosReader.h | 5 +++++ src/core/CLucene/store/IndexInput.h | 3 +++ 18 files changed, 98 insertions(+), 26 deletions(-) diff --git a/src/core/CLucene/index/IndexReader.cpp b/src/core/CLucene/index/IndexReader.cpp index 5b9f8ad2624..584957ee3b1 100644 --- a/src/core/CLucene/index/IndexReader.cpp +++ b/src/core/CLucene/index/IndexReader.cpp @@ -251,7 +251,7 @@ CL_NS_DEF(index) return SegmentInfos::getCurrentSegmentGeneration(directory) != -1; } - TermDocs* IndexReader::termDocs(Term* term) { + TermDocs* IndexReader::termDocs(Term* term, const void* io_ctx) { //Func - Returns an enumeration of all the documents which contain // term. For each document, the document number, the frequency of // the term in that document is also provided, for use in search scoring. @@ -268,14 +268,14 @@ CL_NS_DEF(index) ensureOpen(); //Reference an instantiated TermDocs instance - TermDocs* _termDocs = termDocs(); + TermDocs* _termDocs = termDocs(io_ctx); //Seek all documents containing term _termDocs->seek(term); //return the enumaration return _termDocs; } - TermPositions* IndexReader::termPositions(Term* term){ + TermPositions* IndexReader::termPositions(Term* term, const void* io_ctx){ //Func - Returns an enumeration of all the documents which contain term. For each // document, in addition to the document number and frequency of the term in // that document, a list of all of the ordinal positions of the term in the document @@ -294,7 +294,7 @@ CL_NS_DEF(index) ensureOpen(); //Reference an instantiated termPositions instance - TermPositions* _termPositions = termPositions(); + TermPositions* _termPositions = termPositions(io_ctx); //Seek all documents containing term _termPositions->seek(term); //return the enumeration diff --git a/src/core/CLucene/index/IndexReader.h b/src/core/CLucene/index/IndexReader.h index 4307a0d9332..8f78ca53b07 100644 --- a/src/core/CLucene/index/IndexReader.h +++ b/src/core/CLucene/index/IndexReader.h @@ -564,7 +564,7 @@ public: * @throws IOException if there is a low-level IO error * @memory Caller must clean up */ - virtual TermPositions* termPositions() = 0; + virtual TermPositions* termPositions(const void* io_ctx = nullptr) = 0; /** Returns an enumeration of all the documents which contain * <code>term</code>. For each document, in addition to the document number @@ -584,13 +584,13 @@ public: * @throws IOException if there is a low-level IO error * @memory Caller must clean up */ - TermPositions* termPositions(Term* term); + TermPositions* termPositions(Term* term, const void* io_ctx = nullptr); /** Returns an unpositioned {@link TermDocs} enumerator. * @throws IOException if there is a low-level IO error * @memory Caller must clean up */ - virtual TermDocs* termDocs() = 0; + virtual TermDocs* termDocs(const void* io_ctx = nullptr) = 0; /** Returns an enumeration of all the documents which contain * <code>term</code>. For each document, the document number, the frequency of @@ -602,7 +602,7 @@ public: * @throws IOException if there is a low-level IO error * @memory Caller must clean up */ - TermDocs* termDocs(Term* term); + TermDocs* termDocs(Term* term, const void* io_ctx = nullptr); /** Deletes the document numbered <code>docNum</code>. Once a document is * deleted it will not appear in TermDocs or TermPostitions enumerations. diff --git a/src/core/CLucene/index/MultiReader.cpp b/src/core/CLucene/index/MultiReader.cpp index 726b6e3dac5..101e8012fa0 100644 --- a/src/core/CLucene/index/MultiReader.cpp +++ b/src/core/CLucene/index/MultiReader.cpp @@ -271,15 +271,17 @@ int32_t MultiReader::docFreq(const Term* t) { return total; } -TermDocs* MultiReader::termDocs() { +TermDocs* MultiReader::termDocs(const void* io_ctx) { ensureOpen(); TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts); + ret->setIoContext(io_ctx); return ret; } -TermPositions* MultiReader::termPositions() { +TermPositions* MultiReader::termPositions(const void* io_ctx) { ensureOpen(); TermPositions* ret = (TermPositions*)_CLNEW MultiTermPositions(subReaders, starts); + ret->setIoContext(io_ctx); return ret; } diff --git a/src/core/CLucene/index/MultiReader.h b/src/core/CLucene/index/MultiReader.h index 301d1422e2c..dcfb9e66bd9 100644 --- a/src/core/CLucene/index/MultiReader.h +++ b/src/core/CLucene/index/MultiReader.h @@ -100,8 +100,8 @@ public: //Returns the document frequency of the current term in the set int32_t docFreq(const Term* t=NULL); - TermDocs* termDocs(); - TermPositions* termPositions(); + TermDocs* termDocs(const void* io_ctx = nullptr); + TermPositions* termPositions(const void* io_ctx = nullptr); /** * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption) diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp b/src/core/CLucene/index/MultiSegmentReader.cpp index b4be5f01298..e5822a688c3 100644 --- a/src/core/CLucene/index/MultiSegmentReader.cpp +++ b/src/core/CLucene/index/MultiSegmentReader.cpp @@ -355,15 +355,17 @@ int32_t MultiSegmentReader::docFreq(const Term* t) { return total; } -TermDocs* MultiSegmentReader::termDocs() { +TermDocs* MultiSegmentReader::termDocs(const void* io_ctx) { ensureOpen(); TermDocs* ret = _CLNEW MultiTermDocs(subReaders, starts); + ret->setIoContext(io_ctx); return ret; } -TermPositions* MultiSegmentReader::termPositions() { +TermPositions* MultiSegmentReader::termPositions(const void* io_ctx) { ensureOpen(); TermPositions* ret = static_cast<TermPositions*>(_CLNEW MultiTermPositions(subReaders, starts)); + ret->setIoContext(io_ctx); return ret; } @@ -559,6 +561,10 @@ int32_t MultiTermDocs::docFreq() { return docFreq; } +void MultiTermDocs::setIoContext(const void* io_ctx) { + io_ctx_ = io_ctx; +} + int32_t MultiTermDocs::doc() const { CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was called"); // if not found term, current will return INT_MAX, we could not add base, otherwise it will overflow. @@ -724,7 +730,7 @@ void MultiTermDocs::close() { } TermDocs* MultiTermDocs::termDocs(IndexReader* reader) { - return reader->termDocs(); + return reader->termDocs(io_ctx_); } TermDocs* MultiTermDocs::termDocs(const int32_t i) { @@ -920,7 +926,7 @@ TermDocs* MultiTermPositions::termDocs(IndexReader* reader) { // rather merely producing a SegmentTermDocs via the reader's termDocs // method. - TermPositions* tp = reader->termPositions(); + TermPositions* tp = reader->termPositions(io_ctx_); TermDocs* ret = tp->__asTermDocs(); CND_CONDITION(ret != NULL, diff --git a/src/core/CLucene/index/SegmentReader.cpp b/src/core/CLucene/index/SegmentReader.cpp index 721263664fa..ec0592370cf 100644 --- a/src/core/CLucene/index/SegmentReader.cpp +++ b/src/core/CLucene/index/SegmentReader.cpp @@ -506,22 +506,26 @@ bool SegmentReader::isDeleted(const int32_t n) { return ret; } -TermDocs *SegmentReader::termDocs() { +TermDocs *SegmentReader::termDocs(const void* io_ctx) { //Func - Returns an unpositioned TermDocs enumerator. //Pre - true //Post - An unpositioned TermDocs enumerator has been returned ensureOpen(); - return _CLNEW SegmentTermDocs(this); + auto* ret = _CLNEW SegmentTermDocs(this); + ret->setIoContext(io_ctx); + return ret; } -TermPositions *SegmentReader::termPositions() { +TermPositions *SegmentReader::termPositions(const void* io_ctx) { //Func - Returns an unpositioned TermPositions enumerator. //Pre - true //Post - An unpositioned TermPositions enumerator has been returned ensureOpen(); - return _CLNEW SegmentTermPositions(this); + auto* ret = _CLNEW SegmentTermPositions(this); + ret->setIoContext(io_ctx); + return ret; } int32_t SegmentReader::docFreq(const Term *t) { diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp b/src/core/CLucene/index/SegmentTermDocs.cpp index a761fec2810..4836aed7129 100644 --- a/src/core/CLucene/index/SegmentTermDocs.cpp +++ b/src/core/CLucene/index/SegmentTermDocs.cpp @@ -36,6 +36,16 @@ TermPositions *SegmentTermDocs::__asTermPositions() { return NULL; } +void SegmentTermDocs::setIoContext(const void* io_ctx) { + if (parent && parent->tis) { + parent->tis->setIoContext(io_ctx); + } + if (freqStream) { + freqStream->setIoContext(io_ctx); + } + io_ctx_ = io_ctx; +} + int32_t SegmentTermDocs::docFreq() { return df; } @@ -159,8 +169,10 @@ bool SegmentTermDocs::skipTo(const int32_t target) { assert(count <= df); if (df >= skipInterval) {// optimized case - if (skipListReader == NULL) + if (skipListReader == NULL) { skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(), maxSkipLevels, skipInterval);// lazily clone + skipListReader->setIoContext(io_ctx_); + } if (!haveSkipped) {// lazily initialize skip stream skipListReader->init(skipPointer, freqBasePointer, proxBasePointer, df, hasProx, currentFieldStoresPayloads); diff --git a/src/core/CLucene/index/SegmentTermEnum.cpp b/src/core/CLucene/index/SegmentTermEnum.cpp index 8179c7b7806..44ac45e6dc1 100644 --- a/src/core/CLucene/index/SegmentTermEnum.cpp +++ b/src/core/CLucene/index/SegmentTermEnum.cpp @@ -420,4 +420,10 @@ void SegmentTermEnum::growBuffer(const uint32_t length, bool force_copy) { } } +void SegmentTermEnum::setIoContext(const void* io_ctx) { + if (input) { + input->setIoContext(io_ctx); + } +} + CL_NS_END diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp b/src/core/CLucene/index/SegmentTermPositions.cpp index 5de0da20add..5b5343c2e0b 100644 --- a/src/core/CLucene/index/SegmentTermPositions.cpp +++ b/src/core/CLucene/index/SegmentTermPositions.cpp @@ -27,6 +27,10 @@ SegmentTermPositions::~SegmentTermPositions() { close(); } +void SegmentTermPositions::setIoContext(const void* io_ctx) { + SegmentTermDocs::setIoContext(io_ctx); +} + TermDocs* SegmentTermPositions::__asTermDocs(){ return (TermDocs*) this; } @@ -135,6 +139,7 @@ void SegmentTermPositions::lazySkip() { if (proxStream == NULL) { // clone lazily proxStream = parent->proxStream->clone(); + proxStream->setIoContext(io_ctx_); buffer_.reset(proxStream); } diff --git a/src/core/CLucene/index/SkipListReader.cpp b/src/core/CLucene/index/SkipListReader.cpp index 54747564144..c139024b2b1 100644 --- a/src/core/CLucene/index/SkipListReader.cpp +++ b/src/core/CLucene/index/SkipListReader.cpp @@ -93,6 +93,12 @@ int32_t MultiLevelSkipListReader::skipTo(const int32_t target) { return numSkipped[0] - skipInterval[0] - 1; } +void MultiLevelSkipListReader::setIoContext(const void* io_ctx) { + if (skipStream[0]) { + skipStream[0]->setIoContext(io_ctx); + } +} + bool MultiLevelSkipListReader::loadNextSkip(const int32_t level) { // we have to skip, the target document is greater than the current // skip list entry diff --git a/src/core/CLucene/index/TermInfosReader.cpp b/src/core/CLucene/index/TermInfosReader.cpp index 9044d1d36a2..0169b6f18c5 100644 --- a/src/core/CLucene/index/TermInfosReader.cpp +++ b/src/core/CLucene/index/TermInfosReader.cpp @@ -234,6 +234,9 @@ TermInfo* TermInfosReader::get(const Term* term) { // optimize sequential access: first try scanning cached enum w/o seeking SegmentTermEnum* enumerator = getEnum(); + if (enumerator) { + enumerator->setIoContext(io_ctx_); + } // optimize sequential access: first try scanning cached enumerator w/o seeking if ( @@ -265,6 +268,10 @@ TermInfo* TermInfosReader::get(const Term* term) { return scanEnum(term); } +void TermInfosReader::setIoContext(const void* io_ctx) { + io_ctx_ = io_ctx; +} + int64_t TermInfosReader::getPosition(const Term* term) { //Func - Returns the position of a Term in the set //Pre - term holds a valid reference to a Term diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h index 620105fd617..bf71ad99a2d 100644 --- a/src/core/CLucene/index/Terms.h +++ b/src/core/CLucene/index/Terms.h @@ -83,6 +83,8 @@ public: */ virtual TermPositions* __asTermPositions()=0; + virtual void setIoContext(const void*) {} + virtual int32_t docFreq() { _CLTHROWA(CL_ERR_UnsupportedOperation, "TermDocs::docFreq does not support this method."); } diff --git a/src/core/CLucene/index/_MultiSegmentReader.h b/src/core/CLucene/index/_MultiSegmentReader.h index c5f8deeea23..d004044b0e1 100644 --- a/src/core/CLucene/index/_MultiSegmentReader.h +++ b/src/core/CLucene/index/_MultiSegmentReader.h @@ -104,8 +104,8 @@ public: //Returns the document frequency of the current term in the set int32_t docFreq(const Term* t=NULL); - TermDocs* termDocs(); - TermPositions* termPositions(); + TermDocs* termDocs(const void* io_ctx = nullptr); + TermPositions* termPositions(const void* io_ctx = nullptr); void getFieldNames (FieldOption fldOption, StringArrayWithDeletor& retarray); static void getFieldNames(FieldOption fldOption, StringArrayWithDeletor& retarray, CL_NS(util)::ArrayBase<IndexReader*>* subReaders); @@ -173,6 +173,11 @@ public: virtual TermPositions* __asTermPositions(); int32_t docFreq() override; + + void setIoContext(const void* io_ctx) override; + +protected: + const void* io_ctx_ = nullptr; }; diff --git a/src/core/CLucene/index/_SegmentHeader.h b/src/core/CLucene/index/_SegmentHeader.h index 4fa9b3fc04c..740023200c5 100644 --- a/src/core/CLucene/index/_SegmentHeader.h +++ b/src/core/CLucene/index/_SegmentHeader.h @@ -171,7 +171,8 @@ private: protected: bool currentFieldStoresPayloads; bool hasProx = false; - IndexVersion indexVersion_ = IndexVersion::kV0; + IndexVersion indexVersion_ = IndexVersion::kV0; + const void* io_ctx_ = nullptr; public: ///\param Parent must be a segment reader @@ -197,6 +198,8 @@ public: virtual TermPositions* __asTermPositions(); + void setIoContext(const void* io_ctx) override; + int32_t docFreq() override; protected: @@ -234,6 +237,8 @@ public: SegmentTermPositions(const SegmentReader* Parent); virtual ~SegmentTermPositions(); + void setIoContext(const void* io_ctx) override; + private: void seek(const TermInfo* ti, Term* term); @@ -473,9 +478,9 @@ public: bool isDeleted(const int32_t n); ///Returns an unpositioned TermDocs enumerator. - TermDocs* termDocs(); + TermDocs* termDocs(const void* io_ctx = nullptr); ///Returns an unpositioned TermPositions enumerator. - TermPositions* termPositions(); + TermPositions* termPositions(const void* io_ctx = nullptr); ///Returns the number of documents which contain the term t int32_t docFreq(const Term* t); diff --git a/src/core/CLucene/index/_SegmentTermEnum.h b/src/core/CLucene/index/_SegmentTermEnum.h index 3dd2c8c5b8a..860d466ce49 100644 --- a/src/core/CLucene/index/_SegmentTermEnum.h +++ b/src/core/CLucene/index/_SegmentTermEnum.h @@ -123,6 +123,8 @@ public: int32_t getFormat() { return format; } + void setIoContext(const void* io_ctx); + private: /** * Reads the next term in the enumeration diff --git a/src/core/CLucene/index/_SkipListReader.h b/src/core/CLucene/index/_SkipListReader.h index 5031a9815e4..0ea50614c86 100644 --- a/src/core/CLucene/index/_SkipListReader.h +++ b/src/core/CLucene/index/_SkipListReader.h @@ -69,6 +69,8 @@ public: */ int32_t skipTo(const int32_t target); + void setIoContext(const void* io_ctx); + private: bool loadNextSkip(const int32_t level); diff --git a/src/core/CLucene/index/_TermInfosReader.h b/src/core/CLucene/index/_TermInfosReader.h index a9a993795e2..c2de39bb168 100644 --- a/src/core/CLucene/index/_TermInfosReader.h +++ b/src/core/CLucene/index/_TermInfosReader.h @@ -49,6 +49,8 @@ CL_NS_DEF(index) int64_t numBytesUsed; + const void* io_ctx_ = nullptr; + DEFINE_MUTEX(THIS_LOCK) public: @@ -108,6 +110,9 @@ CL_NS_DEF(index) int64_t getRAMUsed() const { return numBytesUsed; } + + void setIoContext(const void* io_ctx = nullptr); + private: /** Reads the term info index file or .tti file. */ void ensureIndexIsRead(); diff --git a/src/core/CLucene/store/IndexInput.h b/src/core/CLucene/store/IndexInput.h index e17f9eb30c9..a67c7af3c6a 100644 --- a/src/core/CLucene/store/IndexInput.h +++ b/src/core/CLucene/store/IndexInput.h @@ -134,6 +134,9 @@ CL_NS_DEF(store) virtual void setIdxFileCache(bool index) {} + virtual void setIoContext(const void*) {} + virtual const void* getIoContext() {} + }; /** Abstract base class for input from a file in a {@link Directory}. A --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org