This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new 3bc310b2855 [opt](inverted index) add performance profiling for remote 
io access in inverted index (#250)
3bc310b2855 is described below

commit 3bc310b285514eed5ca12b6726d2d070a32e2d10
Author: zzzxl <yangs...@selectdb.com>
AuthorDate: Tue Nov 12 15:03:02 2024 +0800

    [opt](inverted index) add performance profiling for remote io access in 
inverted index (#250)
---
 src/core/CLucene/index/IndexReader.cpp          |  8 ++++----
 src/core/CLucene/index/IndexReader.h            |  8 ++++----
 src/core/CLucene/index/MultiReader.cpp          |  6 ++++--
 src/core/CLucene/index/MultiReader.h            |  4 ++--
 src/core/CLucene/index/MultiSegmentReader.cpp   | 14 ++++++++++----
 src/core/CLucene/index/SegmentReader.cpp        | 12 ++++++++----
 src/core/CLucene/index/SegmentTermDocs.cpp      | 14 +++++++++++++-
 src/core/CLucene/index/SegmentTermEnum.cpp      |  6 ++++++
 src/core/CLucene/index/SegmentTermPositions.cpp |  5 +++++
 src/core/CLucene/index/SkipListReader.cpp       |  6 ++++++
 src/core/CLucene/index/TermInfosReader.cpp      |  7 +++++++
 src/core/CLucene/index/Terms.h                  |  2 ++
 src/core/CLucene/index/_MultiSegmentReader.h    |  9 +++++++--
 src/core/CLucene/index/_SegmentHeader.h         | 11 ++++++++---
 src/core/CLucene/index/_SegmentTermEnum.h       |  2 ++
 src/core/CLucene/index/_SkipListReader.h        |  2 ++
 src/core/CLucene/index/_TermInfosReader.h       |  5 +++++
 src/core/CLucene/store/IndexInput.h             |  3 +++
 18 files changed, 98 insertions(+), 26 deletions(-)

diff --git a/src/core/CLucene/index/IndexReader.cpp 
b/src/core/CLucene/index/IndexReader.cpp
index 5b9f8ad2624..584957ee3b1 100644
--- a/src/core/CLucene/index/IndexReader.cpp
+++ b/src/core/CLucene/index/IndexReader.cpp
@@ -251,7 +251,7 @@ CL_NS_DEF(index)
     return SegmentInfos::getCurrentSegmentGeneration(directory) != -1;
   }
 
-  TermDocs* IndexReader::termDocs(Term* term) {
+  TermDocs* IndexReader::termDocs(Term* term, const void* io_ctx) {
   //Func - Returns an enumeration of all the documents which contain
   //       term. For each document, the document number, the frequency of
   //       the term in that document is also provided, for use in search 
scoring.
@@ -268,14 +268,14 @@ CL_NS_DEF(index)
 
       ensureOpen();
       //Reference an instantiated TermDocs instance
-      TermDocs* _termDocs = termDocs();
+      TermDocs* _termDocs = termDocs(io_ctx);
       //Seek all documents containing term
       _termDocs->seek(term);
       //return the enumaration
       return _termDocs;
   }
 
-  TermPositions* IndexReader::termPositions(Term* term){
+  TermPositions* IndexReader::termPositions(Term* term, const void* io_ctx){
   //Func - Returns an enumeration of all the documents which contain  term. 
For each
   //       document, in addition to the document number and frequency of the 
term in
   //       that document, a list of all of the ordinal positions of the term 
in the document
@@ -294,7 +294,7 @@ CL_NS_DEF(index)
 
       ensureOpen();
       //Reference an instantiated termPositions instance
-      TermPositions* _termPositions = termPositions();
+      TermPositions* _termPositions = termPositions(io_ctx);
          //Seek all documents containing term
       _termPositions->seek(term);
          //return the enumeration
diff --git a/src/core/CLucene/index/IndexReader.h 
b/src/core/CLucene/index/IndexReader.h
index 4307a0d9332..8f78ca53b07 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -564,7 +564,7 @@ public:
    * @throws IOException if there is a low-level IO error
         * @memory Caller must clean up
         */
-       virtual TermPositions* termPositions() = 0;
+       virtual TermPositions* termPositions(const void* io_ctx = nullptr) = 0;
 
     /** Returns an enumeration of all the documents which contain
        * <code>term</code>.  For each document, in addition to the document 
number
@@ -584,13 +584,13 @@ public:
   * @throws IOException if there is a low-level IO error
   * @memory Caller must clean up
        */
-       TermPositions* termPositions(Term* term);
+       TermPositions* termPositions(Term* term, const void* io_ctx = nullptr);
 
        /** Returns an unpositioned {@link TermDocs} enumerator.
    * @throws IOException if there is a low-level IO error
         * @memory Caller must clean up
         */
-       virtual TermDocs* termDocs() = 0;
+       virtual TermDocs* termDocs(const void* io_ctx = nullptr) = 0;
 
        /** Returns an enumeration of all the documents which contain
        * <code>term</code>. For each document, the document number, the 
frequency of
@@ -602,7 +602,7 @@ public:
   * @throws IOException if there is a low-level IO error
   * @memory Caller must clean up
        */
-       TermDocs* termDocs(Term* term);
+       TermDocs* termDocs(Term* term, const void* io_ctx = nullptr);
 
        /** Deletes the document numbered <code>docNum</code>.  Once a document 
is
        * deleted it will not appear in TermDocs or TermPostitions enumerations.
diff --git a/src/core/CLucene/index/MultiReader.cpp 
b/src/core/CLucene/index/MultiReader.cpp
index 726b6e3dac5..101e8012fa0 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -271,15 +271,17 @@ int32_t MultiReader::docFreq(const Term* t) {
        return total;
 }
 
-TermDocs* MultiReader::termDocs() {
+TermDocs* MultiReader::termDocs(const void* io_ctx) {
     ensureOpen();
        TermDocs* ret =  _CLNEW MultiTermDocs(subReaders, starts);
+  ret->setIoContext(io_ctx);
        return ret;
 }
 
-TermPositions* MultiReader::termPositions() {
+TermPositions* MultiReader::termPositions(const void* io_ctx) {
     ensureOpen();
        TermPositions* ret = (TermPositions*)_CLNEW 
MultiTermPositions(subReaders, starts);
+  ret->setIoContext(io_ctx);
        return ret;
 }
 
diff --git a/src/core/CLucene/index/MultiReader.h 
b/src/core/CLucene/index/MultiReader.h
index 301d1422e2c..dcfb9e66bd9 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -100,8 +100,8 @@ public:
 
        //Returns the document frequency of the current term in the set
        int32_t docFreq(const Term* t=NULL);
-       TermDocs* termDocs();
-       TermPositions* termPositions();
+       TermDocs* termDocs(const void* io_ctx = nullptr);
+       TermPositions* termPositions(const void* io_ctx = nullptr);
 
        /**
        * @see IndexReader#getFieldNames(IndexReader.FieldOption fldOption)
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp 
b/src/core/CLucene/index/MultiSegmentReader.cpp
index b4be5f01298..e5822a688c3 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -355,15 +355,17 @@ int32_t MultiSegmentReader::docFreq(const Term* t) {
        return total;
 }
 
-TermDocs* MultiSegmentReader::termDocs() {
+TermDocs* MultiSegmentReader::termDocs(const void* io_ctx) {
     ensureOpen();
        TermDocs* ret =  _CLNEW MultiTermDocs(subReaders, starts);
+       ret->setIoContext(io_ctx);
        return ret;
 }
 
-TermPositions* MultiSegmentReader::termPositions() {
+TermPositions* MultiSegmentReader::termPositions(const void* io_ctx) {
     ensureOpen();
        TermPositions* ret = static_cast<TermPositions*>(_CLNEW 
MultiTermPositions(subReaders, starts));
+       ret->setIoContext(io_ctx);
        return ret;
 }
 
@@ -559,6 +561,10 @@ int32_t MultiTermDocs::docFreq() {
        return docFreq;
 }
 
+void MultiTermDocs::setIoContext(const void* io_ctx) {
+       io_ctx_ = io_ctx;
+}
+
 int32_t MultiTermDocs::doc() const {
   CND_PRECONDITION(current!=NULL,"current==NULL, check that next() was 
called");
   // if not found term, current will return INT_MAX, we could not add base, 
otherwise it will overflow.
@@ -724,7 +730,7 @@ void MultiTermDocs::close() {
 }
 
 TermDocs* MultiTermDocs::termDocs(IndexReader* reader) {
-       return reader->termDocs();
+       return reader->termDocs(io_ctx_);
 }
 
 TermDocs* MultiTermDocs::termDocs(const int32_t i) {
@@ -920,7 +926,7 @@ TermDocs* MultiTermPositions::termDocs(IndexReader* reader) 
{
 // rather merely producing a SegmentTermDocs via the reader's termDocs
 // method.
 
-       TermPositions* tp = reader->termPositions();
+       TermPositions* tp = reader->termPositions(io_ctx_);
        TermDocs* ret = tp->__asTermDocs();
 
        CND_CONDITION(ret != NULL,
diff --git a/src/core/CLucene/index/SegmentReader.cpp 
b/src/core/CLucene/index/SegmentReader.cpp
index 721263664fa..ec0592370cf 100644
--- a/src/core/CLucene/index/SegmentReader.cpp
+++ b/src/core/CLucene/index/SegmentReader.cpp
@@ -506,22 +506,26 @@ bool SegmentReader::isDeleted(const int32_t n) {
     return ret;
 }
 
-TermDocs *SegmentReader::termDocs() {
+TermDocs *SegmentReader::termDocs(const void* io_ctx) {
     //Func - Returns an unpositioned TermDocs enumerator.
     //Pre  - true
     //Post - An unpositioned TermDocs enumerator has been returned
 
     ensureOpen();
-    return _CLNEW SegmentTermDocs(this);
+    auto* ret = _CLNEW SegmentTermDocs(this);
+    ret->setIoContext(io_ctx);
+    return ret;
 }
 
-TermPositions *SegmentReader::termPositions() {
+TermPositions *SegmentReader::termPositions(const void* io_ctx) {
     //Func - Returns an unpositioned TermPositions enumerator.
     //Pre  - true
     //Post - An unpositioned TermPositions enumerator has been returned
 
     ensureOpen();
-    return _CLNEW SegmentTermPositions(this);
+    auto* ret = _CLNEW SegmentTermPositions(this);
+    ret->setIoContext(io_ctx);
+    return ret;
 }
 
 int32_t SegmentReader::docFreq(const Term *t) {
diff --git a/src/core/CLucene/index/SegmentTermDocs.cpp 
b/src/core/CLucene/index/SegmentTermDocs.cpp
index a761fec2810..4836aed7129 100644
--- a/src/core/CLucene/index/SegmentTermDocs.cpp
+++ b/src/core/CLucene/index/SegmentTermDocs.cpp
@@ -36,6 +36,16 @@ TermPositions *SegmentTermDocs::__asTermPositions() {
     return NULL;
 }
 
+void SegmentTermDocs::setIoContext(const void* io_ctx) {
+    if (parent && parent->tis) {
+        parent->tis->setIoContext(io_ctx);
+    }
+    if (freqStream) {
+        freqStream->setIoContext(io_ctx);
+    }
+    io_ctx_ = io_ctx;
+}
+
 int32_t SegmentTermDocs::docFreq() {
     return df;
 }
@@ -159,8 +169,10 @@ bool SegmentTermDocs::skipTo(const int32_t target) {
     assert(count <= df);
 
     if (df >= skipInterval) {// optimized case
-        if (skipListReader == NULL)
+        if (skipListReader == NULL) {
             skipListReader = _CLNEW DefaultSkipListReader(freqStream->clone(), 
maxSkipLevels, skipInterval);// lazily clone
+            skipListReader->setIoContext(io_ctx_);
+        }
 
         if (!haveSkipped) {// lazily initialize skip stream
             skipListReader->init(skipPointer, freqBasePointer, 
proxBasePointer, df, hasProx, currentFieldStoresPayloads);
diff --git a/src/core/CLucene/index/SegmentTermEnum.cpp 
b/src/core/CLucene/index/SegmentTermEnum.cpp
index 8179c7b7806..44ac45e6dc1 100644
--- a/src/core/CLucene/index/SegmentTermEnum.cpp
+++ b/src/core/CLucene/index/SegmentTermEnum.cpp
@@ -420,4 +420,10 @@ void SegmentTermEnum::growBuffer(const uint32_t length, 
bool force_copy) {
     }
 }
 
+void SegmentTermEnum::setIoContext(const void* io_ctx) {
+    if (input) {
+        input->setIoContext(io_ctx);
+    }
+}
+
 CL_NS_END
diff --git a/src/core/CLucene/index/SegmentTermPositions.cpp 
b/src/core/CLucene/index/SegmentTermPositions.cpp
index 5de0da20add..5b5343c2e0b 100644
--- a/src/core/CLucene/index/SegmentTermPositions.cpp
+++ b/src/core/CLucene/index/SegmentTermPositions.cpp
@@ -27,6 +27,10 @@ SegmentTermPositions::~SegmentTermPositions() {
     close();
 }
 
+void SegmentTermPositions::setIoContext(const void* io_ctx) {
+    SegmentTermDocs::setIoContext(io_ctx);
+}
+
 TermDocs* SegmentTermPositions::__asTermDocs(){
     return (TermDocs*) this;
 }
@@ -135,6 +139,7 @@ void SegmentTermPositions::lazySkip() {
     if (proxStream == NULL) {
       // clone lazily
       proxStream = parent->proxStream->clone();
+      proxStream->setIoContext(io_ctx_);
       buffer_.reset(proxStream);
     }
     
diff --git a/src/core/CLucene/index/SkipListReader.cpp 
b/src/core/CLucene/index/SkipListReader.cpp
index 54747564144..c139024b2b1 100644
--- a/src/core/CLucene/index/SkipListReader.cpp
+++ b/src/core/CLucene/index/SkipListReader.cpp
@@ -93,6 +93,12 @@ int32_t MultiLevelSkipListReader::skipTo(const int32_t 
target) {
        return numSkipped[0] - skipInterval[0] - 1;
 }
 
+void MultiLevelSkipListReader::setIoContext(const void* io_ctx) {
+       if (skipStream[0]) {
+               skipStream[0]->setIoContext(io_ctx);
+       }
+}
+
 bool MultiLevelSkipListReader::loadNextSkip(const int32_t level) {
        // we have to skip, the target document is greater than the current
        // skip list entry
diff --git a/src/core/CLucene/index/TermInfosReader.cpp 
b/src/core/CLucene/index/TermInfosReader.cpp
index 9044d1d36a2..0169b6f18c5 100644
--- a/src/core/CLucene/index/TermInfosReader.cpp
+++ b/src/core/CLucene/index/TermInfosReader.cpp
@@ -234,6 +234,9 @@ TermInfo* TermInfosReader::get(const Term* term) {
 
     // optimize sequential access: first try scanning cached enum w/o seeking
     SegmentTermEnum* enumerator = getEnum();
+    if (enumerator) {
+        enumerator->setIoContext(io_ctx_);
+    }
 
     // optimize sequential access: first try scanning cached enumerator w/o 
seeking
     if (
@@ -265,6 +268,10 @@ TermInfo* TermInfosReader::get(const Term* term) {
     return scanEnum(term);
 }
 
+void TermInfosReader::setIoContext(const void* io_ctx) {
+    io_ctx_ = io_ctx;
+}
+
 int64_t TermInfosReader::getPosition(const Term* term) {
     //Func - Returns the position of a Term in the set
     //Pre  - term holds a valid reference to a Term
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index 620105fd617..bf71ad99a2d 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -83,6 +83,8 @@ public:
     */
        virtual TermPositions* __asTermPositions()=0;
 
+       virtual void setIoContext(const void*) {}
+
        virtual int32_t docFreq() {
                _CLTHROWA(CL_ERR_UnsupportedOperation, "TermDocs::docFreq does 
not support this method.");
        }
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h 
b/src/core/CLucene/index/_MultiSegmentReader.h
index c5f8deeea23..d004044b0e1 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -104,8 +104,8 @@ public:
 
        //Returns the document frequency of the current term in the set
        int32_t docFreq(const Term* t=NULL);
-       TermDocs* termDocs();
-       TermPositions* termPositions();
+       TermDocs* termDocs(const void* io_ctx = nullptr);
+       TermPositions* termPositions(const void* io_ctx = nullptr);
 
   void getFieldNames (FieldOption fldOption, StringArrayWithDeletor& retarray);
        static void getFieldNames(FieldOption fldOption, 
StringArrayWithDeletor& retarray, CL_NS(util)::ArrayBase<IndexReader*>* 
subReaders);
@@ -173,6 +173,11 @@ public:
   virtual TermPositions* __asTermPositions();
 
   int32_t docFreq() override;
+
+  void setIoContext(const void* io_ctx) override;
+
+protected:
+  const void* io_ctx_ = nullptr;
 };
 
 
diff --git a/src/core/CLucene/index/_SegmentHeader.h 
b/src/core/CLucene/index/_SegmentHeader.h
index 4fa9b3fc04c..740023200c5 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -171,7 +171,8 @@ private:
 protected:
   bool currentFieldStoresPayloads;
   bool hasProx = false;
-  IndexVersion indexVersion_ = IndexVersion::kV0; 
+  IndexVersion indexVersion_ = IndexVersion::kV0;
+  const void* io_ctx_ = nullptr;
 
 public:
   ///\param Parent must be a segment reader
@@ -197,6 +198,8 @@ public:
 
   virtual TermPositions* __asTermPositions();
 
+  void setIoContext(const void* io_ctx) override;
+
   int32_t docFreq() override;
 
 protected:
@@ -234,6 +237,8 @@ public:
   SegmentTermPositions(const SegmentReader* Parent);
   virtual ~SegmentTermPositions();
 
+  void setIoContext(const void* io_ctx) override;
+
 private:
   void seek(const TermInfo* ti, Term* term);
 
@@ -473,9 +478,9 @@ public:
   bool isDeleted(const int32_t n);
 
   ///Returns an unpositioned TermDocs enumerator.
-  TermDocs* termDocs();
+  TermDocs* termDocs(const void* io_ctx = nullptr);
   ///Returns an unpositioned TermPositions enumerator.
-  TermPositions* termPositions();
+  TermPositions* termPositions(const void* io_ctx = nullptr);
 
   ///Returns the number of documents which contain the term t
   int32_t docFreq(const Term* t);
diff --git a/src/core/CLucene/index/_SegmentTermEnum.h 
b/src/core/CLucene/index/_SegmentTermEnum.h
index 3dd2c8c5b8a..860d466ce49 100644
--- a/src/core/CLucene/index/_SegmentTermEnum.h
+++ b/src/core/CLucene/index/_SegmentTermEnum.h
@@ -123,6 +123,8 @@ public:
 
        int32_t getFormat() { return format; }
 
+       void setIoContext(const void* io_ctx);
+
 private:
        /**
         * Reads the next term in the enumeration
diff --git a/src/core/CLucene/index/_SkipListReader.h 
b/src/core/CLucene/index/_SkipListReader.h
index 5031a9815e4..0ea50614c86 100644
--- a/src/core/CLucene/index/_SkipListReader.h
+++ b/src/core/CLucene/index/_SkipListReader.h
@@ -69,6 +69,8 @@ public:
        */
        int32_t skipTo(const int32_t target);
 
+       void setIoContext(const void* io_ctx);
+
 private:
        bool loadNextSkip(const int32_t level);
 
diff --git a/src/core/CLucene/index/_TermInfosReader.h 
b/src/core/CLucene/index/_TermInfosReader.h
index a9a993795e2..c2de39bb168 100644
--- a/src/core/CLucene/index/_TermInfosReader.h
+++ b/src/core/CLucene/index/_TermInfosReader.h
@@ -49,6 +49,8 @@ CL_NS_DEF(index)
 
         int64_t numBytesUsed;
 
+               const void* io_ctx_ = nullptr;
+
         DEFINE_MUTEX(THIS_LOCK)
 
        public:
@@ -108,6 +110,9 @@ CL_NS_DEF(index)
         int64_t getRAMUsed() const {
             return numBytesUsed;
         }
+
+               void setIoContext(const void* io_ctx = nullptr);
+
     private:
                /** Reads the term info index file or .tti file. */
                void ensureIndexIsRead();
diff --git a/src/core/CLucene/store/IndexInput.h 
b/src/core/CLucene/store/IndexInput.h
index e17f9eb30c9..a67c7af3c6a 100644
--- a/src/core/CLucene/store/IndexInput.h
+++ b/src/core/CLucene/store/IndexInput.h
@@ -134,6 +134,9 @@ CL_NS_DEF(store)
 
                virtual void setIdxFileCache(bool index) {}
 
+               virtual void setIoContext(const void*) {}
+               virtual const void* getIoContext() {}
+
     };
 
    /** Abstract base class for input from a file in a {@link Directory}.  A


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to