This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new fa3b5944e48 [opt](inverted index) Enhance I/O statistics collection 
for the inverted index in file cache scenarios (#295)
fa3b5944e48 is described below

commit fa3b5944e4827faaee11441a927a4985dd8efd7d
Author: zzzxl <yangs...@selectdb.com>
AuthorDate: Tue Mar 18 19:19:24 2025 +0800

    [opt](inverted index) Enhance I/O statistics collection for the inverted 
index in file cache scenarios (#295)
---
 src/core/CLucene/index/IndexReader.h          |   4 +-
 src/core/CLucene/index/MultiReader.cpp        |  12 +-
 src/core/CLucene/index/MultiReader.h          |   4 +-
 src/core/CLucene/index/MultiSegmentReader.cpp |  20 ++-
 src/core/CLucene/index/SegmentReader.cpp      |   8 +-
 src/core/CLucene/index/TermInfosReader.cpp    |   7 +-
 src/core/CLucene/index/Terms.h                |   2 +
 src/core/CLucene/index/_MultiSegmentReader.h  |   9 +-
 src/core/CLucene/index/_SegmentHeader.h       |   4 +-
 src/core/CLucene/index/_TermInfosReader.h     |   2 +-
 src/core/CLucene/util/bkd/bkd_reader.cpp      |   1 +
 src/core/CLucene/util/bkd/bkd_reader.h        |   2 +
 src/test/CMakeLists.txt                       |   1 +
 src/test/index/TestIndexReader2.cpp           | 209 ++++++++++++++++++++++++++
 src/test/test.h                               |   1 +
 src/test/tests.cpp                            |   1 +
 16 files changed, 262 insertions(+), 25 deletions(-)

diff --git a/src/core/CLucene/index/IndexReader.h 
b/src/core/CLucene/index/IndexReader.h
index 8f78ca53b07..ab453384238 100644
--- a/src/core/CLucene/index/IndexReader.h
+++ b/src/core/CLucene/index/IndexReader.h
@@ -543,7 +543,7 @@ public:
   * @throws IOException if there is a low-level IO error
        * @memory Caller must clean up
        */
-       virtual TermEnum* terms() = 0;
+       virtual TermEnum* terms(const void* io_ctx = nullptr) = 0;
 
 /** Returns an enumeration of all terms starting at a given term. If
   * the given term does not exist, the enumeration is positioned at the
@@ -553,7 +553,7 @@ public:
   * @throws IOException if there is a low-level IO error
        * @memory Caller must clean up
        */
-       virtual TermEnum* terms(const Term* t) = 0;
+       virtual TermEnum* terms(const Term* t, const void* io_ctx = nullptr) = 
0;
 
   /** Returns the number of documents containing the term <code>t</code>.
    * @throws IOException if there is a low-level IO error
diff --git a/src/core/CLucene/index/MultiReader.cpp 
b/src/core/CLucene/index/MultiReader.cpp
index 101e8012fa0..963169d3eb2 100644
--- a/src/core/CLucene/index/MultiReader.cpp
+++ b/src/core/CLucene/index/MultiReader.cpp
@@ -253,14 +253,18 @@ void MultiReader::doSetNorm(int32_t n, const TCHAR* 
field, uint8_t value){
        (*subReaders)[i]->setNorm(n-starts[i], field, value); // dispatch
 }
 
-TermEnum* MultiReader::terms() {
+TermEnum* MultiReader::terms(const void* io_ctx) {
   ensureOpen();
-       return _CLNEW MultiTermEnum(subReaders, starts, NULL);
+       auto* ret = _CLNEW MultiTermEnum(subReaders, starts, NULL);
+  ret->setIoContext(io_ctx);
+  return ret;
 }
 
-TermEnum* MultiReader::terms(const Term* term) {
+TermEnum* MultiReader::terms(const Term* term, const void* io_ctx) {
     ensureOpen();
-       return _CLNEW MultiTermEnum(subReaders, starts, term);
+       auto* ret = _CLNEW MultiTermEnum(subReaders, starts, term);
+  ret->setIoContext(io_ctx);
+  return ret;
 }
 
 int32_t MultiReader::docFreq(const Term* t) {
diff --git a/src/core/CLucene/index/MultiReader.h 
b/src/core/CLucene/index/MultiReader.h
index dcfb9e66bd9..aa9c440d72c 100644
--- a/src/core/CLucene/index/MultiReader.h
+++ b/src/core/CLucene/index/MultiReader.h
@@ -95,8 +95,8 @@ public:
        uint8_t* norms(const TCHAR* field);
        void norms(const TCHAR* field, uint8_t* result);
     FieldInfos* getFieldInfos();
-       TermEnum* terms();
-       TermEnum* terms(const Term* term);
+       TermEnum* terms(const void* io_ctx = nullptr);
+       TermEnum* terms(const Term* term, const void* io_ctx = nullptr);
 
        //Returns the document frequency of the current term in the set
        int32_t docFreq(const Term* t=NULL);
diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp 
b/src/core/CLucene/index/MultiSegmentReader.cpp
index e5822a688c3..e5987023c94 100644
--- a/src/core/CLucene/index/MultiSegmentReader.cpp
+++ b/src/core/CLucene/index/MultiSegmentReader.cpp
@@ -205,6 +205,10 @@ MultiSegmentReader::~MultiSegmentReader() {
 const char* MultiTermEnum::getObjectName() const{ return getClassName(); }
 const char* MultiTermEnum::getClassName(){ return "MultiTermEnum"; }
 
+void MultiTermEnum::setIoContext(const void* io_ctx) {
+  io_ctx_ = io_ctx;
+}
+
  DirectoryIndexReader* MultiSegmentReader::doReopen(SegmentInfos* infos){
    SCOPED_LOCK_MUTEX(THIS_LOCK)
     if (infos->size() == 1) {
@@ -337,14 +341,18 @@ void MultiSegmentReader::doSetNorm(int32_t n, const 
TCHAR* field, uint8_t value)
        (*subReaders)[i]->setNorm(n-starts[i], field, value); // dispatch
 }
 
-TermEnum* MultiSegmentReader::terms() {
+TermEnum* MultiSegmentReader::terms(const void* io_ctx) {
   ensureOpen();
-       return _CLNEW MultiTermEnum(subReaders, starts, NULL);
+       auto* ret = _CLNEW MultiTermEnum(subReaders, starts, NULL);
+       ret->setIoContext(io_ctx);
+       return ret;
 }
 
-TermEnum* MultiSegmentReader::terms(const Term* term) {
+TermEnum* MultiSegmentReader::terms(const Term* term, const void* io_ctx) {
     ensureOpen();
-       return _CLNEW MultiTermEnum(subReaders, starts, term);
+       auto* ret = _CLNEW MultiTermEnum(subReaders, starts, term);
+       ret->setIoContext(io_ctx);
+       return ret;
 }
 
 int32_t MultiSegmentReader::docFreq(const Term* t) {
@@ -781,10 +789,10 @@ MultiTermEnum::MultiTermEnum(ArrayBase<IndexReader*>* 
subReaders, const int32_t
                //Check if the enumeration must start from term t
                if (t != NULL) {
                        //termEnum is an enumeration of terms starting at or 
after the named term t
-                       termEnum = reader->terms(t);
+                       termEnum = reader->terms(t, io_ctx_);
                }else{
                        //termEnum is an enumeration of all the Terms and 
TermInfos in the set.
-                       termEnum = reader->terms();
+                       termEnum = reader->terms(io_ctx_);
                }
 
                //Instantiate an new SegmentMerginfo
diff --git a/src/core/CLucene/index/SegmentReader.cpp 
b/src/core/CLucene/index/SegmentReader.cpp
index ec0592370cf..2257ff9fc7c 100644
--- a/src/core/CLucene/index/SegmentReader.cpp
+++ b/src/core/CLucene/index/SegmentReader.cpp
@@ -445,7 +445,7 @@ void SegmentReader::files(vector<string> &retarray) {
     retarray.insert(retarray.end(), tmp.begin(), tmp.end());
 }
 
-TermEnum *SegmentReader::terms() {
+TermEnum *SegmentReader::terms(const void* io_ctx) {
     //Func - Returns an enumeration of all the Terms and TermInfos in the set.
     //Pre  - tis != NULL
     //Post - An enumeration of all the Terms and TermInfos in the set has been 
returned
@@ -453,10 +453,10 @@ TermEnum *SegmentReader::terms() {
     CND_PRECONDITION(tis != NULL, "tis is NULL");
 
     ensureOpen();
-    return tis->terms();
+    return tis->terms(nullptr, io_ctx);
 }
 
-TermEnum *SegmentReader::terms(const Term *t) {
+TermEnum *SegmentReader::terms(const Term *t, const void* io_ctx) {
     //Func - Returns an enumeration of terms starting at or after the named 
term t
     //Pre  - t != NULL
     //       tis != NULL
@@ -466,7 +466,7 @@ TermEnum *SegmentReader::terms(const Term *t) {
     CND_PRECONDITION(tis != NULL, "tis is NULL");
 
     ensureOpen();
-    return tis->terms(t);
+    return tis->terms(t, io_ctx);
 }
 
 bool SegmentReader::document(int32_t n, Document &doc, const FieldSelector 
*fieldSelector) {
diff --git a/src/core/CLucene/index/TermInfosReader.cpp 
b/src/core/CLucene/index/TermInfosReader.cpp
index 23b24ff690f..117b29bcc9a 100644
--- a/src/core/CLucene/index/TermInfosReader.cpp
+++ b/src/core/CLucene/index/TermInfosReader.cpp
@@ -294,7 +294,7 @@ int64_t TermInfosReader::getPosition(const Term* term) {
         return -1;
 }
 
-SegmentTermEnum* TermInfosReader::terms(const Term* term) {
+SegmentTermEnum* TermInfosReader::terms(const Term* term, const void* io_ctx) {
     //Func - Returns an enumeration of terms starting at or after the named 
term.
     //       If term is null then enumerator is set to the beginning
     //Pre  - term holds a valid reference to a Term
@@ -304,7 +304,7 @@ SegmentTermEnum* TermInfosReader::terms(const Term* term) {
     SegmentTermEnum* enumerator = NULL;
     if (term != NULL) {
         //Seek enumerator to term; delete the new TermInfo that's returned.
-        TermInfo* ti = get(term);
+        TermInfo* ti = get(term, io_ctx);
         _CLLDELETE(ti);
         enumerator = getEnum();
     } else
@@ -312,6 +312,9 @@ SegmentTermEnum* TermInfosReader::terms(const Term* term) {
 
     //Clone the entire enumeration
     SegmentTermEnum* cln = enumerator->clone();
+    if (cln) {
+        cln->setIoContext(io_ctx);
+    }
 
     //Check if cln points to a valid instance
     CND_CONDITION(cln != NULL, "cln is NULL");
diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h
index bf71ad99a2d..0af1102874c 100644
--- a/src/core/CLucene/index/Terms.h
+++ b/src/core/CLucene/index/Terms.h
@@ -130,6 +130,8 @@ public:
        * Some implementations are considerably more efficient than that.
        */
        virtual bool skipTo(Term* target);
+
+       virtual void setIoContext(const void*) {}
 };
 
 
diff --git a/src/core/CLucene/index/_MultiSegmentReader.h 
b/src/core/CLucene/index/_MultiSegmentReader.h
index d004044b0e1..830315208c2 100644
--- a/src/core/CLucene/index/_MultiSegmentReader.h
+++ b/src/core/CLucene/index/_MultiSegmentReader.h
@@ -99,8 +99,8 @@ public:
        void norms(const TCHAR* field, uint8_t* result);
     FieldInfos* getFieldInfos();
 
-       TermEnum* terms();
-       TermEnum* terms(const Term* term);
+       TermEnum* terms(const void* io_ctx = nullptr);
+       TermEnum* terms(const Term* term, const void* io_ctx = nullptr);
 
        //Returns the document frequency of the current term in the set
        int32_t docFreq(const Term* t=NULL);
@@ -211,6 +211,11 @@ public:
 
   const char* getObjectName() const;
   static const char* getClassName();
+
+  void setIoContext(const void*) override;
+
+private:
+  const void* io_ctx_ = nullptr;
 };
 
 
diff --git a/src/core/CLucene/index/_SegmentHeader.h 
b/src/core/CLucene/index/_SegmentHeader.h
index 740023200c5..a3aaf1cc95b 100644
--- a/src/core/CLucene/index/_SegmentHeader.h
+++ b/src/core/CLucene/index/_SegmentHeader.h
@@ -467,9 +467,9 @@ public:
   ///Returns all file names managed by this SegmentReader
   void files(std::vector<std::string>& retarray);
   ///Returns an enumeration of all the Terms and TermInfos in the set.
-  TermEnum* terms();
+  TermEnum* terms(const void* io_ctx = nullptr);
   ///Returns an enumeration of terms starting at or after the named term t
-  TermEnum* terms(const Term* t);
+  TermEnum* terms(const Term* t, const void* io_ctx = nullptr);
 
   ///Gets the document identified by n
   bool document(int32_t n, CL_NS(document)::Document& doc, const 
CL_NS(document)::FieldSelector* fieldSelector);
diff --git a/src/core/CLucene/index/_TermInfosReader.h 
b/src/core/CLucene/index/_TermInfosReader.h
index efc826c5420..5ee9e72981f 100644
--- a/src/core/CLucene/index/_TermInfosReader.h
+++ b/src/core/CLucene/index/_TermInfosReader.h
@@ -100,7 +100,7 @@ CL_NS_DEF(index)
                * If no term is specified, an enumeration of all the Terms 
                * and TermInfos in the set is returned.
                */
-               SegmentTermEnum* terms(const Term* term=NULL);
+               SegmentTermEnum* terms(const Term* term=NULL, const void* 
io_ctx = nullptr);
                
                /** Returns the TermInfo for a Term in the set, or null. */
                TermInfo* get(const Term* term, const void* io_ctx = nullptr);
diff --git a/src/core/CLucene/util/bkd/bkd_reader.cpp 
b/src/core/CLucene/util/bkd/bkd_reader.cpp
index 30af9515f35..b4dde7a1bb4 100644
--- a/src/core/CLucene/util/bkd/bkd_reader.cpp
+++ b/src/core/CLucene/util/bkd/bkd_reader.cpp
@@ -152,6 +152,7 @@ 
bkd_reader::intersect_state::intersect_state(store::IndexInput *in,
                                              bkd_reader::intersect_visitor 
*visitor,
                                              index_tree* indexVisitor) {
     in_ = std::unique_ptr<store::IndexInput>(in);
+    in_->setIoContext(visitor->get_io_context());
     visitor_ = visitor;
     common_prefix_lengths_.resize(numDims);
     docid_set_iterator = 
std::make_unique<bkd_docid_set_iterator>(maxPointsInLeafNode);
diff --git a/src/core/CLucene/util/bkd/bkd_reader.h 
b/src/core/CLucene/util/bkd/bkd_reader.h
index 55a76f0853c..2221efccab1 100644
--- a/src/core/CLucene/util/bkd/bkd_reader.h
+++ b/src/core/CLucene/util/bkd/bkd_reader.h
@@ -79,6 +79,8 @@ public:
         virtual void inc_hits(int count) {}
 
         virtual bool only_hits() { return false; }
+
+        virtual const void* get_io_context() { return nullptr; }
     };
     class intersect_state final {
     public:
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index bcc14a8ea3a..53969b44878 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -94,6 +94,7 @@ SET(test_files ./tests.cpp
         ./index/TestIndexWriter.cpp
         ./index/TestIndexModifier.cpp
         ./index/TestIndexReader.cpp
+        ./index/TestIndexReader2.cpp
         ./index/TestThreading.cpp
         ./index/TestUtf8.cpp
         ./index/TestHighFreqTerms.cpp
diff --git a/src/test/index/TestIndexReader2.cpp 
b/src/test/index/TestIndexReader2.cpp
new file mode 100644
index 00000000000..dcbd915b800
--- /dev/null
+++ b/src/test/index/TestIndexReader2.cpp
@@ -0,0 +1,209 @@
+#include <CLucene.h> // IWYU pragma: keep
+#include <CLucene/index/IndexReader.h>
+#include <CLucene/search/query/TermPositionIterator.h>
+#include <CLucene/util/stringUtil.h>
+
+#include <cstddef>
+#include <ctime>
+#include <exception>
+#include <fstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "CLucene/analysis/Analyzers.h"
+#include "CLucene/index/FieldConfig.h"
+#include "CLucene/index/IndexVersion.h"
+#include "CLucene/index/Term.h"
+#include "CLucene/store/FSDirectory.h"
+#include "CLucene/store/_RAMDirectory.h"
+#include "CLucene/store/store_v2/ByteArrayDataInput.h"
+#include "CLucene/store/store_v2/GrowableByteArrayDataOutput.h"
+#include "CuTest.h"
+#include "test.h"
+
+CL_NS_USE(search)
+CL_NS_USE(store)
+CL_NS_USE(index)
+CL_NS_USE(util)
+
+static constexpr int32_t doc_count = 100;
+
+#define FINALLY(eptr, finallyBlock)       \
+    {                                     \
+        finallyBlock;                     \
+        if (eptr) {                       \
+            std::rethrow_exception(eptr); \
+        }                                 \
+    }
+
+static int32_t getDaySeed() {
+    std::time_t now = std::time(nullptr);
+    std::tm* localTime = std::localtime(&now);
+    localTime->tm_sec = 0;
+    localTime->tm_min = 0;
+    localTime->tm_hour = 0;
+    return static_cast<int32_t>(std::mktime(localTime) / (60 * 60 * 24));
+}
+
+static std::string generateRandomIP() {
+    std::string ip_v4;
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    ip_v4.append(".");
+    ip_v4.append(std::to_string(rand() % 256));
+    return ip_v4;
+}
+
+static void write_index(const std::string& name, RAMDirectory* dir,
+                        const std::vector<std::string>& datas) {
+    auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>;
+    analyzer->set_stopwords(nullptr);
+    auto* indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true);
+    indexwriter->setRAMBufferSizeMB(512);
+    indexwriter->setMaxBufferedDocs(-1);
+    indexwriter->setMaxFieldLength(0x7FFFFFFFL);
+    indexwriter->setMergeFactor(1000000000);
+    indexwriter->setUseCompoundFile(false);
+
+    auto* char_string_reader = _CLNEW lucene::util::SStringReader<char>;
+
+    auto* doc = _CLNEW lucene::document::Document();
+    int32_t field_config = lucene::document::Field::STORE_NO;
+    field_config |= lucene::document::Field::INDEX_NONORMS;
+    field_config |= lucene::document::Field::INDEX_TOKENIZED;
+    auto field_name = std::wstring(name.begin(), name.end());
+    auto* field = _CLNEW lucene::document::Field(field_name.c_str(), 
field_config);
+    field->setOmitTermFreqAndPositions(false);
+    doc->add(*field);
+
+    for (const auto& data : datas) {
+        char_string_reader->init(data.data(), data.size(), false);
+        auto* stream = analyzer->reusableTokenStream(field->name(), 
char_string_reader);
+        field->setValue(stream);
+        indexwriter->addDocument(doc);
+    }
+
+    indexwriter->close();
+
+    _CLLDELETE(indexwriter);
+    _CLLDELETE(doc);
+    _CLLDELETE(analyzer);
+    _CLLDELETE(char_string_reader);
+}
+
+struct MockIOContxt {
+    int64_t count = 0;
+};
+
+void TestIndexRead(CuTest* tc) {
+    std::srand(getDaySeed());
+
+    std::string name = "name";
+    std::vector<std::string> datas;
+    datas.push_back("a1");
+    datas.push_back("a2");
+    datas.push_back("a3");
+    datas.push_back("a4");
+    datas.push_back("a5");
+    datas.push_back("a6");
+    datas.push_back("a7");
+    datas.push_back("a8");
+    datas.push_back("a9");
+
+    RAMDirectory dir;
+    write_index(name, &dir, datas);
+
+    {
+        auto* reader = IndexReader::open(&dir);
+
+        MockIOContxt io_ctx;
+        TermEnum* enumerator = reader->terms(&io_ctx);
+
+        int32_t count = 0;
+        Term* lastTerm = nullptr;
+        try {
+            do {
+                lastTerm = enumerator->term();
+                if (lastTerm != nullptr) {
+                    count++;
+                }
+                _CLDECDELETE(lastTerm);
+            } while (enumerator->next());
+        }
+        _CLFINALLY({
+            enumerator->close();
+            _CLDELETE(enumerator);
+        });
+        assertEquals(count, 10);
+
+        reader->close();
+        _CLLDELETE(reader);
+    }
+
+    std::cout << "\nTestIndexRead sucess" << std::endl;
+}
+
+void TestIndexReadSeek(CuTest* tc) {
+    std::srand(getDaySeed());
+
+    std::string name = "name";
+    std::vector<std::string> datas;
+    datas.push_back("a1");
+    datas.push_back("a2");
+    datas.push_back("a3");
+    datas.push_back("a4");
+    datas.push_back("a5");
+    datas.push_back("a6");
+    datas.push_back("a7");
+    datas.push_back("a8");
+    datas.push_back("a9");
+
+    RAMDirectory dir;
+    write_index(name, &dir, datas);
+
+    {
+        auto* reader = IndexReader::open(&dir);
+
+        std::wstring ws_prefix = StringUtil::string_to_wstring("a5");
+        Term* prefix_term = _CLNEW Term(L"name", ws_prefix.c_str());
+
+        MockIOContxt io_ctx;
+        TermEnum* enumerator = reader->terms(prefix_term, &io_ctx);
+
+        int32_t count = 0;
+        Term* lastTerm = nullptr;
+        try {
+            do {
+                lastTerm = enumerator->term();
+                if (lastTerm != nullptr) {
+                    count++;
+                }
+                _CLDECDELETE(lastTerm);
+            } while (enumerator->next());
+        }
+        _CLFINALLY({
+            enumerator->close();
+            _CLDELETE(enumerator);
+            _CLDECDELETE(prefix_term);
+        });
+        assertEquals(count, 5);
+
+        reader->close();
+        _CLLDELETE(reader);
+    }
+
+    std::cout << "\nTestIndexReadSeek sucess" << std::endl;
+}
+
+CuSuite* testIndexReader2() {
+    CuSuite* suite = CuSuiteNew(_T("CLucene Index Reader Test"));
+
+    SUITE_ADD_TEST(suite, TestIndexRead);
+    SUITE_ADD_TEST(suite, TestIndexReadSeek);
+
+    return suite;
+}
diff --git a/src/test/test.h b/src/test/test.h
index 4e3915a4ce1..40cd4bf9180 100644
--- a/src/test/test.h
+++ b/src/test/test.h
@@ -91,6 +91,7 @@ CuSuite *testByteArrayDataInputSuite(void);
 CuSuite *testGrowableByteArrayDataOutputSuite(void);
 CuSuite *testICU(void);
 CuSuite *testUTF8CharsSuite(void);
+CuSuite *testIndexReader2(void);
 
 #ifdef TEST_CONTRIB_LIBS
 //CuSuite *testGermanAnalyzer(void);
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 66760321c9e..d52d05ca108 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -26,4 +26,5 @@ unittest tests[] = {{"analysis", testanalysis},
 #ifdef TEST_CONTRIB_LIBS
                     {"chinese", testchinese},
 #endif
+                    {"TestIndexReader2", testIndexReader2},
                     {"LastTest", NULL}};


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to