This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch clucene-3.0 in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene-3.0 by this push: new 569398a5c96 [opt](inverted index) Enhance I/O statistics collection for the inverted index in file cache scenarios (#295) (#314) 569398a5c96 is described below commit 569398a5c96b4c626251ccbe81257945a3d2aef4 Author: zzzxl <yangs...@selectdb.com> AuthorDate: Fri May 9 18:05:14 2025 +0800 [opt](inverted index) Enhance I/O statistics collection for the inverted index in file cache scenarios (#295) (#314) --- src/core/CLucene/index/IndexReader.h | 4 +- src/core/CLucene/index/MultiReader.cpp | 12 +- src/core/CLucene/index/MultiReader.h | 4 +- src/core/CLucene/index/MultiSegmentReader.cpp | 20 ++- src/core/CLucene/index/SegmentReader.cpp | 8 +- src/core/CLucene/index/TermInfosReader.cpp | 7 +- src/core/CLucene/index/Terms.h | 2 + src/core/CLucene/index/_MultiSegmentReader.h | 9 +- src/core/CLucene/index/_SegmentHeader.h | 4 +- src/core/CLucene/index/_TermInfosReader.h | 2 +- src/core/CLucene/util/bkd/bkd_reader.cpp | 1 + src/core/CLucene/util/bkd/bkd_reader.h | 2 + src/test/CMakeLists.txt | 1 + src/test/index/TestIndexReader2.cpp | 206 ++++++++++++++++++++++++++ src/test/test.h | 3 +- src/test/tests.cpp | 2 +- 16 files changed, 260 insertions(+), 27 deletions(-) diff --git a/src/core/CLucene/index/IndexReader.h b/src/core/CLucene/index/IndexReader.h index 13403963076..29449840c17 100644 --- a/src/core/CLucene/index/IndexReader.h +++ b/src/core/CLucene/index/IndexReader.h @@ -545,7 +545,7 @@ public: * @throws IOException if there is a low-level IO error * @memory Caller must clean up */ - virtual TermEnum* terms() = 0; + virtual TermEnum* terms(const void* io_ctx = nullptr) = 0; /** Returns an enumeration of all terms starting at a given term. If * the given term does not exist, the enumeration is positioned at the @@ -555,7 +555,7 @@ public: * @throws IOException if there is a low-level IO error * @memory Caller must clean up */ - virtual TermEnum* terms(const Term* t) = 0; + virtual TermEnum* terms(const Term* t, const void* io_ctx = nullptr) = 0; /** Returns the number of documents containing the term <code>t</code>. * @throws IOException if there is a low-level IO error diff --git a/src/core/CLucene/index/MultiReader.cpp b/src/core/CLucene/index/MultiReader.cpp index 101e8012fa0..963169d3eb2 100644 --- a/src/core/CLucene/index/MultiReader.cpp +++ b/src/core/CLucene/index/MultiReader.cpp @@ -253,14 +253,18 @@ void MultiReader::doSetNorm(int32_t n, const TCHAR* field, uint8_t value){ (*subReaders)[i]->setNorm(n-starts[i], field, value); // dispatch } -TermEnum* MultiReader::terms() { +TermEnum* MultiReader::terms(const void* io_ctx) { ensureOpen(); - return _CLNEW MultiTermEnum(subReaders, starts, NULL); + auto* ret = _CLNEW MultiTermEnum(subReaders, starts, NULL); + ret->setIoContext(io_ctx); + return ret; } -TermEnum* MultiReader::terms(const Term* term) { +TermEnum* MultiReader::terms(const Term* term, const void* io_ctx) { ensureOpen(); - return _CLNEW MultiTermEnum(subReaders, starts, term); + auto* ret = _CLNEW MultiTermEnum(subReaders, starts, term); + ret->setIoContext(io_ctx); + return ret; } int32_t MultiReader::docFreq(const Term* t) { diff --git a/src/core/CLucene/index/MultiReader.h b/src/core/CLucene/index/MultiReader.h index dcfb9e66bd9..aa9c440d72c 100644 --- a/src/core/CLucene/index/MultiReader.h +++ b/src/core/CLucene/index/MultiReader.h @@ -95,8 +95,8 @@ public: uint8_t* norms(const TCHAR* field); void norms(const TCHAR* field, uint8_t* result); FieldInfos* getFieldInfos(); - TermEnum* terms(); - TermEnum* terms(const Term* term); + TermEnum* terms(const void* io_ctx = nullptr); + TermEnum* terms(const Term* term, const void* io_ctx = nullptr); //Returns the document frequency of the current term in the set int32_t docFreq(const Term* t=NULL); diff --git a/src/core/CLucene/index/MultiSegmentReader.cpp b/src/core/CLucene/index/MultiSegmentReader.cpp index e5822a688c3..e5987023c94 100644 --- a/src/core/CLucene/index/MultiSegmentReader.cpp +++ b/src/core/CLucene/index/MultiSegmentReader.cpp @@ -205,6 +205,10 @@ MultiSegmentReader::~MultiSegmentReader() { const char* MultiTermEnum::getObjectName() const{ return getClassName(); } const char* MultiTermEnum::getClassName(){ return "MultiTermEnum"; } +void MultiTermEnum::setIoContext(const void* io_ctx) { + io_ctx_ = io_ctx; +} + DirectoryIndexReader* MultiSegmentReader::doReopen(SegmentInfos* infos){ SCOPED_LOCK_MUTEX(THIS_LOCK) if (infos->size() == 1) { @@ -337,14 +341,18 @@ void MultiSegmentReader::doSetNorm(int32_t n, const TCHAR* field, uint8_t value) (*subReaders)[i]->setNorm(n-starts[i], field, value); // dispatch } -TermEnum* MultiSegmentReader::terms() { +TermEnum* MultiSegmentReader::terms(const void* io_ctx) { ensureOpen(); - return _CLNEW MultiTermEnum(subReaders, starts, NULL); + auto* ret = _CLNEW MultiTermEnum(subReaders, starts, NULL); + ret->setIoContext(io_ctx); + return ret; } -TermEnum* MultiSegmentReader::terms(const Term* term) { +TermEnum* MultiSegmentReader::terms(const Term* term, const void* io_ctx) { ensureOpen(); - return _CLNEW MultiTermEnum(subReaders, starts, term); + auto* ret = _CLNEW MultiTermEnum(subReaders, starts, term); + ret->setIoContext(io_ctx); + return ret; } int32_t MultiSegmentReader::docFreq(const Term* t) { @@ -781,10 +789,10 @@ MultiTermEnum::MultiTermEnum(ArrayBase<IndexReader*>* subReaders, const int32_t //Check if the enumeration must start from term t if (t != NULL) { //termEnum is an enumeration of terms starting at or after the named term t - termEnum = reader->terms(t); + termEnum = reader->terms(t, io_ctx_); }else{ //termEnum is an enumeration of all the Terms and TermInfos in the set. - termEnum = reader->terms(); + termEnum = reader->terms(io_ctx_); } //Instantiate an new SegmentMerginfo diff --git a/src/core/CLucene/index/SegmentReader.cpp b/src/core/CLucene/index/SegmentReader.cpp index ec0592370cf..2257ff9fc7c 100644 --- a/src/core/CLucene/index/SegmentReader.cpp +++ b/src/core/CLucene/index/SegmentReader.cpp @@ -445,7 +445,7 @@ void SegmentReader::files(vector<string> &retarray) { retarray.insert(retarray.end(), tmp.begin(), tmp.end()); } -TermEnum *SegmentReader::terms() { +TermEnum *SegmentReader::terms(const void* io_ctx) { //Func - Returns an enumeration of all the Terms and TermInfos in the set. //Pre - tis != NULL //Post - An enumeration of all the Terms and TermInfos in the set has been returned @@ -453,10 +453,10 @@ TermEnum *SegmentReader::terms() { CND_PRECONDITION(tis != NULL, "tis is NULL"); ensureOpen(); - return tis->terms(); + return tis->terms(nullptr, io_ctx); } -TermEnum *SegmentReader::terms(const Term *t) { +TermEnum *SegmentReader::terms(const Term *t, const void* io_ctx) { //Func - Returns an enumeration of terms starting at or after the named term t //Pre - t != NULL // tis != NULL @@ -466,7 +466,7 @@ TermEnum *SegmentReader::terms(const Term *t) { CND_PRECONDITION(tis != NULL, "tis is NULL"); ensureOpen(); - return tis->terms(t); + return tis->terms(t, io_ctx); } bool SegmentReader::document(int32_t n, Document &doc, const FieldSelector *fieldSelector) { diff --git a/src/core/CLucene/index/TermInfosReader.cpp b/src/core/CLucene/index/TermInfosReader.cpp index d4582c77b67..424f12bba9e 100644 --- a/src/core/CLucene/index/TermInfosReader.cpp +++ b/src/core/CLucene/index/TermInfosReader.cpp @@ -295,7 +295,7 @@ int64_t TermInfosReader::getPosition(const Term* term) { return -1; } -SegmentTermEnum* TermInfosReader::terms(const Term* term) { +SegmentTermEnum* TermInfosReader::terms(const Term* term, const void* io_ctx) { //Func - Returns an enumeration of terms starting at or after the named term. // If term is null then enumerator is set to the beginning //Pre - term holds a valid reference to a Term @@ -305,7 +305,7 @@ SegmentTermEnum* TermInfosReader::terms(const Term* term) { SegmentTermEnum* enumerator = NULL; if (term != NULL) { //Seek enumerator to term; delete the new TermInfo that's returned. - TermInfo* ti = get(term); + TermInfo* ti = get(term, io_ctx); _CLLDELETE(ti); enumerator = getEnum(); } else @@ -313,6 +313,9 @@ SegmentTermEnum* TermInfosReader::terms(const Term* term) { //Clone the entire enumeration SegmentTermEnum* cln = enumerator->clone(); + if (cln) { + cln->setIoContext(io_ctx); + } //Check if cln points to a valid instance CND_CONDITION(cln != NULL, "cln is NULL"); diff --git a/src/core/CLucene/index/Terms.h b/src/core/CLucene/index/Terms.h index bf71ad99a2d..0af1102874c 100644 --- a/src/core/CLucene/index/Terms.h +++ b/src/core/CLucene/index/Terms.h @@ -130,6 +130,8 @@ public: * Some implementations are considerably more efficient than that. */ virtual bool skipTo(Term* target); + + virtual void setIoContext(const void*) {} }; diff --git a/src/core/CLucene/index/_MultiSegmentReader.h b/src/core/CLucene/index/_MultiSegmentReader.h index d004044b0e1..830315208c2 100644 --- a/src/core/CLucene/index/_MultiSegmentReader.h +++ b/src/core/CLucene/index/_MultiSegmentReader.h @@ -99,8 +99,8 @@ public: void norms(const TCHAR* field, uint8_t* result); FieldInfos* getFieldInfos(); - TermEnum* terms(); - TermEnum* terms(const Term* term); + TermEnum* terms(const void* io_ctx = nullptr); + TermEnum* terms(const Term* term, const void* io_ctx = nullptr); //Returns the document frequency of the current term in the set int32_t docFreq(const Term* t=NULL); @@ -211,6 +211,11 @@ public: const char* getObjectName() const; static const char* getClassName(); + + void setIoContext(const void*) override; + +private: + const void* io_ctx_ = nullptr; }; diff --git a/src/core/CLucene/index/_SegmentHeader.h b/src/core/CLucene/index/_SegmentHeader.h index 739836c361e..54e84ad4ffd 100644 --- a/src/core/CLucene/index/_SegmentHeader.h +++ b/src/core/CLucene/index/_SegmentHeader.h @@ -469,9 +469,9 @@ public: ///Returns all file names managed by this SegmentReader void files(std::vector<std::string>& retarray); ///Returns an enumeration of all the Terms and TermInfos in the set. - TermEnum* terms(); + TermEnum* terms(const void* io_ctx = nullptr); ///Returns an enumeration of terms starting at or after the named term t - TermEnum* terms(const Term* t); + TermEnum* terms(const Term* t, const void* io_ctx = nullptr); ///Gets the document identified by n bool document(int32_t n, CL_NS(document)::Document& doc, const CL_NS(document)::FieldSelector* fieldSelector); diff --git a/src/core/CLucene/index/_TermInfosReader.h b/src/core/CLucene/index/_TermInfosReader.h index efc826c5420..5ee9e72981f 100644 --- a/src/core/CLucene/index/_TermInfosReader.h +++ b/src/core/CLucene/index/_TermInfosReader.h @@ -100,7 +100,7 @@ CL_NS_DEF(index) * If no term is specified, an enumeration of all the Terms * and TermInfos in the set is returned. */ - SegmentTermEnum* terms(const Term* term=NULL); + SegmentTermEnum* terms(const Term* term=NULL, const void* io_ctx = nullptr); /** Returns the TermInfo for a Term in the set, or null. */ TermInfo* get(const Term* term, const void* io_ctx = nullptr); diff --git a/src/core/CLucene/util/bkd/bkd_reader.cpp b/src/core/CLucene/util/bkd/bkd_reader.cpp index 30af9515f35..b4dde7a1bb4 100644 --- a/src/core/CLucene/util/bkd/bkd_reader.cpp +++ b/src/core/CLucene/util/bkd/bkd_reader.cpp @@ -152,6 +152,7 @@ bkd_reader::intersect_state::intersect_state(store::IndexInput *in, bkd_reader::intersect_visitor *visitor, index_tree* indexVisitor) { in_ = std::unique_ptr<store::IndexInput>(in); + in_->setIoContext(visitor->get_io_context()); visitor_ = visitor; common_prefix_lengths_.resize(numDims); docid_set_iterator = std::make_unique<bkd_docid_set_iterator>(maxPointsInLeafNode); diff --git a/src/core/CLucene/util/bkd/bkd_reader.h b/src/core/CLucene/util/bkd/bkd_reader.h index 55a76f0853c..2221efccab1 100644 --- a/src/core/CLucene/util/bkd/bkd_reader.h +++ b/src/core/CLucene/util/bkd/bkd_reader.h @@ -79,6 +79,8 @@ public: virtual void inc_hits(int count) {} virtual bool only_hits() { return false; } + + virtual const void* get_io_context() { return nullptr; } }; class intersect_state final { public: diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index 6b6239dfaf1..edc4baabc10 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -91,6 +91,7 @@ SET(test_files ./tests.cpp ./index/TestIndexWriter.cpp ./index/TestIndexModifier.cpp ./index/TestIndexReader.cpp + ./index/TestIndexReader2.cpp ./index/TestThreading.cpp ./index/TestUtf8.cpp ./index/TestHighFreqTerms.cpp diff --git a/src/test/index/TestIndexReader2.cpp b/src/test/index/TestIndexReader2.cpp new file mode 100644 index 00000000000..5ea05bc1060 --- /dev/null +++ b/src/test/index/TestIndexReader2.cpp @@ -0,0 +1,206 @@ +#include <CLucene.h> // IWYU pragma: keep +#include <CLucene/index/IndexReader.h> +#include <CLucene/search/query/TermPositionIterator.h> +#include <CLucene/util/stringUtil.h> + +#include <cstddef> +#include <ctime> +#include <exception> +#include <fstream> +#include <stdexcept> +#include <string> +#include <vector> + +#include "CLucene/analysis/Analyzers.h" +#include "CLucene/index/IndexVersion.h" +#include "CLucene/index/Term.h" +#include "CLucene/store/FSDirectory.h" +#include "CLucene/store/_RAMDirectory.h" +#include "CuTest.h" +#include "test.h" + +CL_NS_USE(search) +CL_NS_USE(store) +CL_NS_USE(index) +CL_NS_USE(util) + +static constexpr int32_t doc_count = 100; + +#define FINALLY(eptr, finallyBlock) \ + { \ + finallyBlock; \ + if (eptr) { \ + std::rethrow_exception(eptr); \ + } \ + } + +static int32_t getDaySeed() { + std::time_t now = std::time(nullptr); + std::tm* localTime = std::localtime(&now); + localTime->tm_sec = 0; + localTime->tm_min = 0; + localTime->tm_hour = 0; + return static_cast<int32_t>(std::mktime(localTime) / (60 * 60 * 24)); +} + +static std::string generateRandomIP() { + std::string ip_v4; + ip_v4.append(std::to_string(rand() % 256)); + ip_v4.append("."); + ip_v4.append(std::to_string(rand() % 256)); + ip_v4.append("."); + ip_v4.append(std::to_string(rand() % 256)); + ip_v4.append("."); + ip_v4.append(std::to_string(rand() % 256)); + return ip_v4; +} + +static void write_index(const std::string& name, RAMDirectory* dir, + const std::vector<std::string>& datas) { + auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>; + analyzer->set_stopwords(nullptr); + auto* indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true); + indexwriter->setRAMBufferSizeMB(512); + indexwriter->setMaxBufferedDocs(-1); + indexwriter->setMaxFieldLength(0x7FFFFFFFL); + indexwriter->setMergeFactor(1000000000); + indexwriter->setUseCompoundFile(false); + + auto* char_string_reader = _CLNEW lucene::util::SStringReader<char>; + + auto* doc = _CLNEW lucene::document::Document(); + int32_t field_config = lucene::document::Field::STORE_NO; + field_config |= lucene::document::Field::INDEX_NONORMS; + field_config |= lucene::document::Field::INDEX_TOKENIZED; + auto field_name = std::wstring(name.begin(), name.end()); + auto* field = _CLNEW lucene::document::Field(field_name.c_str(), field_config); + field->setOmitTermFreqAndPositions(false); + doc->add(*field); + + for (const auto& data : datas) { + char_string_reader->init(data.data(), data.size(), false); + auto* stream = analyzer->reusableTokenStream(field->name(), char_string_reader); + field->setValue(stream); + indexwriter->addDocument(doc); + } + + indexwriter->close(); + + _CLLDELETE(indexwriter); + _CLLDELETE(doc); + _CLLDELETE(analyzer); + _CLLDELETE(char_string_reader); +} + +struct MockIOContxt { + int64_t count = 0; +}; + +void TestIndexRead(CuTest* tc) { + std::srand(getDaySeed()); + + std::string name = "name"; + std::vector<std::string> datas; + datas.push_back("a1"); + datas.push_back("a2"); + datas.push_back("a3"); + datas.push_back("a4"); + datas.push_back("a5"); + datas.push_back("a6"); + datas.push_back("a7"); + datas.push_back("a8"); + datas.push_back("a9"); + + RAMDirectory dir; + write_index(name, &dir, datas); + + { + auto* reader = IndexReader::open(&dir); + + MockIOContxt io_ctx; + TermEnum* enumerator = reader->terms(&io_ctx); + + int32_t count = 0; + Term* lastTerm = nullptr; + try { + do { + lastTerm = enumerator->term(); + if (lastTerm != nullptr) { + count++; + } + _CLDECDELETE(lastTerm); + } while (enumerator->next()); + } + _CLFINALLY({ + enumerator->close(); + _CLDELETE(enumerator); + }); + assertEquals(count, 10); + + reader->close(); + _CLLDELETE(reader); + } + + std::cout << "\nTestIndexRead sucess" << std::endl; +} + +void TestIndexReadSeek(CuTest* tc) { + std::srand(getDaySeed()); + + std::string name = "name"; + std::vector<std::string> datas; + datas.push_back("a1"); + datas.push_back("a2"); + datas.push_back("a3"); + datas.push_back("a4"); + datas.push_back("a5"); + datas.push_back("a6"); + datas.push_back("a7"); + datas.push_back("a8"); + datas.push_back("a9"); + + RAMDirectory dir; + write_index(name, &dir, datas); + + { + auto* reader = IndexReader::open(&dir); + + std::wstring ws_prefix = StringUtil::string_to_wstring("a5"); + Term* prefix_term = _CLNEW Term(L"name", ws_prefix.c_str()); + + MockIOContxt io_ctx; + TermEnum* enumerator = reader->terms(prefix_term, &io_ctx); + + int32_t count = 0; + Term* lastTerm = nullptr; + try { + do { + lastTerm = enumerator->term(); + if (lastTerm != nullptr) { + count++; + } + _CLDECDELETE(lastTerm); + } while (enumerator->next()); + } + _CLFINALLY({ + enumerator->close(); + _CLDELETE(enumerator); + _CLDECDELETE(prefix_term); + }); + assertEquals(count, 5); + + reader->close(); + _CLLDELETE(reader); + } + + std::cout << "\nTestIndexReadSeek sucess" << std::endl; +} + +CuSuite* testIndexReader2() { + CuSuite* suite = CuSuiteNew(_T("CLucene Index Reader Test")); + + SUITE_ADD_TEST(suite, TestIndexRead); + SUITE_ADD_TEST(suite, TestIndexReadSeek); + + return suite; +} diff --git a/src/test/test.h b/src/test/test.h index 5ac8e98a64b..11f38280ae3 100644 --- a/src/test/test.h +++ b/src/test/test.h @@ -87,7 +87,8 @@ CuSuite *testIndexCompaction(void); CuSuite *testStringReader(void); CuSuite *testIndexCompress(void); CuSuite *testUTF8CharsSuite(void); -CuSuite *testPFORSuite(void); +CuSuite *testIndexReader2(void); + #ifdef TEST_CONTRIB_LIBS //CuSuite *testGermanAnalyzer(void); CuSuite *testchinese(void); diff --git a/src/test/tests.cpp b/src/test/tests.cpp index 4b99b94d396..a5f9e1e83aa 100644 --- a/src/test/tests.cpp +++ b/src/test/tests.cpp @@ -21,8 +21,8 @@ unittest tests[] = { {"testStringReader", testStringReader}, {"IndexCompress", testIndexCompress}, {"TestUTF8Chars", testUTF8CharsSuite}, - {"testPFOR", testPFORSuite}, #ifdef TEST_CONTRIB_LIBS {"chinese", testchinese}, #endif + {"TestIndexReader2", testIndexReader2}, {"LastTest", NULL}}; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org