This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 67efa946260 branch-3.0: [fix](inverted index) Add missing memory usage 
calculation for BKD index #47297 (#47374)
67efa946260 is described below

commit 67efa946260c9ac7c4ba4ae77ca4c5a234b1072d
Author: airborne12 <jiang...@selectdb.com>
AuthorDate: Sat Mar 1 10:22:59 2025 +0800

    branch-3.0: [fix](inverted index) Add missing memory usage calculation for 
BKD index #47297 (#47374)
    
    cherry pick from #47297
---
 be/src/clucene                                     |   2 +-
 .../rowset/segment_v2/inverted_index_reader.cpp    |  17 +-
 .../olap/rowset/segment_v2/inverted_index_reader.h |   3 +-
 .../rowset/segment_v2/inverted_index_searcher.cpp  |   1 +
 .../segment_v2/inverted_index_searcher_test.cpp    | 229 +++++++++++++++++++++
 5 files changed, 240 insertions(+), 12 deletions(-)

diff --git a/be/src/clucene b/be/src/clucene
index 48fa9cc4ec3..748a6643feb 160000
--- a/be/src/clucene
+++ b/be/src/clucene
@@ -1 +1 @@
-Subproject commit 48fa9cc4ec32b40bf3b02338d0a1b2cdbc6408cf
+Subproject commit 748a6643feb4bb0ac273a720c194eeda02c103ca
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index fcd2082f18f..b0d9a29c550 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -210,9 +210,10 @@ Status InvertedIndexReader::handle_searcher_cache(
         // TODO: handle null bitmap procedure in new format.
         InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
         static_cast<void>(read_null_bitmap(io_ctx, stats, 
&null_bitmap_cache_handle, dir.get()));
-        RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, 
mem_tracker.get(), type()));
-        auto* cache_value = new InvertedIndexSearcherCache::CacheValue(
-                std::move(searcher), mem_tracker->consumption(), UnixMillis());
+        size_t reader_size = 0;
+        RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, 
type(), reader_size));
+        auto* cache_value = new 
InvertedIndexSearcherCache::CacheValue(std::move(searcher),
+                                                                       
reader_size, UnixMillis());
         InvertedIndexSearcherCache::instance()->insert(searcher_cache_key, 
cache_value,
                                                        
inverted_index_cache_handle);
         return Status::OK();
@@ -221,9 +222,8 @@ Status InvertedIndexReader::handle_searcher_cache(
 
 Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* 
dir,
                                                   IndexSearcherPtr* searcher,
-                                                  MemTracker* mem_tracker,
-                                                  InvertedIndexReaderType 
reader_type) {
-    SCOPED_CONSUME_MEM_TRACKER(mem_tracker);
+                                                  InvertedIndexReaderType 
reader_type,
+                                                  size_t& reader_size) {
     auto index_searcher_builder =
             
DORIS_TRY(IndexSearcherBuilder::create_index_searcher_builder(reader_type));
 
@@ -231,12 +231,11 @@ Status 
InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir,
     *searcher = searcher_result;
 
     // When the meta information has been read, the ioContext needs to be 
reset to prevent it from being used by other queries.
-    auto stream = static_cast<DorisCompoundReader*>(dir)->getDorisIndexInput();
+    auto* stream = 
static_cast<DorisCompoundReader*>(dir)->getDorisIndexInput();
     stream->setIoContext(nullptr);
     stream->setIndexFile(false);
 
-    // NOTE: before mem_tracker hook becomes active, we caculate reader memory 
size by hand.
-    mem_tracker->consume(index_searcher_builder->get_reader_size());
+    reader_size = index_searcher_builder->get_reader_size();
     return Status::OK();
 };
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index a1445603286..390928493f6 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -227,8 +227,7 @@ public:
                                          const io::IOContext* io_ctx, 
OlapReaderStatistics* stats);
     std::string get_index_file_path();
     static Status create_index_searcher(lucene::store::Directory* dir, 
IndexSearcherPtr* searcher,
-                                        MemTracker* mem_tracker,
-                                        InvertedIndexReaderType reader_type);
+                                        InvertedIndexReaderType reader_type, 
size_t& reader_size);
 
 protected:
     Status match_index_search(const io::IOContext* io_ctx, 
OlapReaderStatistics* stats,
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp
index 5dfbd984813..8d56b913b31 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp
@@ -67,6 +67,7 @@ Status 
BKDIndexSearcherBuilder::build(lucene::store::Directory* directory,
         if (!bkd_reader->open()) {
             LOG(INFO) << "bkd index file " << directory->toString() << " is 
empty";
         }
+        reader_size = bkd_reader->ram_bytes_used();
         output_searcher = bkd_reader;
         _CLDECDELETE(directory)
         return Status::OK();
diff --git a/be/test/olap/rowset/segment_v2/inverted_index_searcher_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index_searcher_test.cpp
new file mode 100644
index 00000000000..955d46c4b0a
--- /dev/null
+++ b/be/test/olap/rowset/segment_v2/inverted_index_searcher_test.cpp
@@ -0,0 +1,229 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "olap/rowset/segment_v2/inverted_index_searcher.h"
+
+#include <CLucene.h>
+#include <CLucene/document/Document.h>
+#include <CLucene/document/Field.h>
+#include <CLucene/index/IndexWriter.h>
+#include <CLucene/store/Directory.h>
+#include <CLucene/store/FSDirectory.h>
+#include <CLucene/store/RAMDirectory.h>
+#include <CLucene/util/Misc.h>
+#include <CLucene/util/NumericUtils.h>
+#include <CLucene/util/bkd/bkd_reader.h>
+#include <CLucene/util/bkd/bkd_writer.h>
+
+#include <random>
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wshadow-field"
+#endif
+
+#include <CLucene/analysis/standard95/StandardAnalyzer.h>
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+#include <gtest/gtest_pred_impl.h>
+
+#include <memory>
+#include <string>
+
+#include "common/status.h"
+#include "io/fs/local_file_system.h"
+
+namespace doris::segment_v2 {
+class InvertedIndexSearcherBuilderFlowTest : public testing::Test {
+public:
+    const std::string kTestDir = "./ut_dir/inverted_index_searcher_flow_test";
+
+    void SetUp() override {
+        _fs = io::global_local_filesystem();
+        auto st = _fs->delete_directory(kTestDir);
+        st = _fs->create_directory(kTestDir);
+        ASSERT_TRUE(st.ok()) << st;
+    }
+
+    void TearDown() override {
+        auto st = _fs->delete_directory(kTestDir);
+        ASSERT_TRUE(st.ok()) << st;
+    }
+    std::string generateRandomString(int length) {
+        static const char alphanum[] =
+                "0123456789"
+                "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                "abcdefghijklmnopqrstuvwxyz";
+        std::string randomString;
+        std::random_device rd;
+        std::mt19937 gen(rd());
+        std::uniform_int_distribution<> dis(0, sizeof(alphanum) - 2);
+        for (int i = 0; i < length; ++i) {
+            randomString += alphanum[dis(gen)];
+        }
+        return randomString;
+    }
+
+    const int32_t MAX_FIELD_LEN = 0x7FFFFFFFL;
+    const int32_t MAX_BUFFER_DOCS = 100000000;
+    const int32_t MERGE_FACTOR = 100000000;
+
+protected:
+    io::FileSystemSPtr _fs;
+};
+
+TEST_F(InvertedIndexSearcherBuilderFlowTest, test_bkd_builder_build_success) {
+    const int N = 1024 * 1024;
+    lucene::store::Directory* tmp_dir =
+            lucene::store::FSDirectory::getDirectory("./ut_dir/TestBKD");
+    std::unique_ptr<lucene::store::Directory> dir =
+            std::unique_ptr<lucene::store::Directory>(_CL_POINTER(tmp_dir));
+    std::shared_ptr<lucene::util::bkd::bkd_writer> w =
+            std::make_shared<lucene::util::bkd::bkd_writer>(N, 1, 1, 4, 512, 
100.0, N, true);
+    w->docs_seen_ = N;
+
+    for (int docID = 0; docID < N; docID++) {
+        std::vector<uint8_t> scratch(4);
+
+        if (docID > 500000) {
+            lucene::util::NumericUtils::intToSortableBytes(200, scratch, 0);
+
+        } else {
+            lucene::util::NumericUtils::intToSortableBytes(100, scratch, 0);
+        }
+        w->add(scratch.data(), scratch.size(), docID);
+    }
+
+    int64_t indexFP;
+    {
+        std::unique_ptr<lucene::store::IndexOutput> 
out(dir->createOutput("bkd"));
+        std::unique_ptr<lucene::store::IndexOutput> 
meta_out(dir->createOutput("bkd_meta"));
+        std::unique_ptr<lucene::store::IndexOutput> 
index_out(dir->createOutput("bkd_index"));
+
+        try {
+            indexFP = w->finish(out.get(), index_out.get());
+            w->meta_finish(meta_out.get(), indexFP, 0);
+        } catch (...) {
+            ASSERT_TRUE(false) << "BKDIndexSearcherBuilder build error";
+        }
+    }
+
+    BKDIndexSearcherBuilder builder;
+    OptionalIndexSearcherPtr output_searcher;
+
+    auto st = builder.build(dir.get(), output_searcher);
+    EXPECT_TRUE(st.ok()) << "BKDIndexSearcherBuilder build error: " << 
st.msg();
+    EXPECT_TRUE(output_searcher.has_value());
+    EXPECT_GT(builder.get_reader_size(), 0);
+    std::cout << "test_bkd_builder size = " << builder.get_reader_size() << 
std::endl;
+}
+
+TEST_F(InvertedIndexSearcherBuilderFlowTest, test_fulltext_builder) {
+    auto* tmp_dir = new lucene::store::RAMDirectory();
+    std::unique_ptr<lucene::store::Directory> dir =
+            std::unique_ptr<lucene::store::Directory>(_CL_POINTER(tmp_dir));
+
+    lucene::analysis::SimpleAnalyzer<char> sanalyzer;
+    lucene::index::IndexWriter w(dir.get(), &sanalyzer, true);
+    w.setUseCompoundFile(false);
+    w.setMaxBufferedDocs(MAX_BUFFER_DOCS);
+    w.setRAMBufferSizeMB(256);
+    w.setMaxFieldLength(MAX_FIELD_LEN);
+    w.setMergeFactor(MERGE_FACTOR);
+    lucene::document::Document doc;
+    std::wstring field_name = L"fulltext";
+    auto* field = _CLNEW lucene::document::Field(
+            field_name.c_str(),
+            int(lucene::document::Field::INDEX_TOKENIZED) | 
int(lucene::document::Field::STORE_NO));
+    doc.add(*field);
+
+    for (int i = 0; i <= 2000; i++) {
+        std::string value1 = "value1";
+        if (i > 0) {
+            value1 = generateRandomString(2000);
+        }
+        auto* stringReader = _CLNEW lucene::util::SStringReader<char>(
+                value1.c_str(), strlen(value1.c_str()), false);
+        auto* stream = sanalyzer.reusableTokenStream(field_name.c_str(), 
stringReader);
+
+        field->setValue(stream);
+        w.addDocument(&doc);
+        _CLDELETE(stringReader);
+    }
+    doc.clear();
+    w.close();
+    FulltextIndexSearcherBuilder builder;
+    OptionalIndexSearcherPtr output_searcher;
+    auto st = builder.build(dir.get(), output_searcher);
+    EXPECT_TRUE(st.ok()) << st.to_string();
+
+    ASSERT_TRUE(output_searcher.has_value());
+    auto searcher_variant = *output_searcher;
+    
EXPECT_TRUE(std::holds_alternative<FulltextIndexSearcherPtr>(searcher_variant));
+    auto searcher_ptr = std::get<FulltextIndexSearcherPtr>(searcher_variant);
+    EXPECT_NE(searcher_ptr, nullptr);
+    EXPECT_GT(builder.get_reader_size(), 0);
+    std::cout << "test_fulltext_builder size = " << builder.get_reader_size() 
<< std::endl;
+}
+
+TEST_F(InvertedIndexSearcherBuilderFlowTest, test_keyword_builder) {
+    auto* tmp_dir = new lucene::store::RAMDirectory();
+    std::unique_ptr<lucene::store::Directory> dir =
+            std::unique_ptr<lucene::store::Directory>(_CL_POINTER(tmp_dir));
+
+    lucene::analysis::SimpleAnalyzer<char> sanalyzer;
+    lucene::index::IndexWriter w(dir.get(), &sanalyzer, true);
+    w.setUseCompoundFile(false);
+    w.setMaxBufferedDocs(MAX_BUFFER_DOCS);
+    w.setRAMBufferSizeMB(256);
+    w.setMaxFieldLength(MAX_FIELD_LEN);
+    w.setMergeFactor(MERGE_FACTOR);
+    lucene::document::Document doc;
+    std::wstring field_name = L"keyword";
+    auto* field = _CLNEW lucene::document::Field(field_name.c_str(),
+                                                 
int(lucene::document::Field::INDEX_UNTOKENIZED) |
+                                                         
int(lucene::document::Field::STORE_NO));
+    doc.add(*field);
+
+    for (int i = 0; i <= 2000; i++) {
+        std::string value1 = "value1";
+        if (i > 0) {
+            value1 = generateRandomString(2000);
+        }
+        field->setValue((char*)value1.c_str(), value1.size());
+        w.addDocument(&doc);
+    }
+    doc.clear();
+    w.close();
+    FulltextIndexSearcherBuilder builder;
+    OptionalIndexSearcherPtr output_searcher;
+    auto st = builder.build(dir.get(), output_searcher);
+    EXPECT_TRUE(st.ok()) << st.to_string();
+
+    ASSERT_TRUE(output_searcher.has_value());
+    auto searcher_variant = *output_searcher;
+    
EXPECT_TRUE(std::holds_alternative<FulltextIndexSearcherPtr>(searcher_variant));
+    auto searcher_ptr = std::get<FulltextIndexSearcherPtr>(searcher_variant);
+    EXPECT_NE(searcher_ptr, nullptr);
+    EXPECT_GT(builder.get_reader_size(), 0);
+    std::cout << "test_keyword_builder size = " << builder.get_reader_size() 
<< std::endl;
+}
+} // namespace doris::segment_v2
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to