This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new d2c108726dc [opt](bloomfilter index) optimize memory usage for bloom 
filter index writer #45833 (#46047)
d2c108726dc is described below

commit d2c108726dc466183dbc6943e2a53089ed436c2a
Author: airborne12 <jiang...@selectdb.com>
AuthorDate: Fri Dec 27 12:10:56 2024 +0800

    [opt](bloomfilter index) optimize memory usage for bloom filter index 
writer #45833 (#46047)
    
    cherry pick from #45833
---
 be/src/olap/rowset/segment_v2/bloom_filter.h       | 10 ++++++++
 .../segment_v2/bloom_filter_index_writer.cpp       | 27 +++++++++++++---------
 .../bloom_filter_index_reader_writer_test.cpp      |  7 +++++-
 3 files changed, 32 insertions(+), 12 deletions(-)

diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.h 
b/be/src/olap/rowset/segment_v2/bloom_filter.h
index 13b1558431e..15644b8d6bf 100644
--- a/be/src/olap/rowset/segment_v2/bloom_filter.h
+++ b/be/src/olap/rowset/segment_v2/bloom_filter.h
@@ -153,6 +153,16 @@ public:
         return hash_code;
     }
 
+    static Result<uint64_t> hash(const char* buf, uint32_t size, 
HashStrategyPB strategy) {
+        if (strategy == HASH_MURMUR3_X64_64) {
+            uint64_t hash_code;
+            murmur_hash3_x64_64(buf, size, DEFAULT_SEED, &hash_code);
+            return hash_code;
+        } else {
+            return Status::InvalidArgument("invalid strategy:{}", strategy);
+        }
+    }
+
     virtual void add_bytes(const char* buf, uint32_t size) {
         if (buf == nullptr) {
             *_has_null = true;
diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
index 74974365466..017393d8ffa 100644
--- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp
@@ -84,9 +84,10 @@ public:
         for (int i = 0; i < count; ++i) {
             if (_values.find(*v) == _values.end()) {
                 if constexpr (_is_slice_type()) {
-                    CppType new_value;
-                    
RETURN_IF_CATCH_EXCEPTION(_type_info->deep_copy(&new_value, v, &_arena));
-                    _values.insert(new_value);
+                    const auto* s = reinterpret_cast<const Slice*>(v);
+                    auto hash =
+                            DORIS_TRY(BloomFilter::hash(s->data, s->size, 
_bf_options.strategy));
+                    _hash_values.insert(hash);
                 } else if constexpr (_is_int128()) {
                     int128_t new_value;
                     memcpy(&new_value, v, sizeof(PackedInt128));
@@ -105,25 +106,28 @@ public:
     Status flush() override {
         std::unique_ptr<BloomFilter> bf;
         RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf));
-        RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, 
_bf_options.strategy));
-        bf->set_has_null(_has_null);
-        for (auto& v : _values) {
-            if constexpr (_is_slice_type()) {
-                Slice* s = (Slice*)&v;
-                bf->add_bytes(s->data, s->size);
-            } else {
+        if constexpr (_is_slice_type()) {
+            RETURN_IF_ERROR(bf->init(_hash_values.size(), _bf_options.fpp, 
_bf_options.strategy));
+            for (const auto& h : _hash_values) {
+                bf->add_hash(h);
+            }
+        } else {
+            RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, 
_bf_options.strategy));
+            for (auto& v : _values) {
                 bf->add_bytes((char*)&v, sizeof(CppType));
             }
         }
+        bf->set_has_null(_has_null);
         _bf_buffer_size += bf->size();
         _bfs.push_back(std::move(bf));
         _values.clear();
+        _hash_values.clear();
         _has_null = false;
         return Status::OK();
     }
 
     Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) 
override {
-        if (_values.size() > 0) {
+        if (_values.size() > 0 || !_hash_values.empty()) {
             RETURN_IF_ERROR(flush());
         }
         index_meta->set_type(BLOOM_FILTER_INDEX);
@@ -172,6 +176,7 @@ private:
     // distinct values
     ValueDict _values;
     std::vector<std::unique_ptr<BloomFilter>> _bfs;
+    std::set<uint64_t> _hash_values;
 };
 
 } // namespace
diff --git 
a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp 
b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp
index 258dd9a5ff8..2b0d3783938 100644
--- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp
+++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp
@@ -160,7 +160,12 @@ void test_bloom_filter_index_reader_writer_template(
         }
         // test nullptr
         EXPECT_TRUE(bf->test_bytes(nullptr, 1));
-
+        if (is_slice_type) {
+            Slice* value = (Slice*)(not_exist_value);
+            EXPECT_FALSE(bf->test_bytes(value->data, value->size));
+        } else {
+            EXPECT_FALSE(bf->test_bytes((char*)not_exist_value, 
sizeof(CppType)));
+        }
         delete reader;
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to