This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new d2c108726dc [opt](bloomfilter index) optimize memory usage for bloom filter index writer #45833 (#46047) d2c108726dc is described below commit d2c108726dc466183dbc6943e2a53089ed436c2a Author: airborne12 <jiang...@selectdb.com> AuthorDate: Fri Dec 27 12:10:56 2024 +0800 [opt](bloomfilter index) optimize memory usage for bloom filter index writer #45833 (#46047) cherry pick from #45833 --- be/src/olap/rowset/segment_v2/bloom_filter.h | 10 ++++++++ .../segment_v2/bloom_filter_index_writer.cpp | 27 +++++++++++++--------- .../bloom_filter_index_reader_writer_test.cpp | 7 +++++- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.h b/be/src/olap/rowset/segment_v2/bloom_filter.h index 13b1558431e..15644b8d6bf 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter.h @@ -153,6 +153,16 @@ public: return hash_code; } + static Result<uint64_t> hash(const char* buf, uint32_t size, HashStrategyPB strategy) { + if (strategy == HASH_MURMUR3_X64_64) { + uint64_t hash_code; + murmur_hash3_x64_64(buf, size, DEFAULT_SEED, &hash_code); + return hash_code; + } else { + return Status::InvalidArgument("invalid strategy:{}", strategy); + } + } + virtual void add_bytes(const char* buf, uint32_t size) { if (buf == nullptr) { *_has_null = true; diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 74974365466..017393d8ffa 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -84,9 +84,10 @@ public: for (int i = 0; i < count; ++i) { if (_values.find(*v) == _values.end()) { if constexpr (_is_slice_type()) { - CppType new_value; - RETURN_IF_CATCH_EXCEPTION(_type_info->deep_copy(&new_value, v, &_arena)); - _values.insert(new_value); + const auto* s = reinterpret_cast<const Slice*>(v); + auto hash = + DORIS_TRY(BloomFilter::hash(s->data, s->size, _bf_options.strategy)); + _hash_values.insert(hash); } else if constexpr (_is_int128()) { int128_t new_value; memcpy(&new_value, v, sizeof(PackedInt128)); @@ -105,25 +106,28 @@ public: Status flush() override { std::unique_ptr<BloomFilter> bf; RETURN_IF_ERROR(BloomFilter::create(BLOCK_BLOOM_FILTER, &bf)); - RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); - bf->set_has_null(_has_null); - for (auto& v : _values) { - if constexpr (_is_slice_type()) { - Slice* s = (Slice*)&v; - bf->add_bytes(s->data, s->size); - } else { + if constexpr (_is_slice_type()) { + RETURN_IF_ERROR(bf->init(_hash_values.size(), _bf_options.fpp, _bf_options.strategy)); + for (const auto& h : _hash_values) { + bf->add_hash(h); + } + } else { + RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); + for (auto& v : _values) { bf->add_bytes((char*)&v, sizeof(CppType)); } } + bf->set_has_null(_has_null); _bf_buffer_size += bf->size(); _bfs.push_back(std::move(bf)); _values.clear(); + _hash_values.clear(); _has_null = false; return Status::OK(); } Status finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) override { - if (_values.size() > 0) { + if (_values.size() > 0 || !_hash_values.empty()) { RETURN_IF_ERROR(flush()); } index_meta->set_type(BLOOM_FILTER_INDEX); @@ -172,6 +176,7 @@ private: // distinct values ValueDict _values; std::vector<std::unique_ptr<BloomFilter>> _bfs; + std::set<uint64_t> _hash_values; }; } // namespace diff --git a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp index 258dd9a5ff8..2b0d3783938 100644 --- a/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp +++ b/be/test/olap/rowset/segment_v2/bloom_filter_index_reader_writer_test.cpp @@ -160,7 +160,12 @@ void test_bloom_filter_index_reader_writer_template( } // test nullptr EXPECT_TRUE(bf->test_bytes(nullptr, 1)); - + if (is_slice_type) { + Slice* value = (Slice*)(not_exist_value); + EXPECT_FALSE(bf->test_bytes(value->data, value->size)); + } else { + EXPECT_FALSE(bf->test_bytes((char*)not_exist_value, sizeof(CppType))); + } delete reader; } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org