This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new f65103e2a6b [Chore](runtime-filter) unify interfaces of bloom filter and remove some unused code (#27822) f65103e2a6b is described below commit f65103e2a6bdade1d144b58044c66c64bf25927a Author: Pxl <pxl...@qq.com> AuthorDate: Sat Dec 2 07:42:55 2023 +0800 [Chore](runtime-filter) unify interfaces of bloom filter and remove some unused code (#27822) * unify interfaces of bloom filter and remove some unused code --- be/src/exprs/bloom_filter_func.h | 54 +++++------- be/src/exprs/runtime_filter.cpp | 67 --------------- be/src/exprs/runtime_filter.h | 4 - be/src/olap/bloom_filter_predicate.h | 17 +--- be/src/vec/exprs/vbloom_predicate.cpp | 11 +-- be/test/exprs/bloom_filter_predicate_test.cpp | 117 -------------------------- 6 files changed, 24 insertions(+), 246 deletions(-) diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index 0323d44315f..6ea805ee2ee 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -20,6 +20,7 @@ #include "exprs/block_bloom_filter.hpp" #include "exprs/runtime_filter.h" #include "olap/rowset/segment_v2/bloom_filter.h" // IWYU pragma: keep +#include "vec/columns/column_dictionary.h" #include "vec/common/string_ref.h" namespace doris { @@ -198,12 +199,6 @@ public: virtual void insert(const void* data) = 0; - virtual bool find(const void* data) const = 0; - - virtual bool find_olap_engine(const void* data) const = 0; - - virtual bool find_uint32_t(uint32_t data) const = 0; - virtual void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) = 0; virtual void find_fixed_len(const vectorized::ColumnPtr& column, uint8_t* results) = 0; @@ -336,16 +331,11 @@ struct CommonFindOp : BaseOp { } void insert(BloomFilterAdaptor& bloom_filter, const void* data) const { - bloom_filter.add_element(((T*)data)[0]); - } - bool find(const BloomFilterAdaptor& bloom_filter, const void* data) const { - return bloom_filter.test_element(((T*)data)[0]); + bloom_filter.add_element(*(T*)data); } + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { - return find(bloom_filter, data); - } - bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const { - return bloom_filter.test(data); + return bloom_filter.test_element(*(T*)data); } }; @@ -412,21 +402,10 @@ struct StringFindOp : public BaseOp { } } - static bool find(const BloomFilterAdaptor& bloom_filter, const void* data) { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { const auto* value = reinterpret_cast<const StringRef*>(data); - if (value == nullptr) { - return false; - } return bloom_filter.test(*value); } - - bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { - return StringFindOp::find(bloom_filter, data); - } - - static bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) { - return bloom_filter.test(data); - } }; // We do not need to judge whether data is empty, because null will not appear @@ -487,17 +466,22 @@ public: dummy.find_batch(*_bloom_filter, column, results); } - bool find(const void* data) const override { - DCHECK(_bloom_filter != nullptr); - return dummy.find(*_bloom_filter, data); - } - - bool find_olap_engine(const void* data) const override { - return dummy.find_olap_engine(*_bloom_filter, data); + template <bool is_nullable> + uint16_t find_dict_olap_engine(const vectorized::ColumnDictI32* column, const uint8* nullmap, + uint16_t* offsets, int number) { + uint16_t new_size = 0; + for (uint16_t i = 0; i < number; i++) { + uint16_t idx = offsets[i]; + offsets[new_size] = idx; + if constexpr (is_nullable) { + new_size += !nullmap[idx] && _bloom_filter->test(column->get_hash_value(idx)); + } else { + new_size += _bloom_filter->test(column->get_hash_value(idx)); + } + } + return new_size; } - bool find_uint32_t(uint32_t data) const override { return dummy.find(*_bloom_filter, data); } - uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number, bool is_parse_column) override { return dummy.find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number, diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 31bf2025988..8398311654f 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -388,43 +388,6 @@ public: BloomFilterFuncBase* get_bloomfilter() const { return _context.bloom_filter_func.get(); } - void insert(const void* data) { - switch (_filter_type) { - case RuntimeFilterType::IN_FILTER: { - if (_is_ignored_in_filter) { - break; - } - _context.hybrid_set->insert(data); - break; - } - case RuntimeFilterType::MIN_FILTER: - case RuntimeFilterType::MAX_FILTER: - case RuntimeFilterType::MINMAX_FILTER: { - _context.minmax_func->insert(data); - break; - } - case RuntimeFilterType::BLOOM_FILTER: { - _context.bloom_filter_func->insert(data); - break; - } - case RuntimeFilterType::IN_OR_BLOOM_FILTER: { - if (_is_bloomfilter) { - _context.bloom_filter_func->insert(data); - } else { - _context.hybrid_set->insert(data); - } - break; - } - case RuntimeFilterType::BITMAP_FILTER: { - _context.bitmap_filter_func->insert(data); - break; - } - default: - DCHECK(false); - break; - } - } - void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) { switch (_filter_type) { case RuntimeFilterType::IN_FILTER: { @@ -458,24 +421,6 @@ public: } } - void insert(const StringRef& value) { - switch (_column_return_type) { - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_HLL: - case TYPE_STRING: { - // StringRef->StringRef - StringRef data = StringRef(value.data, value.size); - insert(reinterpret_cast<const void*>(&data)); - break; - } - - default: - insert(reinterpret_cast<const void*>(value.data)); - break; - } - } - void insert_batch(const vectorized::ColumnPtr& column, size_t start) { if (get_real_type() == RuntimeFilterType::BITMAP_FILTER) { bitmap_filter_insert_batch(column, start); @@ -1050,18 +995,6 @@ void IRuntimeFilter::copy_from_other(IRuntimeFilter* other) { _wrapper->_context = other->_wrapper->_context; } -void IRuntimeFilter::insert(const void* data) { - DCHECK(is_producer()); - if (!_is_ignored) { - _wrapper->insert(data); - } -} - -void IRuntimeFilter::insert(const StringRef& value) { - DCHECK(is_producer()); - _wrapper->insert(value); -} - void IRuntimeFilter::insert_batch(const vectorized::ColumnPtr column, size_t start) { DCHECK(is_producer()); _wrapper->insert_batch(column, start); diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index 4b8c982fee3..187d0d757e9 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -244,10 +244,6 @@ public: void copy_from_other(IRuntimeFilter* other); - // insert data to build filter - // only used for producer - void insert(const void* data); - void insert(const StringRef& data); void insert_batch(vectorized::ColumnPtr column, size_t start); // publish filter diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 51abd68a4b4..bacf61b8362 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -69,21 +69,12 @@ private: uint16_t new_size = 0; if (column.is_column_dictionary()) { - const auto* dict_col = reinterpret_cast<const vectorized::ColumnDictI32*>(&column); - for (uint16_t i = 0; i < size; i++) { - uint16_t idx = sel[i]; - sel[new_size] = idx; - if constexpr (is_nullable) { - new_size += !null_map[idx] && - _specific_filter->find_uint32_t(dict_col->get_hash_value(idx)); - } else { - new_size += _specific_filter->find_uint32_t(dict_col->get_hash_value(idx)); - } - } + const auto* dict_col = assert_cast<const vectorized::ColumnDictI32*>(&column); + new_size = _specific_filter->template find_dict_olap_engine<is_nullable>( + dict_col, null_map, sel, size); } else { const auto& data = - reinterpret_cast< - const vectorized::PredicateColumnType<PredicateEvaluateType<T>>*>( + assert_cast<const vectorized::PredicateColumnType<PredicateEvaluateType<T>>*>( &column) ->get_data(); new_size = _specific_filter->find_fixed_len_olap_engine((char*)data.data(), null_map, diff --git a/be/src/vec/exprs/vbloom_predicate.cpp b/be/src/vec/exprs/vbloom_predicate.cpp index 176ecb219ce..f72657c528a 100644 --- a/be/src/vec/exprs/vbloom_predicate.cpp +++ b/be/src/vec/exprs/vbloom_predicate.cpp @@ -89,16 +89,7 @@ Status VBloomPredicate::execute(VExprContext* context, Block* block, int* result size_t sz = argument_column->size(); res_data_column->resize(sz); auto* ptr = ((ColumnVector<UInt8>*)res_data_column.get())->get_data().data(); - auto type = WhichDataType(remove_nullable(block->get_by_position(arguments[0]).type)); - if (type.is_string_or_fixed_string()) { - for (size_t i = 0; i < sz; i++) { - auto ele = argument_column->get_data_at(i); - const StringRef v(ele.data, ele.size); - ptr[i] = _filter->find(reinterpret_cast<const void*>(&v)); - } - } else { - _filter->find_fixed_len(argument_column, ptr); - } + _filter->find_fixed_len(argument_column, ptr); if (_data_type->is_nullable()) { auto null_map = ColumnVector<UInt8>::create(block->rows(), 0); diff --git a/be/test/exprs/bloom_filter_predicate_test.cpp b/be/test/exprs/bloom_filter_predicate_test.cpp deleted file mode 100644 index 8c33ed13a6d..00000000000 --- a/be/test/exprs/bloom_filter_predicate_test.cpp +++ /dev/null @@ -1,117 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include <gtest/gtest-message.h> -#include <gtest/gtest-test-part.h> -#include <string.h> - -#include <memory> -#include <string> - -#include "common/object_pool.h" -#include "common/status.h" -#include "exprs/bloom_filter_func.h" -#include "exprs/create_predicate_function.h" -#include "gtest/gtest_pred_impl.h" -#include "runtime/define_primitive_type.h" -#include "vec/common/string_ref.h" - -namespace doris { -class BloomFilterPredicateTest : public testing::Test { -public: - BloomFilterPredicateTest() = default; - virtual void SetUp() {} - virtual void TearDown() {} -}; - -TEST_F(BloomFilterPredicateTest, bloom_filter_func_int_test) { - std::unique_ptr<BloomFilterFuncBase> func(create_bloom_filter(PrimitiveType::TYPE_INT)); - EXPECT_TRUE(func->init(1024, 0.05).ok()); - const int data_size = 1024; - int data[data_size]; - for (int i = 0; i < data_size; i++) { - data[i] = i; - func->insert((const void*)&data[i]); - } - for (int i = 0; i < data_size; i++) { - EXPECT_TRUE(func->find((const void*)&data[i])); - } - // test not exist val - int not_exist_val = 0x3355ff; - EXPECT_FALSE(func->find((const void*)¬_exist_val)); -} - -TEST_F(BloomFilterPredicateTest, bloom_filter_func_stringval_test) { - std::unique_ptr<BloomFilterFuncBase> func(create_bloom_filter(PrimitiveType::TYPE_VARCHAR)); - EXPECT_TRUE(func->init(1024, 0.05).ok()); - ObjectPool obj_pool; - const int data_size = 1024; - StringRef data[data_size]; - for (int i = 0; i < data_size; i++) { - auto str = obj_pool.add(new std::string(std::to_string(i))); - data[i] = StringRef(*str); - func->insert((const void*)&data[i]); - } - for (int i = 0; i < data_size; i++) { - EXPECT_TRUE(func->find((const void*)&data[i])); - } - // test not exist value - std::string not_exist_str = "0x3355ff"; - StringRef not_exist_val(not_exist_str); - EXPECT_FALSE(func->find((const void*)¬_exist_val)); - - // test fixed char - func.reset(create_bloom_filter(PrimitiveType::TYPE_CHAR)); - EXPECT_TRUE(func->init(1024, 0.05).ok()); - - auto varchar_true_str = obj_pool.add(new std::string("true")); - StringRef varchar_true(*varchar_true_str); - func->insert((const void*)&varchar_true); - - auto varchar_false_str = obj_pool.add(new std::string("false")); - StringRef varchar_false(*varchar_false_str); - func->insert((const void*)&varchar_false); - - StringRef fixed_char_true; - char true_buf[100] = "true"; - memset(true_buf + strlen(true_buf), 0, 100 - strlen(true_buf)); - fixed_char_true.data = true_buf; - fixed_char_true.size = 10; - - StringRef fixed_char_false; - char false_buf[100] = "false"; - memset(false_buf + strlen(false_buf), 0, 100 - strlen(false_buf)); - fixed_char_false.data = false_buf; - fixed_char_false.size = 10; - - EXPECT_TRUE(func->find_olap_engine((const void*)&fixed_char_true)); - EXPECT_TRUE(func->find_olap_engine((const void*)&fixed_char_false)); - - func->find(nullptr); -} - -TEST_F(BloomFilterPredicateTest, bloom_filter_size_test) { - std::unique_ptr<BloomFilterFuncBase> func(create_bloom_filter(PrimitiveType::TYPE_VARCHAR)); - int length = 4096; - static_cast<void>(func->init_with_fixed_length(4096)); - char* data = nullptr; - int len; - static_cast<void>(func->get_data(&data, &len)); - EXPECT_EQ(length, len); -} - -} // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org