This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 104a822a2f4 [Refacotr](RuntimeFilter) refactor rf code to improve performance (#28094) 104a822a2f4 is described below commit 104a822a2f4f77bb3777f813d24974a3936d6c81 Author: HappenLee <happen...@hotmail.com> AuthorDate: Thu Dec 7 20:32:30 2023 +0800 [Refacotr](RuntimeFilter) refactor rf code to improve performance (#28094) --- be/src/exprs/bloom_filter_func.h | 156 +++++++++++++++++---------------------- be/src/exprs/runtime_filter.h | 1 + 2 files changed, 67 insertions(+), 90 deletions(-) diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index 6ea805ee2ee..3c60ccc89c7 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -55,8 +55,6 @@ public: return _bloom_filter->find(data); } - void add_bytes(const char* data, size_t len) { _bloom_filter->insert(StringRef(data, len)); } - // test_element/find_element only used on vectorized engine template <typename T> bool test_element(T element) const { @@ -217,70 +215,76 @@ protected: bool _build_bf_exactly = false; }; -struct BaseOp { - virtual ~BaseOp() = default; - - virtual bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, - const void* data) const = 0; - - uint16_t find_batch_olap_engine_with_element_size(const BloomFilterAdaptor& bloom_filter, - const char* data, const uint8* nullmap, - uint16_t* offsets, int number, - const bool is_parse_column, - size_t element_size) const { - uint16_t new_size = 0; - if (is_parse_column) { - if (nullmap == nullptr) { - for (int i = 0; i < number; i++) { - uint16_t idx = offsets[i]; - if (!find_olap_engine(bloom_filter, data + element_size * idx)) { - continue; - } - offsets[new_size++] = idx; +template <typename T, bool need_trim = false> +uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char* data, + const uint8* nullmap, uint16_t* offsets, int number, + const bool is_parse_column) { + auto get_element = [](const char* input_data, int idx) { + if constexpr (std::is_same_v<T, StringRef> && need_trim) { + const auto value = ((const StringRef*)(input_data))[idx]; + int64_t size = value.size; + const char* data = value.data; + // CHAR type may pad the tail with \0, need to trim + while (size > 0 && data[size - 1] == '\0') { + size--; + } + return StringRef(value.data, size); + } else { + return ((const T*)(input_data))[idx]; + } + }; + + uint16_t new_size = 0; + if (is_parse_column) { + if (nullmap == nullptr) { + for (int i = 0; i < number; i++) { + uint16_t idx = offsets[i]; + if (!bloom_filter.test_element(get_element(data, idx))) { + continue; } - } else { - for (int i = 0; i < number; i++) { - uint16_t idx = offsets[i]; - if (nullmap[idx]) { - continue; - } - if (!find_olap_engine(bloom_filter, data + element_size * idx)) { - continue; - } - offsets[new_size++] = idx; + offsets[new_size++] = idx; + } + } else { + for (int i = 0; i < number; i++) { + uint16_t idx = offsets[i]; + if (nullmap[idx]) { + continue; + } + if (!bloom_filter.test_element(get_element(data, idx))) { + continue; + } + offsets[new_size++] = idx; + } + } + } else { + if (nullmap == nullptr) { + for (int i = 0; i < number; i++) { + if (!bloom_filter.test_element(get_element(data, i))) { + continue; } + offsets[new_size++] = i; } } else { - if (nullmap == nullptr) { - for (int i = 0; i < number; i++) { - if (!find_olap_engine(bloom_filter, data + element_size * i)) { - continue; - } - offsets[new_size++] = i; + for (int i = 0; i < number; i++) { + if (nullmap[i]) { + continue; } - } else { - for (int i = 0; i < number; i++) { - if (nullmap[i]) { - continue; - } - if (!find_olap_engine(bloom_filter, data + element_size * i)) { - continue; - } - offsets[new_size++] = i; + if (!bloom_filter.test_element(get_element(data, i))) { + continue; } + offsets[new_size++] = i; } } - return new_size; } -}; + return new_size; +} template <class T> -struct CommonFindOp : BaseOp { +struct CommonFindOp { uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, const uint8* nullmap, uint16_t* offsets, int number, const bool is_parse_column) { - return find_batch_olap_engine_with_element_size(bloom_filter, data, nullmap, offsets, - number, is_parse_column, sizeof(T)); + return find_batch_olap<T>(bloom_filter, data, nullmap, offsets, number, is_parse_column); } void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, @@ -333,22 +337,11 @@ struct CommonFindOp : BaseOp { void insert(BloomFilterAdaptor& bloom_filter, const void* data) const { bloom_filter.add_element(*(T*)data); } - - bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { - return bloom_filter.test_element(*(T*)data); - } }; -struct StringFindOp : public BaseOp { - uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, - const uint8* nullmap, uint16_t* offsets, int number, - const bool is_parse_column) { - return find_batch_olap_engine_with_element_size(bloom_filter, data, nullmap, offsets, - number, is_parse_column, sizeof(StringRef)); - } - - static void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, - size_t start) { +struct StringFindOp : CommonFindOp<StringRef> { + void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, + size_t start) { if (column->is_nullable()) { const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get()); const auto& col = @@ -370,8 +363,8 @@ struct StringFindOp : public BaseOp { } } - static void find_batch(const BloomFilterAdaptor& bloom_filter, - const vectorized::ColumnPtr& column, uint8_t* results) { + void find_batch(const BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, + uint8_t* results) { if (column->is_nullable()) { const auto* nullable = assert_cast<const vectorized::ColumnNullable*>(column.get()); const auto& col = @@ -394,33 +387,16 @@ struct StringFindOp : public BaseOp { } } } - - static void insert(BloomFilterAdaptor& bloom_filter, const void* data) { - const auto* value = reinterpret_cast<const StringRef*>(data); - if (value) { - bloom_filter.add_bytes(value->data, value->size); - } - } - - bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { - const auto* value = reinterpret_cast<const StringRef*>(data); - return bloom_filter.test(*value); - } }; // We do not need to judge whether data is empty, because null will not appear // when filer used by the storage engine struct FixedStringFindOp : public StringFindOp { - bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, - const void* input_data) const override { - const auto* value = reinterpret_cast<const StringRef*>(input_data); - int64_t size = value->size; - const char* data = value->data; - // CHAR type may pad the tail with \0, need to trim - while (size > 0 && data[size - 1] == '\0') { - size--; - } - return bloom_filter.test(StringRef(value->data, size)); + uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, + const uint8* nullmap, uint16_t* offsets, int number, + const bool is_parse_column) { + return find_batch_olap<StringRef, true>(bloom_filter, data, nullmap, offsets, number, + is_parse_column); } }; diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index 187d0d757e9..97078c11757 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -244,6 +244,7 @@ public: void copy_from_other(IRuntimeFilter* other); + // insert data to build filter void insert_batch(vectorized::ColumnPtr column, size_t start); // publish filter --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org