This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch new_join in repository https://gitbox.apache.org/repos/asf/doris.git
commit d99292d487a59a0760361bf78f5047fae79f63e2 Author: BiteTheDDDDt <pxl...@qq.com> AuthorDate: Fri Nov 3 13:03:55 2023 +0800 opt for rf --- be/src/exprs/block_bloom_filter.hpp | 5 +- be/src/exprs/bloom_filter_func.h | 111 ++++++++++++++++++++++++++++++----- be/src/olap/bloom_filter_predicate.h | 24 +------- 3 files changed, 103 insertions(+), 37 deletions(-) diff --git a/be/src/exprs/block_bloom_filter.hpp b/be/src/exprs/block_bloom_filter.hpp index 18c34bbb312..f31d7f7d4c0 100644 --- a/be/src/exprs/block_bloom_filter.hpp +++ b/be/src/exprs/block_bloom_filter.hpp @@ -20,6 +20,7 @@ #pragma once +#include "vec/common/string_ref.h" #ifdef __AVX2__ #include <immintrin.h> @@ -72,7 +73,7 @@ public: // non-equal values will have the same hash value) is 0. void insert(uint32_t hash) noexcept; // Same as above with convenience of hashing the key. - void insert(const Slice& key) noexcept { + void insert(const StringRef& key) noexcept { if (key.data) { insert(HashUtil::crc_hash(key.data, key.size, _hash_seed)); } @@ -116,7 +117,7 @@ public: #endif } // Same as above with convenience of hashing the key. - bool find(const Slice& key) const noexcept { + bool find(const StringRef& key) const noexcept { if (key.data) { return find(HashUtil::crc_hash(key.data, key.size, _hash_seed)); } diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index 48989473a32..a9330a01169 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -20,6 +20,7 @@ #include "exprs/block_bloom_filter.hpp" #include "exprs/runtime_filter.h" #include "olap/rowset/segment_v2/bloom_filter.h" // IWYU pragma: keep +#include "vec/common/string_ref.h" namespace doris { @@ -53,7 +54,7 @@ public: return _bloom_filter->find(data); } - void add_bytes(const char* data, size_t len) { _bloom_filter->insert(Slice(data, len)); } + void add_bytes(const char* data, size_t len) { _bloom_filter->insert(StringRef(data, len)); } // test_element/find_element only used on vectorized engine template <typename T> @@ -206,6 +207,10 @@ public: virtual void find_fixed_len(const vectorized::ColumnPtr& column, uint8_t* results) = 0; + virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, + uint16_t* offsets, int number, + bool is_parse_column) = 0; + protected: // bloom filter size int32_t _bloom_filter_alloced; @@ -216,8 +221,72 @@ protected: bool _build_bf_exactly = false; }; +struct BaseOp { + virtual ~BaseOp() = default; + + virtual bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, + const void* data) const = 0; + + uint16_t find_batch_olap_engine_with_element_size(const BloomFilterAdaptor& bloom_filter, + const char* data, const uint8* nullmap, + uint16_t* offsets, int number, + const bool is_parse_column, + size_t element_size) const { + uint16_t new_size = 0; + if (is_parse_column) { + if (nullmap == nullptr) { + for (int i = 0; i < number; i++) { + uint16_t idx = offsets[i]; + if (!find_olap_engine(bloom_filter, data + element_size * idx)) { + continue; + } + offsets[new_size++] = idx; + } + } else { + for (int i = 0; i < number; i++) { + uint16_t idx = offsets[i]; + if (nullmap[idx]) { + continue; + } + if (!find_olap_engine(bloom_filter, data + element_size * idx)) { + continue; + } + offsets[new_size++] = idx; + } + } + } else { + if (nullmap == nullptr) { + for (int i = 0; i < number; i++) { + if (!find_olap_engine(bloom_filter, data + element_size * i)) { + continue; + } + offsets[new_size++] = i; + } + } else { + for (int i = 0; i < number; i++) { + if (nullmap[i]) { + continue; + } + if (!find_olap_engine(bloom_filter, data + element_size * i)) { + continue; + } + offsets[new_size++] = i; + } + } + } + return new_size; + } +}; + template <class T> -struct CommonFindOp { +struct CommonFindOp : BaseOp { + uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, + const uint8* nullmap, uint16_t* offsets, int number, + const bool is_parse_column) { + return find_batch_olap_engine_with_element_size(bloom_filter, data, nullmap, offsets, + number, is_parse_column, sizeof(T)); + } + void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, size_t start) const { if (column->is_nullable()) { @@ -271,7 +340,7 @@ struct CommonFindOp { bool find(const BloomFilterAdaptor& bloom_filter, const void* data) const { return bloom_filter.test_element(((T*)data)[0]); } - bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { return find(bloom_filter, data); } bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const { @@ -279,7 +348,14 @@ struct CommonFindOp { } }; -struct StringFindOp { +struct StringFindOp : public BaseOp { + uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, + const uint8* nullmap, uint16_t* offsets, int number, + const bool is_parse_column) { + return find_batch_olap_engine_with_element_size(bloom_filter, data, nullmap, offsets, + number, is_parse_column, sizeof(StringRef)); + } + static void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, size_t start) { if (column->is_nullable()) { @@ -340,10 +416,10 @@ struct StringFindOp { if (value == nullptr) { return false; } - return bloom_filter.test(Slice(value->data, value->size)); + return bloom_filter.test(*value); } - static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { return StringFindOp::find(bloom_filter, data); } @@ -355,7 +431,8 @@ struct StringFindOp { // We do not need to judge whether data is empty, because null will not appear // when filer used by the storage engine struct FixedStringFindOp : public StringFindOp { - static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* input_data) { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, + const void* input_data) const override { const auto* value = reinterpret_cast<const StringRef*>(input_data); int64_t size = value->size; const char* data = value->data; @@ -363,15 +440,15 @@ struct FixedStringFindOp : public StringFindOp { while (size > 0 && data[size - 1] == '\0') { size--; } - return bloom_filter.test(Slice(value->data, size)); + return bloom_filter.test(StringRef(value->data, size)); } }; struct DateTimeFindOp : public CommonFindOp<VecDateTimeValue> { - static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { VecDateTimeValue value; value.from_olap_datetime(*reinterpret_cast<const uint64_t*>(data)); - return bloom_filter.test(Slice((char*)&value, sizeof(VecDateTimeValue))); + return bloom_filter.test(StringRef((char*)&value, sizeof(VecDateTimeValue))); } }; @@ -379,19 +456,19 @@ struct DateTimeFindOp : public CommonFindOp<VecDateTimeValue> { // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101684 struct DateFindOp : public CommonFindOp<VecDateTimeValue> { - static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { uint24_t date = *static_cast<const uint24_t*>(data); uint64_t value = uint32_t(date); VecDateTimeValue date_value; date_value.from_olap_date(value); - return bloom_filter.test(Slice((char*)&date_value, sizeof(VecDateTimeValue))); + return bloom_filter.test(StringRef((char*)&date_value, sizeof(VecDateTimeValue))); } }; struct DecimalV2FindOp : public CommonFindOp<DecimalV2Value> { - static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) { + bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* data) const override { auto packed_decimal = *static_cast<const decimal12_t*>(data); DecimalV2Value value; int64_t int_value = packed_decimal.integer; @@ -401,7 +478,7 @@ struct DecimalV2FindOp : public CommonFindOp<DecimalV2Value> { constexpr int decimal_value_sz = sizeof(DecimalV2Value); char data_bytes[decimal_value_sz]; memcpy(&data_bytes, &value, decimal_value_sz); - return bloom_filter.test(Slice(data_bytes, decimal_value_sz)); + return bloom_filter.test(StringRef(data_bytes, decimal_value_sz)); } }; @@ -473,6 +550,12 @@ public: bool find_uint32_t(uint32_t data) const override { return dummy.find(*_bloom_filter, data); } + uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, + int number, bool is_parse_column) override { + return dummy.find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number, + is_parse_column); + } + private: typename BloomFilterTypeTraits<type>::FindOp dummy; }; diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index 87f5ff266c3..156f054a3fa 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -63,17 +63,6 @@ private: DCHECK(null_map); } - uint24_t tmp_uint24_value; - auto get_cell_value = [&tmp_uint24_value](auto& data) { - if constexpr (std::is_same_v<std::decay_t<decltype(data)>, uint32_t> && - T == PrimitiveType::TYPE_DATE) { - memcpy((char*)(&tmp_uint24_value), (char*)(&data), sizeof(uint24_t)); - return (const char*)&tmp_uint24_value; - } else { - return (const char*)&data; - } - }; - uint16_t new_size = 0; if (column.is_column_dictionary()) { const auto* dict_col = reinterpret_cast<const vectorized::ColumnDictI32*>(&column); @@ -88,20 +77,13 @@ private: } } } else { - auto& pred_col = + const auto& data = reinterpret_cast< const vectorized::PredicateColumnType<PredicateEvaluateType<T>>*>( &column) ->get_data(); - - auto pred_col_data = pred_col.data(); -#define EVALUATE_WITH_NULL_IMPL(IDX) \ - !null_map[IDX] && _specific_filter->find_olap_engine(get_cell_value(pred_col_data[IDX])) -#define EVALUATE_WITHOUT_NULL_IMPL(IDX) \ - _specific_filter->find_olap_engine(get_cell_value(pred_col_data[IDX])) - EVALUATE_BY_SELECTOR(EVALUATE_WITH_NULL_IMPL, EVALUATE_WITHOUT_NULL_IMPL) -#undef EVALUATE_WITH_NULL_IMPL -#undef EVALUATE_WITHOUT_NULL_IMPL + new_size = _specific_filter->find_fixed_len_olap_engine((char*)data.data(), null_map, + sel, size, data.size() != size); } return new_size; } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org