This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch new_join
in repository https://gitbox.apache.org/repos/asf/doris.git

commit d99292d487a59a0760361bf78f5047fae79f63e2
Author: BiteTheDDDDt <pxl...@qq.com>
AuthorDate: Fri Nov 3 13:03:55 2023 +0800

    opt for rf
---
 be/src/exprs/block_bloom_filter.hpp  |   5 +-
 be/src/exprs/bloom_filter_func.h     | 111 ++++++++++++++++++++++++++++++-----
 be/src/olap/bloom_filter_predicate.h |  24 +-------
 3 files changed, 103 insertions(+), 37 deletions(-)

diff --git a/be/src/exprs/block_bloom_filter.hpp 
b/be/src/exprs/block_bloom_filter.hpp
index 18c34bbb312..f31d7f7d4c0 100644
--- a/be/src/exprs/block_bloom_filter.hpp
+++ b/be/src/exprs/block_bloom_filter.hpp
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "vec/common/string_ref.h"
 #ifdef __AVX2__
 #include <immintrin.h>
 
@@ -72,7 +73,7 @@ public:
     // non-equal values will have the same hash value) is 0.
     void insert(uint32_t hash) noexcept;
     // Same as above with convenience of hashing the key.
-    void insert(const Slice& key) noexcept {
+    void insert(const StringRef& key) noexcept {
         if (key.data) {
             insert(HashUtil::crc_hash(key.data, key.size, _hash_seed));
         }
@@ -116,7 +117,7 @@ public:
 #endif
     }
     // Same as above with convenience of hashing the key.
-    bool find(const Slice& key) const noexcept {
+    bool find(const StringRef& key) const noexcept {
         if (key.data) {
             return find(HashUtil::crc_hash(key.data, key.size, _hash_seed));
         }
diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h
index 48989473a32..a9330a01169 100644
--- a/be/src/exprs/bloom_filter_func.h
+++ b/be/src/exprs/bloom_filter_func.h
@@ -20,6 +20,7 @@
 #include "exprs/block_bloom_filter.hpp"
 #include "exprs/runtime_filter.h"
 #include "olap/rowset/segment_v2/bloom_filter.h" // IWYU pragma: keep
+#include "vec/common/string_ref.h"
 
 namespace doris {
 
@@ -53,7 +54,7 @@ public:
         return _bloom_filter->find(data);
     }
 
-    void add_bytes(const char* data, size_t len) { 
_bloom_filter->insert(Slice(data, len)); }
+    void add_bytes(const char* data, size_t len) { 
_bloom_filter->insert(StringRef(data, len)); }
 
     // test_element/find_element only used on vectorized engine
     template <typename T>
@@ -206,6 +207,10 @@ public:
 
     virtual void find_fixed_len(const vectorized::ColumnPtr& column, uint8_t* 
results) = 0;
 
+    virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* 
nullmap,
+                                                uint16_t* offsets, int number,
+                                                bool is_parse_column) = 0;
+
 protected:
     // bloom filter size
     int32_t _bloom_filter_alloced;
@@ -216,8 +221,72 @@ protected:
     bool _build_bf_exactly = false;
 };
 
+struct BaseOp {
+    virtual ~BaseOp() = default;
+
+    virtual bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
+                                  const void* data) const = 0;
+
+    uint16_t find_batch_olap_engine_with_element_size(const 
BloomFilterAdaptor& bloom_filter,
+                                                      const char* data, const 
uint8* nullmap,
+                                                      uint16_t* offsets, int 
number,
+                                                      const bool 
is_parse_column,
+                                                      size_t element_size) 
const {
+        uint16_t new_size = 0;
+        if (is_parse_column) {
+            if (nullmap == nullptr) {
+                for (int i = 0; i < number; i++) {
+                    uint16_t idx = offsets[i];
+                    if (!find_olap_engine(bloom_filter, data + element_size * 
idx)) {
+                        continue;
+                    }
+                    offsets[new_size++] = idx;
+                }
+            } else {
+                for (int i = 0; i < number; i++) {
+                    uint16_t idx = offsets[i];
+                    if (nullmap[idx]) {
+                        continue;
+                    }
+                    if (!find_olap_engine(bloom_filter, data + element_size * 
idx)) {
+                        continue;
+                    }
+                    offsets[new_size++] = idx;
+                }
+            }
+        } else {
+            if (nullmap == nullptr) {
+                for (int i = 0; i < number; i++) {
+                    if (!find_olap_engine(bloom_filter, data + element_size * 
i)) {
+                        continue;
+                    }
+                    offsets[new_size++] = i;
+                }
+            } else {
+                for (int i = 0; i < number; i++) {
+                    if (nullmap[i]) {
+                        continue;
+                    }
+                    if (!find_olap_engine(bloom_filter, data + element_size * 
i)) {
+                        continue;
+                    }
+                    offsets[new_size++] = i;
+                }
+            }
+        }
+        return new_size;
+    }
+};
+
 template <class T>
-struct CommonFindOp {
+struct CommonFindOp : BaseOp {
+    uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, 
const char* data,
+                                    const uint8* nullmap, uint16_t* offsets, 
int number,
+                                    const bool is_parse_column) {
+        return find_batch_olap_engine_with_element_size(bloom_filter, data, 
nullmap, offsets,
+                                                        number, 
is_parse_column, sizeof(T));
+    }
+
     void insert_batch(BloomFilterAdaptor& bloom_filter, const 
vectorized::ColumnPtr& column,
                       size_t start) const {
         if (column->is_nullable()) {
@@ -271,7 +340,7 @@ struct CommonFindOp {
     bool find(const BloomFilterAdaptor& bloom_filter, const void* data) const {
         return bloom_filter.test_element(((T*)data)[0]);
     }
-    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const override {
         return find(bloom_filter, data);
     }
     bool find(const BloomFilterAdaptor& bloom_filter, uint32_t data) const {
@@ -279,7 +348,14 @@ struct CommonFindOp {
     }
 };
 
-struct StringFindOp {
+struct StringFindOp : public BaseOp {
+    uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, 
const char* data,
+                                    const uint8* nullmap, uint16_t* offsets, 
int number,
+                                    const bool is_parse_column) {
+        return find_batch_olap_engine_with_element_size(bloom_filter, data, 
nullmap, offsets,
+                                                        number, 
is_parse_column, sizeof(StringRef));
+    }
+
     static void insert_batch(BloomFilterAdaptor& bloom_filter, const 
vectorized::ColumnPtr& column,
                              size_t start) {
         if (column->is_nullable()) {
@@ -340,10 +416,10 @@ struct StringFindOp {
         if (value == nullptr) {
             return false;
         }
-        return bloom_filter.test(Slice(value->data, value->size));
+        return bloom_filter.test(*value);
     }
 
-    static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const 
void* data) {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const override {
         return StringFindOp::find(bloom_filter, data);
     }
 
@@ -355,7 +431,8 @@ struct StringFindOp {
 // We do not need to judge whether data is empty, because null will not appear
 // when filer used by the storage engine
 struct FixedStringFindOp : public StringFindOp {
-    static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const 
void* input_data) {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
+                          const void* input_data) const override {
         const auto* value = reinterpret_cast<const StringRef*>(input_data);
         int64_t size = value->size;
         const char* data = value->data;
@@ -363,15 +440,15 @@ struct FixedStringFindOp : public StringFindOp {
         while (size > 0 && data[size - 1] == '\0') {
             size--;
         }
-        return bloom_filter.test(Slice(value->data, size));
+        return bloom_filter.test(StringRef(value->data, size));
     }
 };
 
 struct DateTimeFindOp : public CommonFindOp<VecDateTimeValue> {
-    static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const 
void* data) {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const override {
         VecDateTimeValue value;
         value.from_olap_datetime(*reinterpret_cast<const uint64_t*>(data));
-        return bloom_filter.test(Slice((char*)&value, 
sizeof(VecDateTimeValue)));
+        return bloom_filter.test(StringRef((char*)&value, 
sizeof(VecDateTimeValue)));
     }
 };
 
@@ -379,19 +456,19 @@ struct DateTimeFindOp : public 
CommonFindOp<VecDateTimeValue> {
 // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101684
 
 struct DateFindOp : public CommonFindOp<VecDateTimeValue> {
-    static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const 
void* data) {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const override {
         uint24_t date = *static_cast<const uint24_t*>(data);
         uint64_t value = uint32_t(date);
 
         VecDateTimeValue date_value;
         date_value.from_olap_date(value);
 
-        return bloom_filter.test(Slice((char*)&date_value, 
sizeof(VecDateTimeValue)));
+        return bloom_filter.test(StringRef((char*)&date_value, 
sizeof(VecDateTimeValue)));
     }
 };
 
 struct DecimalV2FindOp : public CommonFindOp<DecimalV2Value> {
-    static bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const 
void* data) {
+    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const override {
         auto packed_decimal = *static_cast<const decimal12_t*>(data);
         DecimalV2Value value;
         int64_t int_value = packed_decimal.integer;
@@ -401,7 +478,7 @@ struct DecimalV2FindOp : public 
CommonFindOp<DecimalV2Value> {
         constexpr int decimal_value_sz = sizeof(DecimalV2Value);
         char data_bytes[decimal_value_sz];
         memcpy(&data_bytes, &value, decimal_value_sz);
-        return bloom_filter.test(Slice(data_bytes, decimal_value_sz));
+        return bloom_filter.test(StringRef(data_bytes, decimal_value_sz));
     }
 };
 
@@ -473,6 +550,12 @@ public:
 
     bool find_uint32_t(uint32_t data) const override { return 
dummy.find(*_bloom_filter, data); }
 
+    uint16_t find_fixed_len_olap_engine(const char* data, const uint8* 
nullmap, uint16_t* offsets,
+                                        int number, bool is_parse_column) 
override {
+        return dummy.find_batch_olap_engine(*_bloom_filter, data, nullmap, 
offsets, number,
+                                            is_parse_column);
+    }
+
 private:
     typename BloomFilterTypeTraits<type>::FindOp dummy;
 };
diff --git a/be/src/olap/bloom_filter_predicate.h 
b/be/src/olap/bloom_filter_predicate.h
index 87f5ff266c3..156f054a3fa 100644
--- a/be/src/olap/bloom_filter_predicate.h
+++ b/be/src/olap/bloom_filter_predicate.h
@@ -63,17 +63,6 @@ private:
             DCHECK(null_map);
         }
 
-        uint24_t tmp_uint24_value;
-        auto get_cell_value = [&tmp_uint24_value](auto& data) {
-            if constexpr (std::is_same_v<std::decay_t<decltype(data)>, 
uint32_t> &&
-                          T == PrimitiveType::TYPE_DATE) {
-                memcpy((char*)(&tmp_uint24_value), (char*)(&data), 
sizeof(uint24_t));
-                return (const char*)&tmp_uint24_value;
-            } else {
-                return (const char*)&data;
-            }
-        };
-
         uint16_t new_size = 0;
         if (column.is_column_dictionary()) {
             const auto* dict_col = reinterpret_cast<const 
vectorized::ColumnDictI32*>(&column);
@@ -88,20 +77,13 @@ private:
                 }
             }
         } else {
-            auto& pred_col =
+            const auto& data =
                     reinterpret_cast<
                             const 
vectorized::PredicateColumnType<PredicateEvaluateType<T>>*>(
                             &column)
                             ->get_data();
-
-            auto pred_col_data = pred_col.data();
-#define EVALUATE_WITH_NULL_IMPL(IDX) \
-    !null_map[IDX] && 
_specific_filter->find_olap_engine(get_cell_value(pred_col_data[IDX]))
-#define EVALUATE_WITHOUT_NULL_IMPL(IDX) \
-    _specific_filter->find_olap_engine(get_cell_value(pred_col_data[IDX]))
-            EVALUATE_BY_SELECTOR(EVALUATE_WITH_NULL_IMPL, 
EVALUATE_WITHOUT_NULL_IMPL)
-#undef EVALUATE_WITH_NULL_IMPL
-#undef EVALUATE_WITHOUT_NULL_IMPL
+            new_size = 
_specific_filter->find_fixed_len_olap_engine((char*)data.data(), null_map,
+                                                                    sel, size, 
data.size() != size);
         }
         return new_size;
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to