This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 104a822a2f4 [Refacotr](RuntimeFilter) refactor rf code to improve 
performance (#28094)
104a822a2f4 is described below

commit 104a822a2f4f77bb3777f813d24974a3936d6c81
Author: HappenLee <happen...@hotmail.com>
AuthorDate: Thu Dec 7 20:32:30 2023 +0800

    [Refacotr](RuntimeFilter) refactor rf code to improve performance (#28094)
---
 be/src/exprs/bloom_filter_func.h | 156 +++++++++++++++++----------------------
 be/src/exprs/runtime_filter.h    |   1 +
 2 files changed, 67 insertions(+), 90 deletions(-)

diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h
index 6ea805ee2ee..3c60ccc89c7 100644
--- a/be/src/exprs/bloom_filter_func.h
+++ b/be/src/exprs/bloom_filter_func.h
@@ -55,8 +55,6 @@ public:
         return _bloom_filter->find(data);
     }
 
-    void add_bytes(const char* data, size_t len) { 
_bloom_filter->insert(StringRef(data, len)); }
-
     // test_element/find_element only used on vectorized engine
     template <typename T>
     bool test_element(T element) const {
@@ -217,70 +215,76 @@ protected:
     bool _build_bf_exactly = false;
 };
 
-struct BaseOp {
-    virtual ~BaseOp() = default;
-
-    virtual bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
-                                  const void* data) const = 0;
-
-    uint16_t find_batch_olap_engine_with_element_size(const 
BloomFilterAdaptor& bloom_filter,
-                                                      const char* data, const 
uint8* nullmap,
-                                                      uint16_t* offsets, int 
number,
-                                                      const bool 
is_parse_column,
-                                                      size_t element_size) 
const {
-        uint16_t new_size = 0;
-        if (is_parse_column) {
-            if (nullmap == nullptr) {
-                for (int i = 0; i < number; i++) {
-                    uint16_t idx = offsets[i];
-                    if (!find_olap_engine(bloom_filter, data + element_size * 
idx)) {
-                        continue;
-                    }
-                    offsets[new_size++] = idx;
+template <typename T, bool need_trim = false>
+uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char* 
data,
+                         const uint8* nullmap, uint16_t* offsets, int number,
+                         const bool is_parse_column) {
+    auto get_element = [](const char* input_data, int idx) {
+        if constexpr (std::is_same_v<T, StringRef> && need_trim) {
+            const auto value = ((const StringRef*)(input_data))[idx];
+            int64_t size = value.size;
+            const char* data = value.data;
+            // CHAR type may pad the tail with \0, need to trim
+            while (size > 0 && data[size - 1] == '\0') {
+                size--;
+            }
+            return StringRef(value.data, size);
+        } else {
+            return ((const T*)(input_data))[idx];
+        }
+    };
+
+    uint16_t new_size = 0;
+    if (is_parse_column) {
+        if (nullmap == nullptr) {
+            for (int i = 0; i < number; i++) {
+                uint16_t idx = offsets[i];
+                if (!bloom_filter.test_element(get_element(data, idx))) {
+                    continue;
                 }
-            } else {
-                for (int i = 0; i < number; i++) {
-                    uint16_t idx = offsets[i];
-                    if (nullmap[idx]) {
-                        continue;
-                    }
-                    if (!find_olap_engine(bloom_filter, data + element_size * 
idx)) {
-                        continue;
-                    }
-                    offsets[new_size++] = idx;
+                offsets[new_size++] = idx;
+            }
+        } else {
+            for (int i = 0; i < number; i++) {
+                uint16_t idx = offsets[i];
+                if (nullmap[idx]) {
+                    continue;
+                }
+                if (!bloom_filter.test_element(get_element(data, idx))) {
+                    continue;
+                }
+                offsets[new_size++] = idx;
+            }
+        }
+    } else {
+        if (nullmap == nullptr) {
+            for (int i = 0; i < number; i++) {
+                if (!bloom_filter.test_element(get_element(data, i))) {
+                    continue;
                 }
+                offsets[new_size++] = i;
             }
         } else {
-            if (nullmap == nullptr) {
-                for (int i = 0; i < number; i++) {
-                    if (!find_olap_engine(bloom_filter, data + element_size * 
i)) {
-                        continue;
-                    }
-                    offsets[new_size++] = i;
+            for (int i = 0; i < number; i++) {
+                if (nullmap[i]) {
+                    continue;
                 }
-            } else {
-                for (int i = 0; i < number; i++) {
-                    if (nullmap[i]) {
-                        continue;
-                    }
-                    if (!find_olap_engine(bloom_filter, data + element_size * 
i)) {
-                        continue;
-                    }
-                    offsets[new_size++] = i;
+                if (!bloom_filter.test_element(get_element(data, i))) {
+                    continue;
                 }
+                offsets[new_size++] = i;
             }
         }
-        return new_size;
     }
-};
+    return new_size;
+}
 
 template <class T>
-struct CommonFindOp : BaseOp {
+struct CommonFindOp {
     uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, 
const char* data,
                                     const uint8* nullmap, uint16_t* offsets, 
int number,
                                     const bool is_parse_column) {
-        return find_batch_olap_engine_with_element_size(bloom_filter, data, 
nullmap, offsets,
-                                                        number, 
is_parse_column, sizeof(T));
+        return find_batch_olap<T>(bloom_filter, data, nullmap, offsets, 
number, is_parse_column);
     }
 
     void insert_batch(BloomFilterAdaptor& bloom_filter, const 
vectorized::ColumnPtr& column,
@@ -333,22 +337,11 @@ struct CommonFindOp : BaseOp {
     void insert(BloomFilterAdaptor& bloom_filter, const void* data) const {
         bloom_filter.add_element(*(T*)data);
     }
-
-    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const override {
-        return bloom_filter.test_element(*(T*)data);
-    }
 };
 
-struct StringFindOp : public BaseOp {
-    uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, 
const char* data,
-                                    const uint8* nullmap, uint16_t* offsets, 
int number,
-                                    const bool is_parse_column) {
-        return find_batch_olap_engine_with_element_size(bloom_filter, data, 
nullmap, offsets,
-                                                        number, 
is_parse_column, sizeof(StringRef));
-    }
-
-    static void insert_batch(BloomFilterAdaptor& bloom_filter, const 
vectorized::ColumnPtr& column,
-                             size_t start) {
+struct StringFindOp : CommonFindOp<StringRef> {
+    void insert_batch(BloomFilterAdaptor& bloom_filter, const 
vectorized::ColumnPtr& column,
+                      size_t start) {
         if (column->is_nullable()) {
             const auto* nullable = assert_cast<const 
vectorized::ColumnNullable*>(column.get());
             const auto& col =
@@ -370,8 +363,8 @@ struct StringFindOp : public BaseOp {
         }
     }
 
-    static void find_batch(const BloomFilterAdaptor& bloom_filter,
-                           const vectorized::ColumnPtr& column, uint8_t* 
results) {
+    void find_batch(const BloomFilterAdaptor& bloom_filter, const 
vectorized::ColumnPtr& column,
+                    uint8_t* results) {
         if (column->is_nullable()) {
             const auto* nullable = assert_cast<const 
vectorized::ColumnNullable*>(column.get());
             const auto& col =
@@ -394,33 +387,16 @@ struct StringFindOp : public BaseOp {
             }
         }
     }
-
-    static void insert(BloomFilterAdaptor& bloom_filter, const void* data) {
-        const auto* value = reinterpret_cast<const StringRef*>(data);
-        if (value) {
-            bloom_filter.add_bytes(value->data, value->size);
-        }
-    }
-
-    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter, const void* 
data) const override {
-        const auto* value = reinterpret_cast<const StringRef*>(data);
-        return bloom_filter.test(*value);
-    }
 };
 
 // We do not need to judge whether data is empty, because null will not appear
 // when filer used by the storage engine
 struct FixedStringFindOp : public StringFindOp {
-    bool find_olap_engine(const BloomFilterAdaptor& bloom_filter,
-                          const void* input_data) const override {
-        const auto* value = reinterpret_cast<const StringRef*>(input_data);
-        int64_t size = value->size;
-        const char* data = value->data;
-        // CHAR type may pad the tail with \0, need to trim
-        while (size > 0 && data[size - 1] == '\0') {
-            size--;
-        }
-        return bloom_filter.test(StringRef(value->data, size));
+    uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, 
const char* data,
+                                    const uint8* nullmap, uint16_t* offsets, 
int number,
+                                    const bool is_parse_column) {
+        return find_batch_olap<StringRef, true>(bloom_filter, data, nullmap, 
offsets, number,
+                                                is_parse_column);
     }
 };
 
diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h
index 187d0d757e9..97078c11757 100644
--- a/be/src/exprs/runtime_filter.h
+++ b/be/src/exprs/runtime_filter.h
@@ -244,6 +244,7 @@ public:
 
     void copy_from_other(IRuntimeFilter* other);
 
+    // insert data to build filter
     void insert_batch(vectorized::ColumnPtr column, size_t start);
 
     // publish filter


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to