This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git

commit bb91922f9bf5df006aa14c8c75492ce8d17638bb
Author: amory <wangqian...@selectdb.com>
AuthorDate: Thu Jul 13 11:53:54 2023 +0800

    [Improve](array)improve array hash function (#21779)
    
    cherry-pick from #21699
---
 be/src/util/timezone_utils.cpp         |  2 +
 be/src/vec/columns/column.h            |  8 +++-
 be/src/vec/columns/column_array.cpp    | 73 ++++++++++++++++++++++++++--------
 be/src/vec/columns/column_array.h      |  6 ++-
 be/src/vec/columns/column_const.h      |  8 ++--
 be/src/vec/columns/column_decimal.cpp  | 50 +++++++++++++++++------
 be/src/vec/columns/column_decimal.h    | 14 ++++++-
 be/src/vec/columns/column_nullable.cpp | 34 +++++++++++-----
 be/src/vec/columns/column_nullable.h   |  6 ++-
 be/src/vec/columns/column_string.h     | 42 +++++++++++++++----
 be/src/vec/columns/column_vector.h     | 42 +++++++++++++++----
 11 files changed, 219 insertions(+), 66 deletions(-)

diff --git a/be/src/util/timezone_utils.cpp b/be/src/util/timezone_utils.cpp
index c22c12b560..b746634e03 100644
--- a/be/src/util/timezone_utils.cpp
+++ b/be/src/util/timezone_utils.cpp
@@ -17,7 +17,9 @@
 //
 
 #include "util/timezone_utils.h"
+
 #include <cctz/time_zone.h>
+
 #include <boost/algorithm/string.hpp>
 #include <filesystem>
 #include <string>
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 89dd4fbef4..c82944b44d 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -360,7 +360,9 @@ public:
         LOG(FATAL) << get_name() << "update_hashes_with_value xxhash not 
supported";
     };
 
-    virtual void update_xxHash_with_value(size_t n, uint64_t& hash) const {
+    // use range for one hash value to avoid virtual function call in loop
+    virtual void update_xxHash_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                          const uint8_t* __restrict null_data) 
const {
         LOG(FATAL) << get_name() << " update_hash_with_value xxhash not 
supported";
     }
 
@@ -372,7 +374,9 @@ public:
         LOG(FATAL) << get_name() << "update_crcs_with_value not supported";
     };
 
-    virtual void update_crc_with_value(size_t n, uint64_t& hash) const {
+    // use range for one hash value to avoid virtual function call in loop
+    virtual void update_crc_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                       const uint8_t* __restrict null_data) 
const {
         LOG(FATAL) << get_name() << " update_crc_with_value not supported";
     }
 
diff --git a/be/src/vec/columns/column_array.cpp 
b/be/src/vec/columns/column_array.cpp
index 999991fde5..6a2f40792b 100644
--- a/be/src/vec/columns/column_array.cpp
+++ b/be/src/vec/columns/column_array.cpp
@@ -229,25 +229,64 @@ void 
ColumnArray::update_hashes_with_value(std::vector<SipHash>& hashes,
 }
 
 // for every array row calculate xxHash
-void ColumnArray::update_xxHash_with_value(size_t n, uint64_t& hash) const {
-    size_t elem_size = size_at(n);
-    size_t offset = offset_at(n);
-    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&elem_size), sizeof(elem_size),
-                                      hash);
-    for (auto i = 0; i < elem_size; ++i) {
-        get_data().update_xxHash_with_value(offset + i, hash);
+void ColumnArray::update_xxHash_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                           const uint8_t* __restrict 
null_data) const {
+    auto& offsets_column = get_offsets();
+    if (null_data) {
+        for (size_t i = start; i < end; ++i) {
+            if (null_data[i] == 0) {
+                size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+                if (elem_size == 0) {
+                    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&elem_size),
+                                                      sizeof(elem_size), hash);
+                } else {
+                    get_data().update_crc_with_value(offsets_column[i - 1], 
offsets_column[i], hash,
+                                                     nullptr);
+                }
+            }
+        }
+    } else {
+        for (size_t i = start; i < end; ++i) {
+            size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+            if (elem_size == 0) {
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&elem_size),
+                                                  sizeof(elem_size), hash);
+            } else {
+                get_data().update_crc_with_value(offsets_column[i - 1], 
offsets_column[i], hash,
+                                                 nullptr);
+            }
+        }
     }
 }
 
 // for every array row calculate crcHash
-void ColumnArray::update_crc_with_value(size_t n, uint64_t& crc) const {
-    size_t elem_size = size_at(n);
-    size_t offset = offset_at(n);
-
-    crc = HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(&elem_size), 
sizeof(elem_size),
-                                  crc);
-    for (auto i = 0; i < elem_size; ++i) {
-        get_data().update_crc_with_value(offset + i, crc);
+void ColumnArray::update_crc_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                        const uint8_t* __restrict null_data) 
const {
+    auto& offsets_column = get_offsets();
+    if (null_data) {
+        for (size_t i = start; i < end; ++i) {
+            if (null_data[i] == 0) {
+                size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+                if (elem_size == 0) {
+                    hash = HashUtil::zlib_crc_hash(reinterpret_cast<const 
char*>(&elem_size),
+                                                   sizeof(elem_size), hash);
+                } else {
+                    get_data().update_crc_with_value(offsets_column[i - 1], 
offsets_column[i], hash,
+                                                     nullptr);
+                }
+            }
+        }
+    } else {
+        for (size_t i = start; i < end; ++i) {
+            size_t elem_size = offsets_column[i] - offsets_column[i - 1];
+            if (elem_size == 0) {
+                hash = HashUtil::zlib_crc_hash(reinterpret_cast<const 
char*>(&elem_size),
+                                               sizeof(elem_size), hash);
+            } else {
+                get_data().update_crc_with_value(offsets_column[i - 1], 
offsets_column[i], hash,
+                                                 nullptr);
+            }
+        }
     }
 }
 
@@ -257,12 +296,12 @@ void ColumnArray::update_hashes_with_value(uint64_t* 
__restrict hashes,
     if (null_data) {
         for (size_t i = 0; i < s; ++i) {
             if (null_data[i] == 0) {
-                update_xxHash_with_value(i, hashes[i]);
+                update_xxHash_with_value(i, i + 1, hashes[i], nullptr);
             }
         }
     } else {
         for (size_t i = 0; i < s; ++i) {
-            update_xxHash_with_value(i, hashes[i]);
+            update_xxHash_with_value(i, i + 1, hashes[i], nullptr);
         }
     }
 }
diff --git a/be/src/vec/columns/column_array.h 
b/be/src/vec/columns/column_array.h
index 7239de3347..2725814fad 100644
--- a/be/src/vec/columns/column_array.h
+++ b/be/src/vec/columns/column_array.h
@@ -97,8 +97,10 @@ public:
     void insert_data(const char* pos, size_t length) override;
     StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& 
begin) const override;
     const char* deserialize_and_insert_from_arena(const char* pos) override;
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
-    void update_crc_with_value(size_t n, uint64_t& crc) const override;
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override;
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override;
 
     void update_hashes_with_value(std::vector<SipHash>& hashes,
                                   const uint8_t* __restrict null_data) const 
override;
diff --git a/be/src/vec/columns/column_const.h 
b/be/src/vec/columns/column_const.h
index 13640a0a9a..05ffe9ac6f 100644
--- a/be/src/vec/columns/column_const.h
+++ b/be/src/vec/columns/column_const.h
@@ -135,7 +135,8 @@ public:
 
     void update_hashes_with_value(uint64_t* __restrict hashes,
                                   const uint8_t* __restrict null_data) const 
override;
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override {
         auto real_data = data->get_data_at(0);
         if (real_data.data == nullptr) {
             hash = HashUtil::xxHash64NullWithSeed(hash);
@@ -144,8 +145,9 @@ public:
         }
     }
 
-    void update_crc_with_value(size_t n, uint64_t& crc) const override {
-        get_data_column_ptr()->update_crc_with_value(n, crc);
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override {
+        get_data_column_ptr()->update_crc_with_value(start, end, hash, 
nullptr);
     }
 
     ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const 
override;
diff --git a/be/src/vec/columns/column_decimal.cpp 
b/be/src/vec/columns/column_decimal.cpp
index 78e7d055cb..4faa3fa490 100644
--- a/be/src/vec/columns/column_decimal.cpp
+++ b/be/src/vec/columns/column_decimal.cpp
@@ -128,16 +128,27 @@ void 
ColumnDecimal<T>::update_hashes_with_value(std::vector<SipHash>& hashes,
 }
 
 template <typename T>
-void ColumnDecimal<T>::update_crc_with_value(size_t n, uint64_t& crc) const {
-    if constexpr (!IsDecimalV2<T>) {
-        crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+void ColumnDecimal<T>::update_crc_with_value(size_t start, size_t end, 
uint64_t& hash,
+                                             const uint8_t* __restrict 
null_data) const {
+    if (null_data == nullptr) {
+        for (size_t i = start; i < end; i++) {
+            if constexpr (!IsDecimalV2<T>) {
+                hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash);
+            } else {
+                decimalv2_do_crc(i, hash);
+            }
+        }
     } else {
-        const DecimalV2Value& dec_val = (const DecimalV2Value&)data[n];
-        int64_t int_val = dec_val.int_value();
-        int32_t frac_val = dec_val.frac_value();
-        crc = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), crc);
-        crc = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), crc);
-    };
+        for (size_t i = start; i < end; i++) {
+            if (null_data[i] == 0) {
+                if constexpr (!IsDecimalV2<T>) {
+                    hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash);
+                } else {
+                    decimalv2_do_crc(i, hash);
+                }
+            }
+        }
+    }
 }
 
 template <typename T>
@@ -150,19 +161,32 @@ void 
ColumnDecimal<T>::update_crcs_with_value(std::vector<uint64_t>& hashes, Pri
     } else {
         if (null_data == nullptr) {
             for (size_t i = 0; i < s; i++) {
-                update_crc_with_value(i, hashes[i]);
+                decimalv2_do_crc(i, hashes[i]);
             }
         } else {
             for (size_t i = 0; i < s; i++) {
-                if (null_data[i] == 0) update_crc_with_value(i, hashes[i]);
+                if (null_data[i] == 0) decimalv2_do_crc(i, hashes[i]);
             }
         }
     }
 }
 
 template <typename T>
-void ColumnDecimal<T>::update_xxHash_with_value(size_t n, uint64_t& hash) 
const {
-    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[n]), 
sizeof(T), hash);
+void ColumnDecimal<T>::update_xxHash_with_value(size_t start, size_t end, 
uint64_t& hash,
+                                                const uint8_t* __restrict 
null_data) const {
+    if (null_data) {
+        for (size_t i = start; i < end; i++) {
+            if (null_data[i] == 0) {
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[i]),
+                                                  sizeof(T), hash);
+            }
+        }
+    } else {
+        for (size_t i = start; i < end; i++) {
+            hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[i]), sizeof(T),
+                                              hash);
+        }
+    }
 }
 
 template <typename T>
diff --git a/be/src/vec/columns/column_decimal.h 
b/be/src/vec/columns/column_decimal.h
index 8968ff6017..ac3bd7456f 100644
--- a/be/src/vec/columns/column_decimal.h
+++ b/be/src/vec/columns/column_decimal.h
@@ -159,8 +159,10 @@ public:
                                   const uint8_t* __restrict null_data) const 
override;
     void update_crcs_with_value(std::vector<uint64_t>& hashes, PrimitiveType 
type,
                                 const uint8_t* __restrict null_data) const 
override;
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
-    void update_crc_with_value(size_t n, uint64_t& crc) const override;
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override;
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override;
 
     int compare_at(size_t n, size_t m, const IColumn& rhs_, int 
nan_direction_hint) const override;
     void get_permutation(bool reverse, size_t limit, int nan_direction_hint,
@@ -265,6 +267,14 @@ protected:
             std::partial_sort(res.begin(), sort_end, res.end(),
                               [this](size_t a, size_t b) { return data[a] < 
data[b]; });
     }
+
+    void ALWAYS_INLINE decimalv2_do_crc(size_t i, uint64_t& hash) const {
+        const DecimalV2Value& dec_val = (const DecimalV2Value&)data[i];
+        int64_t int_val = dec_val.int_value();
+        int32_t frac_val = dec_val.frac_value();
+        hash = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), hash);
+        hash = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), hash);
+    };
 };
 
 template <typename>
diff --git a/be/src/vec/columns/column_nullable.cpp 
b/be/src/vec/columns/column_nullable.cpp
index 0e90251343..2b8632f8ec 100644
--- a/be/src/vec/columns/column_nullable.cpp
+++ b/be/src/vec/columns/column_nullable.cpp
@@ -114,21 +114,35 @@ void ColumnNullable::update_hashes_with_value(uint64_t* 
__restrict hashes,
     }
 }
 
-void ColumnNullable::update_xxHash_with_value(size_t n, uint64_t& hash) const {
-    auto* __restrict real_null_data = assert_cast<const 
ColumnUInt8&>(*null_map).get_data().data();
-    if (real_null_data[n] != 0) {
-        hash = HashUtil::xxHash64NullWithSeed(hash);
+void ColumnNullable::update_xxHash_with_value(size_t start, size_t end, 
uint64_t& hash,
+                                              const uint8_t* __restrict 
null_data) const {
+    if (!has_null()) {
+        nested_column->update_xxHash_with_value(start, end, hash, nullptr);
     } else {
-        nested_column->update_xxHash_with_value(n, hash);
+        auto* __restrict real_null_data =
+                assert_cast<const ColumnUInt8&>(*null_map).get_data().data();
+        for (int i = start; i < end; ++i) {
+            if (real_null_data[i] != 0) {
+                hash = HashUtil::xxHash64NullWithSeed(hash);
+            }
+        }
+        nested_column->update_xxHash_with_value(start, end, hash, 
real_null_data);
     }
 }
 
-void ColumnNullable::update_crc_with_value(size_t n, uint64_t& crc) const {
-    auto* __restrict real_null_data = assert_cast<const 
ColumnUInt8&>(*null_map).get_data().data();
-    if (real_null_data[n] != 0) {
-        crc = HashUtil::zlib_crc_hash_null(crc);
+void ColumnNullable::update_crc_with_value(size_t start, size_t end, uint64_t& 
hash,
+                                           const uint8_t* __restrict 
null_data) const {
+    if (!has_null()) {
+        nested_column->update_crc_with_value(start, end, hash, nullptr);
     } else {
-        nested_column->update_xxHash_with_value(n, crc);
+        auto* __restrict real_null_data =
+                assert_cast<const ColumnUInt8&>(*null_map).get_data().data();
+        for (int i = start; i < end; ++i) {
+            if (real_null_data[i] != 0) {
+                hash = HashUtil::zlib_crc_hash_null(hash);
+            }
+        }
+        nested_column->update_crc_with_value(start, end, hash, real_null_data);
     }
 }
 
diff --git a/be/src/vec/columns/column_nullable.h 
b/be/src/vec/columns/column_nullable.h
index 9f29965a32..1285f0c5e8 100644
--- a/be/src/vec/columns/column_nullable.h
+++ b/be/src/vec/columns/column_nullable.h
@@ -186,8 +186,10 @@ public:
                                 const uint8_t* __restrict null_data) const 
override;
     void update_hashes_with_value(uint64_t* __restrict hashes,
                                   const uint8_t* __restrict null_data) const 
override;
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override;
-    void update_crc_with_value(size_t n, uint64_t& crc) const override;
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override;
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override;
     void get_extremes(Field& min, Field& max) const override;
 
     MutableColumns scatter(ColumnIndex num_columns, const Selector& selector) 
const override {
diff --git a/be/src/vec/columns/column_string.h 
b/be/src/vec/columns/column_string.h
index b4caa7b985..10b3c1a36a 100644
--- a/be/src/vec/columns/column_string.h
+++ b/be/src/vec/columns/column_string.h
@@ -342,16 +342,42 @@ public:
         SIP_HASHES_FUNCTION_COLUMN_IMPL();
     }
 
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
-        size_t string_size = size_at(n);
-        size_t offset = offset_at(n);
-        hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&chars[offset]),
-                                          string_size, hash);
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override {
+        if (null_data) {
+            for (size_t i = start; i < end; ++i) {
+                if (null_data[i] == 0) {
+                    size_t string_size = size_at(i);
+                    size_t offset = offset_at(i);
+                    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&chars[offset]),
+                                                      string_size, hash);
+                }
+            }
+        } else {
+            for (size_t i = start; i < end; ++i) {
+                size_t string_size = size_at(i);
+                size_t offset = offset_at(i);
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&chars[offset]),
+                                                  string_size, hash);
+            }
+        }
     }
 
-    void update_crc_with_value(size_t n, uint64_t& crc) const override {
-        auto data_ref = get_data_at(n);
-        crc = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, crc);
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override {
+        if (null_data) {
+            for (size_t i = start; i < end; ++i) {
+                if (null_data[i] == 0) {
+                    auto data_ref = get_data_at(i);
+                    hash = HashUtil::zlib_crc_hash(data_ref.data, 
data_ref.size, hash);
+                }
+            }
+        } else {
+            for (size_t i = start; i < end; ++i) {
+                auto data_ref = get_data_at(i);
+                hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, 
hash);
+            }
+        }
     }
 
     void update_crcs_with_value(std::vector<uint64_t>& hashes, PrimitiveType 
type,
diff --git a/be/src/vec/columns/column_vector.h 
b/be/src/vec/columns/column_vector.h
index 1c18e810b0..899be2b2fc 100644
--- a/be/src/vec/columns/column_vector.h
+++ b/be/src/vec/columns/column_vector.h
@@ -243,21 +243,49 @@ public:
                                      const uint8_t* null_map,
                                      size_t max_row_byte_size) const override;
 
-    void update_xxHash_with_value(size_t n, uint64_t& hash) const override {
-        hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[n]), sizeof(T), hash);
+    void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash,
+                                  const uint8_t* __restrict null_data) const 
override {
+        if (null_data) {
+            for (size_t i = start; i < end; i++) {
+                if (null_data[i] == 0) {
+                    hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[i]),
+                                                      sizeof(T), hash);
+                }
+            }
+        } else {
+            for (size_t i = start; i < end; i++) {
+                hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const 
char*>(&data[i]),
+                                                  sizeof(T), hash);
+            }
+        }
     }
 
-    void update_crc_with_value(size_t n, uint64_t& crc) const override {
+    void ALWAYS_INLINE update_crc_with_value_without_null(size_t idx, 
uint64_t& hash) const {
         if constexpr (!std::is_same_v<T, Int64>) {
-            crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+            hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash);
         } else {
             if (this->is_date_type() || this->is_datetime_type()) {
                 char buf[64];
-                const VecDateTimeValue& date_val = (const 
VecDateTimeValue&)data[n];
+                const VecDateTimeValue& date_val = (const 
VecDateTimeValue&)data[idx];
                 auto len = date_val.to_buffer(buf);
-                crc = HashUtil::zlib_crc_hash(buf, len, crc);
+                hash = HashUtil::zlib_crc_hash(buf, len, hash);
             } else {
-                crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc);
+                hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash);
+            }
+        }
+    }
+
+    void update_crc_with_value(size_t start, size_t end, uint64_t& hash,
+                               const uint8_t* __restrict null_data) const 
override {
+        if (null_data) {
+            for (size_t i = start; i < end; i++) {
+                if (null_data[i] == 0) {
+                    update_crc_with_value_without_null(i, hash);
+                }
+            }
+        } else {
+            for (size_t i = start; i < end; i++) {
+                update_crc_with_value_without_null(i, hash);
             }
         }
     }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to