This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-1.2-lts in repository https://gitbox.apache.org/repos/asf/doris.git
commit bb91922f9bf5df006aa14c8c75492ce8d17638bb Author: amory <wangqian...@selectdb.com> AuthorDate: Thu Jul 13 11:53:54 2023 +0800 [Improve](array)improve array hash function (#21779) cherry-pick from #21699 --- be/src/util/timezone_utils.cpp | 2 + be/src/vec/columns/column.h | 8 +++- be/src/vec/columns/column_array.cpp | 73 ++++++++++++++++++++++++++-------- be/src/vec/columns/column_array.h | 6 ++- be/src/vec/columns/column_const.h | 8 ++-- be/src/vec/columns/column_decimal.cpp | 50 +++++++++++++++++------ be/src/vec/columns/column_decimal.h | 14 ++++++- be/src/vec/columns/column_nullable.cpp | 34 +++++++++++----- be/src/vec/columns/column_nullable.h | 6 ++- be/src/vec/columns/column_string.h | 42 +++++++++++++++---- be/src/vec/columns/column_vector.h | 42 +++++++++++++++---- 11 files changed, 219 insertions(+), 66 deletions(-) diff --git a/be/src/util/timezone_utils.cpp b/be/src/util/timezone_utils.cpp index c22c12b560..b746634e03 100644 --- a/be/src/util/timezone_utils.cpp +++ b/be/src/util/timezone_utils.cpp @@ -17,7 +17,9 @@ // #include "util/timezone_utils.h" + #include <cctz/time_zone.h> + #include <boost/algorithm/string.hpp> #include <filesystem> #include <string> diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 89dd4fbef4..c82944b44d 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -360,7 +360,9 @@ public: LOG(FATAL) << get_name() << "update_hashes_with_value xxhash not supported"; }; - virtual void update_xxHash_with_value(size_t n, uint64_t& hash) const { + // use range for one hash value to avoid virtual function call in loop + virtual void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { LOG(FATAL) << get_name() << " update_hash_with_value xxhash not supported"; } @@ -372,7 +374,9 @@ public: LOG(FATAL) << get_name() << "update_crcs_with_value not supported"; }; - virtual void update_crc_with_value(size_t n, uint64_t& hash) const { + // use range for one hash value to avoid virtual function call in loop + virtual void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { LOG(FATAL) << get_name() << " update_crc_with_value not supported"; } diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index 999991fde5..6a2f40792b 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -229,25 +229,64 @@ void ColumnArray::update_hashes_with_value(std::vector<SipHash>& hashes, } // for every array row calculate xxHash -void ColumnArray::update_xxHash_with_value(size_t n, uint64_t& hash) const { - size_t elem_size = size_at(n); - size_t offset = offset_at(n); - hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&elem_size), sizeof(elem_size), - hash); - for (auto i = 0; i < elem_size; ++i) { - get_data().update_xxHash_with_value(offset + i, hash); +void ColumnArray::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets_column = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } } } // for every array row calculate crcHash -void ColumnArray::update_crc_with_value(size_t n, uint64_t& crc) const { - size_t elem_size = size_at(n); - size_t offset = offset_at(n); - - crc = HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(&elem_size), sizeof(elem_size), - crc); - for (auto i = 0; i < elem_size; ++i) { - get_data().update_crc_with_value(offset + i, crc); +void ColumnArray::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + auto& offsets_column = get_offsets(); + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t elem_size = offsets_column[i] - offsets_column[i - 1]; + if (elem_size == 0) { + hash = HashUtil::zlib_crc_hash(reinterpret_cast<const char*>(&elem_size), + sizeof(elem_size), hash); + } else { + get_data().update_crc_with_value(offsets_column[i - 1], offsets_column[i], hash, + nullptr); + } + } } } @@ -257,12 +296,12 @@ void ColumnArray::update_hashes_with_value(uint64_t* __restrict hashes, if (null_data) { for (size_t i = 0; i < s; ++i) { if (null_data[i] == 0) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } else { for (size_t i = 0; i < s; ++i) { - update_xxHash_with_value(i, hashes[i]); + update_xxHash_with_value(i, i + 1, hashes[i], nullptr); } } } diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index 7239de3347..2725814fad 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -97,8 +97,10 @@ public: void insert_data(const char* pos, size_t length) override; StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override; const char* deserialize_and_insert_from_arena(const char* pos) override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void update_hashes_with_value(std::vector<SipHash>& hashes, const uint8_t* __restrict null_data) const override; diff --git a/be/src/vec/columns/column_const.h b/be/src/vec/columns/column_const.h index 13640a0a9a..05ffe9ac6f 100644 --- a/be/src/vec/columns/column_const.h +++ b/be/src/vec/columns/column_const.h @@ -135,7 +135,8 @@ public: void update_hashes_with_value(uint64_t* __restrict hashes, const uint8_t* __restrict null_data) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { auto real_data = data->get_data_at(0); if (real_data.data == nullptr) { hash = HashUtil::xxHash64NullWithSeed(hash); @@ -144,8 +145,9 @@ public: } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { - get_data_column_ptr()->update_crc_with_value(n, crc); + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + get_data_column_ptr()->update_crc_with_value(start, end, hash, nullptr); } ColumnPtr filter(const Filter& filt, ssize_t result_size_hint) const override; diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp index 78e7d055cb..4faa3fa490 100644 --- a/be/src/vec/columns/column_decimal.cpp +++ b/be/src/vec/columns/column_decimal.cpp @@ -128,16 +128,27 @@ void ColumnDecimal<T>::update_hashes_with_value(std::vector<SipHash>& hashes, } template <typename T> -void ColumnDecimal<T>::update_crc_with_value(size_t n, uint64_t& crc) const { - if constexpr (!IsDecimalV2<T>) { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); +void ColumnDecimal<T>::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (null_data == nullptr) { + for (size_t i = start; i < end; i++) { + if constexpr (!IsDecimalV2<T>) { + hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash); + } else { + decimalv2_do_crc(i, hash); + } + } } else { - const DecimalV2Value& dec_val = (const DecimalV2Value&)data[n]; - int64_t int_val = dec_val.int_value(); - int32_t frac_val = dec_val.frac_value(); - crc = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), crc); - crc = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), crc); - }; + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + if constexpr (!IsDecimalV2<T>) { + hash = HashUtil::zlib_crc_hash(&data[i], sizeof(T), hash); + } else { + decimalv2_do_crc(i, hash); + } + } + } + } } template <typename T> @@ -150,19 +161,32 @@ void ColumnDecimal<T>::update_crcs_with_value(std::vector<uint64_t>& hashes, Pri } else { if (null_data == nullptr) { for (size_t i = 0; i < s; i++) { - update_crc_with_value(i, hashes[i]); + decimalv2_do_crc(i, hashes[i]); } } else { for (size_t i = 0; i < s; i++) { - if (null_data[i] == 0) update_crc_with_value(i, hashes[i]); + if (null_data[i] == 0) decimalv2_do_crc(i, hashes[i]); } } } } template <typename T> -void ColumnDecimal<T>::update_xxHash_with_value(size_t n, uint64_t& hash) const { - hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[n]), sizeof(T), hash); +void ColumnDecimal<T>::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[i]), + sizeof(T), hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[i]), sizeof(T), + hash); + } + } } template <typename T> diff --git a/be/src/vec/columns/column_decimal.h b/be/src/vec/columns/column_decimal.h index 8968ff6017..ac3bd7456f 100644 --- a/be/src/vec/columns/column_decimal.h +++ b/be/src/vec/columns/column_decimal.h @@ -159,8 +159,10 @@ public: const uint8_t* __restrict null_data) const override; void update_crcs_with_value(std::vector<uint64_t>& hashes, PrimitiveType type, const uint8_t* __restrict null_data) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; int compare_at(size_t n, size_t m, const IColumn& rhs_, int nan_direction_hint) const override; void get_permutation(bool reverse, size_t limit, int nan_direction_hint, @@ -265,6 +267,14 @@ protected: std::partial_sort(res.begin(), sort_end, res.end(), [this](size_t a, size_t b) { return data[a] < data[b]; }); } + + void ALWAYS_INLINE decimalv2_do_crc(size_t i, uint64_t& hash) const { + const DecimalV2Value& dec_val = (const DecimalV2Value&)data[i]; + int64_t int_val = dec_val.int_value(); + int32_t frac_val = dec_val.frac_value(); + hash = HashUtil::zlib_crc_hash(&int_val, sizeof(int_val), hash); + hash = HashUtil::zlib_crc_hash(&frac_val, sizeof(frac_val), hash); + }; }; template <typename> diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index 0e90251343..2b8632f8ec 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -114,21 +114,35 @@ void ColumnNullable::update_hashes_with_value(uint64_t* __restrict hashes, } } -void ColumnNullable::update_xxHash_with_value(size_t n, uint64_t& hash) const { - auto* __restrict real_null_data = assert_cast<const ColumnUInt8&>(*null_map).get_data().data(); - if (real_null_data[n] != 0) { - hash = HashUtil::xxHash64NullWithSeed(hash); +void ColumnNullable::update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (!has_null()) { + nested_column->update_xxHash_with_value(start, end, hash, nullptr); } else { - nested_column->update_xxHash_with_value(n, hash); + auto* __restrict real_null_data = + assert_cast<const ColumnUInt8&>(*null_map).get_data().data(); + for (int i = start; i < end; ++i) { + if (real_null_data[i] != 0) { + hash = HashUtil::xxHash64NullWithSeed(hash); + } + } + nested_column->update_xxHash_with_value(start, end, hash, real_null_data); } } -void ColumnNullable::update_crc_with_value(size_t n, uint64_t& crc) const { - auto* __restrict real_null_data = assert_cast<const ColumnUInt8&>(*null_map).get_data().data(); - if (real_null_data[n] != 0) { - crc = HashUtil::zlib_crc_hash_null(crc); +void ColumnNullable::update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const { + if (!has_null()) { + nested_column->update_crc_with_value(start, end, hash, nullptr); } else { - nested_column->update_xxHash_with_value(n, crc); + auto* __restrict real_null_data = + assert_cast<const ColumnUInt8&>(*null_map).get_data().data(); + for (int i = start; i < end; ++i) { + if (real_null_data[i] != 0) { + hash = HashUtil::zlib_crc_hash_null(hash); + } + } + nested_column->update_crc_with_value(start, end, hash, real_null_data); } } diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 9f29965a32..1285f0c5e8 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -186,8 +186,10 @@ public: const uint8_t* __restrict null_data) const override; void update_hashes_with_value(uint64_t* __restrict hashes, const uint8_t* __restrict null_data) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override; - void update_crc_with_value(size_t n, uint64_t& crc) const override; + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override; void get_extremes(Field& min, Field& max) const override; MutableColumns scatter(ColumnIndex num_columns, const Selector& selector) const override { diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index b4caa7b985..10b3c1a36a 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -342,16 +342,42 @@ public: SIP_HASHES_FUNCTION_COLUMN_IMPL(); } - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { - size_t string_size = size_at(n); - size_t offset = offset_at(n); - hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&chars[offset]), - string_size, hash); + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + size_t string_size = size_at(i); + size_t offset = offset_at(i); + hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&chars[offset]), + string_size, hash); + } + } + } else { + for (size_t i = start; i < end; ++i) { + size_t string_size = size_at(i); + size_t offset = offset_at(i); + hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&chars[offset]), + string_size, hash); + } + } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { - auto data_ref = get_data_at(n); - crc = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, crc); + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; ++i) { + if (null_data[i] == 0) { + auto data_ref = get_data_at(i); + hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hash); + } + } + } else { + for (size_t i = start; i < end; ++i) { + auto data_ref = get_data_at(i); + hash = HashUtil::zlib_crc_hash(data_ref.data, data_ref.size, hash); + } + } } void update_crcs_with_value(std::vector<uint64_t>& hashes, PrimitiveType type, diff --git a/be/src/vec/columns/column_vector.h b/be/src/vec/columns/column_vector.h index 1c18e810b0..899be2b2fc 100644 --- a/be/src/vec/columns/column_vector.h +++ b/be/src/vec/columns/column_vector.h @@ -243,21 +243,49 @@ public: const uint8_t* null_map, size_t max_row_byte_size) const override; - void update_xxHash_with_value(size_t n, uint64_t& hash) const override { - hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[n]), sizeof(T), hash); + void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[i]), + sizeof(T), hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&data[i]), + sizeof(T), hash); + } + } } - void update_crc_with_value(size_t n, uint64_t& crc) const override { + void ALWAYS_INLINE update_crc_with_value_without_null(size_t idx, uint64_t& hash) const { if constexpr (!std::is_same_v<T, Int64>) { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); + hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash); } else { if (this->is_date_type() || this->is_datetime_type()) { char buf[64]; - const VecDateTimeValue& date_val = (const VecDateTimeValue&)data[n]; + const VecDateTimeValue& date_val = (const VecDateTimeValue&)data[idx]; auto len = date_val.to_buffer(buf); - crc = HashUtil::zlib_crc_hash(buf, len, crc); + hash = HashUtil::zlib_crc_hash(buf, len, hash); } else { - crc = HashUtil::zlib_crc_hash(&data[n], sizeof(T), crc); + hash = HashUtil::zlib_crc_hash(&data[idx], sizeof(T), hash); + } + } + } + + void update_crc_with_value(size_t start, size_t end, uint64_t& hash, + const uint8_t* __restrict null_data) const override { + if (null_data) { + for (size_t i = start; i < end; i++) { + if (null_data[i] == 0) { + update_crc_with_value_without_null(i, hash); + } + } + } else { + for (size_t i = start; i < end; i++) { + update_crc_with_value_without_null(i, hash); } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org