This is an automated email from the ASF dual-hosted git repository.
zclll pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new f4fdf54c871 [Improvement](hash) remove nullable when
_serialize_null_into_key is false and add int72 (#58316)
f4fdf54c871 is described below
commit f4fdf54c8716e10c034cc4f7d9bf8ac19f31905e
Author: Pxl <[email protected]>
AuthorDate: Thu Nov 27 18:44:27 2025 +0800
[Improvement](hash) remove nullable when _serialize_null_into_key is false
and add int72 (#58316)
tpcds q97 8.8s -> 8.3s, q2 9s -> 8.7s
This pull request adds support for a new fixed-width hash key type,
`UInt72`, across the codebase. This enables more efficient handling of
hash keys that are 72 bits wide in various data processing components,
including aggregation, joins, sets, partitioning, and dictionary hash
maps. The changes involve updating type variants, hash key type
detection, and hash functions to accommodate the new type.
### Hash Key Type Support
* Introduced the new `UInt72` struct and added it to the hash key type
enumeration (`HashKeyType::fixed72`) and type detection logic in
`hash_key_type.h`, allowing the system to recognize and use 72-bit hash
keys.
[[1]](diffhunk://#diff-0dea38f1f0f0f99ad74d97d77e100557d743ad599b3f5f75c825baf9c13ecdbfR64-R72)
[[2]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R40)
[[3]](diffhunk://#diff-4f1fb8a89cd0e13a719c3427b1ae7581b42cb7325755a3ceac4c44bdc64bd144R63-R64)
### Variant and Method Updates
* Added `UInt72`-based variants to all major hash table, aggregation,
distinct, set, partition, and dictionary hash map method variant
definitions and their corresponding initialization logic, ensuring that
all relevant components can utilize the new key type.
[[1]](diffhunk://#diff-50d8f62236d4e1f81d52e945edee5377b7b22d52e04128eea2c8b7f679b37254R84-R87)
[[2]](diffhunk://#diff-50d8f62236d4e1f81d52e945edee5377b7b22d52e04128eea2c8b7f679b37254R141-R144)
[[3]](diffhunk://#diff-62ad0a1cb1b62de5393935298725cfd2e9766215bdd7653d84cd1fd5e7f59fe3R108-R111)
[[4]](diffhunk://#diff-62ad0a1cb1b62de5393935298725cfd2e9766215bdd7653d84cd1fd5e7f59fe3R160-R163)
[[5]](diffhunk://#diff-66cf4052118abf5abbef2e0d9193df3c35a46f70db35853c5884d56d4118a963L69-R70)
[[6]](diffhunk://#diff-66cf4052118abf5abbef2e0d9193df3c35a46f70db35853c5884d56d4118a963R107-R110)
[[7]](diffhunk://#diff-c557434b23ebbb39ef2851b7926d61af5be4bf8f56b83a92b98f9a574f805a90R143-R146)
[[8]](diffhunk://#diff-c557434b23ebbb39ef2851b7926d61af5be4bf8f56b83a92b98f9a574f805a90R203-R206)
[[9]](diffhunk://#diff-8b095a1e764b3856129d9fd06fb9122a7e9eb16bc5c293d8dcaa4ff841a587edR70)
[[10]](diffhunk://#diff-8b095a1e764b3856129d9fd06fb9122a7e9eb16bc5c293d8dcaa4ff841a587edR109-R112)
[[11]](diffhunk://#diff-60243aa7720001b0983bd282c74f77c8a8542a9a6fed08d80061c4f25847b650L50-R52)
[[12]](diffhunk://#diff-60243aa7720001b0983bd282c74f77c8a8542a9a6fed08d80061c4f25847b650R87-R89)
### Hash Function Implementation
* Implemented a specialized CRC32 hash function for `UInt72` in
`hash.h`, ensuring proper hashing behavior for the new type.
### Code Generation and Instantiation
* Updated template instantiations and code generation macros to include
`FixedKeyHashTableContext<vectorized::UInt72>`, ensuring that join and
hash table probing logic supports the new key type.
### Minor Logic Adjustment
* Refactored build key column handling in hash join to correctly manage
nullable and non-nullable types in `hashjoin_build_sink.cpp`.
---
be/src/pipeline/common/agg_utils.h | 9 +++++++--
be/src/pipeline/common/distinct_agg_utils.h | 9 +++++++--
be/src/pipeline/common/join_utils.h | 7 ++++++-
be/src/pipeline/common/partition_sort_utils.h | 7 ++++++-
be/src/pipeline/common/set_utils.h | 5 +++++
be/src/pipeline/exec/hashjoin_build_sink.cpp | 6 ++++--
.../exec/join/process_hash_table_probe_impl.h | 1 +
be/src/vec/common/hash_table/hash.h | 21 ++++++++++++++-------
be/src/vec/common/hash_table/hash_key_type.h | 3 +++
be/src/vec/common/uint128.h | 9 +++++++++
be/src/vec/functions/complex_dict_hash_map.h | 8 ++++++--
be/test/pipeline/common/distinct_agg_utils_test.cpp | 7 ++++++-
be/test/pipeline/common/set_utils_test.cpp | 6 +++++-
13 files changed, 79 insertions(+), 19 deletions(-)
diff --git a/be/src/pipeline/common/agg_utils.h
b/be/src/pipeline/common/agg_utils.h
index 146649f96b1..f0cf0a17f2a 100644
--- a/be/src/pipeline/common/agg_utils.h
+++ b/be/src/pipeline/common/agg_utils.h
@@ -81,9 +81,10 @@ using AggregatedMethodVariants = std::variant<
vectorized::MethodSingleNullableColumn<
vectorized::MethodStringNoCache<AggregatedDataWithNullableShortStringKey>>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>,
+ vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>,
vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>,
- vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>,
- vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>>;
+ vectorized::MethodKeysFixed<AggData<vectorized::UInt136>>,
+ vectorized::MethodKeysFixed<AggData<vectorized::UInt256>>>;
struct AggregatedDataVariants
: public DataVariants<AggregatedMethodVariants,
vectorized::MethodSingleNullableColumn,
@@ -137,6 +138,10 @@ struct AggregatedDataVariants
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt64>>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed72:
+
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt72>>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<vectorized::MethodKeysFixed<AggData<vectorized::UInt128>>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/distinct_agg_utils.h
b/be/src/pipeline/common/distinct_agg_utils.h
index 592132eba6b..3c95a2793fc 100644
--- a/be/src/pipeline/common/distinct_agg_utils.h
+++ b/be/src/pipeline/common/distinct_agg_utils.h
@@ -105,9 +105,10 @@ using DistinctMethodVariants = std::variant<
vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
vectorized::DataWithNullKey<DistinctDataWithShortStringKey>>>,
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>,
+ vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>,
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>,
- vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>,
- vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>>;
+ vectorized::MethodKeysFixed<DistinctData<vectorized::UInt136>>,
+ vectorized::MethodKeysFixed<DistinctData<vectorized::UInt256>>>;
struct DistinctDataVariants
: public DataVariants<DistinctMethodVariants,
vectorized::MethodSingleNullableColumn,
@@ -156,6 +157,10 @@ struct DistinctDataVariants
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed72:
+
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/join_utils.h
b/be/src/pipeline/common/join_utils.h
index c10b748f82f..08708f037ba 100644
--- a/be/src/pipeline/common/join_utils.h
+++ b/be/src/pipeline/common/join_utils.h
@@ -66,7 +66,8 @@ using HashTableVariants = std::variant<
DirectPrimaryTypeHashTableContext<vectorized::UInt32>,
DirectPrimaryTypeHashTableContext<vectorized::UInt64>,
DirectPrimaryTypeHashTableContext<vectorized::UInt128>,
- FixedKeyHashTableContext<vectorized::UInt64>,
FixedKeyHashTableContext<vectorized::UInt128>,
+ FixedKeyHashTableContext<vectorized::UInt64>,
FixedKeyHashTableContext<vectorized::UInt72>,
+ FixedKeyHashTableContext<vectorized::UInt128>,
FixedKeyHashTableContext<vectorized::UInt136>,
FixedKeyHashTableContext<vectorized::UInt256>, MethodOneString>;
@@ -103,6 +104,10 @@ struct JoinDataVariants {
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt64>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed72:
+
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt72>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<FixedKeyHashTableContext<vectorized::UInt128>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/partition_sort_utils.h
b/be/src/pipeline/common/partition_sort_utils.h
index 381dd3ec42b..ccd1b6a144d 100644
--- a/be/src/pipeline/common/partition_sort_utils.h
+++ b/be/src/pipeline/common/partition_sort_utils.h
@@ -140,9 +140,10 @@ using PartitionedMethodVariants = std::variant<
PartitionDataSingleNullable<vectorized::UInt128>,
PartitionDataSingleNullable<vectorized::UInt256>,
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>,
+ vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>,
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>,
- vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
vectorized::MethodKeysFixed<PartitionData<vectorized::UInt136>>,
+ vectorized::MethodKeysFixed<PartitionData<vectorized::UInt256>>,
vectorized::MethodStringNoCache<PartitionDataWithShortStringKey>,
vectorized::MethodSingleNullableColumn<vectorized::MethodStringNoCache<
vectorized::DataWithNullKey<PartitionDataWithShortStringKey>>>>;
@@ -199,6 +200,10 @@ struct PartitionedHashMapVariants
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt64>>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed72:
+
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt72>>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<vectorized::MethodKeysFixed<PartitionData<vectorized::UInt128>>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/common/set_utils.h
b/be/src/pipeline/common/set_utils.h
index d9f70b1e457..665a7710fa8 100644
--- a/be/src/pipeline/common/set_utils.h
+++ b/be/src/pipeline/common/set_utils.h
@@ -67,6 +67,7 @@ using SetHashTableVariants =
SetPrimaryTypeHashTableContext<vectorized::UInt128>,
SetPrimaryTypeHashTableContext<vectorized::UInt256>,
SetFixedKeyHashTableContext<vectorized::UInt64>,
+ SetFixedKeyHashTableContext<vectorized::UInt72>,
SetFixedKeyHashTableContext<vectorized::UInt128>,
SetFixedKeyHashTableContext<vectorized::UInt256>,
SetFixedKeyHashTableContext<vectorized::UInt136>>;
@@ -105,6 +106,10 @@ struct SetDataVariants
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt64>>(
get_key_sizes(data_types));
break;
+ case HashKeyType::fixed72:
+
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt72>>(
+ get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<SetFixedKeyHashTableContext<vectorized::UInt128>>(
get_key_sizes(data_types));
diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp
b/be/src/pipeline/exec/hashjoin_build_sink.cpp
index cbc22f7168d..778409087ee 100644
--- a/be/src/pipeline/exec/hashjoin_build_sink.cpp
+++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp
@@ -450,9 +450,11 @@ Status
HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state,
/// For 'null safe equal' join,
/// the build key column maybe be converted to nullable from
non-nullable.
if (p._serialize_null_into_key[i]) {
- data_type = vectorized::make_nullable(data_type);
+ data_types.emplace_back(vectorized::make_nullable(data_type));
+ } else {
+ // in this case, we use nullmap to represent null value
+ data_types.emplace_back(vectorized::remove_nullable(data_type));
}
- data_types.emplace_back(std::move(data_type));
}
if (_build_expr_ctxs.size() == 1) {
p._should_keep_hash_key_column = true;
diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
index 1f1edec4335..6753052f61c 100644
--- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
+++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h
@@ -803,6 +803,7 @@ struct ExtractType<T(U)> {
INSTANTIATION(JoinOpType,
(PrimaryTypeHashTableContext<vectorized::UInt128>)); \
INSTANTIATION(JoinOpType,
(PrimaryTypeHashTableContext<vectorized::UInt256>)); \
INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt64>));
\
+ INSTANTIATION(JoinOpType, (FixedKeyHashTableContext<vectorized::UInt72>));
\
INSTANTIATION(JoinOpType,
(FixedKeyHashTableContext<vectorized::UInt128>)); \
INSTANTIATION(JoinOpType,
(FixedKeyHashTableContext<vectorized::UInt136>)); \
INSTANTIATION(JoinOpType,
(FixedKeyHashTableContext<vectorized::UInt256>)); \
diff --git a/be/src/vec/common/hash_table/hash.h
b/be/src/vec/common/hash_table/hash.h
index 4b0e20a01a0..6817d7e091d 100644
--- a/be/src/vec/common/hash_table/hash.h
+++ b/be/src/vec/common/hash_table/hash.h
@@ -189,20 +189,27 @@ struct HashCRC32<wide::Int256> {
}
};
+#include "common/compile_check_avoid_begin.h"
+
+template <>
+struct HashCRC32<doris::vectorized::UInt72> {
+ size_t operator()(const doris::vectorized::UInt72& x) const {
+ doris::vectorized::UInt64 crc = -1ULL;
+ crc = _mm_crc32_u8(crc, x.a);
+ crc = _mm_crc32_u64(crc, x.b);
+ return crc;
+ }
+};
+
template <>
struct HashCRC32<doris::vectorized::UInt136> {
size_t operator()(const doris::vectorized::UInt136& x) const {
-#if defined(__SSE4_2__) || defined(__aarch64__)
doris::vectorized::UInt64 crc = -1ULL;
-#include "common/compile_check_avoid_begin.h"
- //_mm_crc32_u8 does not provide a u64 interface, so there is an
unavoidable conversion from u64 to u32 here.
crc = _mm_crc32_u8(crc, x.a);
-#include "common/compile_check_avoid_end.h"
crc = _mm_crc32_u64(crc, x.b);
crc = _mm_crc32_u64(crc, x.c);
return crc;
-#else
- return Hash128to64({Hash128to64({x.a, x.b}), x.c});
-#endif
}
};
+
+#include "common/compile_check_avoid_end.h"
diff --git a/be/src/vec/common/hash_table/hash_key_type.h
b/be/src/vec/common/hash_table/hash_key_type.h
index 52d264371cb..7a04137324e 100644
--- a/be/src/vec/common/hash_table/hash_key_type.h
+++ b/be/src/vec/common/hash_table/hash_key_type.h
@@ -37,6 +37,7 @@ enum class HashKeyType {
int256_key,
string_key,
fixed64,
+ fixed72,
fixed128,
fixed136,
fixed256
@@ -59,6 +60,8 @@ inline HashKeyType get_hash_key_type_with_fixed(size_t size) {
using namespace vectorized;
if (size <= sizeof(UInt64)) {
return HashKeyType::fixed64;
+ } else if (size <= sizeof(UInt72)) {
+ return HashKeyType::fixed72;
} else if (size <= sizeof(UInt128)) {
return HashKeyType::fixed128;
} else if (size <= sizeof(UInt136)) {
diff --git a/be/src/vec/common/uint128.h b/be/src/vec/common/uint128.h
index 961a4958955..2a6bb70177d 100644
--- a/be/src/vec/common/uint128.h
+++ b/be/src/vec/common/uint128.h
@@ -61,6 +61,15 @@ struct UInt128TrivialHash {
using UInt256 = wide::UInt256;
+#pragma pack(1)
+struct UInt72 {
+ UInt8 a;
+ UInt64 b;
+
+ bool operator==(const UInt72& rhs) const { return a == rhs.a && b ==
rhs.b; }
+};
+#pragma pack()
+
#pragma pack(1)
struct UInt136 {
UInt8 a;
diff --git a/be/src/vec/functions/complex_dict_hash_map.h
b/be/src/vec/functions/complex_dict_hash_map.h
index de06ce3568e..d815cbb0904 100644
--- a/be/src/vec/functions/complex_dict_hash_map.h
+++ b/be/src/vec/functions/complex_dict_hash_map.h
@@ -47,8 +47,9 @@ using DictHashMapVariants = std::variant<
MethodOneNumber<UInt128, DictHashMap<UInt128>>,
MethodOneNumber<UInt256, DictHashMap<UInt256>>,
- MethodKeysFixed<DictHashMap<UInt64>>,
MethodKeysFixed<DictHashMap<UInt128>>,
- MethodKeysFixed<DictHashMap<UInt256>>,
MethodKeysFixed<DictHashMap<UInt136>>>;
+ MethodKeysFixed<DictHashMap<UInt64>>,
MethodKeysFixed<DictHashMap<UInt72>>,
+ MethodKeysFixed<DictHashMap<UInt128>>,
MethodKeysFixed<DictHashMap<UInt136>>,
+ MethodKeysFixed<DictHashMap<UInt256>>>;
struct DictionaryHashMapMethod
: public DataVariants<DictHashMapVariants,
vectorized::MethodSingleNullableColumn,
@@ -83,6 +84,9 @@ struct DictionaryHashMapMethod
case HashKeyType::fixed64:
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt64>>>(get_key_sizes(data_types));
break;
+ case HashKeyType::fixed72:
+
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt72>>>(get_key_sizes(data_types));
+ break;
case HashKeyType::fixed128:
method_variant.emplace<MethodKeysFixed<DictHashMap<UInt128>>>(
get_key_sizes(data_types));
diff --git a/be/test/pipeline/common/distinct_agg_utils_test.cpp
b/be/test/pipeline/common/distinct_agg_utils_test.cpp
index 8d0c5bcb98f..33a572455e2 100644
--- a/be/test/pipeline/common/distinct_agg_utils_test.cpp
+++ b/be/test/pipeline/common/distinct_agg_utils_test.cpp
@@ -159,6 +159,11 @@ TEST_F(DistinctAggUtilsTest,
TestDistinctDataVariantsInitFixedKeys) {
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt64>>>(
variants.method_variant));
break;
+ case HashKeyType::fixed72:
+ ASSERT_TRUE(std::holds_alternative<
+
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt72>>>(
+ variants.method_variant));
+ break;
case HashKeyType::fixed128:
ASSERT_TRUE(std::holds_alternative<
vectorized::MethodKeysFixed<DistinctData<vectorized::UInt128>>>(
@@ -190,7 +195,7 @@ TEST_F(DistinctAggUtilsTest,
TestDistinctDataVariantsInitFixedKeys) {
test_block({std::make_shared<vectorized::DataTypeInt64>(),
std::make_shared<vectorized::DataTypeUInt8>()},
- HashKeyType::fixed128);
+ HashKeyType::fixed72);
test_block({std::make_shared<vectorized::DataTypeInt64>(),
std::make_shared<vectorized::DataTypeInt64>()},
diff --git a/be/test/pipeline/common/set_utils_test.cpp
b/be/test/pipeline/common/set_utils_test.cpp
index 89bd2e175c5..bb12a8edb6f 100644
--- a/be/test/pipeline/common/set_utils_test.cpp
+++ b/be/test/pipeline/common/set_utils_test.cpp
@@ -93,6 +93,10 @@ TEST_F(SetUtilsTest, TestSetDataVariantsInitFixedKeys) {
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt64>>(
variants.method_variant));
break;
+ case HashKeyType::fixed72:
+
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt72>>(
+ variants.method_variant));
+ break;
case HashKeyType::fixed128:
ASSERT_TRUE(std::holds_alternative<SetFixedKeyHashTableContext<vectorized::UInt128>>(
variants.method_variant));
@@ -121,7 +125,7 @@ TEST_F(SetUtilsTest, TestSetDataVariantsInitFixedKeys) {
test_block({std::make_shared<vectorized::DataTypeInt64>(),
std::make_shared<vectorized::DataTypeUInt8>()},
- HashKeyType::fixed128);
+ HashKeyType::fixed72);
test_block({std::make_shared<vectorized::DataTypeInt64>(),
std::make_shared<vectorized::DataTypeInt64>()},
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]