This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 8f77e6363a242b6153ff38c00f557942a9cfbb07 Author: yangshijie <sjyang2...@zju.edu.cn> AuthorDate: Fri Feb 23 11:07:50 2024 +0800 [Feature](function) Support xxhash function like murmur hash function (#31193) --- be/src/util/hash_util.hpp | 11 ++ be/src/vec/functions/function_hash.cpp | 153 +++++++++++++-------- be/test/vec/function/function_hash_test.cpp | 64 +++++++++ .../hash-functions/murmur-hash3-32.md | 2 + .../hash-functions/murmur-hash3-64.md | 2 + .../{murmur-hash3-32.md => xxhash-32.md} | 52 +++---- .../sql-functions/hash-functions/xxhash-64.md | 85 ++++++++++++ docs/sidebars.json | 4 +- .../hash-functions/murmur-hash3-32.md | 4 +- .../hash-functions/murmur-hash3-64.md | 4 +- .../{murmur-hash3-32.md => xxhash-32.md} | 53 +++---- .../sql-functions/hash-functions/xxhash-64.md | 84 +++++++++++ .../doris/catalog/BuiltinScalarFunctions.java | 4 + .../expressions/functions/scalar/XxHash32.java | 72 ++++++++++ .../expressions/functions/scalar/XxHash64.java | 72 ++++++++++ .../expressions/visitor/ScalarFunctionVisitor.java | 10 ++ gensrc/script/doris_builtins_functions.py | 6 +- .../hash_functions/test_hash_function.out | 17 +++ .../hash_functions/test_hash_function.out | 17 +++ .../hash_functions/test_hash_function.groovy | 8 ++ .../hash_functions/test_hash_function.groovy | 8 ++ 21 files changed, 617 insertions(+), 115 deletions(-) diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index afa8a145386..402797a8e35 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -335,6 +335,16 @@ public: #endif // xxHash function for a byte array. For convenience, a 64-bit seed is also // hashed into the result. The mapping may change from time to time. + static xxh_u32 xxHash32WithSeed(const char* s, size_t len, xxh_u32 seed) { + return XXH32(s, len, seed); + } + + // same to the up function, just for null value + static xxh_u32 xxHash32NullWithSeed(xxh_u32 seed) { + static const int INT_VALUE = 0; + return XXH32(reinterpret_cast<const char*>(&INT_VALUE), sizeof(int), seed); + } + static xxh_u64 xxHash64WithSeed(const char* s, size_t len, xxh_u64 seed) { return XXH3_64bits_withSeed(s, len, seed); } @@ -344,6 +354,7 @@ public: static const int INT_VALUE = 0; return XXH3_64bits_withSeed(reinterpret_cast<const char*>(&INT_VALUE), sizeof(int), seed); } + #if defined(__clang__) #pragma clang diagnostic pop #endif diff --git a/be/src/vec/functions/function_hash.cpp b/be/src/vec/functions/function_hash.cpp index cb8dfc09434..195dff94836 100644 --- a/be/src/vec/functions/function_hash.cpp +++ b/be/src/vec/functions/function_hash.cpp @@ -41,21 +41,82 @@ namespace doris::vectorized { constexpr uint64_t emtpy_value = 0xe28dbde7fe22e41c; template <typename ReturnType> -struct MurmurHash3ImplName {}; +struct MurmurHash3Impl { + static constexpr auto name = + std::is_same_v<ReturnType, Int32> ? "murmur_hash3_32" : "murmur_hash3_64"; -template <> -struct MurmurHash3ImplName<Int32> { - static constexpr auto name = "murmur_hash3_32"; -}; + static Status empty_apply(IColumn& icolumn, size_t input_rows_count) { + ColumnVector<ReturnType>& vec_to = assert_cast<ColumnVector<ReturnType>&>(icolumn); + vec_to.get_data().assign(input_rows_count, static_cast<ReturnType>(emtpy_value)); + return Status::OK(); + } + + static Status first_apply(const IDataType* type, const IColumn* column, size_t input_rows_count, + IColumn& icolumn) { + return execute<true>(type, column, input_rows_count, icolumn); + } + + static Status combine_apply(const IDataType* type, const IColumn* column, + size_t input_rows_count, IColumn& icolumn) { + return execute<false>(type, column, input_rows_count, icolumn); + } -template <> -struct MurmurHash3ImplName<Int64> { - static constexpr auto name = "murmur_hash3_64"; + template <bool first> + static Status execute(const IDataType* type, const IColumn* column, size_t input_rows_count, + IColumn& col_to) { + auto& to_column = assert_cast<ColumnVector<ReturnType>&>(col_to); + if constexpr (first) { + if constexpr (std::is_same_v<ReturnType, Int32>) { + to_column.fill(static_cast<Int32>(HashUtil::MURMUR3_32_SEED), input_rows_count); + } else { + to_column.insert_many_defaults(input_rows_count); + } + } + auto& col_to_data = to_column.get_data(); + if (const auto* col_from = check_and_get_column<ColumnString>(column)) { + const typename ColumnString::Chars& data = col_from->get_chars(); + const typename ColumnString::Offsets& offsets = col_from->get_offsets(); + size_t size = offsets.size(); + ColumnString::Offset current_offset = 0; + for (size_t i = 0; i < size; ++i) { + if constexpr (std::is_same_v<ReturnType, Int32>) { + col_to_data[i] = HashUtil::murmur_hash3_32( + reinterpret_cast<const char*>(&data[current_offset]), + offsets[i] - current_offset, col_to_data[i]); + } else { + murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]), + offsets[i] - current_offset, col_to_data[i], + col_to_data.data() + i); + } + current_offset = offsets[i]; + } + } else if (const ColumnConst* col_from_const = + check_and_get_column_const_string_or_fixedstring(column)) { + auto value = col_from_const->get_value<String>(); + for (size_t i = 0; i < input_rows_count; ++i) { + if constexpr (std::is_same_v<ReturnType, Int32>) { + col_to_data[i] = + HashUtil::murmur_hash3_32(value.data(), value.size(), col_to_data[i]); + } else { + murmur_hash3_x64_64(value.data(), value.size(), col_to_data[i], + col_to_data.data() + i); + } + } + } else { + DCHECK(false); + return Status::NotSupported("Illegal column {} of argument of function {}", + column->get_name(), name); + } + return Status::OK(); + } }; +using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl<Int32>>; +using FunctionMurmurHash3_64 = FunctionVariadicArgumentsBase<DataTypeInt64, MurmurHash3Impl<Int64>>; + template <typename ReturnType> -struct MurmurHash3Impl { - static constexpr auto name = MurmurHash3ImplName<ReturnType>::name; +struct XxHashImpl { + static constexpr auto name = std::is_same_v<ReturnType, Int32> ? "xxhash_32" : "xxhash_64"; static Status empty_apply(IColumn& icolumn, size_t input_rows_count) { ColumnVector<ReturnType>& vec_to = assert_cast<ColumnVector<ReturnType>&>(icolumn); @@ -76,40 +137,25 @@ struct MurmurHash3Impl { template <bool first> static Status execute(const IDataType* type, const IColumn* column, size_t input_rows_count, IColumn& col_to) { - auto* col_to_data = assert_cast<ColumnVector<ReturnType>&>(col_to).get_data().data(); + auto& to_column = assert_cast<ColumnVector<ReturnType>&>(col_to); + if constexpr (first) { + to_column.insert_many_defaults(input_rows_count); + } + auto& col_to_data = to_column.get_data(); if (const auto* col_from = check_and_get_column<ColumnString>(column)) { const typename ColumnString::Chars& data = col_from->get_chars(); const typename ColumnString::Offsets& offsets = col_from->get_offsets(); size_t size = offsets.size(); - ColumnString::Offset current_offset = 0; for (size_t i = 0; i < size; ++i) { - if (first) { - if constexpr (std::is_same_v<ReturnType, Int32>) { - UInt32 val = HashUtil::murmur_hash3_32( - reinterpret_cast<const char*>(&data[current_offset]), - offsets[i] - current_offset, HashUtil::MURMUR3_32_SEED); - col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), - 0); - } else { - UInt64 val = 0; - murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]), - offsets[i] - current_offset, 0, &val); - col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), - 0); - } + if constexpr (std::is_same_v<ReturnType, Int32>) { + col_to_data[i] = HashUtil::xxHash32WithSeed( + reinterpret_cast<const char*>(&data[current_offset]), + offsets[i] - current_offset, col_to_data[i]); } else { - if constexpr (std::is_same_v<ReturnType, Int32>) { - col_to_data[i] = HashUtil::murmur_hash3_32( - reinterpret_cast<const char*>(&data[current_offset]), - offsets[i] - current_offset, - assert_cast<ColumnInt32&>(col_to).get_data()[i]); - } else { - murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]), - offsets[i] - current_offset, - assert_cast<ColumnInt64&>(col_to).get_data()[i], - col_to_data + i); - } + col_to_data[i] = HashUtil::xxHash64WithSeed( + reinterpret_cast<const char*>(&data[current_offset]), + offsets[i] - current_offset, col_to_data[i]); } current_offset = offsets[i]; } @@ -117,28 +163,12 @@ struct MurmurHash3Impl { check_and_get_column_const_string_or_fixedstring(column)) { auto value = col_from_const->get_value<String>(); for (size_t i = 0; i < input_rows_count; ++i) { - if (first) { - if constexpr (std::is_same_v<ReturnType, Int32>) { - UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(), - HashUtil::MURMUR3_32_SEED); - col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), - 0); - } else { - UInt64 val = 0; - murmur_hash3_x64_64(value.data(), value.size(), 0, &val); - col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), - 0); - } + if constexpr (std::is_same_v<ReturnType, Int32>) { + col_to_data[i] = + HashUtil::xxHash32WithSeed(value.data(), value.size(), col_to_data[i]); } else { - if constexpr (std::is_same_v<ReturnType, Int32>) { - col_to_data[i] = HashUtil::murmur_hash3_32( - value.data(), value.size(), - assert_cast<ColumnInt32&>(col_to).get_data()[i]); - } else { - murmur_hash3_x64_64(value.data(), value.size(), - assert_cast<ColumnInt64&>(col_to).get_data()[i], - col_to_data + i); - } + col_to_data[i] = + HashUtil::xxHash64WithSeed(value.data(), value.size(), col_to_data[i]); } } } else { @@ -149,11 +179,14 @@ struct MurmurHash3Impl { return Status::OK(); } }; -using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl<Int32>>; -using FunctionMurmurHash3_64 = FunctionVariadicArgumentsBase<DataTypeInt64, MurmurHash3Impl<Int64>>; + +using FunctionXxHash_32 = FunctionVariadicArgumentsBase<DataTypeInt32, XxHashImpl<Int32>>; +using FunctionXxHash_64 = FunctionVariadicArgumentsBase<DataTypeInt64, XxHashImpl<Int64>>; void register_function_hash(SimpleFunctionFactory& factory) { factory.register_function<FunctionMurmurHash3_32>(); factory.register_function<FunctionMurmurHash3_64>(); + factory.register_function<FunctionXxHash_32>(); + factory.register_function<FunctionXxHash_64>(); } } // namespace doris::vectorized \ No newline at end of file diff --git a/be/test/vec/function/function_hash_test.cpp b/be/test/vec/function/function_hash_test.cpp index 10c57d1c31f..4d2cf6be4b4 100644 --- a/be/test/vec/function/function_hash_test.cpp +++ b/be/test/vec/function/function_hash_test.cpp @@ -94,4 +94,68 @@ TEST(HashFunctionTest, murmur_hash_3_64_test) { }; } +TEST(HashFunctionTest, xxhash_32_test) { + std::string func_name = "xxhash_32"; + + { + InputTypeSet input_types = {TypeIndex::String}; + + DataSet data_set = {{{Null()}, Null()}, {{std::string("hello")}, (int32_t)-83855367}}; + + static_cast<void>(check_function<DataTypeInt32, true>(func_name, input_types, data_set)); + }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String}; + + DataSet data_set = {{{std::string("hello"), std::string("world")}, (int32_t)-920844969}, + {{std::string("hello"), Null()}, Null()}}; + + static_cast<void>(check_function<DataTypeInt32, true>(func_name, input_types, data_set)); + }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String, TypeIndex::String}; + + DataSet data_set = {{{std::string("hello"), std::string("world"), std::string("!")}, + (int32_t)352087701}, + {{std::string("hello"), std::string("world"), Null()}, Null()}}; + + static_cast<void>(check_function<DataTypeInt32, true>(func_name, input_types, data_set)); + }; +} + +TEST(HashFunctionTest, xxhash_64_test) { + std::string func_name = "xxhash_64"; + + { + InputTypeSet input_types = {TypeIndex::String}; + + DataSet data_set = {{{Null()}, Null()}, + {{std::string("hello")}, (int64_t)-7685981735718036227}}; + + static_cast<void>(check_function<DataTypeInt64, true>(func_name, input_types, data_set)); + }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String}; + + DataSet data_set = { + {{std::string("hello"), std::string("world")}, (int64_t)7001965798170371843}, + {{std::string("hello"), Null()}, Null()}}; + + static_cast<void>(check_function<DataTypeInt64, true>(func_name, input_types, data_set)); + }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String, TypeIndex::String}; + + DataSet data_set = {{{std::string("hello"), std::string("world"), std::string("!")}, + (int64_t)6796829678999971400}, + {{std::string("hello"), std::string("world"), Null()}, Null()}}; + + static_cast<void>(check_function<DataTypeInt64, true>(func_name, input_types, data_set)); + }; +} + } // namespace doris::vectorized diff --git a/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md b/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md index 7610d4ea27d..051a5c262ff 100644 --- a/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md +++ b/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md @@ -31,6 +31,8 @@ under the License. Return the 32 bits murmur3 hash of input string. +Note: When calculating hash values, it is more recommended to use `xxhash_32` instead of `murmur_hash3_32`. + ### example ``` diff --git a/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-64.md b/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-64.md index d1965f3ed01..fb9d1dd6217 100644 --- a/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-64.md +++ b/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-64.md @@ -31,6 +31,8 @@ under the License. Return the 64 bits murmur3 hash of input string. +Note: When calculating hash values, it is more recommended to use `xxhash_64` instead of `murmur_hash3_64`. + ### example ``` diff --git a/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md b/docs/en/docs/sql-manual/sql-functions/hash-functions/xxhash-32.md similarity index 53% copy from docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md copy to docs/en/docs/sql-manual/sql-functions/hash-functions/xxhash-32.md index 7610d4ea27d..3707d7a70c3 100644 --- a/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md +++ b/docs/en/docs/sql-manual/sql-functions/hash-functions/xxhash-32.md @@ -1,6 +1,6 @@ --- { - "title": "MURMUR_HASH3_32", + "title": "XXHASH_32", "language": "en" } --- @@ -22,40 +22,42 @@ specific language governing permissions and limitations under the License. --> -## murmur_hash3_32 +## xxhash_32 ### description #### Syntax -`INT MURMUR_HASH3_32(VARCHAR input, ...)` +`INT XXHASH_32(VARCHAR input, ...)` -Return the 32 bits murmur3 hash of input string. +Return the 32 bits xxhash of input string. + +Note: When calculating hash values, it is more recommended to use `xxhash_32` instead of `murmur_hash3_32`. ### example ``` -mysql> select murmur_hash3_32(null); -+-----------------------+ -| murmur_hash3_32(NULL) | -+-----------------------+ -| NULL | -+-----------------------+ - -mysql> select murmur_hash3_32("hello"); -+--------------------------+ -| murmur_hash3_32('hello') | -+--------------------------+ -| 1321743225 | -+--------------------------+ - -mysql> select murmur_hash3_32("hello", "world"); -+-----------------------------------+ -| murmur_hash3_32('hello', 'world') | -+-----------------------------------+ -| 984713481 | -+-----------------------------------+ +mysql> select xxhash_32(NULL); ++-----------------+ +| xxhash_32(NULL) | ++-----------------+ +| NULL | ++-----------------+ + +mysql> select xxhash_32("hello"); ++--------------------+ +| xxhash_32('hello') | ++--------------------+ +| -83855367 | ++--------------------+ + +mysql> select xxhash_32("hello", "world"); ++-----------------------------+ +| xxhash_32('hello', 'world') | ++-----------------------------+ +| -920844969 | ++-----------------------------+ ``` ### keywords - MURMUR_HASH3_32,HASH +XXHASH_32,HASH diff --git a/docs/en/docs/sql-manual/sql-functions/hash-functions/xxhash-64.md b/docs/en/docs/sql-manual/sql-functions/hash-functions/xxhash-64.md new file mode 100644 index 00000000000..506613177e9 --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/hash-functions/xxhash-64.md @@ -0,0 +1,85 @@ +--- +{ + "title": "XXHASH_64", + "language": "en" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## xxhash_64 + +### description +#### Syntax + +`BIGINT XXHASH_64(VARCHAR input, ...)` + +Return the 64 bits xxhash of input string. + +Note: When calculating hash values, it is more recommended to use `xxhash_64` instead of `murmur_hash3_64`. + +### example + +``` +mysql> select xxhash_64(NULL); ++-----------------+ +| xxhash_64(NULL) | ++-----------------+ +| NULL | ++-----------------+ + +mysql> select xxhash_64("hello"); ++----------------------+ +| xxhash_64('hello') | ++----------------------+ +| -7685981735718036227 | ++----------------------+ + +mysql> select xxhash_64("hello", "world"); ++-----------------------------+ +| xxhash_64('hello', 'world') | ++-----------------------------+ +| 7001965798170371843 | ++-----------------------------+ +``` + +### benchmark + +Through TPCH Benchmark testing, it was found that `xxhash_64` has significantly improved performance compared to `murmur_hash3_64`. Therefore, in scenarios where hash values need to be calculated, it is more recommended to use `xxhash_64`. + +``` +mysql> select count(murmur_hash3_64(l_comment)) from lineitem; ++-----------------------------------+ +| count(murmur_hash3_64(l_comment)) | ++-----------------------------------+ +| 600037902 | ++-----------------------------------+ +1 row in set (17.18 sec) + +mysql> select count(xxhash_64(l_comment)) from lineitem; ++-----------------------------+ +| count(xxhash_64(l_comment)) | ++-----------------------------+ +| 600037902 | ++-----------------------------+ +1 row in set (8.41 sec) +``` + +### keywords + +XXHASH_64,HASH diff --git a/docs/sidebars.json b/docs/sidebars.json index 48b3c3eea5f..0d07a890eac 100644 --- a/docs/sidebars.json +++ b/docs/sidebars.json @@ -708,7 +708,9 @@ "label": "Hash Functions", "items": [ "sql-manual/sql-functions/hash-functions/murmur-hash3-32", - "sql-manual/sql-functions/hash-functions/murmur-hash3-64" + "sql-manual/sql-functions/hash-functions/murmur-hash3-64", + "sql-manual/sql-functions/hash-functions/xxhash-32", + "sql-manual/sql-functions/hash-functions/xxhash-64" ] }, { diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md index 93100700c7d..57c840293d7 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md @@ -29,7 +29,9 @@ under the License. `INT MURMUR_HASH3_32(VARCHAR input, ...)` -返回输入字符串的32位murmur3 hash值 +返回输入字符串的32位murmur3 hash值。 + +注:在计算hash值时,更推荐使用`xxhash_32`,而不是`murmur_hash3_32`。 ### example diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-64.md b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-64.md index 2a7f04d8f6c..e113d675898 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-64.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-64.md @@ -29,7 +29,9 @@ under the License. `BIGINT MURMUR_HASH3_64(VARCHAR input, ...)` -返回输入字符串的64位murmur3 hash值 +返回输入字符串的64位murmur3 hash值。 + +注:在计算hash值时,更推荐使用`xxhash_64`,而不是`murmur_hash3_64`。 ### example diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/xxhash-32.md similarity index 53% copy from docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md copy to docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/xxhash-32.md index 93100700c7d..9c839f90d8c 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur-hash3-32.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/xxhash-32.md @@ -1,6 +1,6 @@ --- { - "title": "MURMUR_HASH3_32", + "title": "XXHASH_32", "language": "zh-CN" } --- @@ -22,40 +22,41 @@ specific language governing permissions and limitations under the License. --> -## murmur_hash3_32 +## xxhash_32 ### description #### Syntax -`INT MURMUR_HASH3_32(VARCHAR input, ...)` +`INT XXHASH_32(VARCHAR input, ...)` -返回输入字符串的32位murmur3 hash值 +返回输入字符串的32位xxhash值。 + +注:在计算hash值时,更推荐使用`xxhash_32`,而不是`murmur_hash3_32`。 ### example ``` -mysql> select murmur_hash3_32(null); -+-----------------------+ -| murmur_hash3_32(NULL) | -+-----------------------+ -| NULL | -+-----------------------+ - -mysql> select murmur_hash3_32("hello"); -+--------------------------+ -| murmur_hash3_32('hello') | -+--------------------------+ -| 1321743225 | -+--------------------------+ - -mysql> select murmur_hash3_32("hello", "world"); -+-----------------------------------+ -| murmur_hash3_32('hello', 'world') | -+-----------------------------------+ -| 984713481 | -+-----------------------------------+ +mysql> select xxhash_32(NULL); ++-----------------+ +| xxhash_32(NULL) | ++-----------------+ +| NULL | ++-----------------+ + +mysql> select xxhash_32("hello"); ++--------------------+ +| xxhash_32('hello') | ++--------------------+ +| -83855367 | ++--------------------+ + +mysql> select xxhash_32("hello", "world"); ++-----------------------------+ +| xxhash_32('hello', 'world') | ++-----------------------------+ +| -920844969 | ++-----------------------------+ ``` ### keywords - - MURMUR_HASH3_32,HASH +HASH_32,HASH diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/xxhash-64.md b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/xxhash-64.md new file mode 100644 index 00000000000..065e9242334 --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/xxhash-64.md @@ -0,0 +1,84 @@ +--- +{ + "title": "XXHASH_64", + "language": "zh-CN" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## xxhash_64 + +### description +#### Syntax + +`BIGINT XXHASH_64(VARCHAR input, ...)` + +返回输入字符串的64位xxhash值。 + +注:在计算hash值时,更推荐使用`xxhash_64`,而不是`murmur_hash3_64`。 + +### example + +``` +mysql> select xxhash_64(NULL); ++-----------------+ +| xxhash_64(NULL) | ++-----------------+ +| NULL | ++-----------------+ + +mysql> select xxhash_64("hello"); ++----------------------+ +| xxhash_64('hello') | ++----------------------+ +| -7685981735718036227 | ++----------------------+ + +mysql> select xxhash_64("hello", "world"); ++-----------------------------+ +| xxhash_64('hello', 'world') | ++-----------------------------+ +| 7001965798170371843 | ++-----------------------------+ +``` +### benchmark + +通过TPCH Benchmark测试发现,`xxhash_64`相比`murmur_hash3_64`来说性能大幅提升,因此在需要计算hash值的场景下,更推荐使用`xxhash_64`。 + +``` +mysql> select count(murmur_hash3_64(l_comment)) from lineitem; ++-----------------------------------+ +| count(murmur_hash3_64(l_comment)) | ++-----------------------------------+ +| 600037902 | ++-----------------------------------+ +1 row in set (17.18 sec) + +mysql> select count(xxhash_64(l_comment)) from lineitem; ++-----------------------------+ +| count(xxhash_64(l_comment)) | ++-----------------------------+ +| 600037902 | ++-----------------------------+ +1 row in set (8.41 sec) +``` + +### keywords + +XXHASH_64,HASH diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 1ace763675f..f5928650efb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -426,6 +426,8 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.WeeksAdd; import org.apache.doris.nereids.trees.expressions.functions.scalar.WeeksDiff; import org.apache.doris.nereids.trees.expressions.functions.scalar.WeeksSub; import org.apache.doris.nereids.trees.expressions.functions.scalar.WidthBucket; +import org.apache.doris.nereids.trees.expressions.functions.scalar.XxHash32; +import org.apache.doris.nereids.trees.expressions.functions.scalar.XxHash64; import org.apache.doris.nereids.trees.expressions.functions.scalar.Year; import org.apache.doris.nereids.trees.expressions.functions.scalar.YearCeil; import org.apache.doris.nereids.trees.expressions.functions.scalar.YearFloor; @@ -876,6 +878,8 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(WeeksDiff.class, "weeks_diff"), scalar(WeeksSub.class, "weeks_sub"), scalar(WidthBucket.class, "width_bucket"), + scalar(XxHash32.class, "xxhash_32"), + scalar(XxHash64.class, "xxhash_64"), scalar(Year.class, "year"), scalar(YearCeil.class, "year_ceil"), scalar(YearFloor.class, "year_floor"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/XxHash32.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/XxHash32.java new file mode 100644 index 00000000000..149c2cbc766 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/XxHash32.java @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; +import org.apache.doris.nereids.util.ExpressionUtils; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'xxhash_32'. + */ +public class XxHash32 extends ScalarFunction + implements ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(IntegerType.INSTANCE).varArgs(VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(IntegerType.INSTANCE).varArgs(StringType.INSTANCE) + ); + + /** + * constructor with 1 or more arguments. + */ + public XxHash32(Expression arg, Expression... varArgs) { + super("xxhash_32", ExpressionUtils.mergeArguments(arg, varArgs)); + } + + /** + * withChildren. + */ + @Override + public XxHash32 withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() >= 1); + return new XxHash32(children.get(0), + children.subList(1, children.size()).toArray(new Expression[0])); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitXxHash32(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/XxHash64.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/XxHash64.java new file mode 100644 index 00000000000..bc23d8c2a5b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/XxHash64.java @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.BigIntType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; +import org.apache.doris.nereids.util.ExpressionUtils; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'xxhash_64'. + */ +public class XxHash64 extends ScalarFunction + implements ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(BigIntType.INSTANCE).varArgs(VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(BigIntType.INSTANCE).varArgs(StringType.INSTANCE) + ); + + /** + * constructor with 1 or more arguments. + */ + public XxHash64(Expression arg, Expression... varArgs) { + super("xxhash_64", ExpressionUtils.mergeArguments(arg, varArgs)); + } + + /** + * withChildren. + */ + @Override + public XxHash64 withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() >= 1); + return new XxHash64(children.get(0), + children.subList(1, children.size()).toArray(new Expression[0])); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitXxHash64(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 9a1ed840482..183b4a73dac 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -416,6 +416,8 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.WeeksAdd; import org.apache.doris.nereids.trees.expressions.functions.scalar.WeeksDiff; import org.apache.doris.nereids.trees.expressions.functions.scalar.WeeksSub; import org.apache.doris.nereids.trees.expressions.functions.scalar.WidthBucket; +import org.apache.doris.nereids.trees.expressions.functions.scalar.XxHash32; +import org.apache.doris.nereids.trees.expressions.functions.scalar.XxHash64; import org.apache.doris.nereids.trees.expressions.functions.scalar.Year; import org.apache.doris.nereids.trees.expressions.functions.scalar.YearCeil; import org.apache.doris.nereids.trees.expressions.functions.scalar.YearFloor; @@ -1515,6 +1517,14 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(murmurHash364, context); } + default R visitXxHash32(XxHash32 xxHash32, C context) { + return visitScalarFunction(xxHash32, context); + } + + default R visitXxHash64(XxHash64 xxHash64, C context) { + return visitScalarFunction(xxHash64, context); + } + default R visitNegative(Negative negative, C context) { return visitScalarFunction(negative, context); } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 1277f72db60..0ecb05612ac 100644 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1885,7 +1885,11 @@ visible_functions = { [['murmur_hash3_32'], 'INT', ['VARCHAR', '...'], ''], [['murmur_hash3_32'], 'INT', ['STRING', '...'], ''], [['murmur_hash3_64'], 'BIGINT', ['VARCHAR', '...'], ''], - [['murmur_hash3_64'], 'BIGINT', ['STRING', '...'], ''] + [['murmur_hash3_64'], 'BIGINT', ['STRING', '...'], ''], + [['xxhash_32'], 'INT', ['VARCHAR', '...'], ''], + [['xxhash_32'], 'INT', ['STRING', '...'], ''], + [['xxhash_64'], 'BIGINT', ['VARCHAR', '...'], ''], + [['xxhash_64'], 'BIGINT', ['STRING', '...'], ''] ], # aes and base64 function diff --git a/regression-test/data/nereids_p0/sql_functions/hash_functions/test_hash_function.out b/regression-test/data/nereids_p0/sql_functions/hash_functions/test_hash_function.out index 221936613d3..984075ddeff 100644 --- a/regression-test/data/nereids_p0/sql_functions/hash_functions/test_hash_function.out +++ b/regression-test/data/nereids_p0/sql_functions/hash_functions/test_hash_function.out @@ -17,3 +17,20 @@ -- !sql -- 3583109472027628045 +-- !sql -- +\N + +-- !sql -- +-83855367 + +-- !sql -- +-920844969 + +-- !sql -- +\N + +-- !sql -- +-7685981735718036227 + +-- !sql -- +7001965798170371843 diff --git a/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out b/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out index 221936613d3..984075ddeff 100644 --- a/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out +++ b/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out @@ -17,3 +17,20 @@ -- !sql -- 3583109472027628045 +-- !sql -- +\N + +-- !sql -- +-83855367 + +-- !sql -- +-920844969 + +-- !sql -- +\N + +-- !sql -- +-7685981735718036227 + +-- !sql -- +7001965798170371843 diff --git a/regression-test/suites/nereids_p0/sql_functions/hash_functions/test_hash_function.groovy b/regression-test/suites/nereids_p0/sql_functions/hash_functions/test_hash_function.groovy index ae805f904c3..8cae71a2793 100644 --- a/regression-test/suites/nereids_p0/sql_functions/hash_functions/test_hash_function.groovy +++ b/regression-test/suites/nereids_p0/sql_functions/hash_functions/test_hash_function.groovy @@ -26,4 +26,12 @@ suite("test_hash_function") { qt_sql "SELECT murmur_hash3_64(null);" qt_sql "SELECT murmur_hash3_64(\"hello\");" qt_sql "SELECT murmur_hash3_64(\"hello\", \"world\");" + + qt_sql "SELECT xxhash_32(null);" + qt_sql "SELECT xxhash_32(\"hello\");" + qt_sql "SELECT xxhash_32(\"hello\", \"world\");" + + qt_sql "SELECT xxhash_64(null);" + qt_sql "SELECT xxhash_64(\"hello\");" + qt_sql "SELECT xxhash_64(\"hello\", \"world\");" } diff --git a/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy b/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy index d44518509da..d547e9fb287 100644 --- a/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy +++ b/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy @@ -25,4 +25,12 @@ suite("test_hash_function", "arrow_flight_sql") { qt_sql "SELECT murmur_hash3_64(null);" qt_sql "SELECT murmur_hash3_64(\"hello\");" qt_sql "SELECT murmur_hash3_64(\"hello\", \"world\");" + + qt_sql "SELECT xxhash_32(null);" + qt_sql "SELECT xxhash_32(\"hello\");" + qt_sql "SELECT xxhash_32(\"hello\", \"world\");" + + qt_sql "SELECT xxhash_64(null);" + qt_sql "SELECT xxhash_64(\"hello\");" + qt_sql "SELECT xxhash_64(\"hello\", \"world\");" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org