This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-1.1-lts in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.1-lts by this push: new 1ec59f7d63 [function](hash) add support of murmur_hash3_64 (#12923) (#14636) 1ec59f7d63 is described below commit 1ec59f7d63d6d57377335fc3cd3e949206b41965 Author: TengJianPing <18241664+jackte...@users.noreply.github.com> AuthorDate: Mon Nov 28 15:53:54 2022 +0800 [function](hash) add support of murmur_hash3_64 (#12923) (#14636) --- be/src/exprs/hash_functions.cpp | 14 ++++ be/src/exprs/hash_functions.h | 3 + be/src/util/hash_util.hpp | 55 +-------------- be/src/util/murmur_hash3.cpp | 4 +- be/src/vec/functions/function_hash.cpp | 79 ++++++++++++++++------ be/test/CMakeLists.txt | 1 + .../exprs/hash_function_test.cpp} | 47 +++++++------ be/test/vec/function/function_hash_test.cpp | 33 +++++++++ .../hash-functions/murmur_hash3_64.md | 61 +++++++++++++++++ .../hash-functions/murmur_hash3_64.md | 61 +++++++++++++++++ gensrc/script/doris_builtins_functions.py | 6 ++ .../hash_functions/test_hash_function.out | 18 +++++ .../hash_functions/test_hash_function.groovy | 27 +++----- 13 files changed, 300 insertions(+), 109 deletions(-) diff --git a/be/src/exprs/hash_functions.cpp b/be/src/exprs/hash_functions.cpp index 0407e62381..9b9689f5f0 100644 --- a/be/src/exprs/hash_functions.cpp +++ b/be/src/exprs/hash_functions.cpp @@ -40,4 +40,18 @@ IntVal HashFunctions::murmur_hash3_32(FunctionContext* ctx, int num_children, return seed; } +BigIntVal HashFunctions::murmur_hash3_64(FunctionContext* ctx, int num_children, + const StringVal* inputs) { + uint64_t seed = 0; + uint64_t hash = 0; + for (int i = 0; i < num_children; ++i) { + if (inputs[i].is_null) { + return BigIntVal::null(); + } + murmur_hash3_x64_64(inputs[i].ptr, inputs[i].len, seed, &hash); + seed = hash; + } + return hash; +} + } // namespace doris diff --git a/be/src/exprs/hash_functions.h b/be/src/exprs/hash_functions.h index 9fcfb9a7aa..288dfbc7fd 100644 --- a/be/src/exprs/hash_functions.h +++ b/be/src/exprs/hash_functions.h @@ -20,6 +20,7 @@ namespace doris_udf { class FunctionContext; struct IntVal; +struct BigIntVal; struct StringVal; } // namespace doris_udf @@ -30,6 +31,8 @@ public: static void init(); static doris_udf::IntVal murmur_hash3_32(doris_udf::FunctionContext* ctx, int num_children, const doris_udf::StringVal* inputs); + static doris_udf::BigIntVal murmur_hash3_64(doris_udf::FunctionContext* ctx, int num_children, + const doris_udf::StringVal* inputs); }; } // namespace doris diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index d03f466aff..f8e1076d58 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -119,60 +119,11 @@ public: // refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java static const uint32_t MURMUR3_32_SEED = 104729; - ALWAYS_INLINE static uint32_t rotl32(uint32_t x, int8_t r) { - return (x << r) | (x >> (32 - r)); - } - - ALWAYS_INLINE static uint32_t fmix32(uint32_t h) { - h ^= h >> 16; - h *= 0x85ebca6b; - h ^= h >> 13; - h *= 0xc2b2ae35; - h ^= h >> 16; - return h; - } - // modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp static uint32_t murmur_hash3_32(const void* key, int32_t len, uint32_t seed) { - const uint8_t* data = (const uint8_t*)key; - const int nblocks = len / 4; - - uint32_t h1 = seed; - - const uint32_t c1 = 0xcc9e2d51; - const uint32_t c2 = 0x1b873593; - const uint32_t* blocks = (const uint32_t*)(data + nblocks * 4); - - for (int i = -nblocks; i; i++) { - uint32_t k1 = blocks[i]; - - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - - h1 ^= k1; - h1 = rotl32(h1, 13); - h1 = h1 * 5 + 0xe6546b64; - } - - const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); - uint32_t k1 = 0; - switch (len & 3) { - case 3: - k1 ^= tail[2] << 16; - case 2: - k1 ^= tail[1] << 8; - case 1: - k1 ^= tail[0]; - k1 *= c1; - k1 = rotl32(k1, 15); - k1 *= c2; - h1 ^= k1; - }; - - h1 ^= len; - h1 = fmix32(h1); - return h1; + uint32_t out = 0; + murmur_hash3_x86_32(key, len, seed, &out); + return out; } static const int MURMUR_R = 47; diff --git a/be/src/util/murmur_hash3.cpp b/be/src/util/murmur_hash3.cpp index d2fadc5e17..5181558a81 100644 --- a/be/src/util/murmur_hash3.cpp +++ b/be/src/util/murmur_hash3.cpp @@ -31,11 +31,11 @@ #define FORCE_INLINE inline __attribute__((always_inline)) -inline uint32_t rotl32(uint32_t x, int8_t r) { +FORCE_INLINE uint32_t rotl32(uint32_t x, int8_t r) { return (x << r) | (x >> (32 - r)); } -inline uint64_t rotl64(uint64_t x, int8_t r) { +FORCE_INLINE uint64_t rotl64(uint64_t x, int8_t r) { return (x << r) | (x >> (64 - r)); } diff --git a/be/src/vec/functions/function_hash.cpp b/be/src/vec/functions/function_hash.cpp index 92e2a55827..7e9ccc9492 100644 --- a/be/src/vec/functions/function_hash.cpp +++ b/be/src/vec/functions/function_hash.cpp @@ -155,9 +155,22 @@ struct MurmurHash2Impl64 { }; using FunctionMurmurHash2_64 = FunctionVariadicArgumentsBase<DataTypeUInt64, MurmurHash2Impl64>; -struct MurmurHash3Impl32 { +template <typename ReturnType> +struct MurmurHash3ImplName {}; + +template <> +struct MurmurHash3ImplName<Int32> { static constexpr auto name = "murmur_hash3_32"; - using ReturnType = Int32; +}; + +template <> +struct MurmurHash3ImplName<Int64> { + static constexpr auto name = "murmur_hash3_64"; +}; + +template <typename ReturnType> +struct MurmurHash3Impl { + static constexpr auto name = MurmurHash3ImplName<ReturnType>::name; static Status empty_apply(IColumn& icolumn, size_t input_rows_count) { ColumnVector<ReturnType>& vec_to = assert_cast<ColumnVector<ReturnType>&>(icolumn); @@ -178,6 +191,7 @@ struct MurmurHash3Impl32 { template <bool first> static Status execute(const IDataType* type, const IColumn* column, size_t input_rows_count, IColumn& col_to) { + auto* col_to_data = assert_cast<ColumnVector<ReturnType>&>(col_to).get_data().data(); if (const ColumnString* col_from = check_and_get_column<ColumnString>(column)) { const typename ColumnString::Chars& data = col_from->get_chars(); const typename ColumnString::Offsets& offsets = col_from->get_offsets(); @@ -186,16 +200,29 @@ struct MurmurHash3Impl32 { ColumnString::Offset current_offset = 0; for (size_t i = 0; i < size; ++i) { if (first) { - UInt32 val = HashUtil::murmur_hash3_32( - reinterpret_cast<const char*>(&data[current_offset]), - offsets[i] - current_offset - 1, HashUtil::MURMUR3_32_SEED); - col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0); + if constexpr (std::is_same_v<ReturnType, Int32>) { + UInt32 val = HashUtil::murmur_hash3_32( + reinterpret_cast<const char*>(&data[current_offset]), + offsets[i] - current_offset - 1, HashUtil::MURMUR3_32_SEED); + col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), + 0); + } else { + UInt64 val = 0; + murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]), + offsets[i] - current_offset - 1, 0, &val); + col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), + 0); + } } else { - assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] = - HashUtil::murmur_hash3_32( - reinterpret_cast<const char*>(&data[current_offset]), - offsets[i] - current_offset - 1, - ext::bit_cast<UInt32>(col_to[i])); + if constexpr (std::is_same_v<ReturnType, Int32>) { + col_to_data[i] = HashUtil::murmur_hash3_32( + reinterpret_cast<const char*>(&data[current_offset]), + offsets[i] - current_offset - 1, ext::bit_cast<UInt32>(col_to[i])); + } else { + murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]), + offsets[i] - current_offset - 1, + ext::bit_cast<UInt64>(col_to[i]), col_to_data + i); + } } current_offset = offsets[i]; } @@ -204,13 +231,25 @@ struct MurmurHash3Impl32 { String value = col_from_const->get_value<String>().data(); for (size_t i = 0; i < input_rows_count; ++i) { if (first) { - UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(), - HashUtil::MURMUR3_32_SEED); - col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0); + if constexpr (std::is_same_v<ReturnType, Int32>) { + UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(), + HashUtil::MURMUR3_32_SEED); + col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), + 0); + } else { + UInt64 val = 0; + murmur_hash3_x64_64(value.data(), value.size(), 0, &val); + col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), + 0); + } } else { - assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] = - HashUtil::murmur_hash3_32(value.data(), value.size(), - ext::bit_cast<UInt32>(col_to[i])); + if constexpr (std::is_same_v<ReturnType, Int32>) { + col_to_data[i] = HashUtil::murmur_hash3_32( + value.data(), value.size(), ext::bit_cast<UInt32>(col_to[i])); + } else { + murmur_hash3_x64_64(value.data(), value.size(), + ext::bit_cast<UInt64>(col_to[i]), col_to_data + i); + } } } } else { @@ -221,10 +260,12 @@ struct MurmurHash3Impl32 { return Status::OK(); } }; -using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl32>; +using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl<Int32>>; +using FunctionMurmurHash3_64 = FunctionVariadicArgumentsBase<DataTypeInt64, MurmurHash3Impl<Int64>>; void register_function_function_hash(SimpleFunctionFactory& factory) { factory.register_function<FunctionMurmurHash2_64>(); factory.register_function<FunctionMurmurHash3_32>(); + factory.register_function<FunctionMurmurHash3_64>(); } -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index 93966d2a20..437dd4e354 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -113,6 +113,7 @@ set(EXPRS_TEST_FILES exprs/bloom_filter_predicate_test.cpp exprs/array_functions_test.cpp exprs/window_funnel_test.cpp + exprs/hash_function_test.cpp ) set(GEO_TEST_FILES geo/wkt_parse_test.cpp diff --git a/be/src/exprs/hash_functions.cpp b/be/test/exprs/hash_function_test.cpp similarity index 53% copy from be/src/exprs/hash_functions.cpp copy to be/test/exprs/hash_function_test.cpp index 0407e62381..5fcccbb657 100644 --- a/be/src/exprs/hash_functions.cpp +++ b/be/test/exprs/hash_function_test.cpp @@ -15,29 +15,38 @@ // specific language governing permissions and limitations // under the License. -#include "exprs/hash_functions.h" +#include <gtest/gtest.h> -#include "udf/udf.h" -#include "util/hash_util.hpp" +#include <iostream> +#include <string> -namespace doris { +#include "exprs/anyval_util.h" +#include "exprs/hash_functions.h" +#include "testutil/function_utils.h" +#include "testutil/test_util.h" -using doris_udf::FunctionContext; -using doris_udf::IntVal; -using doris_udf::StringVal; +namespace doris { -void HashFunctions::init() {} +class HashFunctionsTest : public testing::Test { +public: + HashFunctionsTest() = default; -IntVal HashFunctions::murmur_hash3_32(FunctionContext* ctx, int num_children, - const StringVal* inputs) { - uint32_t seed = HashUtil::MURMUR3_32_SEED; - for (int i = 0; i < num_children; ++i) { - if (inputs[i].is_null) { - return IntVal::null(); - } - seed = HashUtil::murmur_hash3_32(inputs[i].ptr, inputs[i].len, seed); + void SetUp() { + utils = new FunctionUtils(); + ctx = utils->get_fn_ctx(); } - return seed; -} + void TearDown() { delete utils; } + +private: + FunctionUtils* utils; + FunctionContext* ctx; +}; -} // namespace doris +TEST_F(HashFunctionsTest, murmur_hash3_64) { + StringVal input = AnyValUtil::from_string_temp(ctx, std::string("hello")); + BigIntVal result = HashFunctions::murmur_hash3_64(ctx, 1, &input); + BigIntVal expected((int64_t)-3215607508166160593); + + EXPECT_EQ(expected, result); +} +} // namespace doris \ No newline at end of file diff --git a/be/test/vec/function/function_hash_test.cpp b/be/test/vec/function/function_hash_test.cpp index be22cea4bd..4578181181 100644 --- a/be/test/vec/function/function_hash_test.cpp +++ b/be/test/vec/function/function_hash_test.cpp @@ -55,6 +55,39 @@ TEST(HashFunctionTest, murmur_hash_3_test) { }; } +TEST(HashFunctionTest, murmur_hash_3_64_test) { + std::string func_name = "murmur_hash3_64"; + + { + InputTypeSet input_types = {TypeIndex::String}; + + DataSet data_set = {{{Null()}, Null()}, + {{std::string("hello")}, (int64_t)-3215607508166160593}}; + + check_function<DataTypeInt64, true>(func_name, input_types, data_set); + }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String}; + + DataSet data_set = { + {{std::string("hello"), std::string("world")}, (int64_t)3583109472027628045}, + {{std::string("hello"), Null()}, Null()}}; + + check_function<DataTypeInt64, true>(func_name, input_types, data_set); + }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::String, TypeIndex::String}; + + DataSet data_set = {{{std::string("hello"), std::string("world"), std::string("!")}, + (int64_t)1887828212617890932}, + {{std::string("hello"), std::string("world"), Null()}, Null()}}; + + check_function<DataTypeInt64, true>(func_name, input_types, data_set); + }; +} + TEST(HashFunctionTest, murmur_hash_2_test) { std::string func_name = "murmurHash2_64"; diff --git a/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md b/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md new file mode 100644 index 0000000000..cd05f72b05 --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md @@ -0,0 +1,61 @@ +--- +{ + "title": "murmur_hash3_64", + "language": "en" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## murmur_hash3_64 + +### description +#### Syntax + +`BIGINT MURMUR_HASH3_64(VARCHAR input, ...)` + +Return the 64 bits murmur3 hash of input string. + +### example + +``` +mysql> select murmur_hash3_64(null); ++-----------------------+ +| murmur_hash3_64(NULL) | ++-----------------------+ +| NULL | ++-----------------------+ + +mysql> select murmur_hash3_64("hello"); ++--------------------------+ +| murmur_hash3_64('hello') | ++--------------------------+ +| -3215607508166160593 | ++--------------------------+ + +mysql> select murmur_hash3_64("hello", "world"); ++-----------------------------------+ +| murmur_hash3_64('hello', 'world') | ++-----------------------------------+ +| 3583109472027628045 | ++-----------------------------------+ +``` + +### keywords + + MURMUR_HASH3_64,HASH diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md new file mode 100644 index 0000000000..c25861444c --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/hash-functions/murmur_hash3_64.md @@ -0,0 +1,61 @@ +--- +{ + "title": "murmur_hash3_64", + "language": "zh-CN" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## murmur_hash3_64 + +### description +#### Syntax + +`BIGINT MURMUR_HASH3_64(VARCHAR input, ...)` + +返回输入字符串的64位murmur3 hash值 + +### example + +``` +mysql> select murmur_hash3_64(null); ++-----------------------+ +| murmur_hash3_64(NULL) | ++-----------------------+ +| NULL | ++-----------------------+ + +mysql> select murmur_hash3_64("hello"); ++--------------------------+ +| murmur_hash3_64('hello') | ++--------------------------+ +| -3215607508166160593 | ++--------------------------+ + +mysql> select murmur_hash3_64("hello", "world"); ++-----------------------------------+ +| murmur_hash3_64('hello', 'world') | ++-----------------------------------+ +| 3583109472027628045 | ++-----------------------------------+ +``` + +### keywords + + MURMUR_HASH3_64,HASH diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index ab0353f826..8cce6f2ade 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1245,6 +1245,12 @@ visible_functions = [ [['murmur_hash3_32'], 'INT', ['STRING', '...'], '_ZN5doris13HashFunctions15murmur_hash3_32EPN9doris_udf15FunctionContextEiPKNS1_9StringValE', '', '', 'vec', ''], + [['murmur_hash3_64'], 'BIGINT', ['VARCHAR', '...'], + '_ZN5doris13HashFunctions15murmur_hash3_64EPN9doris_udf15FunctionContextEiPKNS1_9StringValE', + '', '', 'vec', ''], + [['murmur_hash3_64'], 'BIGINT', ['STRING', '...'], + '_ZN5doris13HashFunctions15murmur_hash3_64EPN9doris_udf15FunctionContextEiPKNS1_9StringValE', + '', '', 'vec', ''], # aes and base64 function [['aes_encrypt'], 'VARCHAR', ['VARCHAR', 'VARCHAR'], diff --git a/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out b/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out new file mode 100644 index 0000000000..c7b9485d45 --- /dev/null +++ b/regression-test/data/query_p0/sql_functions/hash_functions/test_hash_function.out @@ -0,0 +1,18 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +\N + +-- !sql -- +1321743225 + +-- !sql -- +984713481 + +-- !sql -- +\N + +-- !sql -- +-3215607508166160593 + +-- !sql -- +3583109472027628045 diff --git a/be/src/exprs/hash_functions.h b/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy similarity index 66% copy from be/src/exprs/hash_functions.h copy to regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy index 9fcfb9a7aa..3f2bccaee5 100644 --- a/be/src/exprs/hash_functions.h +++ b/regression-test/suites/query_p0/sql_functions/hash_functions/test_hash_function.groovy @@ -14,22 +14,15 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. +suite("test_hash_function") { + sql "set enable_vectorized_engine = true;" + sql "set batch_size = 4096;" -#pragma once + qt_sql "SELECT murmur_hash3_32(null);" + qt_sql "SELECT murmur_hash3_32(\"hello\");" + qt_sql "SELECT murmur_hash3_32(\"hello\", \"world\");" -namespace doris_udf { -class FunctionContext; -struct IntVal; -struct StringVal; -} // namespace doris_udf - -namespace doris { - -class HashFunctions { -public: - static void init(); - static doris_udf::IntVal murmur_hash3_32(doris_udf::FunctionContext* ctx, int num_children, - const doris_udf::StringVal* inputs); -}; - -} // namespace doris + qt_sql "SELECT murmur_hash3_64(null);" + qt_sql "SELECT murmur_hash3_64(\"hello\");" + qt_sql "SELECT murmur_hash3_64(\"hello\", \"world\");" +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org