This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch 2.1-tmp in repository https://gitbox.apache.org/repos/asf/doris.git
commit c61d6ad1e2824aa6c8e8d9ea100042efa0c2e1ff Author: zclllyybb <zhaochan...@selectdb.com> AuthorDate: Tue Apr 2 14:55:07 2024 +0800 [Feature] support function uuid_to_int and int_to_uuid #33005 --- be/src/vec/functions/function_uuid.cpp | 213 +++++++++++++++++++++ be/src/vec/functions/simple_function_factory.h | 4 +- be/test/vec/function/function_string_test.cpp | 30 +++ .../doris/catalog/BuiltinScalarFunctions.java | 4 + .../expressions/functions/scalar/InttoUuid.java | 68 +++++++ .../expressions/functions/scalar/UuidtoInt.java | 70 +++++++ .../expressions/visitor/ScalarFunctionVisitor.java | 10 + gensrc/script/doris_builtins_functions.py | 5 +- 8 files changed, 401 insertions(+), 3 deletions(-) diff --git a/be/src/vec/functions/function_uuid.cpp b/be/src/vec/functions/function_uuid.cpp new file mode 100644 index 00000000000..cee5fd7a363 --- /dev/null +++ b/be/src/vec/functions/function_uuid.cpp @@ -0,0 +1,213 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <cctype> +#include <cstddef> +#include <cstring> +#include <memory> +#include <utility> + +#include "common/status.h" +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/columns/column.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_vector.h" +#include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" +#include "vec/core/block.h" +#include "vec/core/column_numbers.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_nullable.h" +#include "vec/data_types/data_type_number.h" +#include "vec/data_types/data_type_string.h" +#include "vec/functions/function.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris { +class FunctionContext; +} // namespace doris + +namespace doris::vectorized { +constexpr static std::array<int, 5> SPLIT_POS = {8, 13, 18, 23, 36}; // 8-4-4-4-12 +constexpr static char DELIMITER = '-'; + +class FunctionUuidtoInt : public IFunction { +public: + static constexpr auto name = "uuid_to_int"; + + static FunctionPtr create() { return std::make_shared<FunctionUuidtoInt>(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 1; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return make_nullable(std::make_shared<DataTypeInt128>()); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) const override { + const auto& arg_column = + assert_cast<const ColumnString&>(*block.get_by_position(arguments[0]).column); + + auto result_column = ColumnInt128::create(input_rows_count); + auto& result_data = result_column->get_data(); + auto null_column = ColumnUInt8::create(input_rows_count); + auto& null_map = null_column->get_data(); + + for (int row = 0; row < input_rows_count; row++) { + auto str = arg_column.get_data_at(row); + const auto* data = str.data; + Int128* result_cell = &result_data[row]; + *result_cell = 0; + null_map[row] = false; + + if (str.size == 36) { + if (data[SPLIT_POS[0]] != DELIMITER || data[SPLIT_POS[1]] != DELIMITER || + data[SPLIT_POS[2]] != DELIMITER || data[SPLIT_POS[3]] != DELIMITER) { + null_map[row] = true; + continue; + } + char new_data[32]; + memset(new_data, 0, sizeof(new_data)); + // ignore '-' + memcpy(new_data, data, 8); + memcpy(new_data + 8, data + SPLIT_POS[0] + 1, 4); + memcpy(new_data + 12, data + SPLIT_POS[1] + 1, 4); + memcpy(new_data + 16, data + SPLIT_POS[2] + 1, 4); + memcpy(new_data + 20, data + SPLIT_POS[3] + 1, 12); + + if (!serialize(new_data, (char*)result_cell, 32)) { + null_map[row] = true; + continue; + } + } else if (str.size == 32) { + if (!serialize(data, (char*)result_cell, 32)) { + null_map[row] = true; + continue; + } + } else { + null_map[row] = true; + continue; + } + } + + block.replace_by_position( + result, ColumnNullable::create(std::move(result_column), std::move(null_column))); + return Status::OK(); + } + + // use char* to write dst is the only legal way by 'restrict aliasing rule' + static bool serialize(const char* __restrict src, char* __restrict dst, size_t length) { + char target; // 8bit, contains 2 char input + auto translate = [&target](const char ch) { + if (isdigit(ch)) { + target += ch - '0'; + } else if (ch >= 'a' && ch <= 'f') { + target += ch - 'a' + 10; + } else if (ch >= 'A' && ch <= 'F') { + target += ch - 'A' + 10; + } else { + return false; + } + return true; + }; + + bool ok = true; + for (size_t i = 0; i < length; i += 2, src++, dst++) { + target = 0; + if (!translate(*src)) { + ok = false; // dont break for auto-simd + } + + src++; + target <<= 4; + if (!translate(*src)) { + ok = false; + } + *dst = target; + } + + return ok; + } +}; + +class FunctionInttoUuid : public IFunction { +public: + static constexpr auto name = "int_to_uuid"; + + static FunctionPtr create() { return std::make_shared<FunctionInttoUuid>(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 1; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared<DataTypeString>(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) const override { + const auto& arg_column = + assert_cast<const ColumnInt128&>(*block.get_by_position(arguments[0]).column); + auto result_column = ColumnString::create(); + constexpr int str_length = 36; + auto& col_data = result_column->get_chars(); + auto& col_offset = result_column->get_offsets(); + col_data.resize(str_length * input_rows_count + + 1); // for branchless deserialize, we occupy one more byte for the last '-' + col_offset.resize(input_rows_count); + + for (int row = 0; row < input_rows_count; row++) { + const Int128* arg = &arg_column.get_data()[row]; + col_offset[row] = col_offset[row - 1] + str_length; + deserialize((char*)arg, col_data.data() + str_length * row); + } + block.replace_by_position(result, std::move(result_column)); + return Status::OK(); + } + + // use char* to read src is the only legal way by 'restrict aliasing rule' + static void deserialize(const char* __restrict src, unsigned char* __restrict dst) { + auto transform = [](char ch) -> unsigned char { + if (ch < 10) { + return ch + '0'; + } else { + return ch - 10 + 'a'; + } + }; + + int j = 0; + for (int i : SPLIT_POS) { + for (; j < i; src++, j += 2) { // input 16 chars, 2 data per char + dst[j] = transform(((*src) >> 4) & 0x0F); + dst[j + 1] = transform(*src & 0x0F); + } + dst[j++] = DELIMITER; // we resized one more byte. + } + } +}; + +void register_function_uuid_transforms(SimpleFunctionFactory& factory) { + factory.register_function<FunctionUuidtoInt>(); + factory.register_function<FunctionInttoUuid>(); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/functions/simple_function_factory.h b/be/src/vec/functions/simple_function_factory.h index a18b0beb8db..649db732093 100644 --- a/be/src/vec/functions/simple_function_factory.h +++ b/be/src/vec/functions/simple_function_factory.h @@ -24,8 +24,6 @@ #include <string> #include "agent/be_exec_version_manager.h" -#include "udf/udf.h" -#include "vec/exprs/table_function/table_function.h" #include "vec/functions/function.h" namespace doris::vectorized { @@ -81,6 +79,7 @@ void register_function_regexp(SimpleFunctionFactory& factory); void register_function_random(SimpleFunctionFactory& factory); void register_function_uuid(SimpleFunctionFactory& factory); void register_function_uuid_numeric(SimpleFunctionFactory& factory); +void register_function_uuid_transforms(SimpleFunctionFactory& factory); void register_function_coalesce(SimpleFunctionFactory& factory); void register_function_grouping(SimpleFunctionFactory& factory); void register_function_datetime_floor_ceil(SimpleFunctionFactory& factory); @@ -265,6 +264,7 @@ public: register_function_random(instance); register_function_uuid(instance); register_function_uuid_numeric(instance); + register_function_uuid_transforms(instance); register_function_coalesce(instance); register_function_grouping(instance); register_function_datetime_floor_ceil(instance); diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 612a6fff0cc..d8d1a57b8eb 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -17,6 +17,7 @@ #include <stdint.h> +#include <cstdint> #include <cstring> #include <memory> #include <string> @@ -1157,4 +1158,33 @@ TEST(function_string_test, function_bit_length_test) { static_cast<void>(check_function<DataTypeInt32, true>(func_name, input_types, data_set)); } +TEST(function_string_test, function_uuid_test) { + { + std::string func_name = "uuid_to_int"; + InputTypeSet input_types = {TypeIndex::String}; + uint64_t high = 9572195551486940809ULL; + uint64_t low = 1759290071393952876ULL; + __int128 result = (__int128)high * (__int128)10000000000000000000ULL + (__int128)low; + DataSet data_set = {{{Null()}, Null()}, + {{std::string("6ce4766f-6783-4b30-b357-bba1c7600348")}, result}, + {{std::string("6ce4766f67834b30b357bba1c7600348")}, result}, + {{std::string("ffffffff-ffff-ffff-ffff-ffffffffffff")}, (__int128)-1}, + {{std::string("00000000-0000-0000-0000-000000000000")}, (__int128)0}, + {{std::string("123")}, Null()}}; + static_cast<void>(check_function<DataTypeInt128, true>(func_name, input_types, data_set)); + } + { + std::string func_name = "int_to_uuid"; + InputTypeSet input_types = {TypeIndex::Int128}; + uint64_t high = 9572195551486940809ULL; + uint64_t low = 1759290071393952876ULL; + __int128 value = (__int128)high * (__int128)10000000000000000000ULL + (__int128)low; + DataSet data_set = {{{Null()}, Null()}, + {{value}, std::string("6ce4766f-6783-4b30-b357-bba1c7600348")}, + {{(__int128)-1}, std::string("ffffffff-ffff-ffff-ffff-ffffffffffff")}, + {{(__int128)0}, std::string("00000000-0000-0000-0000-000000000000")}}; + static_cast<void>(check_function<DataTypeString, true>(func_name, input_types, data_set)); + } +} + } // namespace doris::vectorized diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 9d19ea9e2e9..d28cb751eaa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -198,6 +198,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Ignore; import org.apache.doris.nereids.trees.expressions.functions.scalar.Initcap; import org.apache.doris.nereids.trees.expressions.functions.scalar.InnerProduct; import org.apache.doris.nereids.trees.expressions.functions.scalar.Instr; +import org.apache.doris.nereids.trees.expressions.functions.scalar.InttoUuid; import org.apache.doris.nereids.trees.expressions.functions.scalar.Ipv4CIDRToRange; import org.apache.doris.nereids.trees.expressions.functions.scalar.Ipv4NumToString; import org.apache.doris.nereids.trees.expressions.functions.scalar.Ipv4StringToNum; @@ -417,6 +418,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.User; import org.apache.doris.nereids.trees.expressions.functions.scalar.UtcTimestamp; import org.apache.doris.nereids.trees.expressions.functions.scalar.Uuid; import org.apache.doris.nereids.trees.expressions.functions.scalar.UuidNumeric; +import org.apache.doris.nereids.trees.expressions.functions.scalar.UuidtoInt; import org.apache.doris.nereids.trees.expressions.functions.scalar.Version; import org.apache.doris.nereids.trees.expressions.functions.scalar.Week; import org.apache.doris.nereids.trees.expressions.functions.scalar.WeekCeil; @@ -625,6 +627,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(Initcap.class, "initcap"), scalar(InnerProduct.class, "inner_product"), scalar(Instr.class, "instr"), + scalar(InttoUuid.class, "int_to_uuid"), scalar(Ipv4NumToString.class, "ipv4_num_to_string", "inet_ntoa"), scalar(Ipv4StringToNum.class, "ipv4_string_to_num"), scalar(Ipv4StringToNumOrDefault.class, "ipv4_string_to_num_or_default"), @@ -870,6 +873,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(UtcTimestamp.class, "utc_timestamp"), scalar(Uuid.class, "uuid"), scalar(UuidNumeric.class, "uuid_numeric"), + scalar(UuidtoInt.class, "uuid_to_int"), scalar(Version.class, "version"), scalar(Week.class, "week"), scalar(WeekCeil.class, "week_ceil"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/InttoUuid.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/InttoUuid.java new file mode 100644 index 00000000000..d3434eff35b --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/InttoUuid.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.LargeIntType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'int_to_uuid'. + */ +public class InttoUuid extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT).args(LargeIntType.INSTANCE)); + + /** + * constructor with 1 argument. + */ + public InttoUuid(Expression arg) { + super("int_to_uuid", arg); + } + + /** + * withChildren. + */ + @Override + public InttoUuid withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 1); + return new InttoUuid(children.get(0)); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitInttoUuid(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/UuidtoInt.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/UuidtoInt.java new file mode 100644 index 00000000000..987a8b1d06e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/UuidtoInt.java @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.AlwaysNullable; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.LargeIntType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'uuid_to_int'. + */ +public class UuidtoInt extends ScalarFunction + implements UnaryExpression, ExplicitlyCastableSignature, AlwaysNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(LargeIntType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(LargeIntType.INSTANCE).args(StringType.INSTANCE)); + + /** + * constructor with 1 argument. + */ + public UuidtoInt(Expression arg) { + super("uuid_to_int", arg); + } + + /** + * withChildren. + */ + @Override + public UuidtoInt withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 1); + return new UuidtoInt(children.get(0)); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitUuidtoInt(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 7cef47557cc..83a4a2aa027 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -201,6 +201,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Ignore; import org.apache.doris.nereids.trees.expressions.functions.scalar.Initcap; import org.apache.doris.nereids.trees.expressions.functions.scalar.InnerProduct; import org.apache.doris.nereids.trees.expressions.functions.scalar.Instr; +import org.apache.doris.nereids.trees.expressions.functions.scalar.InttoUuid; import org.apache.doris.nereids.trees.expressions.functions.scalar.Ipv4CIDRToRange; import org.apache.doris.nereids.trees.expressions.functions.scalar.Ipv4NumToString; import org.apache.doris.nereids.trees.expressions.functions.scalar.Ipv4StringToNum; @@ -414,6 +415,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.User; import org.apache.doris.nereids.trees.expressions.functions.scalar.UtcTimestamp; import org.apache.doris.nereids.trees.expressions.functions.scalar.Uuid; import org.apache.doris.nereids.trees.expressions.functions.scalar.UuidNumeric; +import org.apache.doris.nereids.trees.expressions.functions.scalar.UuidtoInt; import org.apache.doris.nereids.trees.expressions.functions.scalar.Version; import org.apache.doris.nereids.trees.expressions.functions.scalar.Week; import org.apache.doris.nereids.trees.expressions.functions.scalar.WeekCeil; @@ -2001,6 +2003,14 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(uuidNumeric, context); } + default R visitUuidtoInt(UuidtoInt uuidtoInt, C context) { + return visitScalarFunction(uuidtoInt, context); + } + + default R visitInttoUuid(InttoUuid inttoUuid, C context) { + return visitScalarFunction(inttoUuid, context); + } + default R visitVersion(Version version, C context) { return visitScalarFunction(version, context); } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index d97c499f2da..3d87ab86fd2 100644 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -2012,7 +2012,10 @@ visible_functions = { "UUID": [ [['uuid'], 'VARCHAR', [], 'ALWAYS_NOT_NULLABLE'], - [['uuid_numeric'], 'LARGEINT', [], 'ALWAYS_NOT_NULLABLE'] + [['uuid_numeric'], 'LARGEINT', [], 'ALWAYS_NOT_NULLABLE'], + [['uuid_to_int'], 'LARGEINT', ['VARCHAR'], 'ALWAYS_NULLABLE'], + [['uuid_to_int'], 'LARGEINT', ['STRING'], 'ALWAYS_NULLABLE'], + [['int_to_uuid'], 'VARCHAR', ['LARGEINT'], 'DEPEND_ON_ARGUMENT'] ], #ip functions --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org