This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-c108335-hive-sql in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-c108335-hive-sql by this push: new 3d8a79efc7b Impl substr from zero (#49196) 3d8a79efc7b is described below commit 3d8a79efc7b9e5ef2074db5d03b5594384811a66 Author: Socrates <suyit...@selectdb.com> AuthorDate: Tue Mar 18 15:55:58 2025 +0800 Impl substr from zero (#49196) --- be/src/vec/exec/scan/file_scanner.cpp | 2 +- be/src/vec/functions/function_string.cpp | 9 +- be/src/vec/functions/function_string.h | 35 +++++-- .../sink/writer/iceberg/partition_transformers.h | 2 +- .../doris/catalog/BuiltinScalarFunctions.java | 8 +- .../functions/scalar/SubstringForZero.java | 113 +++++++++++++++++++++ .../expressions/visitor/ScalarFunctionVisitor.java | 5 + 7 files changed, 158 insertions(+), 16 deletions(-) diff --git a/be/src/vec/exec/scan/file_scanner.cpp b/be/src/vec/exec/scan/file_scanner.cpp index 62e03e3dc5d..21e0ea2f195 100644 --- a/be/src/vec/exec/scan/file_scanner.cpp +++ b/be/src/vec/exec/scan/file_scanner.cpp @@ -845,7 +845,7 @@ void FileScanner::_truncate_char_or_varchar_column(Block* block, int idx, int le temp_arguments[2] = num_columns_without_result + 1; // len size_t result_column_id = num_columns_without_result + 2; - SubstringUtil::substring_execute(*block, temp_arguments, result_column_id, block->rows()); + SubstringUtil<>::substring_execute(*block, temp_arguments, result_column_id, block->rows()); auto res = ColumnNullable::create(block->get_by_position(result_column_id).column, null_map_column_ptr); block->replace_by_position(idx, std::move(res)); diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index d87eaaa3dde..be166baa5f1 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -1195,8 +1195,10 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function<FunctionTrim<Trim2Impl<true, false, NameLTrimIn>>>(); factory.register_function<FunctionTrim<Trim2Impl<false, true, NameRTrimIn>>>(); factory.register_function<FunctionConvertTo>(); - factory.register_function<FunctionSubstring<Substr3Impl>>(); - factory.register_function<FunctionSubstring<Substr2Impl>>(); + factory.register_function<FunctionSubstring<Substr3Impl<false>>>(); + factory.register_function<FunctionSubstring<Substr2Impl<false>>>(); + factory.register_function<FunctionSubstring<Substr3Impl<true>>>(); + factory.register_function<FunctionSubstring<Substr2Impl<true>>>(); factory.register_function<FunctionLeft>(); factory.register_function<FunctionRight>(); factory.register_function<FunctionNullOrEmpty>(); @@ -1242,7 +1244,8 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_alias(FunctionLeft::name, "strleft"); factory.register_alias(FunctionRight::name, "strright"); - factory.register_alias(SubstringUtil::name, "substr"); + factory.register_alias(SubstringUtil<>::name, "substr"); + factory.register_alias(SubstringUtil<true>::name, "substr_for_zero"); factory.register_alias(FunctionToLower::name, "lcase"); factory.register_alias(FunctionToUpper::name, "ucase"); factory.register_alias(FunctionStringDigestOneArg<MD5Sum>::name, "md5"); diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 3b909f4a8d5..b38ca7c50d9 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -162,8 +162,9 @@ struct StringOP { } }; +template <bool is_for_zero = false> struct SubstringUtil { - static constexpr auto name = "substring"; + static constexpr auto name = is_for_zero ? "substring_for_zero" : "substring"; static void substring_execute(Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count) { @@ -220,7 +221,7 @@ private: PMR::vector<size_t> index {&pool}; if constexpr (is_const) { - if (start[0] == 0 || len[0] <= 0) { + if ((!is_for_zero && start[0] == 0) || len[0] <= 0) { for (size_t i = 0; i < size; ++i) { StringOP::push_empty_string(i, res_chars, res_offsets); } @@ -237,11 +238,17 @@ private: int char_len = simd::VStringFunctions::get_char_len(str_data, str_size); // return empty string if start > src.length // Here, start_value is compared against the length of the character. - if (start_value > char_len || str_size == 0 || start_value == 0 || len_value <= 0) { + if (start_value > char_len || str_size == 0 || (!is_for_zero && start_value == 0) || + len_value <= 0) { StringOP::push_empty_string(i, res_chars, res_offsets); continue; } + // Handle Hive compatibility mode - treat start=0 as start=1 + if (is_for_zero && start_value == 0) { + start_value = 1; + } + size_t byte_pos = 0; index.clear(); for (size_t j = 0, char_size = 0; j < str_size; j += char_size) { @@ -287,7 +294,7 @@ private: res_offsets.resize(size); if constexpr (is_const) { - if (start[0] == 0 || len[0] <= 0) { + if ((!is_for_zero && start[0] == 0) || len[0] <= 0) { for (size_t i = 0; i < size; ++i) { StringOP::push_empty_string(i, res_chars, res_offsets); } @@ -305,6 +312,11 @@ private: int start_value = is_const ? start[0] : start[i]; int len_value = is_const ? len[0] : len[i]; + // Handle Hive compatibility mode - treat start=0 as start=1 + if (is_for_zero && start_value == 0) { + start_value = 1; + } + if (start_value > str_size || start_value < -str_size || str_size == 0 || len_value <= 0) { StringOP::push_empty_string(i, res_chars, res_offsets); @@ -616,7 +628,7 @@ private: template <typename Impl> class FunctionSubstring : public IFunction { public: - static constexpr auto name = SubstringUtil::name; + static constexpr auto name = Impl::name; String get_name() const override { return name; } static FunctionPtr create() { return std::make_shared<FunctionSubstring<Impl>>(); } @@ -636,7 +648,9 @@ public: } }; +template <bool is_for_zero = false> struct Substr3Impl { + static constexpr auto name = SubstringUtil<is_for_zero>::name; static DataTypes get_variadic_argument_types() { return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeInt32>(), std::make_shared<DataTypeInt32>()}; @@ -645,12 +659,14 @@ struct Substr3Impl { static Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count) { - SubstringUtil::substring_execute(block, arguments, result, input_rows_count); + SubstringUtil<is_for_zero>::substring_execute(block, arguments, result, input_rows_count); return Status::OK(); } }; +template <bool is_for_zero = false> struct Substr2Impl { + static constexpr auto name = SubstringUtil<is_for_zero>::name; static DataTypes get_variadic_argument_types() { return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeInt32>()}; } @@ -679,7 +695,8 @@ struct Substr2Impl { block.insert({std::move(col_len), std::make_shared<DataTypeInt32>(), "strlen"}); ColumnNumbers temp_arguments = {arguments[0], arguments[1], block.columns() - 1}; - SubstringUtil::substring_execute(block, temp_arguments, result, input_rows_count); + SubstringUtil<is_for_zero>::substring_execute(block, temp_arguments, result, + input_rows_count); return Status::OK(); } }; @@ -891,7 +908,7 @@ public: temp_arguments[1] = num_columns_without_result; temp_arguments[2] = arguments[1]; - SubstringUtil::substring_execute(block, temp_arguments, result, input_rows_count); + SubstringUtil<false>::substring_execute(block, temp_arguments, result, input_rows_count); return Status::OK(); } }; @@ -940,7 +957,7 @@ public: temp_arguments[0] = arguments[0]; temp_arguments[1] = num_columns_without_result; temp_arguments[2] = num_columns_without_result + 1; - SubstringUtil::substring_execute(block, temp_arguments, result, input_rows_count); + SubstringUtil<>::substring_execute(block, temp_arguments, result, input_rows_count); return Status::OK(); } }; diff --git a/be/src/vec/sink/writer/iceberg/partition_transformers.h b/be/src/vec/sink/writer/iceberg/partition_transformers.h index 0b18ce24952..cd1cfa6f94c 100644 --- a/be/src/vec/sink/writer/iceberg/partition_transformers.h +++ b/be/src/vec/sink/writer/iceberg/partition_transformers.h @@ -177,7 +177,7 @@ public: temp_arguments[2] = 2; // width uint32_t result_column_id = 3; - SubstringUtil::substring_execute(temp_block, temp_arguments, result_column_id, + SubstringUtil<>::substring_execute(temp_block, temp_arguments, result_column_id, temp_block.rows()); if (is_nullable) { auto res_column = ColumnNullable::create( diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index c0409ae3489..0445cc3155e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -426,6 +426,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.StructElement import org.apache.doris.nereids.trees.expressions.functions.scalar.SubBitmap; import org.apache.doris.nereids.trees.expressions.functions.scalar.SubReplace; import org.apache.doris.nereids.trees.expressions.functions.scalar.Substring; +import org.apache.doris.nereids.trees.expressions.functions.scalar.SubstringForZero; import org.apache.doris.nereids.trees.expressions.functions.scalar.SubstringIndex; import org.apache.doris.nereids.trees.expressions.functions.scalar.Tan; import org.apache.doris.nereids.trees.expressions.functions.scalar.Tanh; @@ -493,7 +494,8 @@ import java.util.List; /** * Builtin scalar functions. * <p> - * Note: Please ensure that this class only has some lists and no procedural code. + * Note: Please ensure that this class only has some lists and no procedural + * code. * It helps to be clear and concise. */ public class BuiltinScalarFunctions implements FunctionHelper { @@ -923,6 +925,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(SubBitmap.class, "sub_bitmap"), scalar(SubReplace.class, "sub_replace"), scalar(Substring.class, "substr", "substring"), + scalar(SubstringForZero.class, "substr_for_zero", "substring_for_zero"), scalar(SubstringIndex.class, "substring_index"), scalar(Tan.class, "tan"), scalar(Tanh.class, "tanh"), @@ -991,5 +994,6 @@ public class BuiltinScalarFunctions implements FunctionHelper { public static final BuiltinScalarFunctions INSTANCE = new BuiltinScalarFunctions(); // Note: Do not add any code here! - private BuiltinScalarFunctions() {} + private BuiltinScalarFunctions() { + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringForZero.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringForZero.java new file mode 100644 index 00000000000..6c4f2994837 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringForZero.java @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral; +import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; +import java.util.Optional; + +/** + * ScalarFunction 'substring_for_zero'. For compatibility with Hive. + */ +public class SubstringForZero extends ScalarFunction + implements ExplicitlyCastableSignature, PropagateNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT).args(VarcharType.SYSTEM_DEFAULT, IntegerType.INSTANCE), + FunctionSignature.ret(StringType.INSTANCE).args(StringType.INSTANCE, IntegerType.INSTANCE), + FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT) + .args(VarcharType.SYSTEM_DEFAULT, IntegerType.INSTANCE, IntegerType.INSTANCE), + FunctionSignature.ret(StringType.INSTANCE) + .args(StringType.INSTANCE, IntegerType.INSTANCE, IntegerType.INSTANCE)); + + /** + * constructor with 2 arguments. + */ + public SubstringForZero(Expression arg0, Expression arg1) { + super("substring_for_zero", arg0, arg1, Literal.of(Integer.MAX_VALUE)); + } + + /** + * constructor with 3 arguments. + */ + public SubstringForZero(Expression arg0, Expression arg1, Expression arg2) { + super("substring_for_zero", arg0, arg1, arg2); + } + + @Override + public FunctionSignature computeSignature(FunctionSignature signature) { + Optional<Expression> length = arity() == 3 + ? Optional.of(getArgument(2)) + : Optional.empty(); + DataType returnType = VarcharType.SYSTEM_DEFAULT; + if (length.isPresent() && length.get() instanceof IntegerLiteral) { + returnType = VarcharType.createVarcharType(((IntegerLiteral) length.get()).getValue()); + } + return signature.withReturnType(returnType); + } + + public Expression getSource() { + return child(0); + } + + public Expression getPosition() { + return child(1); + } + + public Optional<Expression> getLength() { + return arity() == 3 ? Optional.of(child(2)) : Optional.empty(); + } + + /** + * withChildren. + */ + @Override + public SubstringForZero withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 2 + || children.size() == 3); + if (children.size() == 2) { + return new SubstringForZero(children.get(0), children.get(1)); + } else { + return new SubstringForZero(children.get(0), children.get(1), children.get(2)); + } + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitSubstringForZero(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index af8740047e6..02aa54e17b2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -424,6 +424,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.StructElement import org.apache.doris.nereids.trees.expressions.functions.scalar.SubBitmap; import org.apache.doris.nereids.trees.expressions.functions.scalar.SubReplace; import org.apache.doris.nereids.trees.expressions.functions.scalar.Substring; +import org.apache.doris.nereids.trees.expressions.functions.scalar.SubstringForZero; import org.apache.doris.nereids.trees.expressions.functions.scalar.SubstringIndex; import org.apache.doris.nereids.trees.expressions.functions.scalar.Tan; import org.apache.doris.nereids.trees.expressions.functions.scalar.Tanh; @@ -2071,6 +2072,10 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(substring, context); } + default R visitSubstringForZero(SubstringForZero substringForZero, C context) { + return visitScalarFunction(substringForZero, context); + } + default R visitSubstringIndex(SubstringIndex substringIndex, C context) { return visitScalarFunction(substringIndex, context); } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org