This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new dedf15c6df5 [fix])(function) add function regexp_extract_or_null (#38296) dedf15c6df5 is described below commit dedf15c6df57a20b86b2772054449074ae4b1a64 Author: 苏小刚 <suxiaogang...@icloud.com> AuthorDate: Thu Aug 15 10:24:12 2024 +0800 [fix])(function) add function regexp_extract_or_null (#38296) ## Proposed changes Add function regexp_extract_or_null to be compatible with presto. The function is same as regexp_extract, except that it returns null when no match is found. --- be/src/vec/functions/function_regexp.cpp | 18 ++++-- be/test/vec/function/function_like_test.cpp | 39 ++++++++++++ .../doris/catalog/BuiltinScalarFunctions.java | 2 + .../functions/scalar/RegexpExtractOrNull.java | 73 ++++++++++++++++++++++ .../expressions/visitor/ScalarFunctionVisitor.java | 5 ++ .../test_string_function_regexp.out | 12 ++++ .../test_string_function_regexp.groovy | 5 ++ 7 files changed, 148 insertions(+), 6 deletions(-) diff --git a/be/src/vec/functions/function_regexp.cpp b/be/src/vec/functions/function_regexp.cpp index 4da133a6f51..525d99b6cc7 100644 --- a/be/src/vec/functions/function_regexp.cpp +++ b/be/src/vec/functions/function_regexp.cpp @@ -184,8 +184,9 @@ struct RegexpReplaceOneImpl { } }; +template <bool ReturnNull> struct RegexpExtractImpl { - static constexpr auto name = "regexp_extract"; + static constexpr auto name = ReturnNull ? "regexp_extract_or_null" : "regexp_extract"; // 3 args static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], size_t input_rows_count, ColumnString::Chars& result_data, @@ -201,7 +202,8 @@ struct RegexpExtractImpl { } const auto& index_data = index_col->get_int(i); if (index_data < 0) { - StringOP::push_empty_string(i, result_data, result_offset); + ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) + : StringOP::push_empty_string(i, result_data, result_offset); continue; } _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, @@ -220,7 +222,8 @@ struct RegexpExtractImpl { const auto& index_data = index_col->get_int(0); if (index_data < 0) { for (size_t i = 0; i < input_rows_count; ++i) { - StringOP::push_empty_string(i, result_data, result_offset); + ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) + : StringOP::push_empty_string(i, result_data, result_offset); } return; } @@ -260,7 +263,8 @@ struct RegexpExtractImpl { int max_matches = 1 + re->NumberOfCapturingGroups(); if (index_data >= max_matches) { - StringOP::push_empty_string(index_now, result_data, result_offset); + ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) + : StringOP::push_empty_string(index_now, result_data, result_offset); return; } @@ -268,7 +272,8 @@ struct RegexpExtractImpl { bool success = re->Match(str_sp, 0, str.size, re2::RE2::UNANCHORED, &matches[0], max_matches); if (!success) { - StringOP::push_empty_string(index_now, result_data, result_offset); + ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) + : StringOP::push_empty_string(index_now, result_data, result_offset); return; } const re2::StringPiece& match = matches[index_data]; @@ -486,7 +491,8 @@ public: void register_function_regexp_extract(SimpleFunctionFactory& factory) { factory.register_function<FunctionRegexp<RegexpReplaceImpl>>(); - factory.register_function<FunctionRegexp<RegexpExtractImpl>>(); + factory.register_function<FunctionRegexp<RegexpExtractImpl<true>>>(); + factory.register_function<FunctionRegexp<RegexpExtractImpl<false>>>(); factory.register_function<FunctionRegexp<RegexpReplaceOneImpl>>(); factory.register_function<FunctionRegexp<RegexpExtractAllImpl>>(); } diff --git a/be/test/vec/function/function_like_test.cpp b/be/test/vec/function/function_like_test.cpp index 794bc8c5eae..e39b2cf43b0 100644 --- a/be/test/vec/function/function_like_test.cpp +++ b/be/test/vec/function/function_like_test.cpp @@ -155,6 +155,45 @@ TEST(FunctionLikeTest, regexp_extract) { } } +TEST(FunctionLikeTest, regexp_extract_or_null) { + std::string func_name = "regexp_extract_or_null"; + + DataSet data_set = {{{std::string("x=a3&x=18abc&x=2&y=3&x=4"), + std::string("x=([0-9]+)([a-z]+)"), (int64_t)0}, + std::string("x=18abc")}, + {{std::string("x=a3&x=18abc&x=2&y=3&x=4"), + std::string("^x=([a-z]+)([0-9]+)"), (int64_t)0}, + std::string("x=a3")}, + {{std::string("x=a3&x=18abc&x=2&y=3&x=4"), + std::string("^x=([a-z]+)([0-9]+)"), (int64_t)1}, + std::string("a")}, + {{std::string("http://a.m.baidu.com/i41915173660.htm"), + std::string("i([0-9]+)"), (int64_t)0}, + std::string("i41915173660")}, + {{std::string("http://a.m.baidu.com/i41915173660.htm"), + std::string("i([0-9]+)"), (int64_t)1}, + std::string("41915173660")}, + + {{std::string("hitdecisiondlist"), std::string("(i)(.*?)(e)"), (int64_t)0}, + std::string("itde")}, + {{std::string("hitdecisiondlist"), std::string("(i)(.*?)(e)"), (int64_t)1}, + std::string("i")}, + {{std::string("hitdecisiondlist"), std::string("(i)(.*?)(e)"), (int64_t)2}, + std::string("td")}, + // null + {{std::string("abc"), Null(), (int64_t)0}, Null()}, + {{Null(), std::string("i([0-9]+)"), (int64_t)0}, Null()}}; + + // pattern is constant value + InputTypeSet const_pattern_input_types = {TypeIndex::String, Consted {TypeIndex::String}, + TypeIndex::Int64}; + for (const auto& line : data_set) { + DataSet const_pattern_dataset = {line}; + static_cast<void>(check_function<DataTypeString, true>(func_name, const_pattern_input_types, + const_pattern_dataset)); + } +} + TEST(FunctionLikeTest, regexp_extract_all) { std::string func_name = "regexp_extract_all"; diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 76c6f066301..cddacdefc53 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -341,6 +341,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Random; import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes; import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract; import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull; import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace; import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne; import org.apache.doris.nereids.trees.expressions.functions.scalar.Repeat; @@ -811,6 +812,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(Regexp.class, "regexp"), scalar(RegexpExtract.class, "regexp_extract"), scalar(RegexpExtractAll.class, "regexp_extract_all"), + scalar(RegexpExtractOrNull.class, "regexp_extract_or_null"), scalar(RegexpReplace.class, "regexp_replace"), scalar(RegexpReplaceOne.class, "regexp_replace_one"), scalar(Repeat.class, "repeat"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpExtractOrNull.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpExtractOrNull.java new file mode 100644 index 00000000000..78f94db338d --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpExtractOrNull.java @@ -0,0 +1,73 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.AlwaysNullable; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.shape.TernaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.BigIntType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'regexp_extract_or_null'. This class is generated by GenerateFunction. + */ +public class RegexpExtractOrNull extends ScalarFunction + implements TernaryExpression, ExplicitlyCastableSignature, AlwaysNullable { + + public static final List<FunctionSignature> SIGNATURES = ImmutableList.of( + FunctionSignature.ret(VarcharType.SYSTEM_DEFAULT) + .args(VarcharType.SYSTEM_DEFAULT, VarcharType.SYSTEM_DEFAULT, BigIntType.INSTANCE), + FunctionSignature.ret(StringType.INSTANCE) + .args(StringType.INSTANCE, StringType.INSTANCE, BigIntType.INSTANCE) + ); + + /** + * constructor with 3 arguments. + */ + public RegexpExtractOrNull(Expression arg0, Expression arg1, Expression arg2) { + super("regexp_extract_or_null", arg0, arg1, arg2); + } + + /** + * withChildren. + */ + @Override + public RegexpExtractOrNull withChildren(List<Expression> children) { + Preconditions.checkArgument(children.size() == 3); + return new RegexpExtractOrNull(children.get(0), children.get(1), children.get(2)); + } + + @Override + public List<FunctionSignature> getSignatures() { + return SIGNATURES; + } + + @Override + public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) { + return visitor.visitRegexpExtractOrNull(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 78d8ca0f701..093bf9f1acf 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -340,6 +340,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Random; import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes; import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract; import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull; import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace; import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne; import org.apache.doris.nereids.trees.expressions.functions.scalar.Repeat; @@ -1726,6 +1727,10 @@ public interface ScalarFunctionVisitor<R, C> { return visitScalarFunction(regexpExtractAll, context); } + default R visitRegexpExtractOrNull(RegexpExtractOrNull regexpExtractOrNull, C context) { + return visitScalarFunction(regexpExtractOrNull, context); + } + default R visitRegexpReplace(RegexpReplace regexpReplace, C context) { return visitScalarFunction(regexpReplace, context); } diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out index cfe2fd3eaf7..60719fded1a 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function_regexp.out @@ -49,6 +49,18 @@ b -- !sql -- d +-- !sql -- + + +-- !sql -- +b + +-- !sql -- +d + +-- !sql -- +\N + -- !sql -- ['18','17'] diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy index 5926492ac4d..2066452b0d6 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_regexp.groovy @@ -51,6 +51,11 @@ suite("test_string_function_regexp") { qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 1);" qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 2);" + qt_sql "SELECT regexp_extract('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 3);" + + qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 1);" + qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 2);" + qt_sql "SELECT regexp_extract_or_null('AbCdE', '([[:lower:]]+)C([[:lower:]]+)', 3);" qt_sql "SELECT regexp_extract_all('x=a3&x=18abc&x=2&y=3&x=4&x=17bcd', 'x=([0-9]+)([a-z]+)');" qt_sql "SELECT regexp_extract_all('http://a.m.baidu.com/i41915i73660.htm', 'i([0-9]+)');" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org