This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 50603a11596 branch-3.1: [feature](function)support count_substrings
functions #42055 (#55847)
50603a11596 is described below
commit 50603a11596864f9cbee961af8104211044ae9e2
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Sep 10 16:54:42 2025 +0800
branch-3.1: [feature](function)support count_substrings functions #42055
(#55847)
Cherry-picked from #42055
Co-authored-by: zhangstar333
<[email protected]>
---
be/src/vec/functions/function_string.cpp | 1 +
be/src/vec/functions/function_string.h | 116 +++++++++++++++++++++
.../doris/catalog/BuiltinScalarFunctions.java | 2 +
.../functions/scalar/CountSubstring.java | 70 +++++++++++++
.../expressions/visitor/ScalarFunctionVisitor.java | 5 +
gensrc/script/doris_builtins_functions.py | 1 +
.../string_functions/test_count_substrings.out | Bin 0 -> 1550 bytes
.../string_functions/test_count_substrings.groovy | 76 ++++++++++++++
8 files changed, 271 insertions(+)
diff --git a/be/src/vec/functions/function_string.cpp
b/be/src/vec/functions/function_string.cpp
index d1041b7b0ad..351c694d688 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -1351,6 +1351,7 @@ void register_function_string(SimpleFunctionFactory&
factory) {
factory.register_function<FunctionFromBase64>();
factory.register_function<FunctionSplitPart>();
factory.register_function<FunctionSplitByString>();
+ factory.register_function<FunctionCountSubString>();
factory.register_function<FunctionSubstringIndex>();
factory.register_function<FunctionExtractURLParameter>();
factory.register_function<FunctionStringParseUrl>();
diff --git a/be/src/vec/functions/function_string.h
b/be/src/vec/functions/function_string.h
index 2e59dce22ef..094fe6a7fbb 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -2453,6 +2453,122 @@ private:
}
};
+class FunctionCountSubString : public IFunction {
+public:
+ static constexpr auto name = "count_substrings";
+
+ static FunctionPtr create() { return
std::make_shared<FunctionCountSubString>(); }
+ using NullMapType = PaddedPODArray<UInt8>;
+
+ String get_name() const override { return name; }
+
+ size_t get_number_of_arguments() const override { return 2; }
+
+ DataTypePtr get_return_type_impl(const DataTypes& arguments) const
override {
+ DCHECK(is_string(arguments[0]))
+ << "first argument for function: " << name << " should be
string"
+ << " and arguments[0] is " << arguments[0]->get_name();
+ DCHECK(is_string(arguments[1]))
+ << "second argument for function: " << name << " should be
string"
+ << " and arguments[1] is " << arguments[1]->get_name();
+ return std::make_shared<DataTypeInt32>();
+ }
+
+ Status execute_impl(FunctionContext* /*context*/, Block& block, const
ColumnNumbers& arguments,
+ size_t result, size_t input_rows_count) const override
{
+ DCHECK_EQ(arguments.size(), 2);
+ const auto& [src_column, left_const] =
+ unpack_if_const(block.get_by_position(arguments[0]).column);
+ const auto& [right_column, right_const] =
+ unpack_if_const(block.get_by_position(arguments[1]).column);
+
+ const auto* col_left =
check_and_get_column<ColumnString>(src_column.get());
+ if (!col_left) {
+ return Status::InternalError("Left operator of function {} can not
be {}", get_name(),
+
block.get_by_position(arguments[0]).type->get_name());
+ }
+
+ const auto* col_right =
check_and_get_column<ColumnString>(right_column.get());
+ if (!col_right) {
+ return Status::InternalError("Right operator of function {} can
not be {}", get_name(),
+
block.get_by_position(arguments[1]).type->get_name());
+ }
+
+ auto dest_column_ptr = ColumnInt32::create(input_rows_count, 0);
+ // count_substring(ColumnString, "xxx")
+ if (right_const) {
+ _execute_constant_pattern(*col_left, col_right->get_data_at(0),
+ dest_column_ptr->get_data(),
input_rows_count);
+ } else if (left_const) {
+ // count_substring("xxx", ColumnString)
+ _execute_constant_src_string(col_left->get_data_at(0), *col_right,
+ dest_column_ptr->get_data(),
input_rows_count);
+ } else {
+ // count_substring(ColumnString, ColumnString)
+ _execute_vector(*col_left, *col_right,
dest_column_ptr->get_data(), input_rows_count);
+ }
+
+ block.replace_by_position(result, std::move(dest_column_ptr));
+ return Status::OK();
+ }
+
+private:
+ void _execute_constant_pattern(const ColumnString& src_column_string,
+ const StringRef& pattern_ref,
+ ColumnInt32::Container& dest_column_data,
+ size_t input_rows_count) const {
+ for (size_t i = 0; i < input_rows_count; i++) {
+ const StringRef str_ref = src_column_string.get_data_at(i);
+ dest_column_data[i] = find_str_count(str_ref, pattern_ref);
+ }
+ }
+
+ void _execute_vector(const ColumnString& src_column_string, const
ColumnString& pattern_column,
+ ColumnInt32::Container& dest_column_data, size_t
input_rows_count) const {
+ for (size_t i = 0; i < input_rows_count; i++) {
+ const StringRef pattern_ref = pattern_column.get_data_at(i);
+ const StringRef str_ref = src_column_string.get_data_at(i);
+ dest_column_data[i] = find_str_count(str_ref, pattern_ref);
+ }
+ }
+
+ void _execute_constant_src_string(const StringRef& str_ref, const
ColumnString& pattern_col,
+ ColumnInt32::Container& dest_column_data,
+ size_t input_rows_count) const {
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ const StringRef pattern_ref = pattern_col.get_data_at(i);
+ dest_column_data[i] = find_str_count(str_ref, pattern_ref);
+ }
+ }
+
+ size_t find_pos(size_t pos, const StringRef str_ref, const StringRef
pattern_ref) const {
+ size_t old_size = pos;
+ size_t str_size = str_ref.size;
+ while (pos < str_size && memcmp_small_allow_overflow15(str_ref.data +
pos, pattern_ref.data,
+
pattern_ref.size)) {
+ pos++;
+ }
+ return pos - old_size;
+ }
+
+ int find_str_count(const StringRef str_ref, StringRef pattern_ref) const {
+ int count = 0;
+ if (str_ref.size == 0 || pattern_ref.size == 0) {
+ return 0;
+ } else {
+ for (size_t str_pos = 0; str_pos <= str_ref.size;) {
+ const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref);
+ if (res_pos == (str_ref.size - str_pos)) {
+ break; // not find
+ }
+ count++;
+ str_pos = str_pos + res_pos + pattern_ref.size;
+ }
+ }
+ return count;
+ }
+};
+
struct SM3Sum {
static constexpr auto name = "sm3sum";
using ObjectData = SM3Digest;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index 579f3148dd5..fa72b4e832c 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -128,6 +128,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Cosh;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.CosineDistance;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Cot;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CountEqual;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.CountSubstring;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32Internal;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CreateMap;
@@ -609,6 +610,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(Cot.class, "cot"),
scalar(CosineDistance.class, "cosine_distance"),
scalar(CountEqual.class, "countequal"),
+ scalar(CountSubstring.class, "count_substrings"),
scalar(CreateMap.class, "map"),
scalar(CreateStruct.class, "struct"),
scalar(CreateNamedStruct.class, "named_struct"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java
new file mode 100644
index 00000000000..ce7a43cf94b
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.IntegerType;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'count_substrings'.
+ */
+public class CountSubstring extends ScalarFunction
+ implements BinaryExpression, ExplicitlyCastableSignature,
PropagateNullable {
+
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+ FunctionSignature.ret(IntegerType.INSTANCE)
+ .args(StringType.INSTANCE, StringType.INSTANCE)
+ );
+
+ /**
+ * constructor with 2 arguments.
+ */
+ public CountSubstring(Expression arg0, Expression arg1) {
+ super("count_substrings", arg0, arg1);
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public CountSubstring withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 2);
+ return new CountSubstring(children.get(0), children.get(1));
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitCountSubstring(this, context);
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index c1725cd1658..dd364b8be75 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -136,6 +136,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Cosh;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.CosineDistance;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Cot;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CountEqual;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.CountSubstring;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32Internal;
import org.apache.doris.nereids.trees.expressions.functions.scalar.CreateMap;
@@ -967,6 +968,10 @@ public interface ScalarFunctionVisitor<R, C> {
return visitScalarFunction(countequal, context);
}
+ default R visitCountSubstring(CountSubstring countSubstring, C context) {
+ return visitScalarFunction(countSubstring, context);
+ }
+
default R visitCurrentCatalog(CurrentCatalog currentCatalog, C context) {
return visitScalarFunction(currentCatalog, context);
}
diff --git a/gensrc/script/doris_builtins_functions.py
b/gensrc/script/doris_builtins_functions.py
index 3fdaacbfe29..1c83b2d7c66 100644
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -1675,6 +1675,7 @@ visible_functions = {
[['overlay'], 'VARCHAR', ['VARCHAR', 'INT', 'INT', 'VARCHAR'], ''],
+ [['count_substrings'], 'INT', ['STRING', 'STRING'],
'DEPEND_ON_ARGUMENT'],
[['substr', 'substring'], 'STRING', ['STRING', 'INT'],
'DEPEND_ON_ARGUMENT'],
[['substr', 'substring'], 'STRING', ['STRING', 'INT', 'INT'],
'DEPEND_ON_ARGUMENT'],
[['strleft', 'left'], 'STRING', ['STRING', 'INT'],
'DEPEND_ON_ARGUMENT'],
diff --git
a/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out
b/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out
new file mode 100644
index 00000000000..9bee1363c66
Binary files /dev/null and
b/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out
differ
diff --git
a/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy
b/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy
new file mode 100644
index 00000000000..64051ec7afc
--- /dev/null
+++
b/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_count_substrings") {
+ // const / NULL
+ qt_select1 "select count_substrings(NULL,NULL);"
+ qt_select2 "select count_substrings('a12bc23de345f',NULL);"
+ qt_select3 "select count_substrings(NULL, 'a12bc23de345f');"
+ qt_select4 "select count_substrings('a12bc23de345f','2');"
+ qt_select5 "select count_substrings('a1你你c我你3我d你3你5你','你');"
+ qt_select6 "select count_substrings('ccc','cc');"
+
+ sql """DROP TABLE IF EXISTS test_count_substrings"""
+ sql """
+ CREATE TABLE IF NOT EXISTS test_count_substrings (
+ `k1` int(11) NULL COMMENT "",
+ `s1` varchar(30) NULL COMMENT "",
+ `s2` varchar(30) NOT NULL COMMENT "",
+ `p1` varchar(30) NULL COMMENT "",
+ `p2` varchar(30) NOT NULL COMMENT ""
+ ) ENGINE=OLAP
+ DUPLICATE KEY(`k1`)
+ DISTRIBUTED BY HASH(`k1`) BUCKETS 1
+ PROPERTIES (
+ "replication_allocation" = "tag.location.default: 1",
+ "storage_format" = "V2"
+ )
+ """
+ // empty
+ qt_select4_empty "select count_substrings(s1,p1) from
test_count_substrings;"
+ qt_select5_empty "select count_substrings(s2,p2) from
test_count_substrings;"
+ qt_select6_empty "select count_substrings(s1,p2) from
test_count_substrings;"
+ qt_select7_empty "select count_substrings(s2,p1) from
test_count_substrings;"
+
+ // some normal/special/null value
+ sql """ INSERT INTO test_count_substrings VALUES(1, 'abcde', 'abcde', '',
'') """
+ sql """ INSERT INTO test_count_substrings VALUES(2, '', '', '', '') """
+ sql """ INSERT INTO test_count_substrings VALUES(3, '', '','a','a') """
+ sql """ INSERT INTO test_count_substrings VALUES(4, NULL, '', NULL,'') """
+ sql """ INSERT INTO test_count_substrings VALUES(5, 'asdasd',
'asdasd','a','a') """
+ sql """ INSERT INTO test_count_substrings VALUES(6, 'a1b1c1d',
'a1b1c1d','1','1') """
+ sql """ INSERT INTO test_count_substrings VALUES(7, ',,,', ',,,','#','#')
"""
+ sql """ INSERT INTO test_count_substrings VALUES(8, 'a,b,c',
'a,b,c','v','v') """
+ sql """ INSERT INTO test_count_substrings VALUES(9, 'a,b,c,',
'a,b,c',NULL,'') """
+ sql """ INSERT INTO test_count_substrings VALUES(10, NULL, '','asd','asd')
"""
+ sql """ INSERT INTO test_count_substrings VALUES(11, 'a,b,c,12345',
'a,b,c,12345','5','5') """
+ sql """ INSERT INTO test_count_substrings VALUES(12, 'a,b,c,12345',
'a,b,c,12345','a','a') """
+ sql """ INSERT INTO test_count_substrings VALUES(13, 'a,你,你,1我2你4我5',
'a你,你,1我2你4我5','你','我') """
+
+ // null and not_null combine
+ qt_select5_null_null "select s1,p1,count_substrings(s1, p1) from
test_count_substrings order by k1;"
+ qt_select6_null_not "select s1, p2,count_substrings(s1, p2) from
test_count_substrings order by k1;"
+ qt_select7_not_null "select s2, p1,count_substrings(s2, p1) from
test_count_substrings order by k1;"
+ qt_select8_not_not "select s2, p2,count_substrings(s2, p2) from
test_count_substrings order by k1;"
+
+ // null const combine
+ qt_select9_null_const "select s1, 'a',count_substrings(s1, 'a') from
test_count_substrings order by k1;"
+ qt_select10_not_null_const "select s2, 'a',count_substrings(s2, 'a') from
test_count_substrings order by k1;"
+ qt_select11_const_null "select 'a',p1,count_substrings('a', p1) from
test_count_substrings order by k1;"
+ qt_select12_const_not_null "select 'a',p2,count_substrings('a', p2) from
test_count_substrings order by k1;"
+}
+
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]