This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 50603a11596 branch-3.1: [feature](function)support count_substrings 
functions #42055 (#55847)
50603a11596 is described below

commit 50603a11596864f9cbee961af8104211044ae9e2
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Sep 10 16:54:42 2025 +0800

    branch-3.1: [feature](function)support count_substrings functions #42055 
(#55847)
    
    Cherry-picked from #42055
    
    Co-authored-by: zhangstar333 
<[email protected]>
---
 be/src/vec/functions/function_string.cpp           |   1 +
 be/src/vec/functions/function_string.h             | 116 +++++++++++++++++++++
 .../doris/catalog/BuiltinScalarFunctions.java      |   2 +
 .../functions/scalar/CountSubstring.java           |  70 +++++++++++++
 .../expressions/visitor/ScalarFunctionVisitor.java |   5 +
 gensrc/script/doris_builtins_functions.py          |   1 +
 .../string_functions/test_count_substrings.out     | Bin 0 -> 1550 bytes
 .../string_functions/test_count_substrings.groovy  |  76 ++++++++++++++
 8 files changed, 271 insertions(+)

diff --git a/be/src/vec/functions/function_string.cpp 
b/be/src/vec/functions/function_string.cpp
index d1041b7b0ad..351c694d688 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -1351,6 +1351,7 @@ void register_function_string(SimpleFunctionFactory& 
factory) {
     factory.register_function<FunctionFromBase64>();
     factory.register_function<FunctionSplitPart>();
     factory.register_function<FunctionSplitByString>();
+    factory.register_function<FunctionCountSubString>();
     factory.register_function<FunctionSubstringIndex>();
     factory.register_function<FunctionExtractURLParameter>();
     factory.register_function<FunctionStringParseUrl>();
diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index 2e59dce22ef..094fe6a7fbb 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -2453,6 +2453,122 @@ private:
     }
 };
 
+class FunctionCountSubString : public IFunction {
+public:
+    static constexpr auto name = "count_substrings";
+
+    static FunctionPtr create() { return 
std::make_shared<FunctionCountSubString>(); }
+    using NullMapType = PaddedPODArray<UInt8>;
+
+    String get_name() const override { return name; }
+
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        DCHECK(is_string(arguments[0]))
+                << "first argument for function: " << name << " should be 
string"
+                << " and arguments[0] is " << arguments[0]->get_name();
+        DCHECK(is_string(arguments[1]))
+                << "second argument for function: " << name << " should be 
string"
+                << " and arguments[1] is " << arguments[1]->get_name();
+        return std::make_shared<DataTypeInt32>();
+    }
+
+    Status execute_impl(FunctionContext* /*context*/, Block& block, const 
ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) const override 
{
+        DCHECK_EQ(arguments.size(), 2);
+        const auto& [src_column, left_const] =
+                unpack_if_const(block.get_by_position(arguments[0]).column);
+        const auto& [right_column, right_const] =
+                unpack_if_const(block.get_by_position(arguments[1]).column);
+
+        const auto* col_left = 
check_and_get_column<ColumnString>(src_column.get());
+        if (!col_left) {
+            return Status::InternalError("Left operator of function {} can not 
be {}", get_name(),
+                                         
block.get_by_position(arguments[0]).type->get_name());
+        }
+
+        const auto* col_right = 
check_and_get_column<ColumnString>(right_column.get());
+        if (!col_right) {
+            return Status::InternalError("Right operator of function {} can 
not be {}", get_name(),
+                                         
block.get_by_position(arguments[1]).type->get_name());
+        }
+
+        auto dest_column_ptr = ColumnInt32::create(input_rows_count, 0);
+        // count_substring(ColumnString, "xxx")
+        if (right_const) {
+            _execute_constant_pattern(*col_left, col_right->get_data_at(0),
+                                      dest_column_ptr->get_data(), 
input_rows_count);
+        } else if (left_const) {
+            // count_substring("xxx", ColumnString)
+            _execute_constant_src_string(col_left->get_data_at(0), *col_right,
+                                         dest_column_ptr->get_data(), 
input_rows_count);
+        } else {
+            // count_substring(ColumnString, ColumnString)
+            _execute_vector(*col_left, *col_right, 
dest_column_ptr->get_data(), input_rows_count);
+        }
+
+        block.replace_by_position(result, std::move(dest_column_ptr));
+        return Status::OK();
+    }
+
+private:
+    void _execute_constant_pattern(const ColumnString& src_column_string,
+                                   const StringRef& pattern_ref,
+                                   ColumnInt32::Container& dest_column_data,
+                                   size_t input_rows_count) const {
+        for (size_t i = 0; i < input_rows_count; i++) {
+            const StringRef str_ref = src_column_string.get_data_at(i);
+            dest_column_data[i] = find_str_count(str_ref, pattern_ref);
+        }
+    }
+
+    void _execute_vector(const ColumnString& src_column_string, const 
ColumnString& pattern_column,
+                         ColumnInt32::Container& dest_column_data, size_t 
input_rows_count) const {
+        for (size_t i = 0; i < input_rows_count; i++) {
+            const StringRef pattern_ref = pattern_column.get_data_at(i);
+            const StringRef str_ref = src_column_string.get_data_at(i);
+            dest_column_data[i] = find_str_count(str_ref, pattern_ref);
+        }
+    }
+
+    void _execute_constant_src_string(const StringRef& str_ref, const 
ColumnString& pattern_col,
+                                      ColumnInt32::Container& dest_column_data,
+                                      size_t input_rows_count) const {
+        for (size_t i = 0; i < input_rows_count; ++i) {
+            const StringRef pattern_ref = pattern_col.get_data_at(i);
+            dest_column_data[i] = find_str_count(str_ref, pattern_ref);
+        }
+    }
+
+    size_t find_pos(size_t pos, const StringRef str_ref, const StringRef 
pattern_ref) const {
+        size_t old_size = pos;
+        size_t str_size = str_ref.size;
+        while (pos < str_size && memcmp_small_allow_overflow15(str_ref.data + 
pos, pattern_ref.data,
+                                                               
pattern_ref.size)) {
+            pos++;
+        }
+        return pos - old_size;
+    }
+
+    int find_str_count(const StringRef str_ref, StringRef pattern_ref) const {
+        int count = 0;
+        if (str_ref.size == 0 || pattern_ref.size == 0) {
+            return 0;
+        } else {
+            for (size_t str_pos = 0; str_pos <= str_ref.size;) {
+                const size_t res_pos = find_pos(str_pos, str_ref, pattern_ref);
+                if (res_pos == (str_ref.size - str_pos)) {
+                    break; // not find
+                }
+                count++;
+                str_pos = str_pos + res_pos + pattern_ref.size;
+            }
+        }
+        return count;
+    }
+};
+
 struct SM3Sum {
     static constexpr auto name = "sm3sum";
     using ObjectData = SM3Digest;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index 579f3148dd5..fa72b4e832c 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -128,6 +128,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Cosh;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.CosineDistance;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Cot;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CountEqual;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.CountSubstring;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32Internal;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CreateMap;
@@ -609,6 +610,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(Cot.class, "cot"),
             scalar(CosineDistance.class, "cosine_distance"),
             scalar(CountEqual.class, "countequal"),
+            scalar(CountSubstring.class, "count_substrings"),
             scalar(CreateMap.class, "map"),
             scalar(CreateStruct.class, "struct"),
             scalar(CreateNamedStruct.class, "named_struct"),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java
new file mode 100644
index 00000000000..ce7a43cf94b
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/CountSubstring.java
@@ -0,0 +1,70 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.IntegerType;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'count_substrings'.
+ */
+public class CountSubstring extends ScalarFunction
+        implements BinaryExpression, ExplicitlyCastableSignature, 
PropagateNullable {
+
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            FunctionSignature.ret(IntegerType.INSTANCE)
+                    .args(StringType.INSTANCE, StringType.INSTANCE)
+    );
+
+    /**
+     * constructor with 2 arguments.
+     */
+    public CountSubstring(Expression arg0, Expression arg1) {
+        super("count_substrings", arg0, arg1);
+    }
+
+    /**
+     * withChildren.
+     */
+    @Override
+    public CountSubstring withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 2);
+        return new CountSubstring(children.get(0), children.get(1));
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitCountSubstring(this, context);
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index c1725cd1658..dd364b8be75 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -136,6 +136,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Cosh;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.CosineDistance;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Cot;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CountEqual;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.CountSubstring;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Crc32Internal;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.CreateMap;
@@ -967,6 +968,10 @@ public interface ScalarFunctionVisitor<R, C> {
         return visitScalarFunction(countequal, context);
     }
 
+    default R visitCountSubstring(CountSubstring countSubstring, C context) {
+        return visitScalarFunction(countSubstring, context);
+    }
+
     default R visitCurrentCatalog(CurrentCatalog currentCatalog, C context) {
         return visitScalarFunction(currentCatalog, context);
     }
diff --git a/gensrc/script/doris_builtins_functions.py 
b/gensrc/script/doris_builtins_functions.py
index 3fdaacbfe29..1c83b2d7c66 100644
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -1675,6 +1675,7 @@ visible_functions = {
 
         [['overlay'], 'VARCHAR', ['VARCHAR', 'INT', 'INT', 'VARCHAR'], ''],
 
+        [['count_substrings'], 'INT', ['STRING', 'STRING'], 
'DEPEND_ON_ARGUMENT'],
         [['substr', 'substring'], 'STRING', ['STRING', 'INT'], 
'DEPEND_ON_ARGUMENT'],
         [['substr', 'substring'], 'STRING', ['STRING', 'INT', 'INT'], 
'DEPEND_ON_ARGUMENT'],
         [['strleft', 'left'], 'STRING', ['STRING', 'INT'], 
'DEPEND_ON_ARGUMENT'],
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out
new file mode 100644
index 00000000000..9bee1363c66
Binary files /dev/null and 
b/regression-test/data/query_p0/sql_functions/string_functions/test_count_substrings.out
 differ
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy
new file mode 100644
index 00000000000..64051ec7afc
--- /dev/null
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_count_substrings.groovy
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_count_substrings") {
+    // const / NULL
+    qt_select1 "select count_substrings(NULL,NULL);"
+    qt_select2 "select count_substrings('a12bc23de345f',NULL);"
+    qt_select3 "select count_substrings(NULL, 'a12bc23de345f');"
+    qt_select4 "select count_substrings('a12bc23de345f','2');"
+    qt_select5 "select count_substrings('a1你你c我你3我d你3你5你','你');"
+    qt_select6 "select count_substrings('ccc','cc');"
+
+    sql """DROP TABLE IF EXISTS test_count_substrings"""
+    sql """ 
+            CREATE TABLE IF NOT EXISTS test_count_substrings (
+              `k1` int(11) NULL COMMENT "",
+              `s1` varchar(30) NULL COMMENT "",
+              `s2` varchar(30) NOT NULL COMMENT "",
+              `p1` varchar(30) NULL COMMENT "",
+              `p2` varchar(30) NOT NULL COMMENT ""
+            ) ENGINE=OLAP
+            DUPLICATE KEY(`k1`)
+            DISTRIBUTED BY HASH(`k1`) BUCKETS 1
+            PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1",
+            "storage_format" = "V2"
+            )
+        """
+    // empty
+    qt_select4_empty "select count_substrings(s1,p1) from 
test_count_substrings;"
+    qt_select5_empty "select count_substrings(s2,p2) from 
test_count_substrings;"
+    qt_select6_empty "select count_substrings(s1,p2) from 
test_count_substrings;"
+    qt_select7_empty "select count_substrings(s2,p1) from 
test_count_substrings;"
+
+    // some normal/special/null value
+    sql """ INSERT INTO test_count_substrings VALUES(1, 'abcde', 'abcde', '', 
'') """
+    sql """ INSERT INTO test_count_substrings VALUES(2, '', '', '', '') """
+    sql """ INSERT INTO test_count_substrings VALUES(3, '', '','a','a') """
+    sql """ INSERT INTO test_count_substrings VALUES(4, NULL, '', NULL,'') """
+    sql """ INSERT INTO test_count_substrings VALUES(5, 'asdasd', 
'asdasd','a','a') """
+    sql """ INSERT INTO test_count_substrings VALUES(6, 'a1b1c1d', 
'a1b1c1d','1','1') """
+    sql """ INSERT INTO test_count_substrings VALUES(7, ',,,', ',,,','#','#') 
"""
+    sql """ INSERT INTO test_count_substrings VALUES(8, 'a,b,c', 
'a,b,c','v','v') """
+    sql """ INSERT INTO test_count_substrings VALUES(9, 'a,b,c,', 
'a,b,c',NULL,'') """
+    sql """ INSERT INTO test_count_substrings VALUES(10, NULL, '','asd','asd') 
"""
+    sql """ INSERT INTO test_count_substrings VALUES(11, 'a,b,c,12345', 
'a,b,c,12345','5','5') """
+    sql """ INSERT INTO test_count_substrings VALUES(12, 'a,b,c,12345', 
'a,b,c,12345','a','a') """
+    sql """ INSERT INTO test_count_substrings VALUES(13, 'a,你,你,1我2你4我5', 
'a你,你,1我2你4我5','你','我') """
+
+    // null and not_null combine
+    qt_select5_null_null "select s1,p1,count_substrings(s1, p1) from 
test_count_substrings order by k1;"
+    qt_select6_null_not "select s1, p2,count_substrings(s1, p2) from 
test_count_substrings order by k1;"
+    qt_select7_not_null "select s2, p1,count_substrings(s2, p1) from 
test_count_substrings order by k1;"
+    qt_select8_not_not "select s2, p2,count_substrings(s2, p2) from 
test_count_substrings order by k1;"
+
+    // null const combine
+    qt_select9_null_const "select s1, 'a',count_substrings(s1, 'a') from 
test_count_substrings order by k1;"
+    qt_select10_not_null_const "select s2, 'a',count_substrings(s2, 'a') from 
test_count_substrings order by k1;"
+    qt_select11_const_null "select 'a',p1,count_substrings('a', p1) from 
test_count_substrings order by k1;"
+    qt_select12_const_not_null "select 'a',p2,count_substrings('a', p2) from 
test_count_substrings order by k1;"
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to