This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-lts
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-1.2-lts by this push:
     new e4ba61ec12 [improvment](function) add new function substring_index 
(#15373)
e4ba61ec12 is described below

commit e4ba61ec12a6719a255fbe56471f36045fa4666b
Author: Yulei-Yang <yulei.yang0...@gmail.com>
AuthorDate: Tue Dec 27 11:39:13 2022 +0800

    [improvment](function) add new function substring_index (#15373)
    
    cherry-pick #15024 for branch-1.2-lts
---
 be/src/vec/functions/function_string.cpp           |   1 +
 be/src/vec/functions/function_string.h             | 173 +++++++++++++++++++++
 .../string-functions/substring_index.md            |  92 +++++++++++
 docs/sidebars.json                                 |   1 +
 .../string-functions/substring_index.md            |  91 +++++++++++
 gensrc/script/doris_builtins_functions.py          |   6 +
 .../string_functions/test_string_function.out      |  38 +++++
 .../string_functions/test_string_function.groovy   |  14 ++
 8 files changed, 416 insertions(+)

diff --git a/be/src/vec/functions/function_string.cpp 
b/be/src/vec/functions/function_string.cpp
index 5d5d45409b..3898979a32 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -684,6 +684,7 @@ void register_function_string(SimpleFunctionFactory& 
factory) {
     factory.register_function<FunctionFromBase64>();
     factory.register_function<FunctionSplitPart>();
     factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>();
+    factory.register_function<FunctionSubstringIndex>();
     factory.register_function<FunctionExtractURLParameter>();
     factory.register_function<FunctionStringParseUrl>();
     factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>();
diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index 4d95b07a49..01760840b7 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -1351,6 +1351,179 @@ public:
     }
 };
 
+class FunctionSubstringIndex : public IFunction {
+public:
+    static constexpr auto name = "substring_index";
+    static FunctionPtr create() { return 
std::make_shared<FunctionSubstringIndex>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 3; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return make_nullable(std::make_shared<DataTypeString>());
+    }
+
+    bool use_default_implementation_for_nulls() const override { return true; }
+    bool use_default_implementation_for_constants() const override { return 
false; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 3);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        // Create a zero column to simply implement
+        auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+        auto res = ColumnString::create();
+
+        auto& res_offsets = res->get_offsets();
+        auto& res_chars = res->get_chars();
+        res_offsets.resize(input_rows_count);
+
+        ColumnPtr content_column =
+                
block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
+
+        if (auto* nullable = check_and_get_column<const 
ColumnNullable>(*content_column)) {
+            // Danger: Here must dispose the null map data first! Because
+            // argument_columns[0]=nullable->get_nested_column_ptr(); will 
release the mem
+            // of column nullable mem of null map
+            VectorizedUtils::update_null_map(null_map->get_data(), 
nullable->get_null_map_data());
+            content_column = nullable->get_nested_column_ptr();
+        }
+
+        for (size_t i = 1; i <= 2; i++) {
+            ColumnPtr columnPtr = 
remove_nullable(block.get_by_position(arguments[i]).column);
+
+            if (!is_column_const(*columnPtr)) {
+                return Status::RuntimeError("Argument at index {} for function 
{} must be constant",
+                                            i + 1, get_name());
+            }
+        }
+
+        auto str_col = assert_cast<const ColumnString*>(content_column.get());
+
+        const IColumn& delimiter_col = 
*block.get_by_position(arguments[1]).column;
+        const auto* delimiter_const = typeid_cast<const 
ColumnConst*>(&delimiter_col);
+        auto delimiter = delimiter_const->get_field().get<String>();
+        int32_t delimiter_size = delimiter.size();
+
+        const IColumn& part_num_col = 
*block.get_by_position(arguments[2]).column;
+        const auto* part_num_col_const = typeid_cast<const 
ColumnConst*>(&part_num_col);
+        auto part_number = part_num_col_const->get_field().get<Int32>();
+
+        if (part_number == 0 || delimiter_size == 0) {
+            for (size_t i = 0; i < input_rows_count; ++i) {
+                StringOP::push_empty_string(i, res_chars, res_offsets);
+            }
+        } else if (part_number > 0) {
+            if (delimiter_size == 1) {
+                // If delimiter is a char, use memchr to split
+                for (size_t i = 0; i < input_rows_count; ++i) {
+                    auto str = str_col->get_data_at(i);
+                    int32_t offset = -1;
+                    int32_t num = 0;
+                    while (num < part_number) {
+                        size_t n = str.size - offset - 1;
+                        const char* pos = reinterpret_cast<const char*>(
+                                memchr(str.data + offset + 1, delimiter[0], 
n));
+                        if (pos != nullptr) {
+                            offset = pos - str.data;
+                            num++;
+                        } else {
+                            offset = str.size;
+                            num = (num == 0) ? 0 : num + 1;
+                            break;
+                        }
+                    }
+
+                    if (num == part_number) {
+                        StringOP::push_value_string(
+                                std::string_view {reinterpret_cast<const 
char*>(str.data),
+                                                  (size_t)offset},
+                                i, res_chars, res_offsets);
+                    } else {
+                        StringOP::push_value_string(std::string_view(str.data, 
str.size), i,
+                                                    res_chars, res_offsets);
+                    }
+                }
+            } else {
+                // If delimiter is a string, use memmem to split
+                for (size_t i = 0; i < input_rows_count; ++i) {
+                    auto str = str_col->get_data_at(i);
+                    int32_t offset = -delimiter_size;
+                    int32_t num = 0;
+                    while (num < part_number) {
+                        size_t n = str.size - offset - delimiter_size;
+                        char* pos = reinterpret_cast<char*>(
+                                memmem(str.data + offset + delimiter_size, n, 
delimiter.c_str(),
+                                       delimiter_size));
+                        if (pos != nullptr) {
+                            offset = pos - str.data;
+                            num++;
+                        } else {
+                            offset = str.size;
+                            num = (num == 0) ? 0 : num + 1;
+                            break;
+                        }
+                    }
+
+                    if (num == part_number) {
+                        StringOP::push_value_string(
+                                std::string_view {reinterpret_cast<const 
char*>(str.data),
+                                                  (size_t)offset},
+                                i, res_chars, res_offsets);
+                    } else {
+                        StringOP::push_value_string(std::string_view(str.data, 
str.size), i,
+                                                    res_chars, res_offsets);
+                    }
+                }
+            }
+        } else {
+            // if part_number is negative
+            part_number = -part_number;
+            for (size_t i = 0; i < input_rows_count; ++i) {
+                auto str = str_col->get_data_at(i);
+                auto str_str = str.to_string();
+                int32_t offset = str.size;
+                int32_t pre_offset = offset;
+                int32_t num = 0;
+                auto substr = str_str;
+                while (num <= part_number && offset >= 0) {
+                    offset = (int)substr.rfind(delimiter, offset);
+                    if (offset != -1) {
+                        if (++num == part_number) {
+                            break;
+                        }
+                        pre_offset = offset;
+                        offset = offset - 1;
+                        substr = str_str.substr(0, pre_offset);
+                    } else {
+                        break;
+                    }
+                }
+                num = (offset == -1 && num != 0) ? num + 1 : num;
+
+                if (num == part_number) {
+                    if (offset == -1) {
+                        StringOP::push_value_string(std::string_view(str.data, 
str.size), i,
+                                                    res_chars, res_offsets);
+                    } else {
+                        StringOP::push_value_string(
+                                std::string_view {str.data + offset + 
delimiter_size,
+                                                  str.size - offset - 
delimiter_size},
+                                i, res_chars, res_offsets);
+                    }
+                } else {
+                    StringOP::push_value_string(std::string_view(str.data, 
str.size), i, res_chars,
+                                                res_offsets);
+                }
+            }
+        }
+
+        block.get_by_position(result).column =
+                ColumnNullable::create(std::move(res), std::move(null_map));
+        return Status::OK();
+    }
+};
+
 struct SM3Sum {
     static constexpr auto name = "sm3sum";
     using ObjectData = SM3Digest;
diff --git 
a/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md 
b/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md
new file mode 100644
index 0000000000..129e6afcd0
--- /dev/null
+++ b/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md
@@ -0,0 +1,92 @@
+---
+{
+"title": "substring_index",
+"language": "en"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## substring_index
+
+### Name
+
+<version since="1.2">
+
+SUBSTRING_INDEX
+
+</version>
+
+### description
+
+#### Syntax
+
+`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
+
+Split `content` to two parts at position where the `field`s of `delimiter` 
stays, return one of them according to below rules:
+if `field` is positive, return the left part;
+else if `field` is negative, return the right part;
+if `field` is zero, return an empty string when `content` is not null, else 
will return null.
+
+- `delimiter` is case sensitive and multi-byte safe.
+- `delimiter` and `field` parameter should be constant.
+
+
+### example
+
+```
+mysql> select substring_index("hello world", " ", 1);
++----------------------------------------+
+| substring_index("hello world", " ", 1) |
++----------------------------------------+
+| hello                                  |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", 2);
++----------------------------------------+
+| substring_index("hello world", " ", 2) |
++----------------------------------------+
+| hello world                            |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", -1);
++-----------------------------------------+
+| substring_index("hello world", " ", -1) |
++-----------------------------------------+
+| world                                   |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -2);
++-----------------------------------------+
+| substring_index("hello world", " ", -2) |
++-----------------------------------------+
+| hello world                             |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -3);
++-----------------------------------------+
+| substring_index("hello world", " ", -3) |
++-----------------------------------------+
+| hello world                             |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", 0);
++----------------------------------------+
+| substring_index("hello world", " ", 0) |
++----------------------------------------+
+|                                        |
++----------------------------------------+
+```
+### keywords
+
+    SUBSTRING_INDEX, SUBSTRING
\ No newline at end of file
diff --git a/docs/sidebars.json b/docs/sidebars.json
index 1a1cefa5ec..71dd56628d 100644
--- a/docs/sidebars.json
+++ b/docs/sidebars.json
@@ -410,6 +410,7 @@
                                 
"sql-manual/sql-functions/string-functions/strleft",
                                 
"sql-manual/sql-functions/string-functions/strright",
                                 
"sql-manual/sql-functions/string-functions/split_part",
+                                
"sql-manual/sql-functions/string-functions/substring_index",
                                 
"sql-manual/sql-functions/string-functions/money_format",
                                 
"sql-manual/sql-functions/string-functions/parse_url",
                                 
"sql-manual/sql-functions/string-functions/convert_to",
diff --git 
a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md 
b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md
new file mode 100644
index 0000000000..f328c08003
--- /dev/null
+++ 
b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md
@@ -0,0 +1,91 @@
+---
+{
+"title": "substring_index",
+"language": "zh-CN"
+}
+---
+
+<!-- 
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+## substring_index
+
+### Name
+
+<version since="1.2">
+
+SUBSTRING_INDEX
+
+</version>
+
+### description
+
+#### Syntax
+
+`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)`
+
+返回 content 的子字符串,在 delimiter 出现 field 次的位置按如下规则截取:  
+如果 field > 0,则从左边算起,返回截取位置前的子串;  
+如果 field < 0,则从右边算起,返回截取位置后的子串;
+如果 field = 0,返回一个空串(`content` 不为null), 或者Null (`content` = null)。
+
+- delimiter 大小写敏感,且是多字节安全的。
+- `delimiter` 和 `field` 参数需要是常量, 不支持变量。
+
+### example
+
+```
+mysql> select substring_index("hello world", " ", 1);
++----------------------------------------+
+| substring_index("hello world", " ", 1) |
++----------------------------------------+
+| hello                                  |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", 2);
++----------------------------------------+
+| substring_index("hello world", " ", 2) |
++----------------------------------------+
+| hello world                            |
++----------------------------------------+
+mysql> select substring_index("hello world", " ", -1);
++-----------------------------------------+
+| substring_index("hello world", " ", -1) |
++-----------------------------------------+
+| world                                   |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -2);
++-----------------------------------------+
+| substring_index("hello world", " ", -2) |
++-----------------------------------------+
+| hello world                             |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", -3);
++-----------------------------------------+
+| substring_index("hello world", " ", -3) |
++-----------------------------------------+
+| hello world                             |
++-----------------------------------------+
+mysql> select substring_index("hello world", " ", 0);
++----------------------------------------+
+| substring_index("hello world", " ", 0) |
++----------------------------------------+
+|                                        |
++----------------------------------------+
+```
+### keywords
+
+    SUBSTRING_INDEX, SUBSTRING
\ No newline at end of file
diff --git a/gensrc/script/doris_builtins_functions.py 
b/gensrc/script/doris_builtins_functions.py
index 78a6d2803e..2f7fd04e71 100755
--- a/gensrc/script/doris_builtins_functions.py
+++ b/gensrc/script/doris_builtins_functions.py
@@ -2378,6 +2378,9 @@ visible_functions = [
     [['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
         
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
         '', '', 'vec', 'ALWAYS_NULLABLE'],
+    [['substring_index'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],
+        
'_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
+        '', '', 'vec', 'ALWAYS_NULLABLE'],
     [['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 
'vec', ''],
 
     [['sub_replace'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],'','', '', 
'vec', 'ALWAYS_NULLABLE'],
@@ -2535,6 +2538,9 @@ visible_functions = [
     [['split_part'], 'STRING', ['STRING', 'STRING', 'INT'],
         
'_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
         '', '', 'vec', 'ALWAYS_NULLABLE'],
+    [['substring_index'], 'STRING', ['STRING', 'STRING', 'INT'],
+        
'_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE',
+        '', '', 'vec', 'ALWAYS_NULLABLE'],
 
     # Utility functions
     [['convert_to'], 'VARCHAR', ['VARCHAR','VARCHAR'], '','', '', 'vec', ''],
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
index 492999e714..8b2a4c8bb0 100644
--- 
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
+++ 
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
@@ -278,3 +278,41 @@ tNEW-STRorigin str
 -- !sql --
 d***is
 
+-- !sql --
+hello
+
+-- !sql --
+hello world
+
+-- !sql --
+hello world
+
+-- !sql --
+world
+
+-- !sql --
+hello world
+
+-- !sql --
+hello world
+
+-- !sql --
+prefix__string2
+
+-- !sql --
+prefix_
+
+-- !sql --
+prefix_string2
+
+-- !sql --
+\N
+
+-- !sql --
+\N
+
+-- !sql --
+\N
+
+-- !sql --
+prefix_string
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
index 8894a4f5b0..fb0df8f38f 100644
--- 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
@@ -142,4 +142,18 @@ suite("test_string_function") {
 
     qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);"
     qt_sql "select sub_replace(\"doris\",\"***\",1,2);"
+
+    qt_sql "select substring_index(\"hello world\", \" \", 1);"
+    qt_sql "select substring_index(\"hello world\", \" \", 2);"
+    qt_sql "select substring_index(\"hello world\", \" \", 3);"
+    qt_sql "select substring_index(\"hello world\", \" \", -1);"
+    qt_sql "select substring_index(\"hello world\", \" \", -2);"
+    qt_sql "select substring_index(\"hello world\", \" \", -3);"
+    qt_sql "select substring_index(\"prefix__string2\", \"__\", 2);"
+    qt_sql "select substring_index(\"prefix__string2\", \"_\", 2);"
+    qt_sql "select substring_index(\"prefix_string2\", \"__\", 1);"
+    qt_sql "select substring_index(null, \"__\", 1);"
+    qt_sql "select substring_index(\"prefix_string\", null, 1);"
+    qt_sql "select substring_index(\"prefix_string\", \"_\", null);"
+    qt_sql "select substring_index(\"prefix_string\", \"__\", -1);"
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to