This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-1.2-lts in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-1.2-lts by this push: new e4ba61ec12 [improvment](function) add new function substring_index (#15373) e4ba61ec12 is described below commit e4ba61ec12a6719a255fbe56471f36045fa4666b Author: Yulei-Yang <yulei.yang0...@gmail.com> AuthorDate: Tue Dec 27 11:39:13 2022 +0800 [improvment](function) add new function substring_index (#15373) cherry-pick #15024 for branch-1.2-lts --- be/src/vec/functions/function_string.cpp | 1 + be/src/vec/functions/function_string.h | 173 +++++++++++++++++++++ .../string-functions/substring_index.md | 92 +++++++++++ docs/sidebars.json | 1 + .../string-functions/substring_index.md | 91 +++++++++++ gensrc/script/doris_builtins_functions.py | 6 + .../string_functions/test_string_function.out | 38 +++++ .../string_functions/test_string_function.groovy | 14 ++ 8 files changed, 416 insertions(+) diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index 5d5d45409b..3898979a32 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -684,6 +684,7 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function<FunctionFromBase64>(); factory.register_function<FunctionSplitPart>(); factory.register_function<FunctionStringMd5AndSM3<MD5Sum>>(); + factory.register_function<FunctionSubstringIndex>(); factory.register_function<FunctionExtractURLParameter>(); factory.register_function<FunctionStringParseUrl>(); factory.register_function<FunctionMoneyFormat<MoneyFormatDoubleImpl>>(); diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 4d95b07a49..01760840b7 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -1351,6 +1351,179 @@ public: } }; +class FunctionSubstringIndex : public IFunction { +public: + static constexpr auto name = "substring_index"; + static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); } + String get_name() const override { return name; } + size_t get_number_of_arguments() const override { return 3; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return make_nullable(std::make_shared<DataTypeString>()); + } + + bool use_default_implementation_for_nulls() const override { return true; } + bool use_default_implementation_for_constants() const override { return false; } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + DCHECK_EQ(arguments.size(), 3); + + auto null_map = ColumnUInt8::create(input_rows_count, 0); + // Create a zero column to simply implement + auto const_null_map = ColumnUInt8::create(input_rows_count, 0); + auto res = ColumnString::create(); + + auto& res_offsets = res->get_offsets(); + auto& res_chars = res->get_chars(); + res_offsets.resize(input_rows_count); + + ColumnPtr content_column = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + + if (auto* nullable = check_and_get_column<const ColumnNullable>(*content_column)) { + // Danger: Here must dispose the null map data first! Because + // argument_columns[0]=nullable->get_nested_column_ptr(); will release the mem + // of column nullable mem of null map + VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data()); + content_column = nullable->get_nested_column_ptr(); + } + + for (size_t i = 1; i <= 2; i++) { + ColumnPtr columnPtr = remove_nullable(block.get_by_position(arguments[i]).column); + + if (!is_column_const(*columnPtr)) { + return Status::RuntimeError("Argument at index {} for function {} must be constant", + i + 1, get_name()); + } + } + + auto str_col = assert_cast<const ColumnString*>(content_column.get()); + + const IColumn& delimiter_col = *block.get_by_position(arguments[1]).column; + const auto* delimiter_const = typeid_cast<const ColumnConst*>(&delimiter_col); + auto delimiter = delimiter_const->get_field().get<String>(); + int32_t delimiter_size = delimiter.size(); + + const IColumn& part_num_col = *block.get_by_position(arguments[2]).column; + const auto* part_num_col_const = typeid_cast<const ColumnConst*>(&part_num_col); + auto part_number = part_num_col_const->get_field().get<Int32>(); + + if (part_number == 0 || delimiter_size == 0) { + for (size_t i = 0; i < input_rows_count; ++i) { + StringOP::push_empty_string(i, res_chars, res_offsets); + } + } else if (part_number > 0) { + if (delimiter_size == 1) { + // If delimiter is a char, use memchr to split + for (size_t i = 0; i < input_rows_count; ++i) { + auto str = str_col->get_data_at(i); + int32_t offset = -1; + int32_t num = 0; + while (num < part_number) { + size_t n = str.size - offset - 1; + const char* pos = reinterpret_cast<const char*>( + memchr(str.data + offset + 1, delimiter[0], n)); + if (pos != nullptr) { + offset = pos - str.data; + num++; + } else { + offset = str.size; + num = (num == 0) ? 0 : num + 1; + break; + } + } + + if (num == part_number) { + StringOP::push_value_string( + std::string_view {reinterpret_cast<const char*>(str.data), + (size_t)offset}, + i, res_chars, res_offsets); + } else { + StringOP::push_value_string(std::string_view(str.data, str.size), i, + res_chars, res_offsets); + } + } + } else { + // If delimiter is a string, use memmem to split + for (size_t i = 0; i < input_rows_count; ++i) { + auto str = str_col->get_data_at(i); + int32_t offset = -delimiter_size; + int32_t num = 0; + while (num < part_number) { + size_t n = str.size - offset - delimiter_size; + char* pos = reinterpret_cast<char*>( + memmem(str.data + offset + delimiter_size, n, delimiter.c_str(), + delimiter_size)); + if (pos != nullptr) { + offset = pos - str.data; + num++; + } else { + offset = str.size; + num = (num == 0) ? 0 : num + 1; + break; + } + } + + if (num == part_number) { + StringOP::push_value_string( + std::string_view {reinterpret_cast<const char*>(str.data), + (size_t)offset}, + i, res_chars, res_offsets); + } else { + StringOP::push_value_string(std::string_view(str.data, str.size), i, + res_chars, res_offsets); + } + } + } + } else { + // if part_number is negative + part_number = -part_number; + for (size_t i = 0; i < input_rows_count; ++i) { + auto str = str_col->get_data_at(i); + auto str_str = str.to_string(); + int32_t offset = str.size; + int32_t pre_offset = offset; + int32_t num = 0; + auto substr = str_str; + while (num <= part_number && offset >= 0) { + offset = (int)substr.rfind(delimiter, offset); + if (offset != -1) { + if (++num == part_number) { + break; + } + pre_offset = offset; + offset = offset - 1; + substr = str_str.substr(0, pre_offset); + } else { + break; + } + } + num = (offset == -1 && num != 0) ? num + 1 : num; + + if (num == part_number) { + if (offset == -1) { + StringOP::push_value_string(std::string_view(str.data, str.size), i, + res_chars, res_offsets); + } else { + StringOP::push_value_string( + std::string_view {str.data + offset + delimiter_size, + str.size - offset - delimiter_size}, + i, res_chars, res_offsets); + } + } else { + StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars, + res_offsets); + } + } + } + + block.get_by_position(result).column = + ColumnNullable::create(std::move(res), std::move(null_map)); + return Status::OK(); + } +}; + struct SM3Sum { static constexpr auto name = "sm3sum"; using ObjectData = SM3Digest; diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md b/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md new file mode 100644 index 0000000000..129e6afcd0 --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/string-functions/substring_index.md @@ -0,0 +1,92 @@ +--- +{ +"title": "substring_index", +"language": "en" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## substring_index + +### Name + +<version since="1.2"> + +SUBSTRING_INDEX + +</version> + +### description + +#### Syntax + +`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)` + +Split `content` to two parts at position where the `field`s of `delimiter` stays, return one of them according to below rules: +if `field` is positive, return the left part; +else if `field` is negative, return the right part; +if `field` is zero, return an empty string when `content` is not null, else will return null. + +- `delimiter` is case sensitive and multi-byte safe. +- `delimiter` and `field` parameter should be constant. + + +### example + +``` +mysql> select substring_index("hello world", " ", 1); ++----------------------------------------+ +| substring_index("hello world", " ", 1) | ++----------------------------------------+ +| hello | ++----------------------------------------+ +mysql> select substring_index("hello world", " ", 2); ++----------------------------------------+ +| substring_index("hello world", " ", 2) | ++----------------------------------------+ +| hello world | ++----------------------------------------+ +mysql> select substring_index("hello world", " ", -1); ++-----------------------------------------+ +| substring_index("hello world", " ", -1) | ++-----------------------------------------+ +| world | ++-----------------------------------------+ +mysql> select substring_index("hello world", " ", -2); ++-----------------------------------------+ +| substring_index("hello world", " ", -2) | ++-----------------------------------------+ +| hello world | ++-----------------------------------------+ +mysql> select substring_index("hello world", " ", -3); ++-----------------------------------------+ +| substring_index("hello world", " ", -3) | ++-----------------------------------------+ +| hello world | ++-----------------------------------------+ +mysql> select substring_index("hello world", " ", 0); ++----------------------------------------+ +| substring_index("hello world", " ", 0) | ++----------------------------------------+ +| | ++----------------------------------------+ +``` +### keywords + + SUBSTRING_INDEX, SUBSTRING \ No newline at end of file diff --git a/docs/sidebars.json b/docs/sidebars.json index 1a1cefa5ec..71dd56628d 100644 --- a/docs/sidebars.json +++ b/docs/sidebars.json @@ -410,6 +410,7 @@ "sql-manual/sql-functions/string-functions/strleft", "sql-manual/sql-functions/string-functions/strright", "sql-manual/sql-functions/string-functions/split_part", + "sql-manual/sql-functions/string-functions/substring_index", "sql-manual/sql-functions/string-functions/money_format", "sql-manual/sql-functions/string-functions/parse_url", "sql-manual/sql-functions/string-functions/convert_to", diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md new file mode 100644 index 0000000000..f328c08003 --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/substring_index.md @@ -0,0 +1,91 @@ +--- +{ +"title": "substring_index", +"language": "zh-CN" +} +--- + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +## substring_index + +### Name + +<version since="1.2"> + +SUBSTRING_INDEX + +</version> + +### description + +#### Syntax + +`VARCHAR substring_index(VARCHAR content, VARCHAR delimiter, INT field)` + +返回 content 的子字符串,在 delimiter 出现 field 次的位置按如下规则截取: +如果 field > 0,则从左边算起,返回截取位置前的子串; +如果 field < 0,则从右边算起,返回截取位置后的子串; +如果 field = 0,返回一个空串(`content` 不为null), 或者Null (`content` = null)。 + +- delimiter 大小写敏感,且是多字节安全的。 +- `delimiter` 和 `field` 参数需要是常量, 不支持变量。 + +### example + +``` +mysql> select substring_index("hello world", " ", 1); ++----------------------------------------+ +| substring_index("hello world", " ", 1) | ++----------------------------------------+ +| hello | ++----------------------------------------+ +mysql> select substring_index("hello world", " ", 2); ++----------------------------------------+ +| substring_index("hello world", " ", 2) | ++----------------------------------------+ +| hello world | ++----------------------------------------+ +mysql> select substring_index("hello world", " ", -1); ++-----------------------------------------+ +| substring_index("hello world", " ", -1) | ++-----------------------------------------+ +| world | ++-----------------------------------------+ +mysql> select substring_index("hello world", " ", -2); ++-----------------------------------------+ +| substring_index("hello world", " ", -2) | ++-----------------------------------------+ +| hello world | ++-----------------------------------------+ +mysql> select substring_index("hello world", " ", -3); ++-----------------------------------------+ +| substring_index("hello world", " ", -3) | ++-----------------------------------------+ +| hello world | ++-----------------------------------------+ +mysql> select substring_index("hello world", " ", 0); ++----------------------------------------+ +| substring_index("hello world", " ", 0) | ++----------------------------------------+ +| | ++----------------------------------------+ +``` +### keywords + + SUBSTRING_INDEX, SUBSTRING \ No newline at end of file diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 78a6d2803e..2f7fd04e71 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -2378,6 +2378,9 @@ visible_functions = [ [['split_part'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'], '_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE', '', '', 'vec', 'ALWAYS_NULLABLE'], + [['substring_index'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'], + '_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE', + '', '', 'vec', 'ALWAYS_NULLABLE'], [['extract_url_parameter'], 'VARCHAR', ['VARCHAR', 'VARCHAR'],'','', '', 'vec', ''], [['sub_replace'], 'VARCHAR', ['VARCHAR', 'VARCHAR', 'INT'],'','', '', 'vec', 'ALWAYS_NULLABLE'], @@ -2535,6 +2538,9 @@ visible_functions = [ [['split_part'], 'STRING', ['STRING', 'STRING', 'INT'], '_ZN5doris15StringFunctions10split_partEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE', '', '', 'vec', 'ALWAYS_NULLABLE'], + [['substring_index'], 'STRING', ['STRING', 'STRING', 'INT'], + '_ZN5doris15StringFunctions15substring_indexEPN9doris_udf15FunctionContextERKNS1_9StringValES6_RKNS1_6IntValE', + '', '', 'vec', 'ALWAYS_NULLABLE'], # Utility functions [['convert_to'], 'VARCHAR', ['VARCHAR','VARCHAR'], '','', '', 'vec', ''], diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out index 492999e714..8b2a4c8bb0 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out @@ -278,3 +278,41 @@ tNEW-STRorigin str -- !sql -- d***is +-- !sql -- +hello + +-- !sql -- +hello world + +-- !sql -- +hello world + +-- !sql -- +world + +-- !sql -- +hello world + +-- !sql -- +hello world + +-- !sql -- +prefix__string2 + +-- !sql -- +prefix_ + +-- !sql -- +prefix_string2 + +-- !sql -- +\N + +-- !sql -- +\N + +-- !sql -- +\N + +-- !sql -- +prefix_string diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy index 8894a4f5b0..fb0df8f38f 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy @@ -142,4 +142,18 @@ suite("test_string_function") { qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);" qt_sql "select sub_replace(\"doris\",\"***\",1,2);" + + qt_sql "select substring_index(\"hello world\", \" \", 1);" + qt_sql "select substring_index(\"hello world\", \" \", 2);" + qt_sql "select substring_index(\"hello world\", \" \", 3);" + qt_sql "select substring_index(\"hello world\", \" \", -1);" + qt_sql "select substring_index(\"hello world\", \" \", -2);" + qt_sql "select substring_index(\"hello world\", \" \", -3);" + qt_sql "select substring_index(\"prefix__string2\", \"__\", 2);" + qt_sql "select substring_index(\"prefix__string2\", \"_\", 2);" + qt_sql "select substring_index(\"prefix_string2\", \"__\", 1);" + qt_sql "select substring_index(null, \"__\", 1);" + qt_sql "select substring_index(\"prefix_string\", null, 1);" + qt_sql "select substring_index(\"prefix_string\", \"_\", null);" + qt_sql "select substring_index(\"prefix_string\", \"__\", -1);" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org