This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 98f5d95fba8 branch-2.1: [feat](function) SUBSTRING_INDEX function delimiter supports dynamic #50149 (#50303) 98f5d95fba8 is described below commit 98f5d95fba8ba5fdd702e11931436ea12cfd5f7a Author: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> AuthorDate: Wed Apr 23 15:00:40 2025 +0800 branch-2.1: [feat](function) SUBSTRING_INDEX function delimiter supports dynamic #50149 (#50303) Cherry-picked from #50149 Co-authored-by: lw112 <131352377+felixw...@users.noreply.github.com> --- be/src/vec/functions/function_string.h | 85 ++++++++----- .../functions/scalar/SubstringIndex.java | 11 -- .../data/function_p0/test_substring_index.out | Bin 0 -> 2304 bytes .../function_p0/test_substring_index_columns.out | Bin 0 -> 1124 bytes .../function_p0/test_substring_index_simple.out | Bin 0 -> 725 bytes .../suites/function_p0/test_substring_index.groovy | 137 +++++++++++++++++++++ .../test_substring_index_columns.groovy | 114 +++++++++++++++++ .../function_p0/test_substring_index_simple.groovy | 89 +++++++++++++ 8 files changed, 396 insertions(+), 40 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index 79c5823f9f9..a17f4b847d6 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -2205,24 +2205,42 @@ public: const auto* str_col = assert_cast<const ColumnString*>(content_column.get()); - [[maybe_unused]] const auto& [delimiter_col, delimiter_const] = + // Handle both constant and non-constant delimiter parameters + ColumnPtr delimiter_column_ptr; + bool delimiter_const = false; + std::tie(delimiter_column_ptr, delimiter_const) = unpack_if_const(block.get_by_position(arguments[1]).column); - auto delimiter = delimiter_col->get_data_at(0); - int32_t delimiter_size = delimiter.size; + const auto* delimiter_col = assert_cast<const ColumnString*>(delimiter_column_ptr.get()); - [[maybe_unused]] const auto& [part_num_col, part_const] = + ColumnPtr part_num_column_ptr; + bool part_num_const = false; + std::tie(part_num_column_ptr, part_num_const) = unpack_if_const(block.get_by_position(arguments[2]).column); - auto part_number = *((int*)part_num_col->get_data_at(0).data); + const ColumnVector<Int32>* part_num_col = + assert_cast<const ColumnVector<Int32>*>(part_num_column_ptr.get()); - if (part_number == 0 || delimiter_size == 0) { - for (size_t i = 0; i < input_rows_count; ++i) { + // For constant multi-character delimiters, create StringRef and StringSearch only once + std::optional<StringRef> const_delimiter_ref; + std::optional<StringSearch> const_search; + if (delimiter_const && delimiter_col->get_data_at(0).size > 1) { + const_delimiter_ref.emplace(delimiter_col->get_data_at(0)); + const_search.emplace(&const_delimiter_ref.value()); + } + + for (size_t i = 0; i < input_rows_count; ++i) { + auto str = str_col->get_data_at(i); + auto delimiter = delimiter_col->get_data_at(delimiter_const ? 0 : i); + int32_t delimiter_size = delimiter.size; + + auto part_number = part_num_col->get_element(part_num_const ? 0 : i); + + if (part_number == 0 || delimiter_size == 0) { StringOP::push_empty_string(i, res_chars, res_offsets); + continue; } - } else if (part_number > 0) { - if (delimiter_size == 1) { - // If delimiter is a char, use memchr to split - for (size_t i = 0; i < input_rows_count; ++i) { - auto str = str_col->get_data_at(i); + + if (part_number > 0) { + if (delimiter_size == 1) { int32_t offset = -1; int32_t num = 0; while (num < part_number) { @@ -2248,18 +2266,23 @@ public: StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars, res_offsets); } - } - } else { - StringRef delimiter_ref(delimiter); - StringSearch search(&delimiter_ref); - for (size_t i = 0; i < input_rows_count; ++i) { - auto str = str_col->get_data_at(i); + } else { + // For multi-character delimiters + // Use pre-created StringRef and StringSearch for constant delimiters + StringRef delimiter_ref = const_delimiter_ref ? const_delimiter_ref.value() + : StringRef(delimiter); + const StringSearch* search_ptr = const_search ? &const_search.value() : nullptr; + StringSearch local_search(&delimiter_ref); + if (!search_ptr) { + search_ptr = &local_search; + } + int32_t offset = -delimiter_size; int32_t num = 0; while (num < part_number) { size_t n = str.size - offset - delimiter_size; // search first match delimter_ref index from src string among str_offset to end - const char* pos = search.search(str.data + offset + delimiter_size, n); + const char* pos = search_ptr->search(str.data + offset + delimiter_size, n); if (pos < str.data + str.size) { offset = pos - str.data; num++; @@ -2280,21 +2303,25 @@ public: res_chars, res_offsets); } } - } - } else { - // if part_number is negative - part_number = -part_number; - for (size_t i = 0; i < input_rows_count; ++i) { - auto str = str_col->get_data_at(i); + } else { + int neg_part_number = -part_number; auto str_str = str.to_string(); int32_t offset = str.size; int32_t pre_offset = offset; int32_t num = 0; auto substr = str_str; - while (num <= part_number && offset >= 0) { - offset = (int)substr.rfind(delimiter, offset); + + // Use pre-created StringRef for constant delimiters + StringRef delimiter_str = + const_delimiter_ref + ? const_delimiter_ref.value() + : StringRef(reinterpret_cast<const char*>(delimiter.data), + delimiter.size); + + while (num <= neg_part_number && offset >= 0) { + offset = (int)substr.rfind(delimiter_str, offset); if (offset != -1) { - if (++num == part_number) { + if (++num == neg_part_number) { break; } pre_offset = offset; @@ -2306,7 +2333,7 @@ public: } num = (offset == -1 && num != 0) ? num + 1 : num; - if (num == part_number) { + if (num == neg_part_number) { if (offset == -1) { StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars, res_offsets); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java index bb9e2b749c4..7751578b2a4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/SubstringIndex.java @@ -18,7 +18,6 @@ package org.apache.doris.nereids.trees.expressions.functions.scalar; import org.apache.doris.catalog.FunctionSignature; -import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; @@ -53,16 +52,6 @@ public class SubstringIndex extends ScalarFunction super("substring_index", arg0, arg1, arg2); } - @Override - public void checkLegalityBeforeTypeCoercion() { - for (int i = 1; i < children.size(); ++i) { - if (!getArgument(i).isConstant()) { - throw new AnalysisException(getName() - + " function except for the first argument, other parameter must be a constant."); - } - } - } - /** * withChildren. */ diff --git a/regression-test/data/function_p0/test_substring_index.out b/regression-test/data/function_p0/test_substring_index.out new file mode 100644 index 00000000000..480f2c2b482 Binary files /dev/null and b/regression-test/data/function_p0/test_substring_index.out differ diff --git a/regression-test/data/function_p0/test_substring_index_columns.out b/regression-test/data/function_p0/test_substring_index_columns.out new file mode 100644 index 00000000000..ef023cd37a2 Binary files /dev/null and b/regression-test/data/function_p0/test_substring_index_columns.out differ diff --git a/regression-test/data/function_p0/test_substring_index_simple.out b/regression-test/data/function_p0/test_substring_index_simple.out new file mode 100644 index 00000000000..ca236ccc9bd Binary files /dev/null and b/regression-test/data/function_p0/test_substring_index_simple.out differ diff --git a/regression-test/suites/function_p0/test_substring_index.groovy b/regression-test/suites/function_p0/test_substring_index.groovy new file mode 100644 index 00000000000..47df1208a00 --- /dev/null +++ b/regression-test/suites/function_p0/test_substring_index.groovy @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_substring_index") { + sql "set enable_nereids_planner=true" + sql "set enable_fallback_to_original_planner=false" + + sql "DROP TABLE IF EXISTS test_substring_index" + sql """ + CREATE TABLE test_substring_index ( + id INT, + str VARCHAR(100), + delimiter VARCHAR(10), + count INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql """ + INSERT INTO test_substring_index VALUES + (1, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'BBB', -1), + (2, 'zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06', 'ccc', -1), + (3, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'DDD', -1), + (4, 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06', 'DDD', -1), + (5, 'cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06', 'eee', -1), + (6, 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06', 'A_01', -1), + (7, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', 'BBB', 1), + (8, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', '|', 2), + (9, 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06', '|', -2), + (10, 'ABC', '|', 1), + (11, 'ABC|DEF', '|', 0), + (12, '', 'ABC', 1), + (13, 'ABC|DEF|GHI', '', 1) + """ + + sql """ + INSERT INTO test_substring_index VALUES + (101, '北京市|上海市|广州市|深圳市|成都市', '|', 2), + (102, '北京市|上海市|广州市|深圳市|成都市', '|', -2), + (103, '北京市|上海市|广州市|深圳市|成都市', '上海', -1), + (104, '中国人民共和国', '人民', 1), + (105, '中国人民共和国', '人民', -1), + (106, '你好,世界!你好,朋友!', '你好', 1), + (107, '你好,世界!你好,朋友!', '你好', -1), + (108, '你好,世界!你好,朋友!', '世界', -1), + (109, '中文|测试|数据', '测试', 1), + (110, '中文|测试|数据', '测试', -1) + """ + + sql """ + INSERT INTO test_substring_index VALUES + (201, 'hello😀world😀example', '😀', 1), + (202, 'hello😀world😀example', '😀', 2), + (203, 'hello😀world😀example', '😀', -1), + (204, '👋👋hello👋world👋', '👋', 2), + (205, '👋👋hello👋world👋', '👋', -2) + """ + + qt_sql """ + SELECT + id, + str, + delimiter, + count, + substring_index(str, delimiter, count) as result + FROM test_substring_index + WHERE id BETWEEN 1 AND 13 + ORDER BY id + """ + + qt_sql """ + SELECT + id, + str, + delimiter, + count, + substring_index(str, delimiter, count) as result + FROM test_substring_index + WHERE id BETWEEN 101 AND 110 + ORDER BY id + """ + + qt_sql """ + SELECT + id, + str, + delimiter, + count, + substring_index(str, delimiter, count) as result + FROM test_substring_index + WHERE id BETWEEN 201 AND 205 + ORDER BY id + """ + + qt_sql """ + SELECT + a.id, + a.str, + a.delimiter, + b.count, + substring_index(a.str, a.delimiter, b.count) as result + FROM test_substring_index a + JOIN test_substring_index b ON a.id = b.id + WHERE a.id IN (1, 3, 7, 8, 101, 103, 201, 203) + ORDER BY a.id + """ + + qt_sql """ + SELECT + substring_index('', '', 1) as empty_all, + substring_index('test', '', 1) as empty_delimiter, + substring_index('', 'test', 1) as empty_string, + substring_index('test', 'test', 0) as zero_count, + substring_index('test|test', '|', 999) as large_count, + substring_index('test|test', '|', -999) as large_negative_count + """ + + sql "DROP TABLE IF EXISTS test_substring_index" +} \ No newline at end of file diff --git a/regression-test/suites/function_p0/test_substring_index_columns.groovy b/regression-test/suites/function_p0/test_substring_index_columns.groovy new file mode 100644 index 00000000000..80fb3b5318d --- /dev/null +++ b/regression-test/suites/function_p0/test_substring_index_columns.groovy @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_substring_index_columns") { + sql "set enable_nereids_planner=true" + sql "set enable_fallback_to_original_planner=false" + + sql "DROP TABLE IF EXISTS test_substring_index_compat" + sql """ + CREATE TABLE test_substring_index_compat ( + no INT, + sub_str VARCHAR(50), + str VARCHAR(100) + ) ENGINE=OLAP + DUPLICATE KEY(no) + DISTRIBUTED BY HASH(no) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql """ + INSERT INTO test_substring_index_compat VALUES + (1, 'BBB', 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06'), + (2, 'ccc', 'zyz_01|zyz_02|CCC_03|qwe_04|qwe_05|qwe_06'), + (3, 'DDD', 'AAA_01|BBB_02|CCC_03|DDD_04|EEE_05|FFF_06'), + (4, 'DDD', 'sgr_01|wsc_02|CCC_03|DDD_04|rfv_05|rgb_06'), + (5, 'eee', 'cdr_01|vfr_02|dfc_03|DDD_04|EEE_05|FFF_06'), + (6, 'A_01', 'AAA_01|dsd_02|ert_03|bgt_04|fgh_05|hyb_06') + """ + + qt_sql """ + SELECT + no, + sub_str AS '分隔符字符串', + str AS '需要截取的字符串', + substring_index(str, sub_str, -1) AS '动态分隔符结果' + FROM test_substring_index_compat + ORDER BY no + """ + + sql """ + INSERT INTO test_substring_index_compat VALUES + (7, '市', '北京市|上海市|广州市|深圳市'), + (8, '人民', '中华人民共和国'), + (9, '分隔符', '中文分隔符测试分隔符数据'), + (10, '你好', '你好,世界!你好,朋友!') + """ + + qt_sql """ + SELECT + no, + sub_str AS '分隔符字符串', + str AS '需要截取的字符串', + substring_index(str, sub_str, 1) AS '正向截取', + substring_index(str, sub_str, -1) AS '反向截取' + FROM test_substring_index_compat + WHERE no > 6 + ORDER BY no + """ + + sql "DROP TABLE IF EXISTS test_dynamic_params" + sql """ + CREATE TABLE test_dynamic_params ( + id INT, + source_str VARCHAR(100), + delimiter VARCHAR(20), + count_val INT + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ) + """ + + sql """ + INSERT INTO test_dynamic_params VALUES + (1, 'field1,field2,field3,field4', ',', 2), + (2, 'field1,field2,field3,field4', ',', -1), + (3, 'AAA_01|BBB_02|CCC_03', '|', 2), + (4, 'AAA_01|BBB_02|CCC_03', '|', -2), + (5, '中文分隔符测试分隔符数据', '分隔符', 1), + (6, '中文分隔符测试分隔符数据', '分隔符', -1) + """ + + qt_sql """ + SELECT + id, + source_str, + delimiter, + count_val, + substring_index(source_str, delimiter, count_val) AS result + FROM test_dynamic_params + ORDER BY id + """ + + sql "DROP TABLE IF EXISTS test_substring_index_compat" + sql "DROP TABLE IF EXISTS test_dynamic_params" +} \ No newline at end of file diff --git a/regression-test/suites/function_p0/test_substring_index_simple.groovy b/regression-test/suites/function_p0/test_substring_index_simple.groovy new file mode 100644 index 00000000000..82d97e23782 --- /dev/null +++ b/regression-test/suites/function_p0/test_substring_index_simple.groovy @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_substring_index_simple") { + sql "set enable_nereids_planner=true" + sql "set enable_fallback_to_original_planner=false" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 2) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -2) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'XYZ', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'XYZ', -1) as result""" + + qt_sql """SELECT substring_index('', '|', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 0) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', 10) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', -10) as result""" + + qt_sql """SELECT substring_index('AAA_01||BBB_02||CCC_03', '||', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01||BBB_02||CCC_03', '||', -1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03', 'BBB', -1) as result""" + + qt_sql """SELECT substring_index('|AAA_01|BBB_02|CCC_03', '|', 1) as result""" + + qt_sql """SELECT substring_index('AAA_01|BBB_02|CCC_03|', '|', -1) as result""" + + qt_sql """SELECT substring_index('北京市|上海市|广州市', '|', 2) as result""" + + qt_sql """SELECT substring_index('北京市分隔符上海市分隔符广州市', '分隔符', 1) as result""" + + qt_sql """SELECT substring_index('北京市分隔符上海市分隔符广州市', '分隔符', -1) as result""" + + qt_sql """SELECT substring_index('hello😀world😀example', '😀', 1) as result""" + + qt_sql """SELECT substring_index('hello😀world😀example', '😀', -1) as result""" + + qt_sql """ + SELECT substring_index('AAA_01|BBB_02|CCC_03', (SELECT '|'), 2) as result + """ + + qt_sql """ + SELECT substring_index('AAA_01|BBB_02|CCC_03', '|', (SELECT 2)) as result + """ + + qt_sql """ + SELECT substring_index('AAA_01|BBB_02|CCC_03', (SELECT '|'), (SELECT 2)) as result + """ + + qt_sql """ + SELECT substring_index('AAA_01|BBB_02|CCC_03', concat('|'), 2) as result + """ + + qt_sql """ + SELECT substring_index('中文_分隔符_测试_分隔符_数据', concat('分', '隔', '符'), 1) as result + """ + + qt_sql """ + SELECT + substring_index('AAA_01|BBB_02|CCC_03', 'BBB', -1) as result1, + substring_index('AAA_01|BBB_02|CCC_03', 'bbb', -1) as result2 + """ +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org