This is an automated email from the ASF dual-hosted git repository. xuyang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 5793cb11d0 [feature-wip] (array-type) function concat_ws support array (#10749) 5793cb11d0 is described below commit 5793cb11d07e8d46fd4675820a661d36b623d9ac Author: zxealous <xealous0...@gmail.com> AuthorDate: Sun Jul 17 17:50:39 2022 +0800 [feature-wip] (array-type) function concat_ws support array (#10749) Issue #10052 function concat_ws support array --- be/src/vec/functions/function_string.h | 147 ++++++++++++++++++--- be/test/vec/function/function_string_test.cpp | 17 +++ .../sql-functions/string-functions/concat_ws.md | 30 ++++- .../sql-functions/string-functions/concat_ws.md | 28 +++- gensrc/script/doris_builtins_functions.py | 3 + .../string_functions/test_string_function.out | 12 ++ .../string_functions/test_string_function.groovy | 4 + 7 files changed, 215 insertions(+), 26 deletions(-) diff --git a/be/src/vec/functions/function_string.h b/be/src/vec/functions/function_string.h index c545223a23..a7df35b1ed 100644 --- a/be/src/vec/functions/function_string.h +++ b/be/src/vec/functions/function_string.h @@ -30,12 +30,14 @@ #include "util/md5.h" #include "util/sm3.h" #include "util/url_parser.h" +#include "vec/columns/column_array.h" #include "vec/columns/column_decimal.h" #include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" #include "vec/columns/columns_number.h" #include "vec/common/assert_cast.h" #include "vec/common/string_ref.h" +#include "vec/data_types/data_type_array.h" #include "vec/data_types/data_type_decimal.h" #include "vec/data_types/data_type_nullable.h" #include "vec/data_types/data_type_number.h" @@ -491,10 +493,13 @@ public: } }; -// concat_ws (string,string....) +// concat_ws (string,string....) or (string, Array) // TODO: avoid use fmtlib class FunctionStringConcatWs : public IFunction { public: + using Chars = ColumnString::Chars; + using Offsets = ColumnString::Offsets; + static constexpr auto name = "concat_ws"; static FunctionPtr create() { return std::make_shared<FunctionStringConcatWs>(); } String get_name() const override { return name; } @@ -520,8 +525,8 @@ public: auto res = ColumnString::create(); bool is_null_type = block.get_by_position(arguments[0]).type.get()->is_nullable(); size_t argument_size = arguments.size(); - std::vector<const ColumnString::Offsets*> offsets_list(argument_size); - std::vector<const ColumnString::Chars*> chars_list(argument_size); + std::vector<const Offsets*> offsets_list(argument_size); + std::vector<const Chars*> chars_list(argument_size); std::vector<const ColumnUInt8::Container*> null_list(argument_size); ColumnPtr argument_columns[argument_size]; @@ -540,6 +545,11 @@ public: } else { null_list[i] = &const_null_map->get_data(); } + + if (check_column<ColumnArray>(argument_columns[i].get())) { + continue; + } + auto col_str = assert_cast<const ColumnString*>(argument_columns[i].get()); offsets_list[i] = &col_str->get_offsets(); chars_list[i] = &col_str->get_chars(); @@ -553,20 +563,126 @@ public: fmt::memory_buffer buffer; std::vector<std::string_view> views; + if (check_column<ColumnArray>(argument_columns[1].get())) { + // Determine if the nested type of the array is String + const ColumnArray& array_column = + reinterpret_cast<const ColumnArray&>(*argument_columns[1]); + if (!array_column.get_data().is_column_string()) { + return Status::NotSupported( + fmt::format("unsupported nested array of type {} for function {}", + is_column_nullable(array_column.get_data()) + ? array_column.get_data().get_name() + : array_column.get_data().get_family_name(), + get_name())); + } + // Concat string in array + _execute_array(input_rows_count, array_column, buffer, views, offsets_list, chars_list, + null_list, res_data, res_offset); + + } else { + // Concat string + _execute_string(input_rows_count, argument_size, buffer, views, offsets_list, + chars_list, null_list, res_data, res_offset); + } + if (is_null_type) { + block.get_by_position(result).column = + ColumnNullable::create(std::move(res), std::move(null_map)); + } else { + block.get_by_position(result).column = std::move(res); + } + return Status::OK(); + } + +private: + void _execute_array(const size_t& input_rows_count, const ColumnArray& array_column, + fmt::memory_buffer& buffer, std::vector<std::string_view>& views, + const std::vector<const Offsets*>& offsets_list, + const std::vector<const Chars*>& chars_list, + const std::vector<const ColumnUInt8::Container*>& null_list, + Chars& res_data, Offsets& res_offset) { + // Get array nested column + const UInt8* array_nested_null_map = nullptr; + ColumnPtr array_nested_column = nullptr; + + if (is_column_nullable(array_column.get_data())) { + const auto& array_nested_null_column = + reinterpret_cast<const ColumnNullable&>(array_column.get_data()); + // String's null map in array + array_nested_null_map = + array_nested_null_column.get_null_map_column().get_data().data(); + array_nested_column = array_nested_null_column.get_nested_column_ptr(); + } else { + array_nested_column = array_column.get_data_ptr(); + } + + const auto& string_column = reinterpret_cast<const ColumnString&>(*array_nested_column); + const Chars& string_src_chars = string_column.get_chars(); + const Offsets& src_string_offsets = string_column.get_offsets(); + const Offsets& src_array_offsets = array_column.get_offsets(); + ColumnArray::Offset current_src_array_offset = 0; + + // Concat string in array for (size_t i = 0; i < input_rows_count; ++i) { - auto& seq_offsets = *offsets_list[0]; - auto& seq_chars = *chars_list[0]; - auto& seq_nullmap = *null_list[0]; - if (seq_nullmap[i]) { - res_data.push_back('\0'); + auto& sep_offsets = *offsets_list[0]; + auto& sep_chars = *chars_list[0]; + auto& sep_nullmap = *null_list[0]; + + if (sep_nullmap[i]) { res_offset[i] = res_data.size(); + current_src_array_offset += src_array_offsets[i] - src_array_offsets[i - 1]; continue; } - int seq_size = seq_offsets[i] - seq_offsets[i - 1] - 1; - const char* seq_data = reinterpret_cast<const char*>(&seq_chars[seq_offsets[i - 1]]); + int sep_size = sep_offsets[i] - sep_offsets[i - 1] - 1; + const char* sep_data = reinterpret_cast<const char*>(&sep_chars[sep_offsets[i - 1]]); - std::string_view seq(seq_data, seq_size); + std::string_view sep(sep_data, sep_size); + buffer.clear(); + views.clear(); + + for (auto next_src_array_offset = src_array_offsets[i]; + current_src_array_offset < next_src_array_offset; ++current_src_array_offset) { + const auto current_src_string_offset = + current_src_array_offset ? src_string_offsets[current_src_array_offset - 1] + : 0; + size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - + current_src_string_offset - 1; + const char* ptr = + reinterpret_cast<const char*>(&string_src_chars[current_src_string_offset]); + + if (array_nested_null_map == nullptr || + !array_nested_null_map[current_src_array_offset]) { + views.emplace_back(ptr, bytes_to_copy); + } + } + + fmt::format_to(buffer, "{}", fmt::join(views, sep)); + + StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data, + res_offset); + } + } + + void _execute_string(const size_t& input_rows_count, const size_t& argument_size, + fmt::memory_buffer& buffer, std::vector<std::string_view>& views, + const std::vector<const Offsets*>& offsets_list, + const std::vector<const Chars*>& chars_list, + const std::vector<const ColumnUInt8::Container*>& null_list, + Chars& res_data, Offsets& res_offset) { + // Concat string + for (size_t i = 0; i < input_rows_count; ++i) { + auto& sep_offsets = *offsets_list[0]; + auto& sep_chars = *chars_list[0]; + auto& sep_nullmap = *null_list[0]; + if (sep_nullmap[i]) { + res_offset[i] = res_data.size(); + continue; + } + + int sep_size = sep_offsets[i] - sep_offsets[i - 1] - 1; + const char* sep_data = reinterpret_cast<const char*>(&sep_chars[sep_offsets[i - 1]]); + + std::string_view sep(sep_data, sep_size); buffer.clear(); views.clear(); for (size_t j = 1; j < argument_size; ++j) { @@ -580,17 +696,10 @@ public: views.emplace_back(ptr, size); } } - fmt::format_to(buffer, "{}", fmt::join(views, seq)); + fmt::format_to(buffer, "{}", fmt::join(views, sep)); StringOP::push_value_string(std::string_view(buffer.data(), buffer.size()), i, res_data, res_offset); } - if (is_null_type) { - block.get_by_position(result).column = - ColumnNullable::create(std::move(res), std::move(null_map)); - } else { - block.get_by_position(result).column = std::move(res); - } - return Status::OK(); } }; diff --git a/be/test/vec/function/function_string_test.cpp b/be/test/vec/function/function_string_test.cpp index 60cb797aae..2d259014be 100644 --- a/be/test/vec/function/function_string_test.cpp +++ b/be/test/vec/function/function_string_test.cpp @@ -387,6 +387,23 @@ TEST(function_string_test, function_concat_ws_test) { check_function<DataTypeString, true>(func_name, input_types, data_set); }; + + { + InputTypeSet input_types = {TypeIndex::String, TypeIndex::Array, TypeIndex::String}; + + Array vec1 = {Field("", 0), Field("", 0), Field("", 0)}; + Array vec2 = {Field("123", 3), Field("456", 3), Field("789", 3)}; + Array vec3 = {Field("", 0), Field("?", 1), Field("", 0)}; + Array vec4 = {Field("abc", 3), Field("", 0), Field("def", 3)}; + Array vec5 = {Field("abc", 3), Field("def", 3), Field("ghi", 3)}; + DataSet data_set = {{{std::string("-"), vec1}, std::string("--")}, + {{std::string(""), vec2}, std::string("123456789")}, + {{std::string("-"), vec3}, std::string("-?-")}, + {{Null(), vec4}, Null()}, + {{std::string("-"), vec5}, std::string("abc-def-ghi")}}; + + check_function<DataTypeString, true>(func_name, input_types, data_set); + }; } TEST(function_string_test, function_null_or_empty_test) { diff --git a/docs/en/docs/sql-manual/sql-functions/string-functions/concat_ws.md b/docs/en/docs/sql-manual/sql-functions/string-functions/concat_ws.md index 80a0c11dee..a8a4f57d4b 100644 --- a/docs/en/docs/sql-manual/sql-functions/string-functions/concat_ws.md +++ b/docs/en/docs/sql-manual/sql-functions/string-functions/concat_ws.md @@ -28,12 +28,13 @@ under the License. ### Description #### Syntax -`VARCHAR concat ws (VARCHAR sep, VARCHAR str,...)` +`VARCHAR concat_ws (VARCHAR sep, VARCHAR str,...)` +`VARCHAR concat_ws(VARCHAR sep, ARRAY array)` -Using the first parameter SEP as a connector, the second parameter and all subsequent parameters are spliced into a string. +Using the first parameter SEP as a connector, the second parameter and all subsequent parameters(or all string in an ARRAY) are spliced into a string. If the separator is NULL, return NULL. -` The concat_ws` function does not skip empty strings, but NULL values. +The `concat_ws` function does not skip empty strings, it skips NULL values. ### example @@ -58,6 +59,27 @@ mysql> select concat_ws("or", "d", NULL,"is"); +---------------------------------+ | doris | +---------------------------------+ + +mysql> select concat_ws("or", ["d", "is"]); ++-----------------------------------+ +| concat_ws('or', ARRAY('d', 'is')) | ++-----------------------------------+ +| doris | ++-----------------------------------+ + +mysql> select concat_ws(NULL, ["d", "is"]); ++-----------------------------------+ +| concat_ws(NULL, ARRAY('d', 'is')) | ++-----------------------------------+ +| NULL | ++-----------------------------------+ + +mysql> select concat_ws("or", ["d", NULL,"is"]); ++-----------------------------------------+ +| concat_ws('or', ARRAY('d', NULL, 'is')) | ++-----------------------------------------+ +| doris | ++-----------------------------------------+ ``` ### keywords -CONCAT_WS,CONCAT,WS +CONCAT_WS,CONCAT,WS,ARRAY diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/concat_ws.md b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/concat_ws.md index bcebf9f4a6..a21d1b3ba5 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/concat_ws.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/string-functions/concat_ws.md @@ -29,11 +29,12 @@ under the License. #### Syntax `VARCHAR concat_ws(VARCHAR sep, VARCHAR str,...)` +`VARCHAR concat_ws(VARCHAR sep, ARRAY array)` -使用第一个参数 sep 作为连接符,将第二个参数以及后续所有参数拼接成一个字符串. +使用第一个参数 sep 作为连接符,将第二个参数以及后续所有参数(或ARRAY中的所有字符串)拼接成一个字符串。 如果分隔符是 NULL,返回 NULL。 -`concat_ws`函数不会跳过空字符串,会跳过 NULL 值 +`concat_ws`函数不会跳过空字符串,会跳过 NULL 值。 ### example @@ -58,6 +59,27 @@ mysql> select concat_ws("or", "d", NULL,"is"); +---------------------------------+ | doris | +---------------------------------+ + +mysql> select concat_ws("or", ["d", "is"]); ++-----------------------------------+ +| concat_ws('or', ARRAY('d', 'is')) | ++-----------------------------------+ +| doris | ++-----------------------------------+ + +mysql> select concat_ws(NULL, ["d", "is"]); ++-----------------------------------+ +| concat_ws(NULL, ARRAY('d', 'is')) | ++-----------------------------------+ +| NULL | ++-----------------------------------+ + +mysql> select concat_ws("or", ["d", NULL,"is"]); ++-----------------------------------------+ +| concat_ws('or', ARRAY('d', NULL, 'is')) | ++-----------------------------------------+ +| doris | ++-----------------------------------------+ ``` ### keywords -CONCAT_WS,CONCAT,WS +CONCAT_WS,CONCAT,WS,ARRAY diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 3883e7b1ac..249638831b 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -1018,6 +1018,9 @@ visible_functions = [ [['concat_ws'], 'VARCHAR', ['VARCHAR', 'VARCHAR', '...'], '_ZN5doris15StringFunctions9concat_wsEPN9doris_udf' '15FunctionContextERKNS1_9StringValEiPS5_', '', '', 'vec', 'CUSTOM'], + [['concat_ws'], 'VARCHAR', ['VARCHAR', 'ARRAY_VARCHAR'], + '_ZN5doris15StringFunctions9concat_wsEPN9doris_udf' + '15FunctionContextERKNS1_9StringValEiPS5_', '', '', 'vec', 'CUSTOM'], [['find_in_set'], 'INT', ['VARCHAR', 'VARCHAR'], '_ZN5doris15StringFunctions11find_in_setEPN9doris_udf' '15FunctionContextERKNS1_9StringValES6_', '', '', 'vec', ''], diff --git a/regression-test/data/query/sql_functions/string_functions/test_string_function.out b/regression-test/data/query/sql_functions/string_functions/test_string_function.out index 70c7d33202..61e0295054 100644 --- a/regression-test/data/query/sql_functions/string_functions/test_string_function.out +++ b/regression-test/data/query/sql_functions/string_functions/test_string_function.out @@ -41,6 +41,18 @@ doris -- !sql -- doris +-- !sql -- +doris + +-- !sql -- +\N + +-- !sql -- +doris + +-- !sql -- +dororis + -- !sql -- true diff --git a/regression-test/suites/query/sql_functions/string_functions/test_string_function.groovy b/regression-test/suites/query/sql_functions/string_functions/test_string_function.groovy index 00351890ef..8ecbc0e3d5 100644 --- a/regression-test/suites/query/sql_functions/string_functions/test_string_function.groovy +++ b/regression-test/suites/query/sql_functions/string_functions/test_string_function.groovy @@ -38,6 +38,10 @@ suite("test_string_function", "query") { qt_sql "select concat_ws(\"or\", \"d\", \"is\");" qt_sql "select concat_ws(NULL, \"d\", \"is\");" qt_sql "select concat_ws(\"or\", \"d\", NULL,\"is\");" + qt_sql "select concat_ws(\"or\", [\"d\", \"is\"]);" + qt_sql "select concat_ws(NULL, [\"d\", \"is\"]);" + qt_sql "select concat_ws(\"or\", [\"d\", NULL,\"is\"]);" + qt_sql "select concat_ws(\"or\", [\"d\", \"\",\"is\"]);" qt_sql "select ends_with(\"Hello doris\", \"doris\");" qt_sql "select ends_with(\"Hello doris\", \"Hello\");" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org