Yukang-Lian commented on code in PR #13741: URL: https://github.com/apache/doris/pull/13741#discussion_r1025333709
########## be/src/vec/functions/function_string.h: ########## @@ -1159,6 +1160,141 @@ class FunctionSplitPart : public IFunction { } }; +class FunctionSplitByString : public IFunction { +private: + void get_offsets_and_len(const std::string& s, const std::string& c, std::vector<size_t>& v_offset, + std::vector<size_t>& v_charlen) { + v_offset.clear(); + v_charlen.clear(); + if (c.size() == 0) { + for (int i = 0; i < s.size(); i++) { + v_offset.push_back(i); + v_charlen.push_back(1); + } + } else if (c.size() < s.size()) { + string::size_type start = 0, end = s.size() - c.size(); + size_t end_delimiter_num = 0; + while (start < s.size() && start == s.find(c, start)) { + v_charlen.push_back(0); + v_offset.push_back(start); + start += c.size(); + } + if (start > s.size() - 1) { + return; + } + while (start < end && end == s.find(c, end)) { + end_delimiter_num++; + end -= c.size(); + } + string::size_type pos1 = start, pos2 = s.find(c, start); + while (pos2 < end + c.size()) { + v_offset.push_back(pos1); + v_charlen.push_back(pos2 - pos1); + pos1 = pos2 + c.size(); + pos2 = s.find(c, pos1); + } + v_offset.push_back(pos1); + v_charlen.push_back(s.size() - end_delimiter_num * c.size() - pos1); + + while (end_delimiter_num > 0) { + v_charlen.push_back(0); + v_offset.push_back(s.size() - end_delimiter_num * c.size()); + end_delimiter_num--; + } + } else { + v_offset.push_back(0); + v_charlen.push_back(s.size()); + } + } + +public: + static constexpr auto name = "split_by_string"; + + static FunctionPtr create() { return std::make_shared<FunctionSplitByString>(); } + using NullMapType = PaddedPODArray<UInt8>; + + String get_name() const override { return name; } + + bool is_variadic() const override { return false; } + + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared<DataTypeArray>(make_nullable(arguments[0])); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + DCHECK_EQ(arguments.size(), 2); + + ColumnPtr src_column = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + ColumnPtr delimiter_column = + block.get_by_position(arguments[1]).column->convert_to_full_column_if_const(); + + DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; + auto dest_column_ptr = ColumnArray::create(make_nullable(src_column_type)->create_column(), + ColumnArray::ColumnOffsets::create()); + + IColumn* dest_nested_column = &dest_column_ptr->get_data(); + auto& dest_offsets = dest_column_ptr->get_offsets(); + DCHECK(dest_nested_column != nullptr); + dest_nested_column->reserve(0); + dest_offsets.reserve(0); + + NullMapType* dest_nested_null_map = nullptr; + if (dest_nested_column->is_nullable()) { + ColumnNullable* dest_nullable_col = + reinterpret_cast<ColumnNullable*>(dest_nested_column); + dest_nested_column = dest_nullable_col->get_nested_column_ptr(); + dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data(); + } + + _execute(*src_column, *delimiter_column, *dest_nested_column, dest_offsets, + dest_nested_null_map); + block.replace_by_position(result, std::move(dest_column_ptr)); + return Status::OK(); + } + + void _execute(const IColumn& src_column, const IColumn& delimiter_column, + IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map) { + ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column); + ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); + ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); + column_string_chars.reserve(0); + + ColumnArray::Offset64 string_pos = 0; + ColumnArray::Offset64 dest_pos = 0; + const ColumnString* src_column_string = reinterpret_cast<const ColumnString*>(&src_column); + ColumnArray::Offset64 src_offsets_size = src_column_string->get_offsets().size(); + + for (size_t i = 0; i < src_offsets_size; i++) { + const auto delimiter = delimiter_column.get_data_at(i).to_string(); + const auto str = src_column_string->get_data_at(i).to_string(); + StringRef str_ref = src_column_string->get_data_at(i); + if (str.size() == 0) { + dest_offsets.push_back(dest_pos); + continue; + } + vector<size_t> v_len; + vector<size_t> v_offset; + getOffsetsAndLen(str, delimiter, v_offset, v_len); Review Comment: Do you mean `get_offsets_and_len`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org