github-actions[bot] commented on code in PR #15024: URL: https://github.com/apache/doris/pull/15024#discussion_r1045812914
########## be/src/vec/functions/function_string.h: ########## @@ -1407,6 +1407,179 @@ class FunctionSplitPart : public IFunction { } }; +class FunctionSubstringIndex : public IFunction { +public: + static constexpr auto name = "substring_index"; + static FunctionPtr create() { return std::make_shared<FunctionSubstringIndex>(); } + String get_name() const override { return name; } + size_t get_number_of_arguments() const override { return 3; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return make_nullable(std::make_shared<DataTypeString>()); + } + + bool use_default_implementation_for_nulls() const override { return true; } + bool use_default_implementation_for_constants() const override { return false; } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) override { + DCHECK_EQ(arguments.size(), 3); + + auto null_map = ColumnUInt8::create(input_rows_count, 0); + // Create a zero column to simply implement + auto const_null_map = ColumnUInt8::create(input_rows_count, 0); + auto res = ColumnString::create(); + + auto& res_offsets = res->get_offsets(); + auto& res_chars = res->get_chars(); + res_offsets.resize(input_rows_count); + + ColumnPtr content_column = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + + if (auto* nullable = check_and_get_column<const ColumnNullable>(*content_column)) { + // Danger: Here must dispose the null map data first! Because + // argument_columns[0]=nullable->get_nested_column_ptr(); will release the mem + // of column nullable mem of null map + VectorizedUtils::update_null_map(null_map->get_data(), nullable->get_null_map_data()); + content_column = nullable->get_nested_column_ptr(); + } + + for (size_t i = 1; i <= 2; i++) { + ColumnPtr columnPtr = remove_nullable(block.get_by_position(arguments[i]).column); + + if (!is_column_const(*columnPtr)) { + return Status::RuntimeError("Argument at index {} for function {} must be constant", + i + 1, get_name()); + } + } + + auto str_col = assert_cast<const ColumnString*>(content_column.get()); + + const IColumn& delimiter_col = *block.get_by_position(arguments[1]).column; + const auto* delimiter_const = typeid_cast<const ColumnConst*>(&delimiter_col); + auto delimiter = delimiter_const->get_field().get<String>(); + int32_t delimiter_size = delimiter.size(); + + const IColumn& part_num_col = *block.get_by_position(arguments[2]).column; + const auto* part_num_col_const = typeid_cast<const ColumnConst*>(&part_num_col); + auto part_number = part_num_col_const->get_field().get<Int32>(); + + if (part_number == 0 || delimiter_size == 0) { + for (size_t i = 0; i < input_rows_count; ++i) { + StringOP::push_empty_string(i, res_chars, res_offsets); + } + } else if (part_number > 0) { + if (delimiter_size == 1) { + // If delimiter is a char, use memchr to split + for (size_t i = 0; i < input_rows_count; ++i) { + auto str = str_col->get_data_at(i); + int32_t offset = -1; + int32_t num = 0; + while (num < part_number) { + size_t n = str.size - offset - 1; + const char* pos = reinterpret_cast<const char*>( + memchr(str.data + offset + 1, delimiter[0], n)); + if (pos != nullptr) { + offset = pos - str.data; + num++; + } else { + offset = str.size; + num = (num == 0) ? 0 : num + 1; + break; + } + } + + if (num == part_number) { + StringOP::push_value_string( + std::string_view {reinterpret_cast<const char*>(str.data), + (size_t)offset}, + i, res_chars, res_offsets); + } else { + StringOP::push_value_string(std::string_view(str.data, str.size), i, + res_chars, res_offsets); + } + } + } else { + // If delimiter is a string, use memmem to split + for (size_t i = 0; i < input_rows_count; ++i) { + auto str = str_col->get_data_at(i); + int32_t offset = -delimiter_size; + int32_t num = 0; + while (num < part_number) { + size_t n = str.size - offset - delimiter_size; + char* pos = reinterpret_cast<char*>( + memmem(str.data + offset + delimiter_size, n, delimiter.c_str(), + delimiter_size)); + if (pos != nullptr) { + offset = pos - str.data; + num++; + } else { + offset = str.size; + num = (num == 0) ? 0 : num + 1; + break; + } + } + + if (num == part_number) { + StringOP::push_value_string( + std::string_view {reinterpret_cast<const char*>(str.data), + (size_t)offset}, + i, res_chars, res_offsets); + } else { + StringOP::push_value_string(std::string_view(str.data, str.size), i, + res_chars, res_offsets); + } + } + } + } else { + // if part_number is negative + part_number = -part_number; + for (size_t i = 0; i < input_rows_count; ++i) { + auto str = str_col->get_data_at(i); + auto str_str = str.to_string(); + int32_t offset = str.size; + int32_t pre_offset = offset; + int32_t num = 0; + auto substr = str_str; + while (num <= part_number && offset >= 0) { + offset = (int)substr.rfind(delimiter, offset); + if (offset != -1) { + if (++num == part_number) { + break; + } + pre_offset = offset; + offset = offset - 1; + substr = str_str.substr(0, pre_offset); + } else { + break; + } + } + num = (offset == -1 && num != 0) ? num + 1 : num; + + if (num == part_number) { + if (offset == -1) { + StringOP::push_value_string(std::string_view(str.data, str.size), i, + res_chars, res_offsets); + } else { + StringOP::push_value_string( + std::string_view {str.data + offset + delimiter_size, + str.size - offset - delimiter_size}, + i, res_chars, res_offsets); + } + } else { + StringOP::push_value_string(std::string_view(str.data, str.size), i, res_chars, + res_offsets); + } + } + } + + block.get_by_position(result).column = + ColumnNullable::create(std::move(res), std::move(null_map)); + return Status::OK(); + } +}; + class FunctionSplitByString : public IFunction { Review Comment: warning: static data member 'name' not allowed in local class 'FunctionSplitByString' [clang-diagnostic-error] ```cpp static constexpr auto name = "split_by_string"; ^ ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org