Copilot commented on code in PR #50367: URL: https://github.com/apache/doris/pull/50367#discussion_r2057299387
########## be/src/vec/functions/function_regexp.cpp: ########## @@ -286,100 +289,106 @@ struct RegexpExtractImpl { struct RegexpExtractAllImpl { static constexpr auto name = "regexp_extract_all"; - size_t get_number_of_arguments() const { return 2; } - - static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], - size_t input_rows_count, ColumnString::Chars& result_data, - ColumnString::Offsets& result_offset, NullMap& null_map) { + template <bool first_const, bool second_const, bool third_const> + static void execute_impl(FunctionContext* context, const ColumnPtr* argument_columns, + size_t input_rows_count, ColumnArray::MutablePtr& result_column) { const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); - for (int i = 0; i < input_rows_count; ++i) { - if (null_map[i]) { - StringOP::push_null_string(i, result_data, result_offset, null_map); - continue; + const auto* group_idx_col = check_and_get_column<ColumnInt32>(argument_columns[2].get()); + + auto& result_array_col = assert_cast<ColumnArray&>(*result_column); + if constexpr (second_const && third_const) { + auto* re = reinterpret_cast<re2::RE2*>( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + if (re != nullptr) { + auto group_idx = group_idx_col->get_int(0); + + if (re->NumberOfCapturingGroups() < group_idx) { + result_array_col.insert_many_defaults(input_rows_count); + return; + } } - _execute_inner_loop<false>(context, str_col, pattern_col, result_data, result_offset, - null_map, i); } - } - static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], - size_t input_rows_count, ColumnString::Chars& result_data, - ColumnString::Offsets& result_offset, NullMap& null_map) { - const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); - const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); + auto& column_nullable = assert_cast<ColumnNullable&>(result_array_col.get_data()); + auto& null_map = column_nullable.get_null_map_data(); + auto& column_string = assert_cast<ColumnString&>(column_nullable.get_nested_column()); + auto& offsets = result_array_col.get_offsets(); + for (int i = 0; i < input_rows_count; ++i) { - if (null_map[i]) { - StringOP::push_null_string(i, result_data, result_offset, null_map); - continue; - } - _execute_inner_loop<true>(context, str_col, pattern_col, result_data, result_offset, - null_map, i); + _execute_inner_loop<first_const, second_const, third_const>( + context, str_col, pattern_col, group_idx_col, i, column_string, null_map, + offsets); } } - template <bool Const> + + template <bool first_const, bool second_const, bool third_const> static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, const ColumnString* pattern_col, - ColumnString::Chars& result_data, - ColumnString::Offsets& result_offset, NullMap& null_map, - const size_t index_now) { - re2::RE2* re = reinterpret_cast<re2::RE2*>( + const ColumnInt32* group_idx_col, const size_t index_now, + ColumnString& result_string_column, NullMap& null_map, + ColumnArray::Offsets64& result_offsets) { + auto* re = reinterpret_cast<re2::RE2*>( context->get_function_state(FunctionContext::THREAD_LOCAL)); std::unique_ptr<re2::RE2> scoped_re; + if (re == nullptr) { std::string error_str; - const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); + const auto& pattern = + pattern_col->get_data_at(index_check_const(index_now, second_const)); bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), scoped_re); if (!st) { context->add_warning(error_str.c_str()); - StringOP::push_null_string(index_now, result_data, result_offset, null_map); + null_map.push_back(1); + result_string_column.insert_default(); + result_offsets.emplace_back(result_offsets.back() + 1); return; } re = scoped_re.get(); } - if (re->NumberOfCapturingGroups() == 0) { - StringOP::push_empty_string(index_now, result_data, result_offset); + + auto group_idx = group_idx_col->get_element(index_check_const(index_now, third_const)); + + if (re->NumberOfCapturingGroups() < group_idx || group_idx < 0) { + result_offsets.emplace_back(result_offsets.back()); return; } - const auto& str = str_col->get_data_at(index_now); - int max_matches = 1 + re->NumberOfCapturingGroups(); + + const auto& str = str_col->get_data_at(index_check_const(index_now, first_const)); + int max_matches = 1 + group_idx; std::vector<re2::StringPiece> res_matches; size_t pos = 0; while (pos < str.size) { - auto str_pos = str.data + pos; + const auto* str_pos = str.data + pos; auto str_size = str.size - pos; re2::StringPiece str_sp = re2::StringPiece(str_pos, str_size); std::vector<re2::StringPiece> matches(max_matches); - bool success = - re->Match(str_sp, 0, str_size, re2::RE2::UNANCHORED, &matches[0], max_matches); + bool success = re->Match(str_sp, 0, str_size, re2::RE2::UNANCHORED, matches.data(), + max_matches); if (!success) { - StringOP::push_empty_string(index_now, result_data, result_offset); break; } + if (matches[0].empty()) { - StringOP::push_empty_string(index_now, result_data, result_offset); pos += 1; continue; } - res_matches.push_back(matches[1]); + + res_matches.push_back(matches[group_idx]); auto offset = std::string(str_pos, str_size).find(std::string(matches[0].as_string())); pos += offset + matches[0].size(); } if (res_matches.empty()) { - StringOP::push_empty_string(index_now, result_data, result_offset); + result_offsets.emplace_back(result_offsets.back()); return; } - std::string res = "["; - for (int j = 0; j < res_matches.size(); ++j) { - res += "'" + res_matches[j].as_string() + "'"; - if (j < res_matches.size() - 1) { - res += ","; - } + for (auto res_matche : res_matches) { + result_string_column.insert_data(res_matche.data(), res_matche.size()); Review Comment: [nitpick] Possible typo: 'res_matche' might be intended as 'res_match' for clarity. ```suggestion for (auto res_match : res_matches) { result_string_column.insert_data(res_match.data(), res_match.size()); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org