This is an automated email from the ASF dual-hosted git repository.
zclll pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new d1e4294041d [refine](function) Optimize the execution of the left
function for UTF-8 characters. (#55002)
d1e4294041d is described below
commit d1e4294041de47823a31dba14ad4d07879a2e1c7
Author: Mryange <[email protected]>
AuthorDate: Mon Aug 25 15:00:30 2025 +0800
[refine](function) Optimize the execution of the left function for UTF-8
characters. (#55002)
### What problem does this PR solve?
Previously, the left function would be converted to the substring
function, like this:
left(str, len) ---> substring(str, 1, len)
However, since substring allows negative indices, it requires traversing
the entire string.
In fact, for the left function, we only need to traverse at most len
characters.
---
be/src/vec/functions/function_string.h | 70 +++++++++++++++++++++++++++++-----
1 file changed, 61 insertions(+), 9 deletions(-)
diff --git a/be/src/vec/functions/function_string.h
b/be/src/vec/functions/function_string.h
index 2e64a4ed3e4..a6ed907b4e9 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -672,18 +672,70 @@ public:
Status execute_impl(FunctionContext* context, Block& block, const
ColumnNumbers& arguments,
uint32_t result, size_t input_rows_count) const
override {
- auto int_type = std::make_shared<DataTypeInt32>();
- size_t num_columns_without_result = block.columns();
- block.insert({int_type->create_column_const(input_rows_count,
to_field<TYPE_INT>(1)),
- int_type, "const 1"});
- ColumnNumbers temp_arguments(3);
- temp_arguments[0] = arguments[0];
- temp_arguments[1] = num_columns_without_result;
- temp_arguments[2] = arguments[1];
+ DCHECK_EQ(arguments.size(), 2);
+ auto res = ColumnString::create();
+ bool col_const[2];
+ ColumnPtr argument_columns[2];
+ for (int i = 0; i < 2; ++i) {
+ std::tie(argument_columns[i], col_const[i]) =
+
unpack_if_const(block.get_by_position(arguments[i]).column);
+ }
- SubstringUtil::substring_execute(block, temp_arguments, result,
input_rows_count);
+ const auto& str_col = assert_cast<const
ColumnString&>(*argument_columns[0]);
+ const auto& len_col = assert_cast<const
ColumnInt32&>(*argument_columns[1]);
+ const auto is_ascii = str_col.is_ascii();
+
+ std::visit(
+ [&](auto is_ascii, auto str_const, auto len_const) {
+ _execute<is_ascii, str_const, len_const>(str_col, len_col,
*res,
+ input_rows_count);
+ },
+ vectorized::make_bool_variant(is_ascii),
+ vectorized::make_bool_variant(col_const[0]),
+ vectorized::make_bool_variant(col_const[1]));
+
+ block.get_by_position(result).column = std::move(res);
return Status::OK();
}
+
+ template <bool is_ascii, bool str_const, bool len_const>
+ static void _execute(const ColumnString& str_col, const ColumnInt32&
len_col, ColumnString& res,
+ size_t size) {
+ auto& res_chars = res.get_chars();
+ auto& res_offsets = res.get_offsets();
+ res_offsets.resize(size);
+ const auto& len_data = len_col.get_data();
+
+ if constexpr (str_const) {
+ res_chars.reserve(size * (str_col.get_chars().size()));
+ } else {
+ res_chars.reserve(str_col.get_chars().size());
+ }
+
+ for (int i = 0; i < size; ++i) {
+ auto str = str_col.get_data_at(index_check_const<str_const>(i));
+ int len = len_data[index_check_const<len_const>(i)];
+ if (len <= 0 || str.empty()) {
+ StringOP::push_empty_string(i, res_chars, res_offsets);
+ continue;
+ }
+
+ const char* begin = str.begin();
+ const char* p = begin;
+
+ if constexpr (is_ascii) {
+ p = begin + std::min(len, static_cast<int>(str.size));
+ } else {
+ const char* end = str.end();
+ for (size_t i = 0, char_size = 0; i < len && p < end; ++i, p
+= char_size) {
+ char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
+ }
+ }
+
+ StringOP::push_value_string_reserved_and_allow_overflow({begin,
p}, i, res_chars,
+
res_offsets);
+ }
+ }
};
class FunctionRight : public IFunction {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]