This is an automated email from the ASF dual-hosted git repository.

zclll pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d1e4294041d [refine](function) Optimize the execution of the left 
function for UTF-8 characters. (#55002)
d1e4294041d is described below

commit d1e4294041de47823a31dba14ad4d07879a2e1c7
Author: Mryange <[email protected]>
AuthorDate: Mon Aug 25 15:00:30 2025 +0800

    [refine](function) Optimize the execution of the left function for UTF-8 
characters. (#55002)
    
    ### What problem does this PR solve?
    
    Previously, the left function would be converted to the substring
    function, like this:
    left(str, len) ---> substring(str, 1, len)
    
    However, since substring allows negative indices, it requires traversing
    the entire string.
    In fact, for the left function, we only need to traverse at most len
    characters.
---
 be/src/vec/functions/function_string.h | 70 +++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index 2e64a4ed3e4..a6ed907b4e9 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -672,18 +672,70 @@ public:
 
     Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
                         uint32_t result, size_t input_rows_count) const 
override {
-        auto int_type = std::make_shared<DataTypeInt32>();
-        size_t num_columns_without_result = block.columns();
-        block.insert({int_type->create_column_const(input_rows_count, 
to_field<TYPE_INT>(1)),
-                      int_type, "const 1"});
-        ColumnNumbers temp_arguments(3);
-        temp_arguments[0] = arguments[0];
-        temp_arguments[1] = num_columns_without_result;
-        temp_arguments[2] = arguments[1];
+        DCHECK_EQ(arguments.size(), 2);
+        auto res = ColumnString::create();
+        bool col_const[2];
+        ColumnPtr argument_columns[2];
+        for (int i = 0; i < 2; ++i) {
+            std::tie(argument_columns[i], col_const[i]) =
+                    
unpack_if_const(block.get_by_position(arguments[i]).column);
+        }
 
-        SubstringUtil::substring_execute(block, temp_arguments, result, 
input_rows_count);
+        const auto& str_col = assert_cast<const 
ColumnString&>(*argument_columns[0]);
+        const auto& len_col = assert_cast<const 
ColumnInt32&>(*argument_columns[1]);
+        const auto is_ascii = str_col.is_ascii();
+
+        std::visit(
+                [&](auto is_ascii, auto str_const, auto len_const) {
+                    _execute<is_ascii, str_const, len_const>(str_col, len_col, 
*res,
+                                                             input_rows_count);
+                },
+                vectorized::make_bool_variant(is_ascii),
+                vectorized::make_bool_variant(col_const[0]),
+                vectorized::make_bool_variant(col_const[1]));
+
+        block.get_by_position(result).column = std::move(res);
         return Status::OK();
     }
+
+    template <bool is_ascii, bool str_const, bool len_const>
+    static void _execute(const ColumnString& str_col, const ColumnInt32& 
len_col, ColumnString& res,
+                         size_t size) {
+        auto& res_chars = res.get_chars();
+        auto& res_offsets = res.get_offsets();
+        res_offsets.resize(size);
+        const auto& len_data = len_col.get_data();
+
+        if constexpr (str_const) {
+            res_chars.reserve(size * (str_col.get_chars().size()));
+        } else {
+            res_chars.reserve(str_col.get_chars().size());
+        }
+
+        for (int i = 0; i < size; ++i) {
+            auto str = str_col.get_data_at(index_check_const<str_const>(i));
+            int len = len_data[index_check_const<len_const>(i)];
+            if (len <= 0 || str.empty()) {
+                StringOP::push_empty_string(i, res_chars, res_offsets);
+                continue;
+            }
+
+            const char* begin = str.begin();
+            const char* p = begin;
+
+            if constexpr (is_ascii) {
+                p = begin + std::min(len, static_cast<int>(str.size));
+            } else {
+                const char* end = str.end();
+                for (size_t i = 0, char_size = 0; i < len && p < end; ++i, p 
+= char_size) {
+                    char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)];
+                }
+            }
+
+            StringOP::push_value_string_reserved_and_allow_overflow({begin, 
p}, i, res_chars,
+                                                                    
res_offsets);
+        }
+    }
 };
 
 class FunctionRight : public IFunction {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to