HappenLee commented on code in PR #49231:
URL: https://github.com/apache/doris/pull/49231#discussion_r2004682099


##########
be/src/vec/functions/function_string.cpp:
##########
@@ -448,20 +451,61 @@ struct TransferImpl {
             return Status::OK();
         }
 
+        const bool is_ascii = simd::VStringFunctions::is_ascii({data.data(), 
data.size()});
         res_offsets.resize(offset_size);
-        memcpy_small_allow_read_write_overflow15(
-                res_offsets.data(), offsets.data(),
-                offset_size * sizeof(ColumnString::Offsets::value_type));
-
-        size_t data_length = data.size();
-        res_data.resize(data_length);
-        if constexpr (std::is_same_v<OpName, NameToUpper>) {
-            simd::VStringFunctions::to_upper(data.data(), data_length, 
res_data.data());
-        } else if constexpr (std::is_same_v<OpName, NameToLower>) {
-            simd::VStringFunctions::to_lower(data.data(), data_length, 
res_data.data());
+        if (is_ascii) {
+            memcpy_small_allow_read_write_overflow15(
+                    res_offsets.data(), offsets.data(),
+                    offset_size * sizeof(ColumnString::Offsets::value_type));
+
+            size_t data_length = data.size();
+            res_data.resize(data_length);
+            if constexpr (std::is_same_v<OpName, NameToUpper>) {
+                simd::VStringFunctions::to_upper(data.data(), data_length, 
res_data.data());
+            } else if constexpr (std::is_same_v<OpName, NameToLower>) {
+                simd::VStringFunctions::to_lower(data.data(), data_length, 
res_data.data());
+            }
+        } else {
+            execute_utf8(data, offsets, res_data, res_offsets);
         }
+
         return Status::OK();
     }
+
+    static void execute_utf8(const ColumnString::Chars& data, const 
ColumnString::Offsets& offsets,
+                             ColumnString::Chars& res_data, 
ColumnString::Offsets& res_offsets) {
+        for (int64_t i = 0; i < offsets.size(); ++i) {
+            const char* begin = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
+            uint32_t size = offsets[i] - offsets[i - 1];
+            std::string res;
+            if constexpr (std::is_same_v<OpName, NameToUpper>) {
+                res = to_upper_utf8(begin, size);
+            } else if constexpr (std::is_same_v<OpName, NameToLower>) {
+                res = to_lower_utf8(begin, size);
+            }
+            StringOP::push_value_string(res, i, res_data, res_offsets);
+        }
+    }
+
+    static std::string to_upper_utf8(const char* data, uint32_t size) {
+        icu::StringPiece sp;
+        sp.set(data, size);
+        icu::UnicodeString unicode_str = icu::UnicodeString::fromUTF8(sp);
+        unicode_str.toUpper();
+        std::string output;

Review Comment:
   why pass output as the param, reuse the string mem.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to