zhiqiang-hhhh commented on code in PR #41681:
URL: https://github.com/apache/doris/pull/41681#discussion_r1800577717


##########
be/src/vec/functions/function_string.cpp:
##########
@@ -535,6 +545,135 @@ struct TrimUtil {
         return Status::OK();
     }
 };
+template <bool is_ltrim_in, bool is_rtrim_in, bool trim_single>
+struct TrimInUtil {
+    static Status vector(const ColumnString::Chars& str_data,
+                         const ColumnString::Offsets& str_offsets, const 
StringRef& remove_str,
+                         ColumnString::Chars& res_data, ColumnString::Offsets& 
res_offsets) {
+        const size_t offset_size = str_offsets.size();
+        res_offsets.resize(offset_size);
+        res_data.reserve(str_data.size());
+        bool all_ascii = simd::VStringFunctions::is_ascii(remove_str) &&
+                         simd::VStringFunctions::is_ascii(StringRef(
+                                 reinterpret_cast<const 
char*>(str_data.data()), str_data.size()));
+
+        if (all_ascii) {
+            return impl_vectors_ascii(str_data, str_offsets, remove_str, 
res_data, res_offsets);
+        } else {
+            return impl_vectors_utf8(str_data, str_offsets, remove_str, 
res_data, res_offsets);
+        }
+    }
+
+private:
+    static Status impl_vectors_ascii(const ColumnString::Chars& str_data,
+                                     const ColumnString::Offsets& str_offsets,
+                                     const StringRef& remove_str, 
ColumnString::Chars& res_data,
+                                     ColumnString::Offsets& res_offsets) {
+        const size_t offset_size = str_offsets.size();
+        std::bitset<128> char_lookup;
+        const char* remove_begin = remove_str.data;
+        const char* remove_end = remove_str.data + remove_str.size;
+
+        while (remove_begin < remove_end) {
+            char_lookup.set(static_cast<unsigned char>(*remove_begin));
+            remove_begin += 1;
+        }
+
+        for (size_t i = 0; i < offset_size; ++i) {
+            const char* str_begin = reinterpret_cast<const char*>(
+                    str_data.data() + (i == 0 ? 0 : str_offsets[i - 1]));

Review Comment:
   `str_data.data() + str_offsets[i]` is better, since PODArray guarantees 
access to -1 is safe.



##########
be/src/vec/functions/function_string.cpp:
##########
@@ -535,6 +545,135 @@ struct TrimUtil {
         return Status::OK();
     }
 };
+template <bool is_ltrim_in, bool is_rtrim_in, bool trim_single>
+struct TrimInUtil {
+    static Status vector(const ColumnString::Chars& str_data,
+                         const ColumnString::Offsets& str_offsets, const 
StringRef& remove_str,
+                         ColumnString::Chars& res_data, ColumnString::Offsets& 
res_offsets) {
+        const size_t offset_size = str_offsets.size();
+        res_offsets.resize(offset_size);
+        res_data.reserve(str_data.size());
+        bool all_ascii = simd::VStringFunctions::is_ascii(remove_str) &&
+                         simd::VStringFunctions::is_ascii(StringRef(
+                                 reinterpret_cast<const 
char*>(str_data.data()), str_data.size()));
+
+        if (all_ascii) {
+            return impl_vectors_ascii(str_data, str_offsets, remove_str, 
res_data, res_offsets);
+        } else {
+            return impl_vectors_utf8(str_data, str_offsets, remove_str, 
res_data, res_offsets);
+        }
+    }
+
+private:
+    static Status impl_vectors_ascii(const ColumnString::Chars& str_data,
+                                     const ColumnString::Offsets& str_offsets,
+                                     const StringRef& remove_str, 
ColumnString::Chars& res_data,
+                                     ColumnString::Offsets& res_offsets) {
+        const size_t offset_size = str_offsets.size();
+        std::bitset<128> char_lookup;
+        const char* remove_begin = remove_str.data;
+        const char* remove_end = remove_str.data + remove_str.size;
+
+        while (remove_begin < remove_end) {
+            char_lookup.set(static_cast<unsigned char>(*remove_begin));
+            remove_begin += 1;
+        }
+
+        for (size_t i = 0; i < offset_size; ++i) {
+            const char* str_begin = reinterpret_cast<const char*>(
+                    str_data.data() + (i == 0 ? 0 : str_offsets[i - 1]));
+            const char* str_end = reinterpret_cast<const 
char*>(str_data.data() + str_offsets[i]);
+            const char* left_trim_pos = str_begin;
+            const char* right_trim_pos = str_end;
+
+            if constexpr (is_ltrim_in) {
+                while (left_trim_pos < str_end) {
+                    if (!char_lookup.test(static_cast<unsigned 
char>(*left_trim_pos))) {
+                        break;
+                    }
+                    ++left_trim_pos;
+                }
+            }
+
+            if constexpr (is_rtrim_in) {
+                while (right_trim_pos > left_trim_pos) {
+                    --right_trim_pos;
+                    if (!char_lookup.test(static_cast<unsigned 
char>(*right_trim_pos))) {
+                        ++right_trim_pos;
+                        break;
+                    }
+                }
+            }
+
+            res_data.insert_assume_reserved(left_trim_pos, right_trim_pos);
+            res_offsets[i] = res_data.size();
+        }
+
+        return Status::OK();
+    }
+
+    static Status impl_vectors_utf8(const ColumnString::Chars& str_data,
+                                    const ColumnString::Offsets& str_offsets,
+                                    const StringRef& remove_str, 
ColumnString::Chars& res_data,
+                                    ColumnString::Offsets& res_offsets) {
+        const size_t offset_size = str_offsets.size();
+        res_offsets.resize(offset_size);
+        res_data.reserve(str_data.size());
+
+        std::unordered_set<std::string> char_lookup;

Review Comment:
   potential large momory comsuption



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to