liujiwen-up commented on code in PR #41681: URL: https://github.com/apache/doris/pull/41681#discussion_r1801174551
########## be/src/vec/functions/function_string.cpp: ########## @@ -535,6 +545,135 @@ struct TrimUtil { return Status::OK(); } }; +template <bool is_ltrim_in, bool is_rtrim_in, bool trim_single> +struct TrimInUtil { + static Status vector(const ColumnString::Chars& str_data, + const ColumnString::Offsets& str_offsets, const StringRef& remove_str, + ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { + const size_t offset_size = str_offsets.size(); + res_offsets.resize(offset_size); + res_data.reserve(str_data.size()); + bool all_ascii = simd::VStringFunctions::is_ascii(remove_str) && + simd::VStringFunctions::is_ascii(StringRef( + reinterpret_cast<const char*>(str_data.data()), str_data.size())); + + if (all_ascii) { + return impl_vectors_ascii(str_data, str_offsets, remove_str, res_data, res_offsets); + } else { + return impl_vectors_utf8(str_data, str_offsets, remove_str, res_data, res_offsets); + } + } + +private: + static Status impl_vectors_ascii(const ColumnString::Chars& str_data, + const ColumnString::Offsets& str_offsets, + const StringRef& remove_str, ColumnString::Chars& res_data, + ColumnString::Offsets& res_offsets) { + const size_t offset_size = str_offsets.size(); + std::bitset<128> char_lookup; + const char* remove_begin = remove_str.data; + const char* remove_end = remove_str.data + remove_str.size; + + while (remove_begin < remove_end) { + char_lookup.set(static_cast<unsigned char>(*remove_begin)); + remove_begin += 1; + } + + for (size_t i = 0; i < offset_size; ++i) { + const char* str_begin = reinterpret_cast<const char*>( + str_data.data() + (i == 0 ? 0 : str_offsets[i - 1])); + const char* str_end = reinterpret_cast<const char*>(str_data.data() + str_offsets[i]); + const char* left_trim_pos = str_begin; + const char* right_trim_pos = str_end; + + if constexpr (is_ltrim_in) { + while (left_trim_pos < str_end) { + if (!char_lookup.test(static_cast<unsigned char>(*left_trim_pos))) { + break; + } + ++left_trim_pos; + } + } + + if constexpr (is_rtrim_in) { + while (right_trim_pos > left_trim_pos) { + --right_trim_pos; + if (!char_lookup.test(static_cast<unsigned char>(*right_trim_pos))) { + ++right_trim_pos; + break; + } + } + } + + res_data.insert_assume_reserved(left_trim_pos, right_trim_pos); + res_offsets[i] = res_data.size(); + } + + return Status::OK(); + } + + static Status impl_vectors_utf8(const ColumnString::Chars& str_data, + const ColumnString::Offsets& str_offsets, + const StringRef& remove_str, ColumnString::Chars& res_data, + ColumnString::Offsets& res_offsets) { + const size_t offset_size = str_offsets.size(); + res_offsets.resize(offset_size); + res_data.reserve(str_data.size()); + + std::unordered_set<std::string> char_lookup; Review Comment: Change to string_view -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org