This is an automated email from the ASF dual-hosted git repository.

zclll pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 5a9cb5aa255 [optimize](function) modify to_base64, from_base64, unhex 
to avoid a extra copy (#57182)
5a9cb5aa255 is described below

commit 5a9cb5aa255c6fb8f0ea39296200488188ae18c7
Author: admiring_xm <[email protected]>
AuthorDate: Wed Oct 22 16:11:43 2025 +0800

    [optimize](function) modify to_base64, from_base64, unhex to avoid a extra 
copy (#57182)
    
    The format is method/a/b, where a denotes the number of lines and b
    denotes the length of each line's string.
    
    ```text
    Run on (24 X 2395.45 MHz CPU s)
    CPU Caches:
      L1 Data 32 KiB (x12)
      L1 Instruction 32 KiB (x12)
      L2 Unified 1024 KiB (x12)
      L3 Unified 32768 KiB (x1)
    Load Average: 3.32, 2.73, 2.74
    --------------------------------------------------------------------------
    Benchmark                                Time             CPU   Iterations
    --------------------------------------------------------------------------
    BM_ToBase64Impl_Old/1000/256        151122 ns       151123 ns         4623
    BM_ToBase64Impl_Old/100/65536       801007 ns       800983 ns          785
    BM_ToBase64Impl_Old/10/100000       115405 ns       115405 ns         6024
    BM_ToBase64Impl_New/1000/256        126745 ns       126732 ns         5512
    BM_ToBase64Impl_New/100/65536       449138 ns       449122 ns         1304
    BM_ToBase64Impl_New/10/100000        54417 ns        54373 ns        12773
    BM_FromBase64Impl_Old/1000/256       95666 ns        95666 ns         7315
    BM_FromBase64Impl_Old/100/65536     750223 ns       750193 ns          867
    BM_FromBase64Impl_Old/10/100000     113150 ns       113146 ns         6115
    BM_FromBase64Impl_New/1000/256       79121 ns        79121 ns         8847
    BM_FromBase64Impl_New/100/65536     522309 ns       521026 ns         1214
    BM_FromBase64Impl_New/10/100000      78207 ns        78205 ns         8929
    BM_UnhexImpl_Old/1000/256            13058 ns        13058 ns        53759
    BM_UnhexImpl_Old/100/65536            1328 ns         1327 ns       484917
    BM_UnhexImpl_Old/100/100000           1319 ns         1319 ns       529219
    BM_UnhexImpl_New/1000/256             6045 ns         6023 ns       116043
    BM_UnhexImpl_New/100/65536             656 ns          656 ns      1056606
    BM_UnhexImpl_New/100/100000            710 ns          710 ns       984401
    BM_UnhexNullImpl_Old/1000/256         6413 ns         6413 ns       109417
    BM_UnhexNullImpl_Old/100/65536         686 ns          681 ns      1054879
    BM_UnhexNullImpl_Old/100/100000        725 ns          725 ns       957391
    BM_UnhexNullImpl_New/1000/256         6144 ns         6144 ns       114054
    BM_UnhexNullImpl_New/100/65536         664 ns          664 ns       981229
    BM_UnhexNullImpl_New/100/100000        710 ns          708 ns       982551
    ```
---
 be/benchmark/benchmark_main.cpp               |   1 +
 be/benchmark/benchmark_string.hpp             | 394 ++++++++++++++++++++++++++
 be/src/vec/functions/function_string.cpp      | 142 +++++-----
 be/test/vec/function/function_string_test.cpp |  19 +-
 4 files changed, 489 insertions(+), 67 deletions(-)

diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp
index 950c55b5883..6e7213a89ec 100644
--- a/be/benchmark/benchmark_main.cpp
+++ b/be/benchmark/benchmark_main.cpp
@@ -20,6 +20,7 @@
 #include "benchmark_bit_pack.hpp"
 #include "benchmark_fastunion.hpp"
 #include "benchmark_hll_merge.hpp"
+#include "benchmark_string.hpp"
 #include "binary_cast_benchmark.hpp"
 #include "vec/columns/column_string.h"
 #include "vec/core/block.h"
diff --git a/be/benchmark/benchmark_string.hpp 
b/be/benchmark/benchmark_string.hpp
new file mode 100644
index 00000000000..3bd2a2d442a
--- /dev/null
+++ b/be/benchmark/benchmark_string.hpp
@@ -0,0 +1,394 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <benchmark/benchmark.h>
+
+#include <random>
+#include <vector>
+
+#include "vec/functions/function_string.cpp"
+#include "vec/functions/string_hex_util.h"
+
+namespace doris::vectorized {
+
+// old logic for to_base64
+struct OldToBase64Impl {
+    static Status vector(const ColumnString::Chars& data, const 
ColumnString::Offsets& offsets,
+                         ColumnString::Chars& dst_data, ColumnString::Offsets& 
dst_offsets) {
+        auto rows_count = offsets.size();
+        dst_offsets.resize(rows_count);
+        std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
+        std::vector<char> heap_buf;
+        for (int i = 0; i < rows_count; ++i) {
+            const auto* source = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
+            size_t srclen = offsets[i] - offsets[i - 1];
+
+            if (srclen == 0) {
+                StringOP::push_empty_string(i, dst_data, dst_offsets);
+                continue;
+            }
+
+            auto cipher_len = 4 * ((srclen + 2) / 3);
+            char* dst = nullptr;
+            if (cipher_len <= stack_buf.size()) {
+                dst = stack_buf.data();
+            } else {
+                heap_buf.resize(cipher_len);
+                dst = heap_buf.data();
+            }
+
+            auto outlen = base64_encode((const unsigned char*)source, srclen, 
(unsigned char*)dst);
+
+            StringOP::push_value_string(std::string_view(dst, outlen), i, 
dst_data, dst_offsets);
+        }
+        return Status::OK();
+    }
+};
+
+// old logic for from_base64
+struct OldFromBase64Impl {
+    static Status vector(const ColumnString::Chars& data, const 
ColumnString::Offsets& offsets,
+                         ColumnString::Chars& dst_data, ColumnString::Offsets& 
dst_offsets,
+                         NullMap& null_map) {
+        auto rows_count = offsets.size();
+        dst_offsets.resize(rows_count);
+        std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
+        std::vector<char> heap_buf;
+        for (int i = 0; i < rows_count; ++i) {
+            if (null_map[i]) {
+                StringOP::push_null_string(i, dst_data, dst_offsets, null_map);
+                continue;
+            }
+
+            const auto* source = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
+            ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
+
+            if (srclen == 0) {
+                StringOP::push_empty_string(i, dst_data, dst_offsets);
+                continue;
+            }
+
+            auto cipher_len = srclen / 4 * 3;
+            char* dst = nullptr;
+            if (cipher_len <= stack_buf.size()) {
+                dst = stack_buf.data();
+            } else {
+                heap_buf.resize(cipher_len);
+                dst = heap_buf.data();
+            }
+            auto outlen = base64_decode(source, srclen, dst);
+
+            if (outlen < 0) {
+                StringOP::push_null_string(i, dst_data, dst_offsets, null_map);
+            } else {
+                StringOP::push_value_string(std::string_view(dst, outlen), i, 
dst_data,
+                                            dst_offsets);
+            }
+        }
+
+        return Status::OK();
+    }
+};
+
+// old logic for unhex
+struct OldUnHexImpl {
+    static Status vector(const ColumnString::Chars& data, const 
ColumnString::Offsets& offsets,
+                         ColumnString::Chars& dst_data, ColumnString::Offsets& 
dst_offsets) {
+        auto rows_count = offsets.size();
+        dst_offsets.resize(rows_count);
+        std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
+        std::vector<char> heap_buf;
+        for (int i = 0; i < rows_count; ++i) {
+            const auto* source = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
+            ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
+
+            if (srclen == 0) {
+                StringOP::push_empty_string(i, dst_data, dst_offsets);
+                continue;
+            }
+
+            auto cipher_len = srclen / 2;
+            char* dst = nullptr;
+            if (cipher_len <= stack_buf.size()) {
+                dst = stack_buf.data();
+            } else {
+                heap_buf.resize(cipher_len);
+                dst = heap_buf.data();
+            }
+
+            int outlen = string_hex::hex_decode(source, srclen, dst);
+            StringOP::push_value_string(std::string_view(dst, outlen), i, 
dst_data, dst_offsets);
+        }
+
+        return Status::OK();
+    }
+
+    static Status vector(const ColumnString::Chars& data, const 
ColumnString::Offsets& offsets,
+                         ColumnString::Chars& dst_data, ColumnString::Offsets& 
dst_offsets,
+                         ColumnUInt8::Container* null_map_data) {
+        auto rows_count = offsets.size();
+        dst_offsets.resize(rows_count);
+        std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
+        std::vector<char> heap_buf;
+        for (int i = 0; i < rows_count; ++i) {
+            const auto* source = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
+            ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
+
+            if (srclen == 0) {
+                StringOP::push_null_string(i, dst_data, dst_offsets, 
*null_map_data);
+                continue;
+            }
+
+            auto cipher_len = srclen / 2;
+            char* dst = nullptr;
+            if (cipher_len <= stack_buf.size()) {
+                dst = stack_buf.data();
+            } else {
+                heap_buf.resize(cipher_len);
+                dst = heap_buf.data();
+            }
+
+            int outlen = string_hex::hex_decode(source, srclen, dst);
+            if (outlen == 0) {
+                StringOP::push_null_string(i, dst_data, dst_offsets, 
*null_map_data);
+                continue;
+            }
+
+            StringOP::push_value_string(std::string_view(dst, outlen), i, 
dst_data, dst_offsets);
+        }
+
+        return Status::OK();
+    }
+};
+
+static void generate_test_data(ColumnString::Chars& data, 
ColumnString::Offsets& offsets,
+                               size_t num_rows, size_t str_len, unsigned char 
max_char) {
+    const std::string base64_chars =
+            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+            "abcdefghijklmnopqrstuvwxyz"
+            "0123456789+/";
+    std::mt19937 rng(12345);
+    std::uniform_int_distribution<unsigned char> dist(0, max_char);
+
+    offsets.resize(num_rows);
+    data.clear();
+    data.reserve(num_rows * str_len);
+
+    size_t offset = 0;
+    for (size_t i = 0; i < num_rows; ++i) {
+        for (size_t j = 0; j < str_len; ++j) {
+            data.push_back(static_cast<char>(base64_chars[dist(rng)]));
+        }
+        offset += str_len;
+        offsets[i] = cast_set<uint32_t>(offset);
+    }
+}
+
+static void BM_ToBase64Impl_Old(benchmark::State& state) {
+    size_t rows = state.range(0);
+    size_t len = state.range(1);
+    ColumnString::Chars data;
+    ColumnString::Offsets offsets;
+    generate_test_data(data, offsets, rows, len, 63);
+
+    ColumnString::Chars dst_data;
+    ColumnString::Offsets dst_offsets;
+
+    for (auto _ : state) {
+        dst_data.clear();
+        dst_offsets.clear();
+        benchmark::DoNotOptimize(OldToBase64Impl::vector(data, offsets, 
dst_data, dst_offsets));
+    }
+}
+
+static void BM_ToBase64Impl_New(benchmark::State& state) {
+    size_t rows = state.range(0);
+    size_t len = state.range(1);
+    ColumnString::Chars data;
+    ColumnString::Offsets offsets;
+    generate_test_data(data, offsets, rows, len, 63);
+
+    ColumnString::Chars dst_data;
+    ColumnString::Offsets dst_offsets;
+
+    for (auto _ : state) {
+        dst_data.clear();
+        dst_offsets.clear();
+        benchmark::DoNotOptimize(ToBase64Impl::vector(data, offsets, dst_data, 
dst_offsets));
+    }
+}
+
+// 10, 100000 is a big data test case for testing memory allocation on the heap
+BENCHMARK(BM_ToBase64Impl_Old)
+        ->Args({1000, 256})
+        ->Args({100, 65536})
+        ->Args({10, 100000})
+        ->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_ToBase64Impl_New)
+        ->Args({1000, 256})
+        ->Args({100, 65536})
+        ->Args({10, 100000})
+        ->Unit(benchmark::kNanosecond);
+
+static void BM_FromBase64Impl_Old(benchmark::State& state) {
+    size_t rows = state.range(0);
+    size_t len = state.range(1);
+    ColumnString::Chars data;
+    ColumnString::Offsets offsets;
+    auto null_map = ColumnUInt8::create(rows, 0);
+    generate_test_data(data, offsets, rows, len, 63);
+
+    ColumnString::Chars dst_data;
+    ColumnString::Offsets dst_offsets;
+
+    for (auto _ : state) {
+        dst_data.clear();
+        dst_offsets.clear();
+        benchmark::DoNotOptimize(OldFromBase64Impl::vector(data, offsets, 
dst_data, dst_offsets,
+                                                           
null_map->get_data()));
+    }
+}
+
+static void BM_FromBase64Impl_New(benchmark::State& state) {
+    size_t rows = state.range(0);
+    size_t len = state.range(1);
+    ColumnString::Chars data;
+    ColumnString::Offsets offsets;
+    auto null_map = ColumnUInt8::create(rows, 0);
+    generate_test_data(data, offsets, rows, len, 63);
+
+    ColumnString::Chars dst_data;
+    ColumnString::Offsets dst_offsets;
+
+    for (auto _ : state) {
+        dst_data.clear();
+        dst_offsets.clear();
+        benchmark::DoNotOptimize(
+                FromBase64Impl::vector(data, offsets, dst_data, dst_offsets, 
null_map->get_data()));
+    }
+}
+
+// 10, 100000 is a big data test case for testing memory allocation on the heap
+BENCHMARK(BM_FromBase64Impl_Old)
+        ->Args({1000, 256})
+        ->Args({100, 65536})
+        ->Args({10, 100000})
+        ->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_FromBase64Impl_New)
+        ->Args({1000, 256})
+        ->Args({100, 65536})
+        ->Args({10, 100000})
+        ->Unit(benchmark::kNanosecond);
+
+static void BM_UnhexImpl_Old(benchmark::State& state) {
+    size_t rows = state.range(0);
+    size_t len = state.range(1);
+    ColumnString::Chars data;
+    ColumnString::Offsets offsets;
+    generate_test_data(data, offsets, rows, len, 16);
+
+    ColumnString::Chars dst_data;
+    ColumnString::Offsets dst_offsets;
+
+    for (auto _ : state) {
+        dst_data.clear();
+        dst_offsets.clear();
+        benchmark::DoNotOptimize(OldUnHexImpl::vector(data, offsets, dst_data, 
dst_offsets));
+    }
+}
+
+static void BM_UnhexImpl_New(benchmark::State& state) {
+    size_t rows = state.range(0);
+    size_t len = state.range(1);
+    ColumnString::Chars data;
+    ColumnString::Offsets offsets;
+    generate_test_data(data, offsets, rows, len, 16);
+
+    ColumnString::Chars dst_data;
+    ColumnString::Offsets dst_offsets;
+
+    for (auto _ : state) {
+        dst_data.clear();
+        dst_offsets.clear();
+        benchmark::DoNotOptimize(
+                UnHexImpl<UnHexImplEmpty>::vector(data, offsets, dst_data, 
dst_offsets));
+    }
+}
+
+// 100, 100000 is a big data test case for testing memory allocation on the 
heap
+BENCHMARK(BM_UnhexImpl_Old)
+        ->Args({1000, 256})
+        ->Args({100, 65536})
+        ->Args({100, 100000})
+        ->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_UnhexImpl_New)
+        ->Args({1000, 256})
+        ->Args({100, 65536})
+        ->Args({100, 100000})
+        ->Unit(benchmark::kNanosecond);
+
+static void BM_UnhexNullImpl_Old(benchmark::State& state) {
+    size_t rows = state.range(0);
+    size_t len = state.range(1);
+    ColumnString::Chars data;
+    ColumnString::Offsets offsets;
+    auto null_map = ColumnUInt8::create(rows, 0);
+    generate_test_data(data, offsets, rows, len, 16);
+
+    ColumnString::Chars dst_data;
+    ColumnString::Offsets dst_offsets;
+
+    for (auto _ : state) {
+        dst_data.clear();
+        dst_offsets.clear();
+        benchmark::DoNotOptimize(
+                OldUnHexImpl::vector(data, offsets, dst_data, dst_offsets, 
&null_map->get_data()));
+    }
+}
+
+static void BM_UnhexNullImpl_New(benchmark::State& state) {
+    size_t rows = state.range(0);
+    size_t len = state.range(1);
+    ColumnString::Chars data;
+    ColumnString::Offsets offsets;
+    auto null_map = ColumnUInt8::create(rows, 0);
+    generate_test_data(data, offsets, rows, len, 16);
+
+    ColumnString::Chars dst_data;
+    ColumnString::Offsets dst_offsets;
+
+    for (auto _ : state) {
+        dst_data.clear();
+        dst_offsets.clear();
+        benchmark::DoNotOptimize(UnHexImpl<UnHexImplNull>::vector(
+                data, offsets, dst_data, dst_offsets, &null_map->get_data()));
+    }
+}
+
+// 100, 100000 is a big data test case for testing memory allocation on the 
heap
+BENCHMARK(BM_UnhexNullImpl_Old)
+        ->Args({1000, 256})
+        ->Args({100, 65536})
+        ->Args({100, 100000})
+        ->Unit(benchmark::kNanosecond);
+BENCHMARK(BM_UnhexNullImpl_New)
+        ->Args({1000, 256})
+        ->Args({100, 65536})
+        ->Args({100, 100000})
+        ->Unit(benchmark::kNanosecond);
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/src/vec/functions/function_string.cpp 
b/be/src/vec/functions/function_string.cpp
index 7a6c71eadbe..4bdcc88428e 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -978,30 +978,32 @@ struct UnHexImpl {
                          ColumnString::Chars& dst_data, ColumnString::Offsets& 
dst_offsets) {
         auto rows_count = offsets.size();
         dst_offsets.resize(rows_count);
-        std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
-        std::vector<char> heap_buf;
+
+        int64_t total_size = 0;
+        for (size_t i = 0; i < rows_count; i++) {
+            size_t len = offsets[i] - offsets[i - 1];
+            total_size += len / 2;
+        }
+        ColumnString::check_chars_length(total_size, rows_count);
+        dst_data.resize(total_size);
+        char* dst_data_ptr = reinterpret_cast<char*>(dst_data.data());
+        size_t offset = 0;
+
         for (int i = 0; i < rows_count; ++i) {
             const auto* source = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
             ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
 
-            if (srclen == 0) {
-                StringOP::push_empty_string(i, dst_data, dst_offsets);
+            if (UNLIKELY(srclen == 0)) {
+                dst_offsets[i] = cast_set<uint32_t>(offset);
                 continue;
             }
 
-            auto cipher_len = srclen / 2;
-            char* dst = nullptr;
-            if (cipher_len <= stack_buf.size()) {
-                dst = stack_buf.data();
-            } else {
-                heap_buf.resize(cipher_len);
-                dst = heap_buf.data();
-            }
+            int outlen = string_hex::hex_decode(source, srclen, dst_data_ptr + 
offset);
 
-            int outlen = string_hex::hex_decode(source, srclen, dst);
-            StringOP::push_value_string(std::string_view(dst, outlen), i, 
dst_data, dst_offsets);
+            offset += outlen;
+            dst_offsets[i] = cast_set<uint32_t>(offset);
         }
-
+        dst_data.pop_back(total_size - offset);
         return Status::OK();
     }
 
@@ -1010,35 +1012,39 @@ struct UnHexImpl {
                          ColumnUInt8::Container* null_map_data) {
         auto rows_count = offsets.size();
         dst_offsets.resize(rows_count);
-        std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
-        std::vector<char> heap_buf;
+
+        int64_t total_size = 0;
+        for (size_t i = 0; i < rows_count; i++) {
+            size_t len = offsets[i] - offsets[i - 1];
+            total_size += len / 2;
+        }
+        ColumnString::check_chars_length(total_size, rows_count);
+        dst_data.resize(total_size);
+        char* dst_data_ptr = reinterpret_cast<char*>(dst_data.data());
+        size_t offset = 0;
+
         for (int i = 0; i < rows_count; ++i) {
             const auto* source = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
             ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
 
-            if (srclen == 0) {
-                StringOP::push_null_string(i, dst_data, dst_offsets, 
*null_map_data);
+            if (UNLIKELY(srclen == 0)) {
+                (*null_map_data)[i] = 1;
+                dst_offsets[i] = cast_set<uint32_t>(offset);
                 continue;
             }
 
-            auto cipher_len = srclen / 2;
-            char* dst = nullptr;
-            if (cipher_len <= stack_buf.size()) {
-                dst = stack_buf.data();
-            } else {
-                heap_buf.resize(cipher_len);
-                dst = heap_buf.data();
-            }
+            int outlen = string_hex::hex_decode(source, srclen, dst_data_ptr + 
offset);
 
-            int outlen = string_hex::hex_decode(source, srclen, dst);
             if (outlen == 0) {
-                StringOP::push_null_string(i, dst_data, dst_offsets, 
*null_map_data);
+                (*null_map_data)[i] = 1;
+                dst_offsets[i] = cast_set<uint32_t>(offset);
                 continue;
             }
 
-            StringOP::push_value_string(std::string_view(dst, outlen), i, 
dst_data, dst_offsets);
+            offset += outlen;
+            dst_offsets[i] = cast_set<uint32_t>(offset);
         }
-
+        dst_data.pop_back(total_size - offset);
         return Status::OK();
     }
 };
@@ -1088,30 +1094,33 @@ struct ToBase64Impl {
                          ColumnString::Chars& dst_data, ColumnString::Offsets& 
dst_offsets) {
         auto rows_count = offsets.size();
         dst_offsets.resize(rows_count);
-        std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
-        std::vector<char> heap_buf;
+
+        size_t total_size = 0;
+        for (size_t i = 0; i < rows_count; i++) {
+            size_t len = offsets[i] - offsets[i - 1];
+            total_size += 4 * ((len + 2) / 3);
+        }
+        ColumnString::check_chars_length(total_size, rows_count);
+        dst_data.resize(total_size);
+        auto* dst_data_ptr = dst_data.data();
+        size_t offset = 0;
+
         for (int i = 0; i < rows_count; ++i) {
             const auto* source = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
             size_t srclen = offsets[i] - offsets[i - 1];
 
-            if (srclen == 0) {
-                StringOP::push_empty_string(i, dst_data, dst_offsets);
+            if (UNLIKELY(srclen == 0)) {
+                dst_offsets[i] = cast_set<uint32_t>(offset);
                 continue;
             }
 
-            auto cipher_len = srclen / 2;
-            char* dst = nullptr;
-            if (cipher_len <= stack_buf.size()) {
-                dst = stack_buf.data();
-            } else {
-                heap_buf.resize(cipher_len);
-                dst = heap_buf.data();
-            }
-
-            auto outlen = base64_encode((const unsigned char*)source, srclen, 
(unsigned char*)dst);
+            auto outlen = doris::base64_encode((const unsigned char*)source, 
srclen,
+                                               (unsigned char*)(dst_data_ptr + 
offset));
 
-            StringOP::push_value_string(std::string_view(dst, outlen), i, 
dst_data, dst_offsets);
+            offset += outlen;
+            dst_offsets[i] = cast_set<uint32_t>(offset);
         }
+        dst_data.pop_back(total_size - offset);
         return Status::OK();
     }
 };
@@ -1126,40 +1135,43 @@ struct FromBase64Impl {
                          NullMap& null_map) {
         auto rows_count = offsets.size();
         dst_offsets.resize(rows_count);
-        std::array<char, string_hex::MAX_STACK_CIPHER_LEN> stack_buf;
-        std::vector<char> heap_buf;
+
+        size_t total_size = 0;
+        for (size_t i = 0; i < rows_count; i++) {
+            auto len = offsets[i] - offsets[i - 1];
+            total_size += len / 4 * 3;
+        }
+        ColumnString::check_chars_length(total_size, rows_count);
+        dst_data.resize(total_size);
+        char* dst_data_ptr = reinterpret_cast<char*>(dst_data.data());
+        size_t offset = 0;
+
         for (int i = 0; i < rows_count; ++i) {
-            if (null_map[i]) {
-                StringOP::push_null_string(i, dst_data, dst_offsets, null_map);
+            if (UNLIKELY(null_map[i])) {
+                null_map[i] = 1;
+                dst_offsets[i] = cast_set<uint32_t>(offset);
                 continue;
             }
 
             const auto* source = reinterpret_cast<const char*>(&data[offsets[i 
- 1]]);
             ColumnString::Offset srclen = offsets[i] - offsets[i - 1];
 
-            if (srclen == 0) {
-                StringOP::push_empty_string(i, dst_data, dst_offsets);
+            if (UNLIKELY(srclen == 0)) {
+                dst_offsets[i] = cast_set<uint32_t>(offset);
                 continue;
             }
 
-            auto cipher_len = srclen / 2;
-            char* dst = nullptr;
-            if (cipher_len <= stack_buf.size()) {
-                dst = stack_buf.data();
-            } else {
-                heap_buf.resize(cipher_len);
-                dst = heap_buf.data();
-            }
-            auto outlen = base64_decode(source, srclen, dst);
+            auto outlen = base64_decode(source, srclen, dst_data_ptr + offset);
 
             if (outlen < 0) {
-                StringOP::push_null_string(i, dst_data, dst_offsets, null_map);
+                null_map[i] = 1;
+                dst_offsets[i] = cast_set<uint32_t>(offset);
             } else {
-                StringOP::push_value_string(std::string_view(dst, outlen), i, 
dst_data,
-                                            dst_offsets);
+                offset += outlen;
+                dst_offsets[i] = cast_set<uint32_t>(offset);
             }
         }
-
+        dst_data.pop_back(total_size - offset);
         return Status::OK();
     }
 };
diff --git a/be/test/vec/function/function_string_test.cpp 
b/be/test/vec/function/function_string_test.cpp
index 84272d4475c..6ce498cf630 100644
--- a/be/test/vec/function/function_string_test.cpp
+++ b/be/test/vec/function/function_string_test.cpp
@@ -2515,6 +2515,12 @@ TEST(function_string_test, function_hex_test) {
             {{std::string("23 12 --!__!_!__!")}, 
std::string("3233203132202D2D215F5F215F215F5F21")},
             {{std::string("112+ + +")}, std::string("3131322B202B202B")},
             {{std::string("     +       23 ")}, 
std::string("20202020202B20202020202020323320")},
+            {{std::string("πŸ˜€πŸ•")}, std::string("F09F9880F09F8D95")},
+            {{std::string("ζ΅‹θ―•")}, std::string("E6B58BE8AF95")},
+            {{std::string("こんにけは")}, 
std::string("E38193E38293E381ABE381A1E381AF")},
+            {{std::string("μ•ˆλ…•ν•˜μ„Έμš”")}, 
std::string("EC9588EB8595ED9598EC84B8EC9A94")},
+            {{std::string("ν…ŒμŠ€νŠΈ")}, std::string("ED858CEC8AA4ED8AB8")},
+            {{std::string("πŸŽ‰πŸ”")}, std::string("F09F8E89F09F8D94")},
     };
     check_function_all_arg_comb<DataTypeString, true>(func_name, input_types, 
data_set);
 }
@@ -2525,16 +2531,19 @@ TEST(function_string_test, function_unhex_test) {
     DataSet data_set = {
             {{std::string("41624364456667")}, std::string("AbCdEfg")},
             {{std::string("E4BDA0E5A5BD48454C4C4F")}, std::string("δ½ ε₯½HELLO")},
+            {{std::string("F09F9880F09F8D95")}, std::string("πŸ˜€πŸ•")},
+            {{std::string("E6B58BE8AF95")}, std::string("ζ΅‹θ―•")},
             {{std::string("")}, std::string("")},
             {{Null()}, Null()},
             {{std::string("21402324402A2028212623")}, std::string("!@#$@* 
(!&#")},
             {{std::string("4A534B41422851405F5F21")}, 
std::string("JSKAB(Q@__!")},
-            // {{std::string("M4D59207465737420537472E4BDA0E5A5BD2020")}, 
Null()},
             {{std::string("2020202020202020202020202020202020")}, 
std::string("                 ")},
             {{std::string("3233203132202D2D215F5F215F215F5F21")}, 
std::string("23 12 --!__!_!__!")},
             {{std::string("3131322B202B202B")}, std::string("112+ + +")},
             {{std::string("20202020202B20202020202020323320")}, std::string("  
   +       23 ")},
-            // {{std::string("!")}, Null()},
+            {{std::string("E38193E38293E381ABE381A1E381AF")}, 
std::string("こんにけは")},
+            {{std::string("EC9588EB8595ED9598EC84B8EC9A94")}, 
std::string("μ•ˆλ…•ν•˜μ„Έμš”")},
+            {{std::string("ED858CEC8AA4ED8AB8")}, std::string("ν…ŒμŠ€νŠΈ")},
     };
     check_function_all_arg_comb<DataTypeString, true>(unhex_func_name, 
input_types, data_set);
 
@@ -2542,6 +2551,8 @@ TEST(function_string_test, function_unhex_test) {
     data_set = {
             {{std::string("41624364456667")}, std::string("AbCdEfg")},
             {{std::string("E4BDA0E5A5BD48454C4C4F")}, std::string("δ½ ε₯½HELLO")},
+            {{std::string("F09F9880F09F8D95")}, std::string("πŸ˜€πŸ•")},
+            {{std::string("E6B58BE8AF95")}, std::string("ζ΅‹θ―•")},
             {{std::string("")}, Null()},
             {{Null()}, Null()},
             {{std::string("21402324402A2028212623")}, std::string("!@#$@* 
(!&#")},
@@ -2553,6 +2564,10 @@ TEST(function_string_test, function_unhex_test) {
             {{std::string("20202020202B20202020202020323320")}, std::string("  
   +       23 ")},
             {{std::string("41G42")}, Null()},
             {{std::string("!")}, Null()},
+            {{std::string("F09F8E89F09F8D94")}, std::string("πŸŽ‰πŸ”")},
+            {{std::string("E38193E38293E381ABE381A1E381AF")}, 
std::string("こんにけは")},
+            {{std::string("EC9588EB8595ED9598EC84B8EC9A94")}, 
std::string("μ•ˆλ…•ν•˜μ„Έμš”")},
+            {{std::string("ED858CEC8AA4ED8AB8")}, std::string("ν…ŒμŠ€νŠΈ")},
     };
     check_function_all_arg_comb<DataTypeString, true>(unhex_null_func_name, 
input_types, data_set);
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to