This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
commit b031c95324e815c25496156597db0a3b871d57fd Author: HappenLee <happen...@hotmail.com> AuthorDate: Wed Mar 13 10:23:39 2024 +0800 [Opt](exec) use libbase64 to replace base64 code in doris (#32078) * [Opt](exec) use libbase64 to replace base64 code in doris --- be/cmake/thirdparty.cmake | 1 + be/src/exec/olap_utils.h | 36 ----- be/src/util/url_coding.cpp | 175 +++------------------ be/src/vec/functions/function_bitmap.cpp | 6 +- be/src/vec/functions/function_string.cpp | 6 +- .../load_p0/stream_load/test_stream_load.groovy | 2 +- .../test_stream_load_move_memtable.groovy | 8 +- 7 files changed, 35 insertions(+), 199 deletions(-) diff --git a/be/cmake/thirdparty.cmake b/be/cmake/thirdparty.cmake index 0d485b2466e..e9fbdabee8b 100644 --- a/be/cmake/thirdparty.cmake +++ b/be/cmake/thirdparty.cmake @@ -117,6 +117,7 @@ add_thirdparty(bitshuffle) add_thirdparty(roaring) add_thirdparty(fmt) add_thirdparty(cctz) +add_thirdparty(base64) add_thirdparty(aws-cpp-sdk-core LIB64) add_thirdparty(aws-cpp-sdk-s3 LIB64) diff --git a/be/src/exec/olap_utils.h b/be/src/exec/olap_utils.h index 4024337c462..2e101b1270f 100644 --- a/be/src/exec/olap_utils.h +++ b/be/src/exec/olap_utils.h @@ -61,42 +61,6 @@ public: } }; -static char encoding_table[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', - 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', - 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}; - -static int mod_table[] = {0, 2, 1}; -static const char base64_pad = '='; - -inline size_t base64_encode(const char* data, size_t length, char* encoded_data) { - size_t output_length = (size_t)(4.0 * ceil((double)length / 3.0)); - - if (encoded_data == nullptr) { - return 0; - } - - for (uint32_t i = 0, j = 0; i < length;) { - uint32_t octet_a = i < length ? (unsigned char)data[i++] : 0; - uint32_t octet_b = i < length ? (unsigned char)data[i++] : 0; - uint32_t octet_c = i < length ? (unsigned char)data[i++] : 0; - - uint32_t triple = (octet_a << 0x10) + (octet_b << 0x08) + octet_c; - - encoded_data[j++] = encoding_table[(triple >> 3 * 6) & 0x3F]; - encoded_data[j++] = encoding_table[(triple >> 2 * 6) & 0x3F]; - encoded_data[j++] = encoding_table[(triple >> 1 * 6) & 0x3F]; - encoded_data[j++] = encoding_table[(triple >> 0 * 6) & 0x3F]; - } - - for (int i = 0; i < mod_table[length % 3]; i++) { - encoded_data[output_length - 1 - i] = base64_pad; - } - - return output_length; -} - enum SQLFilterOp { FILTER_LARGER = 0, FILTER_LARGER_OR_EQUAL = 1, diff --git a/be/src/util/url_coding.cpp b/be/src/util/url_coding.cpp index 7d6e264d5e9..6ddd4c05401 100644 --- a/be/src/util/url_coding.cpp +++ b/be/src/util/url_coding.cpp @@ -17,6 +17,7 @@ #include "util/url_coding.h" +#include <libbase64.h> #include <math.h> #include <memory> @@ -86,165 +87,35 @@ bool url_decode(const std::string& in, std::string* out) { return true; } -static void encode_base64_internal(const std::string& in, std::string* out, - const unsigned char* basis, bool padding) { - size_t len = in.size(); - // Every 3 source bytes will be encoded into 4 bytes. - out->resize((len + 2) / 3 * 4); - unsigned char* d = (unsigned char*)out->data(); - const auto* s = reinterpret_cast<const unsigned char*>(in.data()); - while (len > 2) { - *d++ = basis[(s[0] >> 2) & 0x3f]; - *d++ = basis[((s[0] & 3) << 4) | (s[1] >> 4)]; - *d++ = basis[((s[1] & 0x0f) << 2) | (s[2] >> 6)]; - *d++ = basis[s[2] & 0x3f]; - - s += 3; - len -= 3; - } - if (len) { - *d++ = basis[(s[0] >> 2) & 0x3f]; - if (len == 1) { - *d++ = basis[(s[0] & 3) << 4]; - if (padding) { - *d++ = '='; - } - } else { - *d++ = basis[((s[0] & 3) << 4) | (s[1] >> 4)]; - *d++ = basis[(s[1] & 0x0f) << 2]; - } - if (padding) { - *d++ = '='; - } - } - out->resize((char*)d - out->data()); -} - void base64_encode(const std::string& in, std::string* out) { - static unsigned char basis64[] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; - encode_base64_internal(in, out, basis64, true); + out->resize(in.length() * (4.0 / 3) + 1); + auto len = base64_encode(reinterpret_cast<const unsigned char*>(in.c_str()), in.length(), + (unsigned char*)out->c_str()); + out->resize(len); } -static char encoding_table[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', - 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', - 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', - 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', - '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'}; - -static const char base64_pad = '='; - -static short decoding_table[256] = { - -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2, -2, - -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, 62, - -2, -2, -2, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -2, -2, -2, -2, -2, -2, -2, 0, - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, - 23, 24, 25, -2, -2, -2, -2, -2, -2, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, - 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, - -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}; - -static int mod_table[] = {0, 2, 1}; - size_t base64_encode(const unsigned char* data, size_t length, unsigned char* encoded_data) { - auto output_length = (size_t)(4.0 * ceil((double)length / 3.0)); - - if (encoded_data == nullptr) { - return 0; - } - - for (uint32_t i = 0, j = 0; i < length;) { - uint32_t octet_a = i < length ? data[i++] : 0; - uint32_t octet_b = i < length ? data[i++] : 0; - uint32_t octet_c = i < length ? data[i++] : 0; - uint32_t triple = (octet_a << 0x10) + (octet_b << 0x08) + octet_c; - - encoded_data[j++] = encoding_table[(triple >> 3 * 6) & 0x3F]; - encoded_data[j++] = encoding_table[(triple >> 2 * 6) & 0x3F]; - encoded_data[j++] = encoding_table[(triple >> 1 * 6) & 0x3F]; - encoded_data[j++] = encoding_table[(triple >> 0 * 6) & 0x3F]; - } - - for (int i = 0; i < mod_table[length % 3]; i++) { - encoded_data[output_length - 1 - i] = '='; - } - - return output_length; + size_t encode_len = 0; +#if defined(__aarch64__) || defined(_M_ARM64) + do_base64_encode(reinterpret_cast<const char*>(data), length, + reinterpret_cast<char*>(encoded_data), &encode_len, BASE64_FORCE_NEON64); +#else + do_base64_encode(reinterpret_cast<const char*>(data), length, + reinterpret_cast<char*>(encoded_data), &encode_len, 0); +#endif + return encode_len; } int64_t base64_decode(const char* data, size_t length, char* decoded_data) { - const char* current = data; - int ch = 0; - int i = 0; - int j = 0; - int k = 0; - - // run through the whole string, converting as we go - while ((ch = *current++) != '\0' && length-- > 0) { - if (ch >= 256 || ch < 0) { - return -1; - } - - if (ch == base64_pad) { - if (*current != '=' && (i % 4) == 1) { - return -1; - } - continue; - } - - ch = decoding_table[ch]; - // a space or some other separator character, we simply skip over - if (ch == -1) { - continue; - } else if (ch == -2) { - return -1; - } - - switch (i % 4) { - case 0: - decoded_data[j] = ch << 2; - break; - case 1: - decoded_data[j++] |= ch >> 4; - decoded_data[j] = (ch & 0x0f) << 4; - break; - case 2: - decoded_data[j++] |= ch >> 2; - decoded_data[j] = (ch & 0x03) << 6; - break; - case 3: - decoded_data[j++] |= ch; - break; - default: - break; - } - - i++; - } - - k = j; - /* mop things up if we ended on a boundary */ - if (ch == base64_pad) { - switch (i % 4) { - case 1: - return 0; - case 2: - k++; - [[fallthrough]]; - case 3: - decoded_data[k] = 0; - default: - break; - } - } - - decoded_data[j] = '\0'; - - return j; + size_t decode_len = 0; +#if defined(__aarch64__) || defined(_M_ARM64) + auto ret = do_base64_decode(reinterpret_cast<const char*>(data), length, decoded_data, + &decode_len, BASE64_FORCE_NEON64); +#else + auto ret = do_base64_decode(reinterpret_cast<const char*>(data), length, decoded_data, + &decode_len, 0); +#endif + return ret > 0 ? decode_len : -1; } bool base64_decode(const std::string& in, std::string* out) { diff --git a/be/src/vec/functions/function_bitmap.cpp b/be/src/vec/functions/function_bitmap.cpp index d1f6cf432ee..4d77b85259f 100644 --- a/be/src/vec/functions/function_bitmap.cpp +++ b/be/src/vec/functions/function_bitmap.cpp @@ -282,7 +282,7 @@ struct BitmapFromBase64 { decode_buff.resize(curr_decode_buff_len); last_decode_buff_len = curr_decode_buff_len; } - int outlen = base64_decode(src_str, src_size, decode_buff.data()); + auto outlen = base64_decode(src_str, src_size, decode_buff.data()); if (outlen < 0) { res.emplace_back(); null_map[i] = 1; @@ -1012,8 +1012,8 @@ struct BitmapToBase64 { } bitmap_val.write_to(ser_buff.data()); - int outlen = base64_encode((const unsigned char*)ser_buff.data(), cur_ser_size, - chars_data + encoded_offset); + auto outlen = base64_encode((const unsigned char*)ser_buff.data(), cur_ser_size, + chars_data + encoded_offset); DCHECK(outlen > 0); encoded_offset += (int)(4.0 * ceil((double)cur_ser_size / 3.0)); diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index ce2c94b937b..4da6d11a5e1 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -817,7 +817,7 @@ struct ToBase64Impl { dst = dst_uptr.get(); } - int outlen = base64_encode((const unsigned char*)source, srclen, (unsigned char*)dst); + auto outlen = base64_encode((const unsigned char*)source, srclen, (unsigned char*)dst); StringOP::push_value_string(std::string_view(dst, outlen), i, dst_data, dst_offsets); } @@ -860,7 +860,7 @@ struct ToBase64OldImpl { dst = dst_uptr.get(); } - int outlen = base64_encode((const unsigned char*)source, srclen, (unsigned char*)dst); + auto outlen = base64_encode((const unsigned char*)source, srclen, (unsigned char*)dst); StringOP::push_value_string(std::string_view(dst, outlen), i, dst_data, dst_offsets); } @@ -902,7 +902,7 @@ struct FromBase64Impl { dst_uptr.reset(new char[cipher_len]); dst = dst_uptr.get(); } - int outlen = base64_decode(source, srclen, dst); + auto outlen = base64_decode(source, srclen, dst); if (outlen < 0) { StringOP::push_null_string(i, dst_data, dst_offsets, null_map); diff --git a/regression-test/suites/load_p0/stream_load/test_stream_load.groovy b/regression-test/suites/load_p0/stream_load/test_stream_load.groovy index ee69cc47779..6c002b2d29b 100644 --- a/regression-test/suites/load_p0/stream_load/test_stream_load.groovy +++ b/regression-test/suites/load_p0/stream_load/test_stream_load.groovy @@ -1030,7 +1030,7 @@ suite("test_stream_load", "p0") { set 'column_separator', '|' set 'columns', 'k1, k2, v1, v2, v3' set 'strict_mode', 'true' - set 'Authorization', 'Basic Y29tbW9uX3VzZXI6MTIzNDU2dGVzdCE=' + set 'Authorization', 'Basic Y29tbW9uX3VzZXJAJyUnOjEyMzQ1NnRlc3Qh' file 'test_auth.csv' time 10000 // limit inflight 10s diff --git a/regression-test/suites/load_p0/stream_load/test_stream_load_move_memtable.groovy b/regression-test/suites/load_p0/stream_load/test_stream_load_move_memtable.groovy index e0d00120552..09d9e57bf6e 100644 --- a/regression-test/suites/load_p0/stream_load/test_stream_load_move_memtable.groovy +++ b/regression-test/suites/load_p0/stream_load/test_stream_load_move_memtable.groovy @@ -877,8 +877,8 @@ suite("test_stream_load_move_memtable", "p0") { PROPERTIES ("replication_allocation" = "tag.location.default: 1"); """ - sql """create USER common_user1@'%' IDENTIFIED BY '123456test!'""" - sql """GRANT LOAD_PRIV ON *.* TO 'common_user1'@'%';""" + sql """create USER ddd IDENTIFIED BY '123456test!'""" + sql """GRANT LOAD_PRIV ON *.* TO 'ddd';""" streamLoad { table "${tableName13}" @@ -886,7 +886,7 @@ suite("test_stream_load_move_memtable", "p0") { set 'column_separator', '|' set 'columns', 'k1, k2, v1, v2, v3' set 'strict_mode', 'true' - set 'Authorization', 'Basic Y29tbW9uX3VzZXIxOjEyMzQ1NnRlc3Qh' + set 'Authorization', 'Basic ZGRkOjEyMzQ1NnRlc3Qh' set 'memtable_on_sink_node', 'true' file 'test_auth.csv' @@ -906,7 +906,7 @@ suite("test_stream_load_move_memtable", "p0") { } sql "sync" - sql """DROP USER 'common_user1'@'%'""" + sql """DROP USER 'ddd'""" // test default value def tableName14 = "test_default_value_mm" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org