This is an automated email from the ASF dual-hosted git repository. lihaopeng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new a2d0d103631 [env](patch) use patch to opt bitshuffle (#38378) a2d0d103631 is described below commit a2d0d103631e5ebbd13ffe776cbd9fb72dba6dc8 Author: Mryange <59914473+mrya...@users.noreply.github.com> AuthorDate: Wed Jul 31 11:19:20 2024 +0800 [env](patch) use patch to opt bitshuffle (#38378) --- .../olap/rowset/segment_v2/bitshuffle_wrapper.cpp | 13 ++ thirdparty/build-thirdparty.sh | 7 +- thirdparty/download-thirdparty.sh | 13 ++ thirdparty/patches/bitshuffle-0.5.1.patch | 225 +++++++++++++++++++++ 4 files changed, 256 insertions(+), 2 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp index 7ad20f210c2..2dca068ac6f 100644 --- a/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp +++ b/be/src/olap/rowset/segment_v2/bitshuffle_wrapper.cpp @@ -34,6 +34,15 @@ #undef bshuf_compress_lz4 #undef bshuf_decompress_lz4 +#undef BITSHUFFLE_H +#define bshuf_compress_lz4_bound bshuf_compress_lz4_bound_neon +#define bshuf_compress_lz4 bshuf_compress_lz4_neon +#define bshuf_decompress_lz4 bshuf_decompress_lz4_neon +#include <bitshuffle/bitshuffle.h> // NOLINT(*) +#undef bshuf_compress_lz4_bound +#undef bshuf_compress_lz4 +#undef bshuf_decompress_lz4 + using base::CPU; namespace doris { @@ -63,6 +72,10 @@ __attribute__((constructor)) void SelectBitshuffleFunctions() { g_bshuf_compress_lz4 = bshuf_compress_lz4; g_bshuf_decompress_lz4 = bshuf_decompress_lz4; } +#elif defined(__ARM_NEON) && defined(__aarch64__) + g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound_neon; + g_bshuf_compress_lz4 = bshuf_compress_lz4_neon; + g_bshuf_decompress_lz4 = bshuf_decompress_lz4_neon; #else g_bshuf_compress_lz4_bound = bshuf_compress_lz4_bound; g_bshuf_compress_lz4 = bshuf_compress_lz4; diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index cf0d7576d0d..07acd21e44d 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -1141,7 +1141,7 @@ build_bitshuffle() { MACHINE_TYPE="$(uname -m)" # Becuase aarch64 don't support avx2, disable it. if [[ "${MACHINE_TYPE}" == "aarch64" || "${MACHINE_TYPE}" == 'arm64' ]]; then - arches=('default') + arches=('default' 'neon') fi to_link="" @@ -1153,6 +1153,9 @@ build_bitshuffle() { if [[ "${arch}" == "avx512" ]]; then arch_flag="-mavx512bw -mavx512f" fi + if [[ "${arch}" == "neon" ]]; then + arch_flag="-march=armv8-a+crc" + fi tmp_obj="bitshuffle_${arch}_tmp.o" dst_obj="bitshuffle_${arch}.o" "${CC}" ${EXTRA_CFLAGS:+${EXTRA_CFLAGS}} ${arch_flag:+${arch_flag}} -std=c99 "-I${PREFIX}/include/lz4" -O3 -DNDEBUG -c \ @@ -1162,7 +1165,7 @@ build_bitshuffle() { # Merge the object files together to produce a combined .o file. "${ld}" -r -o "${tmp_obj}" bitshuffle_core.o bitshuffle.o iochain.o # For the AVX2 symbols, suffix them. - if [[ "${arch}" == "avx2" ]] || [[ "${arch}" == "avx512" ]]; then + if [[ "${arch}" == "avx2" ]] || [[ "${arch}" == "avx512" ]] || [[ "${arch}" == "neon" ]]; then local nm="${DORIS_BIN_UTILS}/nm" local objcopy="${DORIS_BIN_UTILS}/objcopy" diff --git a/thirdparty/download-thirdparty.sh b/thirdparty/download-thirdparty.sh index ca26f448970..2f2f35734c5 100755 --- a/thirdparty/download-thirdparty.sh +++ b/thirdparty/download-thirdparty.sh @@ -502,4 +502,17 @@ if [[ " ${TP_ARCHIVES[*]} " =~ " KRB5 " ]]; then echo "Finished patching ${KRB5_SOURCE}" fi +# patch bitshuffle +if [[ " ${TP_ARCHIVES[*]} " =~ " BITSHUFFLE " ]]; then + if [[ "${BITSHUFFLE_SOURCE}" = "bitshuffle-0.5.1" ]]; then + cd "${TP_SOURCE_DIR}/${BITSHUFFLE_SOURCE}" + if [[ ! -f "${PATCHED_MARK}" ]]; then + patch -p1 <"${TP_PATCH_DIR}/bitshuffle-0.5.1.patch" + touch "${PATCHED_MARK}" + fi + cd - + fi + echo "Finished patching ${BITSHUFFLE_SOURCE}" +fi + # vim: ts=4 sw=4 ts=4 tw=100: diff --git a/thirdparty/patches/bitshuffle-0.5.1.patch b/thirdparty/patches/bitshuffle-0.5.1.patch new file mode 100644 index 00000000000..b7d7fcd68a9 --- /dev/null +++ b/thirdparty/patches/bitshuffle-0.5.1.patch @@ -0,0 +1,225 @@ +From 8463fb7b14f797de715d11cca2df4b9534f98d17 Mon Sep 17 00:00:00 2001 +From: Sebastian Pop <s...@amazon.com> +Date: Fri, 7 Apr 2023 21:26:54 +0000 +Subject: [PATCH 1/2] [arm64] use a better translation for move_mask + +No changes | With the patch | Speedup +$ python3 ./tests/test_ext.py | | +.bitshuffle 64 : 4.94 s/GB, 0.20 GB/s | 1.53 s/GB, 0.65 GB/s | 3.25x +.bitunshuffle 64 : 5.09 s/GB, 0.20 GB/s | 1.53 s/GB, 0.65 GB/s | 3.25x +.compress 64 : 5.26 s/GB, 0.19 GB/s | 1.80 s/GB, 0.55 GB/s | 2.89x +.compress zstd 64 : 8.02 s/GB, 0.12 GB/s | 4.80 s/GB, 0.21 GB/s | 1.75x +.decompress 64 : 5.72 s/GB, 0.17 GB/s | 2.21 s/GB, 0.45 GB/s | 2.64x +.decompress zstd 64 : 5.71 s/GB, 0.18 GB/s | 2.18 s/GB, 0.46 GB/s | 2.55x +--- + src/bitshuffle_core.c | 89 +++++++++++++++++++++++++++++-------------- + 1 file changed, 60 insertions(+), 29 deletions(-) + +diff --git a/src/bitshuffle_core.c b/src/bitshuffle_core.c +index ba41473..22203db 100644 +--- a/src/bitshuffle_core.c ++++ b/src/bitshuffle_core.c +@@ -605,44 +605,59 @@ int64_t bshuf_trans_byte_elem_NEON(const void* in, void* out, const size_t size, + } + } + +- +-/* Creates a mask made up of the most significant +- * bit of each byte of 'input' +- */ +-int32_t move_byte_mask_neon(uint8x16_t input) { +- +- return ( ((input[0] & 0x80) >> 7) | (((input[1] & 0x80) >> 7) << 1) | (((input[2] & 0x80) >> 7) << 2) | (((input[3] & 0x80) >> 7) << 3) +- | (((input[4] & 0x80) >> 7) << 4) | (((input[5] & 0x80) >> 7) << 5) | (((input[6] & 0x80) >> 7) << 6) | (((input[7] & 0x80) >> 7) << 7) +- | (((input[8] & 0x80) >> 7) << 8) | (((input[9] & 0x80) >> 7) << 9) | (((input[10] & 0x80) >> 7) << 10) | (((input[11] & 0x80) >> 7) << 11) +- | (((input[12] & 0x80) >> 7) << 12) | (((input[13] & 0x80) >> 7) << 13) | (((input[14] & 0x80) >> 7) << 14) | (((input[15] & 0x80) >> 7) << 15) +- ); ++uint64_t neonmovemask_bulk(uint8x16_t p0, uint8x16_t p1, uint8x16_t p2, uint8x16_t p3) { ++ const uint8x16_t bitmask = { 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, ++ 0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80}; ++ uint8x16_t t0 = vandq_u8(p0, bitmask); ++ uint8x16_t t1 = vandq_u8(p1, bitmask); ++ uint8x16_t t2 = vandq_u8(p2, bitmask); ++ uint8x16_t t3 = vandq_u8(p3, bitmask); ++ uint8x16_t sum0 = vpaddq_u8(t0, t1); ++ uint8x16_t sum1 = vpaddq_u8(t2, t3); ++ sum0 = vpaddq_u8(sum0, sum1); ++ sum0 = vpaddq_u8(sum0, sum0); ++ return vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0); + } + + /* Transpose bits within bytes. */ + int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size, + const size_t elem_size) { + +- size_t ii, kk; ++ size_t ii; + const char* in_b = (const char*) in; + char* out_b = (char*) out; +- uint16_t* out_ui16; +- + int64_t count; +- + size_t nbyte = elem_size * size; + + CHECK_MULT_EIGHT(nbyte); + +- int16x8_t xmm; +- int32_t bt; ++ const uint8x16_t a0 = vdupq_n_u8(0x80); ++ const uint8x16_t a1 = vdupq_n_u8(0x40); ++ const uint8x16_t a2 = vdupq_n_u8(0x20); ++ const uint8x16_t a3 = vdupq_n_u8(0x10); ++ const uint8x16_t a4 = vdupq_n_u8(0x8); ++ const uint8x16_t a5 = vdupq_n_u8(0x4); ++ const uint8x16_t a6 = vdupq_n_u8(0x2); ++ const uint8x16_t a7 = vdupq_n_u8(0x1); + + for (ii = 0; ii + 15 < nbyte; ii += 16) { +- xmm = vld1q_s16((int16_t *) (in_b + ii)); ++ uint8x16_t x = vld1q_u8((uint8_t *) (in_b + ii)); ++ uint8x16_t x0 = vceqq_u8(a0, vandq_u8(x, a0)); ++ uint8x16_t x1 = vceqq_u8(a1, vandq_u8(x, a1)); ++ uint8x16_t x2 = vceqq_u8(a2, vandq_u8(x, a2)); ++ uint8x16_t x3 = vceqq_u8(a3, vandq_u8(x, a3)); ++ uint8x16_t x4 = vceqq_u8(a4, vandq_u8(x, a4)); ++ uint8x16_t x5 = vceqq_u8(a5, vandq_u8(x, a5)); ++ uint8x16_t x6 = vceqq_u8(a6, vandq_u8(x, a6)); ++ uint8x16_t x7 = vceqq_u8(a7, vandq_u8(x, a7)); ++ ++ uint64_t out[2]; ++ out[0] = neonmovemask_bulk(x0, x1, x2, x3); ++ out[1] = neonmovemask_bulk(x4, x5, x6, x7); ++ int kk; + for (kk = 0; kk < 8; kk++) { +- bt = move_byte_mask_neon((uint8x16_t) xmm); +- xmm = vshlq_n_s16(xmm, 1); +- out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; +- *out_ui16 = bt; ++ uint16_t *out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; ++ *out_ui16 = ((uint16_t*)out)[kk]; + } + } + count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, +@@ -785,21 +800,37 @@ int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t + size_t ii, jj, kk; + size_t nbyte = elem_size * size; + +- int16x8_t xmm; +- int32_t bt; +- + if (elem_size % 2) { + bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size); + } else { ++ const uint8x16_t a0 = vdupq_n_u8(0x80); ++ const uint8x16_t a1 = vdupq_n_u8(0x40); ++ const uint8x16_t a2 = vdupq_n_u8(0x20); ++ const uint8x16_t a3 = vdupq_n_u8(0x10); ++ const uint8x16_t a4 = vdupq_n_u8(0x8); ++ const uint8x16_t a5 = vdupq_n_u8(0x4); ++ const uint8x16_t a6 = vdupq_n_u8(0x2); ++ const uint8x16_t a7 = vdupq_n_u8(0x1); + for (ii = 0; ii + 8 * elem_size - 1 < nbyte; + ii += 8 * elem_size) { + for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) { +- xmm = vld1q_s16((int16_t *) &in_b[ii + jj]); ++ uint8x16_t x = vld1q_u8((uint8_t *) &in_b[ii + jj]); ++ uint8x16_t x0 = vceqq_u8(a0, vandq_u8(x, a0)); ++ uint8x16_t x1 = vceqq_u8(a1, vandq_u8(x, a1)); ++ uint8x16_t x2 = vceqq_u8(a2, vandq_u8(x, a2)); ++ uint8x16_t x3 = vceqq_u8(a3, vandq_u8(x, a3)); ++ uint8x16_t x4 = vceqq_u8(a4, vandq_u8(x, a4)); ++ uint8x16_t x5 = vceqq_u8(a5, vandq_u8(x, a5)); ++ uint8x16_t x6 = vceqq_u8(a6, vandq_u8(x, a6)); ++ uint8x16_t x7 = vceqq_u8(a7, vandq_u8(x, a7)); ++ ++ uint64_t out[2]; ++ out[0] = neonmovemask_bulk(x0, x1, x2, x3); ++ out[1] = neonmovemask_bulk(x4, x5, x6, x7); ++ + for (kk = 0; kk < 8; kk++) { +- bt = move_byte_mask_neon((uint8x16_t) xmm); +- xmm = vshlq_n_s16(xmm, 1); + size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); +- out_ui16[ind / 2] = bt; ++ out_ui16[ind / 2] = ((uint16_t *)out)[kk]; + } + } + } +-- +2.31.1 + + +From db32ce8d0f0ab3f245008da58f0fe983243f6823 Mon Sep 17 00:00:00 2001 +From: Sebastian Pop <s...@amazon.com> +Date: Sat, 15 Apr 2023 13:42:06 +0000 +Subject: [PATCH 2/2] fix aliasing bug + +Patch from Andrew Pinski <pins...@gcc.gnu.org>. +--- + src/bitshuffle_core.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +diff --git a/src/bitshuffle_core.c b/src/bitshuffle_core.c +index 22203db..f3b6ca0 100644 +--- a/src/bitshuffle_core.c ++++ b/src/bitshuffle_core.c +@@ -49,6 +49,8 @@ typedef int64_t omp_size_t; + typedef size_t omp_size_t; + #endif + ++typedef uint16_t alias_uint16_t __attribute__((may_alias)); ++ + // Macros. + #define CHECK_MULT_EIGHT(n) if (n % 8) return -80; + #define MAX(X,Y) ((X) > (Y) ? (X) : (Y)) +@@ -656,8 +658,8 @@ int64_t bshuf_trans_bit_byte_NEON(const void* in, void* out, const size_t size, + out[1] = neonmovemask_bulk(x4, x5, x6, x7); + int kk; + for (kk = 0; kk < 8; kk++) { +- uint16_t *out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; +- *out_ui16 = ((uint16_t*)out)[kk]; ++ alias_uint16_t *out_ui16 = (alias_uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; ++ *out_ui16 = ((alias_uint16_t*)out)[kk]; + } + } + count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size, +@@ -795,7 +797,7 @@ int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t + // With a bit of care, this could be written such that such that it is + // in_buf = out_buf safe. + const char* in_b = (const char*) in; +- uint16_t* out_ui16 = (uint16_t*) out; ++ alias_uint16_t* out_ui16 = (alias_uint16_t*) out; + + size_t ii, jj, kk; + size_t nbyte = elem_size * size; +@@ -830,7 +832,7 @@ int64_t bshuf_shuffle_bit_eightelem_NEON(const void* in, void* out, const size_t + + for (kk = 0; kk < 8; kk++) { + size_t ind = (ii + jj / 8 + (7 - kk) * elem_size); +- out_ui16[ind / 2] = ((uint16_t *)out)[kk]; ++ out_ui16[ind / 2] = ((alias_uint16_t *)out)[kk]; + } + } + } +@@ -1145,7 +1147,7 @@ int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, + size_t ii, kk; + const char* in_b = (const char*) in; + char* out_b = (char*) out; +- uint16_t* out_ui16; ++ alias_uint16_t* out_ui16; + + int64_t count; + +@@ -1161,7 +1163,7 @@ int64_t bshuf_trans_bit_byte_SSE(const void* in, void* out, const size_t size, + for (kk = 0; kk < 8; kk++) { + bt = _mm_movemask_epi8(xmm); + xmm = _mm_slli_epi16(xmm, 1); +- out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; ++ out_ui16 = (alias_uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8]; + *out_ui16 = bt; + } + } +-- +2.31.1 + --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org