This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 3f95bc1882 GH-46788: [C++][Parquet] Enable SIMD for byte stream split
with 2 streams (#46789)
3f95bc1882 is described below
commit 3f95bc18828d3649fe0c4a437bd70150819952cb
Author: Antoine Prouvost <[email protected]>
AuthorDate: Wed Jun 25 11:09:52 2025 +0200
GH-46788: [C++][Parquet] Enable SIMD for byte stream split with 2 streams
(#46789)
### Rationale for this change
Performance improvements for split stream encoding with two streams.
`f16` are often used in machine learning for instance.
### What changes are included in this PR?
- `ByteStreamSplitDecodeSimd128` was a straightforward beneficial change.
- `ByteStreamSplitEncodeSimd128` was significantly refactor to make it more
generic. With the new implementation, we can investigate merging it with the
`avx2` version.
### Are these changes tested?
Yes with existing tests.
### Are there any user-facing changes?
No.
* GitHub Issue: #46788
Authored-by: AntoinePrv <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/util/byte_stream_split_internal.h | 235 ++++++++++++++----------
cpp/src/arrow/util/byte_stream_split_test.cc | 12 +-
cpp/src/arrow/util/type_traits.h | 27 +++
cpp/src/parquet/encoding_benchmark.cc | 31 +++-
4 files changed, 199 insertions(+), 106 deletions(-)
diff --git a/cpp/src/arrow/util/byte_stream_split_internal.h
b/cpp/src/arrow/util/byte_stream_split_internal.h
index d3214239ff..2eb678cbfb 100644
--- a/cpp/src/arrow/util/byte_stream_split_internal.h
+++ b/cpp/src/arrow/util/byte_stream_split_internal.h
@@ -20,6 +20,7 @@
#include "arrow/util/endian.h"
#include "arrow/util/simd.h"
#include "arrow/util/small_vector.h"
+#include "arrow/util/type_traits.h"
#include "arrow/util/ubsan.h"
#include <algorithm>
@@ -39,16 +40,34 @@ namespace arrow::util::internal {
// SIMD implementations
//
+template <typename T>
+constexpr T ReversePow2(T x) {
+ for (T n = 0, y = 1; n <= (8 * static_cast<T>(sizeof(T))); ++n, y = y * 2) {
+ if (y == x) {
+ return n;
+ }
+ }
+ return 0;
+}
+
+static_assert(ReversePow2(8) == 3);
+static_assert(ReversePow2(4) == 2);
+static_assert(ReversePow2(2) == 1);
+
#if defined(ARROW_HAVE_NEON) || defined(ARROW_HAVE_SSE4_2)
template <int kNumStreams>
void ByteStreamSplitDecodeSimd128(const uint8_t* data, int width, int64_t
num_values,
int64_t stride, uint8_t* out) {
using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>;
+ static_assert(kNumStreams <= simd_batch::size,
+ "The algorithm works when the number of streams is smaller
than the SIMD "
+ "batch size.");
assert(width == kNumStreams);
- static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of
streams.");
- constexpr int kNumStreamsLog2 = (kNumStreams == 8 ? 3 : 2);
- constexpr int64_t kBlockSize = sizeof(simd_batch) * kNumStreams;
+ constexpr int kNumStreamsLog2 = ReversePow2(kNumStreams);
+ static_assert(kNumStreamsLog2 != 0,
+ "The algorithm works for a number of streams being a power of
two.");
+ constexpr int64_t kBlockSize = simd_batch::size * kNumStreams;
const int64_t size = num_values * kNumStreams;
const int64_t num_blocks = size / kBlockSize;
@@ -71,13 +90,12 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int
width, int64_t num_va
// Stage 1: AAAA BBBB CCCC DDDD
// Stage 2: ACAC ACAC BDBD BDBD
// Stage 3: ABCD ABCD ABCD ABCD
- simd_batch stage[kNumStreamsLog2 + 1][kNumStreams];
constexpr int kNumStreamsHalf = kNumStreams / 2U;
for (int64_t i = 0; i < num_blocks; ++i) {
+ simd_batch stage[kNumStreamsLog2 + 1][kNumStreams];
for (int j = 0; j < kNumStreams; ++j) {
- stage[0][j] =
- simd_batch::load_unaligned(&data[i * sizeof(simd_batch) + j *
stride]);
+ stage[0][j] = simd_batch::load_unaligned(&data[i * simd_batch::size + j
* stride]);
}
for (int step = 0; step < kNumStreamsLog2; ++step) {
for (int j = 0; j < kNumStreamsHalf; ++j) {
@@ -89,23 +107,54 @@ void ByteStreamSplitDecodeSimd128(const uint8_t* data, int
width, int64_t num_va
}
for (int j = 0; j < kNumStreams; ++j) {
xsimd::store_unaligned(
- reinterpret_cast<int8_t*>(out + (i * kNumStreams + j) *
sizeof(simd_batch)),
+ reinterpret_cast<int8_t*>(out + (i * kNumStreams + j) *
simd_batch::size),
stage[kNumStreamsLog2][j]);
}
}
}
+// Like xsimd::zip_lo, but zip groups of kNumBytes at once.
+template <int kNumBytes, int kBatchSize = 16,
+ typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>>
+auto zip_lo_n(Batch const& a, Batch const& b) -> Batch {
+ using arrow::internal::SizedInt;
+
+ if constexpr (kNumBytes == kBatchSize) {
+ return a;
+ } else {
+ return xsimd::bitwise_cast<int8_t>(
+ xsimd::zip_lo(xsimd::bitwise_cast<SizedInt<kNumBytes>>(a),
+ xsimd::bitwise_cast<SizedInt<kNumBytes>>(b)));
+ }
+}
+
+// Like xsimd::zip_hi, but zip groups of kNumBytes at once.
+template <int kNumBytes, int kBatchSize = 16,
+ typename Batch = xsimd::make_sized_batch_t<int8_t, kBatchSize>>
+auto zip_hi_n(Batch const& a, Batch const& b) -> Batch {
+ using arrow::internal::SizedInt;
+
+ if constexpr (kNumBytes == kBatchSize) {
+ return b;
+ } else {
+ return xsimd::bitwise_cast<int8_t>(
+ xsimd::zip_hi(xsimd::bitwise_cast<SizedInt<kNumBytes>>(a),
+ xsimd::bitwise_cast<SizedInt<kNumBytes>>(b)));
+ }
+}
+
template <int kNumStreams>
void ByteStreamSplitEncodeSimd128(const uint8_t* raw_values, int width,
const int64_t num_values, uint8_t*
output_buffer_raw) {
using simd_batch = xsimd::make_sized_batch_t<int8_t, 16>;
assert(width == kNumStreams);
- static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of
streams.");
- constexpr int kBlockSize = sizeof(simd_batch) * kNumStreams;
-
- simd_batch stage[3][kNumStreams];
- simd_batch final_result[kNumStreams];
+ static_assert(kNumStreams <= simd_batch::size,
+ "The algorithm works when the number of streams is smaller
than the SIMD "
+ "batch size.");
+ constexpr int kBlockSize = simd_batch::size * kNumStreams;
+ static_assert(ReversePow2(kNumStreams) != 0,
+ "The algorithm works for a number of streams being a power of
two.");
const int64_t size = num_values * kNumStreams;
const int64_t num_blocks = size / kBlockSize;
@@ -123,95 +172,80 @@ void ByteStreamSplitEncodeSimd128(const uint8_t*
raw_values, int width,
output_buffer_raw[j * num_values + i] = byte_in_value;
}
}
- // The current shuffling algorithm diverges for float and double types but
the compiler
- // should be able to remove the branch since only one path is taken for each
template
- // instantiation.
- // Example run for 32-bit variables:
- // Step 0: copy from unaligned input bytes:
- // 0: ABCD ABCD ABCD ABCD 1: ABCD ABCD ABCD ABCD ...
- // Step 1: simd_batch<int8_t, 8>::zip_lo and simd_batch<int8_t, 8>::zip_hi:
- // 0: AABB CCDD AABB CCDD 1: AABB CCDD AABB CCDD ...
- // Step 2: apply simd_batch<int8_t, 8>::zip_lo and simd_batch<int8_t,
8>::zip_hi again:
- // 0: AAAA BBBB CCCC DDDD 1: AAAA BBBB CCCC DDDD ...
- // Step 3: simd_batch<int8_t, 8>::zip_lo and simd_batch<int8_t, 8>::zip_hi:
- // 0: AAAA AAAA BBBB BBBB 1: CCCC CCCC DDDD DDDD ...
- // Step 4: simd_batch<int64_t, 2>::zip_lo and simd_batch<int64_t, 2>::zip_hi:
- // 0: AAAA AAAA AAAA AAAA 1: BBBB BBBB BBBB BBBB ...
+
+ // Number of input values we can fit in a simd register
+ constexpr int kNumValuesInBatch = simd_batch::size / kNumStreams;
+ static_assert(kNumValuesInBatch > 0);
+ // Number of bytes we'll bring together in the first byte-level part of the
algorithm.
+ // Since we zip with the next batch, the number of values in a batch
determines how many
+ // bytes end up together before we can use a larger type
+ constexpr int kNumBytes = 2 * kNumValuesInBatch;
+ // Number of steps in the first part of the algorithm with byte-level zipping
+ constexpr int kNumStepsByte = ReversePow2(kNumValuesInBatch) + 1;
+ // Number of steps in the first part of the algorithm with large data type
zipping
+ constexpr int kNumStepsLarge =
+ ReversePow2(static_cast<int>(simd_batch::size) / kNumBytes);
+ // Total number of steps
+ constexpr int kNumSteps = kNumStepsByte + kNumStepsLarge;
+ static_assert(kNumSteps == ReversePow2(simd_batch::size));
+
+ // Two step shuffling algorithm that starts with bytes and ends with a
larger data type.
+ // An algorithm similar to the decoding one with log2(simd_batch::size) + 1
stages is
+ // also valid but not as performant.
for (int64_t block_index = 0; block_index < num_blocks; ++block_index) {
+ simd_batch stage[kNumSteps + 1][kNumStreams];
+
// First copy the data to stage 0.
for (int i = 0; i < kNumStreams; ++i) {
stage[0][i] = simd_batch::load_unaligned(
- reinterpret_cast<const int8_t*>(raw_values) +
- (block_index * kNumStreams + i) * sizeof(simd_batch));
+ &raw_values[(block_index * kNumStreams + i) * simd_batch::size]);
}
+ // We first make byte-level shuffling, until we have gather enough bytes
together
+ // and in the correct order to use a bigger data type.
+ //
+ // Example with 32bit data on 128 bit register:
+ //
+ // 0: A0B0C0D0 A1B1C1D1 A2B2C2D2 A3B3C3D3 | A4B4C4D4 A5B5C5D5 A6B6C6D6
A7B7C7D7 | ...
+ // 1: A0A4B0B4 C0C4D0D4 A1A5B1B5 C1C5D1D5 | A2A6B2B6 C2C6D2D6 A3A7B3B7
C3C7D3D7 | ...
+ // 2: A0A2A4A6 B0B2B4B6 C0C2C4C6 D0D2D4D6 | A1A3A5A7 B1B3B5B7 C1C3C5C7
D1D3D5D7 | ...
+ // 3: A0A1A2A3 A4A5A6A7 B0B1B2B3 B4B5B6B7 | C0C1C2C3 C4C5C6C7 D0D1D2D3
D4D5D6D7 | ...
+ //
// The shuffling of bytes is performed through the unpack intrinsics.
// In my measurements this gives better performance then an implementation
// which uses the shuffle intrinsics.
- for (int stage_lvl = 0; stage_lvl < 2; ++stage_lvl) {
- for (int i = 0; i < kNumStreams / 2; ++i) {
- stage[stage_lvl + 1][i * 2] =
- xsimd::zip_lo(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 +
1]);
- stage[stage_lvl + 1][i * 2 + 1] =
- xsimd::zip_hi(stage[stage_lvl][i * 2], stage[stage_lvl][i * 2 +
1]);
+ //
+ // Loop order does not matter so we prefer higher locality
+ constexpr int kNumStreamsHalf = kNumStreams / 2;
+ for (int i = 0; i < kNumStreamsHalf; ++i) {
+ for (int step = 0; step < kNumStepsByte; ++step) {
+ stage[step + 1][i * 2] =
+ xsimd::zip_lo(stage[step][i * 2], stage[step][i * 2 + 1]);
+ stage[step + 1][i * 2 + 1] =
+ xsimd::zip_hi(stage[step][i * 2], stage[step][i * 2 + 1]);
}
}
- if constexpr (kNumStreams == 8) {
- // This is the path for 64bits data.
- simd_batch tmp[8];
- using int32_batch = xsimd::make_sized_batch_t<int32_t, 4>;
- // This is a workaround, see:
https://github.com/xtensor-stack/xsimd/issues/735
- auto from_int32_batch = [](int32_batch from) -> simd_batch {
- simd_batch dest;
- memcpy(&dest, &from, sizeof(simd_batch));
- return dest;
- };
- auto to_int32_batch = [](simd_batch from) -> int32_batch {
- int32_batch dest;
- memcpy(&dest, &from, sizeof(simd_batch));
- return dest;
- };
- for (int i = 0; i < 4; ++i) {
- tmp[i * 2] = from_int32_batch(
- xsimd::zip_lo(to_int32_batch(stage[2][i]),
to_int32_batch(stage[2][i + 4])));
- tmp[i * 2 + 1] = from_int32_batch(
- xsimd::zip_hi(to_int32_batch(stage[2][i]),
to_int32_batch(stage[2][i + 4])));
- }
- for (int i = 0; i < 4; ++i) {
- final_result[i * 2] = from_int32_batch(
- xsimd::zip_lo(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4])));
- final_result[i * 2 + 1] = from_int32_batch(
- xsimd::zip_hi(to_int32_batch(tmp[i]), to_int32_batch(tmp[i + 4])));
- }
- } else {
- // This is the path for 32bits data.
- using int64_batch = xsimd::make_sized_batch_t<int64_t, 2>;
- // This is a workaround, see:
https://github.com/xtensor-stack/xsimd/issues/735
- auto from_int64_batch = [](int64_batch from) -> simd_batch {
- simd_batch dest;
- memcpy(&dest, &from, sizeof(simd_batch));
- return dest;
- };
- auto to_int64_batch = [](simd_batch from) -> int64_batch {
- int64_batch dest;
- memcpy(&dest, &from, sizeof(simd_batch));
- return dest;
- };
- simd_batch tmp[4];
- for (int i = 0; i < 2; ++i) {
- tmp[i * 2] = xsimd::zip_lo(stage[2][i * 2], stage[2][i * 2 + 1]);
- tmp[i * 2 + 1] = xsimd::zip_hi(stage[2][i * 2], stage[2][i * 2 + 1]);
- }
- for (int i = 0; i < 2; ++i) {
- final_result[i * 2] = from_int64_batch(
- xsimd::zip_lo(to_int64_batch(tmp[i]), to_int64_batch(tmp[i + 2])));
- final_result[i * 2 + 1] = from_int64_batch(
- xsimd::zip_hi(to_int64_batch(tmp[i]), to_int64_batch(tmp[i + 2])));
+
+ // We know have the bytes packed in a larger data type and in the correct
order to
+ // start using a bigger data type
+ //
+ // Example with 32bit data on 128 bit register.
+ // The large data type is int64_t with NumBytes=8 bytes:
+ //
+ // 4: A0A1A2A3 A4A5A6A7 A8A9AAAB ACADAEAF | B0B1B2B3 B4B5B6B7 B8B9BABB
BCBDBEBF | ...
+ for (int step = kNumStepsByte; step < kNumSteps; ++step) {
+ for (int i = 0; i < kNumStreamsHalf; ++i) {
+ stage[step + 1][i * 2] =
+ zip_lo_n<kNumBytes>(stage[step][i], stage[step][i +
kNumStreamsHalf]);
+ stage[step + 1][i * 2 + 1] =
+ zip_hi_n<kNumBytes>(stage[step][i], stage[step][i +
kNumStreamsHalf]);
}
}
+
+ // Save the encoded data to the output buffer
for (int i = 0; i < kNumStreams; ++i) {
- xsimd::store_unaligned(&output_buffer_streams[i][block_index *
sizeof(simd_batch)],
- final_result[i]);
+ xsimd::store_unaligned(&output_buffer_streams[i][block_index *
simd_batch::size],
+ stage[kNumSteps][i]);
}
}
}
@@ -309,13 +343,9 @@ template <int kNumStreams>
void ByteStreamSplitEncodeAvx2(const uint8_t* raw_values, int width,
const int64_t num_values, uint8_t*
output_buffer_raw) {
assert(width == kNumStreams);
- static_assert(kNumStreams == 4 || kNumStreams == 8, "Invalid number of
streams.");
+ static_assert(kNumStreams == 4, "Invalid number of streams.");
constexpr int kBlockSize = sizeof(__m256i) * kNumStreams;
- if constexpr (kNumStreams == 8) // Back to SSE, currently no path for
double.
- return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width,
num_values,
- output_buffer_raw);
-
const int64_t size = num_values * kNumStreams;
if (size < kBlockSize) // Back to SSE for small size
return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width,
num_values,
@@ -384,7 +414,12 @@ template <int kNumStreams>
void inline ByteStreamSplitDecodeSimd(const uint8_t* data, int width, int64_t
num_values,
int64_t stride, uint8_t* out) {
# if defined(ARROW_HAVE_AVX2)
- return ByteStreamSplitDecodeAvx2<kNumStreams>(data, width, num_values,
stride, out);
+ // Not implemented
+ if constexpr (kNumStreams == 2) {
+ return ByteStreamSplitDecodeSimd128<2>(data, width, num_values, stride,
out);
+ } else {
+ return ByteStreamSplitDecodeAvx2<kNumStreams>(data, width, num_values,
stride, out);
+ }
# elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
return ByteStreamSplitDecodeSimd128<kNumStreams>(data, width, num_values,
stride, out);
# else
@@ -397,8 +432,14 @@ void inline ByteStreamSplitEncodeSimd(const uint8_t*
raw_values, int width,
const int64_t num_values,
uint8_t* output_buffer_raw) {
# if defined(ARROW_HAVE_AVX2)
- return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, width, num_values,
- output_buffer_raw);
+ // Not implemented
+ if constexpr (kNumStreams == 2 || kNumStreams == 8) {
+ return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width,
num_values,
+ output_buffer_raw);
+ } else {
+ return ByteStreamSplitEncodeAvx2<kNumStreams>(raw_values, width,
num_values,
+ output_buffer_raw);
+ }
# elif defined(ARROW_HAVE_SSE4_2) || defined(ARROW_HAVE_NEON)
return ByteStreamSplitEncodeSimd128<kNumStreams>(raw_values, width,
num_values,
output_buffer_raw);
@@ -555,7 +596,7 @@ inline void ByteStreamSplitEncode(const uint8_t*
raw_values, int width,
memcpy(out, raw_values, num_values);
return;
case 2:
- return ByteStreamSplitEncodeScalar<2>(raw_values, width, num_values,
out);
+ return ByteStreamSplitEncodePerhapsSimd<2>(raw_values, width,
num_values, out);
case 4:
return ByteStreamSplitEncodePerhapsSimd<4>(raw_values, width,
num_values, out);
case 8:
@@ -579,7 +620,7 @@ inline void ByteStreamSplitDecode(const uint8_t* data, int
width, int64_t num_va
memcpy(out, data, num_values);
return;
case 2:
- return ByteStreamSplitDecodeScalar<2>(data, width, num_values, stride,
out);
+ return ByteStreamSplitDecodePerhapsSimd<2>(data, width, num_values,
stride, out);
case 4:
return ByteStreamSplitDecodePerhapsSimd<4>(data, width, num_values,
stride, out);
case 8:
diff --git a/cpp/src/arrow/util/byte_stream_split_test.cc
b/cpp/src/arrow/util/byte_stream_split_test.cc
index 9755cd8b8d..ec995a0a9a 100644
--- a/cpp/src/arrow/util/byte_stream_split_test.cc
+++ b/cpp/src/arrow/util/byte_stream_split_test.cc
@@ -136,7 +136,7 @@ class TestByteStreamSplitSpecialized : public
::testing::Test {
return input;
}
- template <bool kSimdImplemented = (kWidth == 4 || kWidth == 8)>
+ template <bool kSimdImplemented = (kWidth == 2 || kWidth == 4 || kWidth ==
8)>
static std::vector<DecodeFunc> MakeDecodeFuncs() {
std::vector<DecodeFunc> funcs;
funcs.push_back({"scalar_dynamic", &ByteStreamSplitDecodeScalarDynamic});
@@ -146,7 +146,10 @@ class TestByteStreamSplitSpecialized : public
::testing::Test {
funcs.push_back({"simd", &ByteStreamSplitDecodeSimd<kWidth>});
funcs.push_back({"simd128", &ByteStreamSplitDecodeSimd128<kWidth>});
# if defined(ARROW_HAVE_AVX2)
- funcs.push_back({"avx2", &ByteStreamSplitDecodeAvx2<kWidth>});
+ // The only available implementations
+ if constexpr (kWidth == 4 || kWidth == 8) {
+ funcs.push_back({"avx2", &ByteStreamSplitDecodeAvx2<kWidth>});
+ }
# endif
}
#endif // defined(ARROW_HAVE_SIMD_SPLIT)
@@ -164,7 +167,10 @@ class TestByteStreamSplitSpecialized : public
::testing::Test {
funcs.push_back({"simd", &ByteStreamSplitEncodeSimd<kWidth>});
funcs.push_back({"simd128", &ByteStreamSplitEncodeSimd128<kWidth>});
# if defined(ARROW_HAVE_AVX2)
- funcs.push_back({"avx2", &ByteStreamSplitEncodeAvx2<kWidth>});
+ // The only available implementation
+ if constexpr (kWidth == 4) {
+ funcs.push_back({"avx2", &ByteStreamSplitEncodeAvx2<kWidth>});
+ }
# endif
}
#endif // defined(ARROW_HAVE_SIMD_SPLIT)
diff --git a/cpp/src/arrow/util/type_traits.h b/cpp/src/arrow/util/type_traits.h
index c190615242..9c3b388dab 100644
--- a/cpp/src/arrow/util/type_traits.h
+++ b/cpp/src/arrow/util/type_traits.h
@@ -42,5 +42,32 @@ template <typename T>
struct is_null_pointer : std::is_same<std::nullptr_t, typename
std::remove_cv<T>::type> {
};
+template <int kNumBytes>
+struct SizedIntImpl;
+
+template <>
+struct SizedIntImpl<1> {
+ using type = int8_t;
+};
+
+template <>
+struct SizedIntImpl<2> {
+ using type = int16_t;
+};
+
+template <>
+struct SizedIntImpl<4> {
+ using type = int32_t;
+};
+
+template <>
+struct SizedIntImpl<8> {
+ using type = int64_t;
+};
+
+// Map a number of bytes to a type
+template <int kNumBytes>
+using SizedInt = typename SizedIntImpl<kNumBytes>::type;
+
} // namespace internal
} // namespace arrow
diff --git a/cpp/src/parquet/encoding_benchmark.cc
b/cpp/src/parquet/encoding_benchmark.cc
index 28cfcd98a3..aee30bf762 100644
--- a/cpp/src/parquet/encoding_benchmark.cc
+++ b/cpp/src/parquet/encoding_benchmark.cc
@@ -19,6 +19,7 @@
#include <array>
#include <cmath>
+#include <cstdint>
#include <limits>
#include <random>
@@ -508,6 +509,11 @@
BENCHMARK(BM_ByteStreamSplitEncode_Float_Scalar)->Apply(ByteStreamSplitApply);
BENCHMARK(BM_ByteStreamSplitEncode_Double_Scalar)->Apply(ByteStreamSplitApply);
#if defined(ARROW_HAVE_SSE4_2)
+static void BM_ByteStreamSplitDecode_Int16_Sse2(benchmark::State& state) {
+ BM_ByteStreamSplitDecode<int16_t>(
+ state,
::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(int16_t)>);
+}
+
static void BM_ByteStreamSplitDecode_Float_Sse2(benchmark::State& state) {
BM_ByteStreamSplitDecode<float>(
state,
::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(float)>);
@@ -518,6 +524,11 @@ static void
BM_ByteStreamSplitDecode_Double_Sse2(benchmark::State& state) {
state,
::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(double)>);
}
+static void BM_ByteStreamSplitEncode_Int16_Sse2(benchmark::State& state) {
+ BM_ByteStreamSplitEncode<int16_t>(
+ state,
::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(int16_t)>);
+}
+
static void BM_ByteStreamSplitEncode_Float_Sse2(benchmark::State& state) {
BM_ByteStreamSplitEncode<float>(
state,
::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(float)>);
@@ -528,8 +539,10 @@ static void
BM_ByteStreamSplitEncode_Double_Sse2(benchmark::State& state) {
state,
::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(double)>);
}
+BENCHMARK(BM_ByteStreamSplitDecode_Int16_Sse2)->Apply(ByteStreamSplitApply);
BENCHMARK(BM_ByteStreamSplitDecode_Float_Sse2)->Apply(ByteStreamSplitApply);
BENCHMARK(BM_ByteStreamSplitDecode_Double_Sse2)->Apply(ByteStreamSplitApply);
+BENCHMARK(BM_ByteStreamSplitEncode_Int16_Sse2)->Apply(ByteStreamSplitApply);
BENCHMARK(BM_ByteStreamSplitEncode_Float_Sse2)->Apply(ByteStreamSplitApply);
BENCHMARK(BM_ByteStreamSplitEncode_Double_Sse2)->Apply(ByteStreamSplitApply);
#endif
@@ -550,18 +563,17 @@ static void
BM_ByteStreamSplitEncode_Float_Avx2(benchmark::State& state) {
state,
::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(float)>);
}
-static void BM_ByteStreamSplitEncode_Double_Avx2(benchmark::State& state) {
- BM_ByteStreamSplitEncode<double>(
- state,
::arrow::util::internal::ByteStreamSplitEncodeAvx2<sizeof(double)>);
-}
-
BENCHMARK(BM_ByteStreamSplitDecode_Float_Avx2)->Apply(ByteStreamSplitApply);
BENCHMARK(BM_ByteStreamSplitDecode_Double_Avx2)->Apply(ByteStreamSplitApply);
BENCHMARK(BM_ByteStreamSplitEncode_Float_Avx2)->Apply(ByteStreamSplitApply);
-BENCHMARK(BM_ByteStreamSplitEncode_Double_Avx2)->Apply(ByteStreamSplitApply);
#endif
#if defined(ARROW_HAVE_NEON)
+static void BM_ByteStreamSplitDecode_Int16_Neon(benchmark::State& state) {
+ BM_ByteStreamSplitDecode<int16_t>(
+ state,
::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(int16_t)>);
+}
+
static void BM_ByteStreamSplitDecode_Float_Neon(benchmark::State& state) {
BM_ByteStreamSplitDecode<float>(
state,
::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(float)>);
@@ -572,6 +584,11 @@ static void
BM_ByteStreamSplitDecode_Double_Neon(benchmark::State& state) {
state,
::arrow::util::internal::ByteStreamSplitDecodeSimd128<sizeof(double)>);
}
+static void BM_ByteStreamSplitEncode_Int16_Neon(benchmark::State& state) {
+ BM_ByteStreamSplitEncode<int16_t>(
+ state,
::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(int16_t)>);
+}
+
static void BM_ByteStreamSplitEncode_Float_Neon(benchmark::State& state) {
BM_ByteStreamSplitEncode<float>(
state,
::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(float)>);
@@ -582,8 +599,10 @@ static void
BM_ByteStreamSplitEncode_Double_Neon(benchmark::State& state) {
state,
::arrow::util::internal::ByteStreamSplitEncodeSimd128<sizeof(double)>);
}
+BENCHMARK(BM_ByteStreamSplitDecode_Int16_Neon)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitDecode_Float_Neon)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitDecode_Double_Neon)->Range(MIN_RANGE, MAX_RANGE);
+BENCHMARK(BM_ByteStreamSplitEncode_Int16_Neon)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitEncode_Float_Neon)->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK(BM_ByteStreamSplitEncode_Double_Neon)->Range(MIN_RANGE, MAX_RANGE);
#endif