This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new b80d457e6e Allocate buffers before work in `boolean_kernels` benchmark
(#9035)
b80d457e6e is described below
commit b80d457e6e1689e077a635f8b03afdc4be4fa06a
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu Dec 25 09:31:11 2025 -0500
Allocate buffers before work in `boolean_kernels` benchmark (#9035)
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Related to https://github.com/apache/arrow-rs/issues/8806
- Related to https://github.com/apache/arrow-rs/pull/8996
# Rationale for this change
When working on improving the boolean kernels, I have seen significant
and unexplained noise from run to run. For example, just adding a fast
path for `u64` aligned data resulted in a reported 30% regression in the
speed of slice24 (code that is not affected by the change at all).
for example, from https://github.com/apache/arrow-rs/pull/9022
```
and 1.00 208.0±5.91ns ? ?/sec 1.34
278.8±10.07ns ? ?/sec
and_sliced_1 1.00 1100.2±6.53ns ? ?/sec 1.12
1226.9±6.11ns ? ?/sec
and_sliced_24 1.40 340.9±2.49ns ? ?/sec 1.00
243.7±2.13ns ? ?/sec
```
I also can't reproduce this effect locally or when I run the benchmarks
individually.
Given the above, and the tiny amount of time spent in the benchmark
(hundreds of nanoseconds), I believe what is happening is that changing
the allocation pattern during the benchmark runs (each kernel allocates
output), data for subsequent iterations is allocated subtlety
differently (e.g. the exact alignment or some other factor is
different).
This results in different performance characteristics even when the code
has not
changed.
# What changes are included in this PR?
To reduce this noise, I want to change the benchmarks to pre-allocate
the input.
# Are these changes tested?
I ran them manually
# Are there any user-facing changes?
No, this is just a benchmark change
---
arrow/benches/boolean_kernels.rs | 37 +++++++++++++++++++++----------------
1 file changed, 21 insertions(+), 16 deletions(-)
diff --git a/arrow/benches/boolean_kernels.rs b/arrow/benches/boolean_kernels.rs
index 6ec507c958..a7231c031a 100644
--- a/arrow/benches/boolean_kernels.rs
+++ b/arrow/benches/boolean_kernels.rs
@@ -40,38 +40,43 @@ fn bench_not(array: &BooleanArray) {
}
fn add_benchmark(c: &mut Criterion) {
+ // allocate arrays of 32K elements
let size = 2usize.pow(15);
+
+ // Note we allocate all arrays before the benchmark to ensure the
allocation of the arrays
+ // is not affected by allocations that happen during the benchmarked
operation.
let array1 = create_boolean_array(size, 0.0, 0.5);
let array2 = create_boolean_array(size, 0.0, 0.5);
- c.bench_function("and", |b| b.iter(|| bench_and(&array1, &array2)));
- c.bench_function("or", |b| b.iter(|| bench_or(&array1, &array2)));
- c.bench_function("not", |b| b.iter(|| bench_not(&array1)));
// Slice by 1 (not aligned to byte (8 bit) or word (64 bit) boundaries)
let offset = 1;
- let array1_slice = array1.slice(offset, size - offset);
- let array2_slice = array2.slice(offset, size - offset);
+ let array1_sliced_1 = array1.slice(offset, size - offset);
+ let array2_sliced_1 = array2.slice(offset, size - offset);
+
+ // Slice by 24 (aligned on byte (8 bit) but not word (64 bit) boundaries)
+ let offset = 24;
+ let array1_sliced_24 = array1.slice(offset, size - offset);
+ let array2_sliced_24 = array2.slice(offset, size - offset);
+
+ c.bench_function("and", |b| b.iter(|| bench_and(&array1, &array2)));
+ c.bench_function("or", |b| b.iter(|| bench_or(&array1, &array2)));
+ c.bench_function("not", |b| b.iter(|| bench_not(&array1)));
c.bench_function("and_sliced_1", |b| {
- b.iter(|| bench_and(&array1_slice, &array2_slice))
+ b.iter(|| bench_and(&array1_sliced_1, &array2_sliced_1))
});
c.bench_function("or_sliced_1", |b| {
- b.iter(|| bench_or(&array1_slice, &array2_slice))
+ b.iter(|| bench_or(&array1_sliced_1, &array2_sliced_1))
});
- c.bench_function("not_sliced_1", |b| b.iter(|| bench_not(&array1_slice)));
-
- // Slice by 24 (aligned on byte (8 bit) but not word (64 bit) boundaries)
- let offset = 24;
- let array1_slice = array1.slice(offset, size - offset);
- let array2_slice = array2.slice(offset, size - offset);
+ c.bench_function("not_sliced_1", |b| b.iter(||
bench_not(&array1_sliced_1)));
c.bench_function("and_sliced_24", |b| {
- b.iter(|| bench_and(&array1_slice, &array2_slice))
+ b.iter(|| bench_and(&array1_sliced_24, &array2_sliced_24))
});
c.bench_function("or_sliced_24", |b| {
- b.iter(|| bench_or(&array1_slice, &array2_slice))
+ b.iter(|| bench_or(&array1_sliced_24, &array2_sliced_24))
});
- c.bench_function("not_slice_24", |b| b.iter(|| bench_not(&array1_slice)));
+ c.bench_function("not_slice_24", |b| b.iter(||
bench_not(&array1_sliced_24)));
}
criterion_group!(benches, add_benchmark);