This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 5a1e48260b bench: added to row_format benchmark conversion of 53
non-nested columns (#9081)
5a1e48260b is described below
commit 5a1e48260b0727ed52d9c070382fa21d69560672
Author: Raz Luvaton <[email protected]>
AuthorDate: Sat Jan 10 14:33:00 2026 +0200
bench: added to row_format benchmark conversion of 53 non-nested columns
(#9081)
# Which issue does this PR close?
N/A
# Rationale for this change
I noticed that converting around 50 columns the conversion become very
slow, so adding a benchmark as I'm optimizing those parts
# What changes are included in this PR?
added new benchmark for `row_format` that convert 50 columns arrays
# Are these changes tested?
N/A
# Are there any user-facing changes?
Nope
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow/benches/row_format.rs | 107 +++++++++++++++++++++++++++++++++++++++++--
arrow/src/util/bench_util.rs | 38 +++++++++++++++
2 files changed, 142 insertions(+), 3 deletions(-)
diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs
index d67095ac2c..1c120bb2f2 100644
--- a/arrow/benches/row_format.rs
+++ b/arrow/benches/row_format.rs
@@ -23,13 +23,15 @@ use arrow::array::ArrayRef;
use arrow::datatypes::{Int64Type, UInt64Type};
use arrow::row::{RowConverter, SortField};
use arrow::util::bench_util::{
- create_boolean_array, create_dict_from_values, create_primitive_array,
- create_string_array_with_len, create_string_dict_array,
create_string_view_array_with_len,
+ create_boolean_array, create_boolean_array_with_seed,
create_dict_from_values,
+ create_f64_array_with_seed, create_primitive_array,
create_primitive_array_with_seed,
+ create_string_array_with_len,
create_string_array_with_len_range_and_prefix_and_seed,
+ create_string_dict_array, create_string_view_array_with_len,
create_string_view_array_with_max_len,
};
use arrow::util::data_gen::create_random_array;
use arrow_array::Array;
-use arrow_array::types::Int32Type;
+use arrow_array::types::{Int8Type, Int32Type};
use arrow_schema::{DataType, Field};
use criterion::Criterion;
use std::{hint, sync::Arc};
@@ -85,6 +87,102 @@ fn bench_iter(c: &mut Criterion) {
});
}
+/// A single benchmark with a medium number of columns (around 50) without
nested columns for real-world use cases
+/// This also makes sure there is a large gap between each value in the column
and how it is laid out in the row format.
+/// and it is on the edge of not fitting in L3 on some machines
+fn run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(
+ batch_size: usize,
+ c: &mut Criterion,
+) {
+ let mut seed = 0;
+
+ let mut cols: Vec<ArrayRef> = vec![];
+
+ for nulls in [0.0, 0.1, 0.2, 0.5] {
+ seed += 1;
+ cols.push(Arc::new(create_primitive_array_with_seed::<Int8Type>(
+ batch_size, nulls, seed,
+ )) as ArrayRef);
+ }
+
+ for nulls in [0.0, 0.1, 0.2, 0.5] {
+ seed += 1;
+ cols.push(Arc::new(create_primitive_array_with_seed::<Int32Type>(
+ batch_size, nulls, seed,
+ )) as ArrayRef);
+ }
+
+ for nulls in [0.0, 0.1, 0.2, 0.5] {
+ seed += 1;
+ cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
+ batch_size, nulls, seed,
+ )) as ArrayRef);
+ }
+
+ for _ in 0..10 {
+ seed += 1;
+ cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
+ batch_size, 0.0, seed,
+ )) as ArrayRef);
+ }
+
+ for nulls in [0.0, 0.1, 0.2, 0.5] {
+ seed += 1;
+ cols.push(Arc::new(
+ create_string_array_with_len_range_and_prefix_and_seed::<i32>(
+ batch_size, nulls, 0, 50, "", seed,
+ ),
+ ));
+ }
+
+ for _ in 0..3 {
+ seed += 1;
+ cols.push(Arc::new(
+ create_string_array_with_len_range_and_prefix_and_seed::<i32>(
+ batch_size, 0.0, 0, 10, "", seed,
+ ),
+ ));
+ }
+ for _ in 0..3 {
+ seed += 1;
+ cols.push(Arc::new(
+ create_string_array_with_len_range_and_prefix_and_seed::<i32>(
+ batch_size, 0.0, 10, 20, "", seed,
+ ),
+ ));
+ }
+ for _ in 0..3 {
+ seed += 1;
+ cols.push(Arc::new(
+ create_string_array_with_len_range_and_prefix_and_seed::<i32>(
+ batch_size, 0.0, 20, 30, "", seed,
+ ),
+ ));
+ }
+
+ for nulls in [0.0, 0.1, 0.2, 0.5] {
+ seed += 1;
+ cols.push(Arc::new(create_boolean_array_with_seed(
+ batch_size, nulls, 0.5, seed,
+ )));
+ }
+
+ for _ in 0..10 {
+ seed += 1;
+ cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
+ batch_size, 0.0, seed,
+ )) as ArrayRef);
+ }
+
+ for nulls in [0.0, 0.1, 0.2, 0.5] {
+ seed += 1;
+ cols.push(Arc::new(create_f64_array_with_seed(batch_size, nulls,
seed)) as ArrayRef);
+ }
+
+ assert_eq!(cols.len(), 53);
+ do_bench(c, format!("{batch_size} 53 columns").as_str(), cols);
+}
+
fn row_bench(c: &mut Criterion) {
let cols = vec![Arc::new(create_primitive_array::<UInt64Type>(4096, 0.))
as ArrayRef];
do_bench(c, "4096 u64(0)", cols);
@@ -279,6 +377,9 @@ fn row_bench(c: &mut Criterion) {
];
do_bench(c, "4096 large_list(0) sliced to 10 of u64(0)", cols);
+ run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(4096,
c);
+ run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(8192,
c);
+
bench_iter(c);
}
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
index 1f1dcff9b6..bcf7a559e9 100644
--- a/arrow/src/util/bench_util.rs
+++ b/arrow/src/util/bench_util.rs
@@ -117,6 +117,29 @@ where
.collect()
}
+/// Creates a random array of a given size and null density based on the
provided seed
+pub fn create_boolean_array_with_seed(
+ size: usize,
+ null_density: f32,
+ true_density: f32,
+ seed: u64,
+) -> BooleanArray
+where
+ StandardUniform: Distribution<bool>,
+{
+ let mut rng = StdRng::seed_from_u64(seed);
+ (0..size)
+ .map(|_| {
+ if rng.random::<f32>() < null_density {
+ None
+ } else {
+ let value = rng.random::<f32>() < true_density;
+ Some(value)
+ }
+ })
+ .collect()
+}
+
/// Creates a random (but fixed-seeded) string array of a given size and null
density.
///
/// Strings have a random length
@@ -734,3 +757,18 @@ pub fn create_f64_array(size: usize, nan_density: f32) ->
Float64Array {
})
.collect()
}
+
+/// Creates a random f64 array of a given size and nan-value density based on
a given seed
+pub fn create_f64_array_with_seed(size: usize, nan_density: f32, seed: u64) ->
Float64Array {
+ let mut rng = StdRng::seed_from_u64(seed);
+
+ (0..size)
+ .map(|_| {
+ if rng.random::<f32>() < nan_density {
+ Some(f64::NAN)
+ } else {
+ Some(rng.random())
+ }
+ })
+ .collect()
+}