This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 5a1e48260b bench: added to row_format benchmark conversion of 53 
non-nested columns (#9081)
5a1e48260b is described below

commit 5a1e48260b0727ed52d9c070382fa21d69560672
Author: Raz Luvaton <[email protected]>
AuthorDate: Sat Jan 10 14:33:00 2026 +0200

    bench: added to row_format benchmark conversion of 53 non-nested columns 
(#9081)
    
    # Which issue does this PR close?
    
    N/A
    
    # Rationale for this change
    
    I noticed that converting around 50 columns the conversion become very
    slow, so adding a benchmark as I'm optimizing those parts
    
    # What changes are included in this PR?
    
    added new benchmark for `row_format` that convert 50 columns arrays
    
    # Are these changes tested?
    
    N/A
    
    # Are there any user-facing changes?
    
    Nope
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow/benches/row_format.rs  | 107 +++++++++++++++++++++++++++++++++++++++++--
 arrow/src/util/bench_util.rs |  38 +++++++++++++++
 2 files changed, 142 insertions(+), 3 deletions(-)

diff --git a/arrow/benches/row_format.rs b/arrow/benches/row_format.rs
index d67095ac2c..1c120bb2f2 100644
--- a/arrow/benches/row_format.rs
+++ b/arrow/benches/row_format.rs
@@ -23,13 +23,15 @@ use arrow::array::ArrayRef;
 use arrow::datatypes::{Int64Type, UInt64Type};
 use arrow::row::{RowConverter, SortField};
 use arrow::util::bench_util::{
-    create_boolean_array, create_dict_from_values, create_primitive_array,
-    create_string_array_with_len, create_string_dict_array, 
create_string_view_array_with_len,
+    create_boolean_array, create_boolean_array_with_seed, 
create_dict_from_values,
+    create_f64_array_with_seed, create_primitive_array, 
create_primitive_array_with_seed,
+    create_string_array_with_len, 
create_string_array_with_len_range_and_prefix_and_seed,
+    create_string_dict_array, create_string_view_array_with_len,
     create_string_view_array_with_max_len,
 };
 use arrow::util::data_gen::create_random_array;
 use arrow_array::Array;
-use arrow_array::types::Int32Type;
+use arrow_array::types::{Int8Type, Int32Type};
 use arrow_schema::{DataType, Field};
 use criterion::Criterion;
 use std::{hint, sync::Arc};
@@ -85,6 +87,102 @@ fn bench_iter(c: &mut Criterion) {
     });
 }
 
+/// A single benchmark with a medium number of columns (around 50) without 
nested columns for real-world use cases
+/// This also makes sure there is a large gap between each value in the column 
and how it is laid out in the row format.
+/// and it is on the edge of not fitting in L3 on some machines
+fn run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(
+    batch_size: usize,
+    c: &mut Criterion,
+) {
+    let mut seed = 0;
+
+    let mut cols: Vec<ArrayRef> = vec![];
+
+    for nulls in [0.0, 0.1, 0.2, 0.5] {
+        seed += 1;
+        cols.push(Arc::new(create_primitive_array_with_seed::<Int8Type>(
+            batch_size, nulls, seed,
+        )) as ArrayRef);
+    }
+
+    for nulls in [0.0, 0.1, 0.2, 0.5] {
+        seed += 1;
+        cols.push(Arc::new(create_primitive_array_with_seed::<Int32Type>(
+            batch_size, nulls, seed,
+        )) as ArrayRef);
+    }
+
+    for nulls in [0.0, 0.1, 0.2, 0.5] {
+        seed += 1;
+        cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
+            batch_size, nulls, seed,
+        )) as ArrayRef);
+    }
+
+    for _ in 0..10 {
+        seed += 1;
+        cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
+            batch_size, 0.0, seed,
+        )) as ArrayRef);
+    }
+
+    for nulls in [0.0, 0.1, 0.2, 0.5] {
+        seed += 1;
+        cols.push(Arc::new(
+            create_string_array_with_len_range_and_prefix_and_seed::<i32>(
+                batch_size, nulls, 0, 50, "", seed,
+            ),
+        ));
+    }
+
+    for _ in 0..3 {
+        seed += 1;
+        cols.push(Arc::new(
+            create_string_array_with_len_range_and_prefix_and_seed::<i32>(
+                batch_size, 0.0, 0, 10, "", seed,
+            ),
+        ));
+    }
+    for _ in 0..3 {
+        seed += 1;
+        cols.push(Arc::new(
+            create_string_array_with_len_range_and_prefix_and_seed::<i32>(
+                batch_size, 0.0, 10, 20, "", seed,
+            ),
+        ));
+    }
+    for _ in 0..3 {
+        seed += 1;
+        cols.push(Arc::new(
+            create_string_array_with_len_range_and_prefix_and_seed::<i32>(
+                batch_size, 0.0, 20, 30, "", seed,
+            ),
+        ));
+    }
+
+    for nulls in [0.0, 0.1, 0.2, 0.5] {
+        seed += 1;
+        cols.push(Arc::new(create_boolean_array_with_seed(
+            batch_size, nulls, 0.5, seed,
+        )));
+    }
+
+    for _ in 0..10 {
+        seed += 1;
+        cols.push(Arc::new(create_primitive_array_with_seed::<Int64Type>(
+            batch_size, 0.0, seed,
+        )) as ArrayRef);
+    }
+
+    for nulls in [0.0, 0.1, 0.2, 0.5] {
+        seed += 1;
+        cols.push(Arc::new(create_f64_array_with_seed(batch_size, nulls, 
seed)) as ArrayRef);
+    }
+
+    assert_eq!(cols.len(), 53);
+    do_bench(c, format!("{batch_size} 53 columns").as_str(), cols);
+}
+
 fn row_bench(c: &mut Criterion) {
     let cols = vec![Arc::new(create_primitive_array::<UInt64Type>(4096, 0.)) 
as ArrayRef];
     do_bench(c, "4096 u64(0)", cols);
@@ -279,6 +377,9 @@ fn row_bench(c: &mut Criterion) {
     ];
     do_bench(c, "4096 large_list(0) sliced to 10 of u64(0)", cols);
 
+    run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(4096, 
c);
+    run_benchmark_on_medium_amount_and_types_of_columns_without_nesting(8192, 
c);
+
     bench_iter(c);
 }
 
diff --git a/arrow/src/util/bench_util.rs b/arrow/src/util/bench_util.rs
index 1f1dcff9b6..bcf7a559e9 100644
--- a/arrow/src/util/bench_util.rs
+++ b/arrow/src/util/bench_util.rs
@@ -117,6 +117,29 @@ where
         .collect()
 }
 
+/// Creates a random array of a given size and null density based on the 
provided seed
+pub fn create_boolean_array_with_seed(
+    size: usize,
+    null_density: f32,
+    true_density: f32,
+    seed: u64,
+) -> BooleanArray
+where
+    StandardUniform: Distribution<bool>,
+{
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..size)
+        .map(|_| {
+            if rng.random::<f32>() < null_density {
+                None
+            } else {
+                let value = rng.random::<f32>() < true_density;
+                Some(value)
+            }
+        })
+        .collect()
+}
+
 /// Creates a random (but fixed-seeded) string array of a given size and null 
density.
 ///
 /// Strings have a random length
@@ -734,3 +757,18 @@ pub fn create_f64_array(size: usize, nan_density: f32) -> 
Float64Array {
         })
         .collect()
 }
+
+/// Creates a random f64 array of a given size and nan-value density based on 
a given seed
+pub fn create_f64_array_with_seed(size: usize, nan_density: f32, seed: u64) -> 
Float64Array {
+    let mut rng = StdRng::seed_from_u64(seed);
+
+    (0..size)
+        .map(|_| {
+            if rng.random::<f32>() < nan_density {
+                Some(f64::NAN)
+            } else {
+                Some(rng.random())
+            }
+        })
+        .collect()
+}

Reply via email to