Re: [PR] refactor `array_remove` benchmarks & add nested benches [datafusion]

via GitHub Fri, 24 Apr 2026 10:48:59 -0700


Jefffrey commented on code in PR #21834:
URL: https://github.com/apache/datafusion/pull/21834#discussion_r3139349585



##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -16,74 +16,61 @@
 // under the License.
 
 use arrow::array::{
-    Array, ArrayRef, BinaryArray, BooleanArray, Decimal128Array, 
FixedSizeBinaryArray,
-    Float64Array, Int64Array, ListArray, StringArray,
+    Array, ArrayBuilder, ArrayRef, BooleanBuilder, FixedSizeBinaryArray, 
Int64Builder,
+    ListArray, ListBuilder, StringBuilder,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
 use arrow::datatypes::{DataType, Field};
 use criterion::{
     criterion_group, criterion_main, {BenchmarkId, Criterion},
 };
 use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
-use datafusion_functions_nested::remove::ArrayRemove;
+use datafusion_functions_nested::remove::{ArrayRemove, ArrayRemoveAll, 
ArrayRemoveN};
 use rand::Rng;
 use rand::SeedableRng;
 use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
 use std::hint::black_box;
 use std::sync::Arc;
 
 const NUM_ROWS: usize = 10000;
-const ARRAY_SIZES: &[usize] = &[10, 100, 500];
+const LIST_SIZES: &[usize] = &[10, 100, 500];
 const SEED: u64 = 42;
-const NULL_DENSITY: f64 = 0.1;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;

Review Comment:
   `NULL_DENSITY` previously determined the nulls within the lists inside list 
arrays; the list array itself always had no null buffer so each element was 
valid. Changing the way we generate test data (see next comments)



##########
datafusion/functions-nested/src/remove.rs:
##########
@@ -163,11 +163,17 @@ make_udf_expr_and_func!(
     argument(name = "max", description = "Number of first occurrences to 
remove.")
 )]
 #[derive(Debug, PartialEq, Eq, Hash)]
-pub(super) struct ArrayRemoveN {
+pub struct ArrayRemoveN {

Review Comment:
   To make them accessible in benchmarks



##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -341,231 +350,145 @@ fn bench_array_remove_fixed_size_binary(c: &mut 
Criterion) {
     group.finish();
 }
 
-fn create_args(list_array: ArrayRef, element: ScalarValue) -> 
Vec<ColumnarValue> {
-    vec![
-        ColumnarValue::Array(list_array),
-        ColumnarValue::Scalar(element),
-    ]
+#[inline]
+fn create_args(haystack: ArrayRef, needle: ScalarValue) -> ScalarFunctionArgs {
+    let haystack_type = haystack.data_type().clone();
+    let needle_type = needle.data_type().clone();
+    ScalarFunctionArgs {
+        args: vec![
+            ColumnarValue::Array(haystack),
+            ColumnarValue::Scalar(needle),
+        ],
+        arg_fields: vec![
+            Field::new("haystack", haystack_type.clone(), true).into(),
+            Field::new("needle", needle_type, true).into(),
+        ],
+        number_rows: NUM_ROWS,
+        return_field: Field::new("result", haystack_type, true).into(),
+        config_options: Arc::new(ConfigOptions::default()),
+    }
 }
 
-fn create_int64_list_array(
-    num_rows: usize,
-    array_size: usize,
-    null_density: f64,
-) -> ArrayRef {
-    let mut rng = StdRng::seed_from_u64(SEED);
-    let values = (0..num_rows * array_size)
-        .map(|_| {
-            if rng.random::<f64>() < null_density {
-                None
-            } else {
-                Some(rng.random_range(0..array_size as i64))
-            }
-        })
-        .collect::<Int64Array>();
-    let offsets = (0..=num_rows)
-        .map(|i| (i * array_size) as i32)
-        .collect::<Vec<i32>>();
-
-    Arc::new(
-        ListArray::try_new(
-            Arc::new(Field::new("item", DataType::Int64, true)),
-            OffsetBuffer::new(offsets.into()),
-            Arc::new(values),
-            None,
-        )
-        .unwrap(),
-    )
+#[inline]
+fn create_args_n(
+    haystack: ArrayRef,
+    needle: ScalarValue,
+    n: ScalarValue,
+) -> ScalarFunctionArgs {
+    let haystack_type = haystack.data_type().clone();
+    let needle_type = needle.data_type().clone();
+    let n_type = n.data_type().clone();
+    ScalarFunctionArgs {
+        args: vec![
+            ColumnarValue::Array(haystack),
+            ColumnarValue::Scalar(needle),
+            ColumnarValue::Scalar(n),
+        ],
+        arg_fields: vec![
+            Field::new("haystack", haystack_type.clone(), true).into(),
+            Field::new("needle", needle_type, true).into(),
+            Field::new("n", n_type, true).into(),
+        ],
+        number_rows: NUM_ROWS,
+        return_field: Field::new("result", haystack_type, true).into(),
+        config_options: Arc::new(ConfigOptions::default()),
+    }
 }
 
-fn create_f64_list_array(
-    num_rows: usize,
-    array_size: usize,
-    null_density: f64,
-) -> ArrayRef {
+fn create_list_array<Builder, Item>(

Review Comment:
   This is a generic data generator for types which work with 
`ListArray::from_nested_iter`
   
   - Unfortunately `FixedSizeBinaryBuilder` doesn't satisfy all the required 
bounds so fixed size binary array needs to be a separate function (same with 
nested list)
   
   Previously we generated quite randomly:
   
   1. For each row in the haystack array:
   2. Generate the values of the fixed sized list element, where values may be 
null, and values in range `0..list_size`
   3. Collect these list elements into final haystack array which has no nulls
   
   Some problems with this approach is we don't control the presence of the 
element to remove (the needle).
   
   - For example in fixed size binary generation, it generated random 16 bytes 
for each value, and the needle was a fixed `[1_u8; 16]`; that is quite a search 
area for the needle
   
   This new approach uses `NEEDLE_DENSITY` to determine within each row of the 
haystack array how many of these to place, otherwise filling with some other 
filler elements (which aren't really important; we don't need a widespread of 
these values, just values that *aren't* the needle). We then also add nulls to 
the final haystack array.



##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -16,74 +16,61 @@
 // under the License.
 
 use arrow::array::{
-    Array, ArrayRef, BinaryArray, BooleanArray, Decimal128Array, 
FixedSizeBinaryArray,
-    Float64Array, Int64Array, ListArray, StringArray,
+    Array, ArrayBuilder, ArrayRef, BooleanBuilder, FixedSizeBinaryArray, 
Int64Builder,
+    ListArray, ListBuilder, StringBuilder,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
 use arrow::datatypes::{DataType, Field};
 use criterion::{
     criterion_group, criterion_main, {BenchmarkId, Criterion},
 };
 use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
-use datafusion_functions_nested::remove::ArrayRemove;
+use datafusion_functions_nested::remove::{ArrayRemove, ArrayRemoveAll, 
ArrayRemoveN};
 use rand::Rng;
 use rand::SeedableRng;
 use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
 use std::hint::black_box;
 use std::sync::Arc;
 
 const NUM_ROWS: usize = 10000;
-const ARRAY_SIZES: &[usize] = &[10, 100, 500];
+const LIST_SIZES: &[usize] = &[10, 100, 500];
 const SEED: u64 = 42;
-const NULL_DENSITY: f64 = 0.1;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;
 
 fn criterion_benchmark(c: &mut Criterion) {
-    // Test array_remove with different data types and array sizes
-    // TODO: Add performance tests for nested datatypes
     bench_array_remove_int64(c);
-    bench_array_remove_f64(c);
+    bench_array_remove_n_int64(c);
+    bench_array_remove_all_int64(c);
+
+    bench_array_remove_int64_nested(c);
+    bench_array_remove_n_int64_nested(c);
+    bench_array_remove_all_int64_nested(c);
+
     bench_array_remove_strings(c);
-    bench_array_remove_binary(c);
     bench_array_remove_boolean(c);
-    bench_array_remove_decimal64(c);

Review Comment:
   Decided to remove decimal & f64 as it seems redundant to have them as i64 
should be enough to test primitive types; same reason for removing binary since 
we can just use string benchmark



##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -92,39 +79,29 @@ fn bench_array_remove_int64(c: &mut Criterion) {
     group.finish();
 }
 
-fn bench_array_remove_f64(c: &mut Criterion) {
-    let mut group = c.benchmark_group("array_remove_f64");
+fn bench_array_remove_n_int64(c: &mut Criterion) {
+    let mut group = c.benchmark_group("array_remove_n_int64");
 
-    for &array_size in ARRAY_SIZES {
-        let list_array = create_f64_list_array(NUM_ROWS, array_size, 
NULL_DENSITY);
-        let element_to_remove = ScalarValue::Float64(Some(1.0));
-        let args = create_args(list_array.clone(), element_to_remove.clone());
+    let filler_values = [None, Some(1), Some(2), Some(3), Some(4), Some(5)];
+    let needle = 0;
+    for &list_size in LIST_SIZES {
+        let list_array =
+            create_list_array::<Int64Builder, _>(list_size, needle, 
&filler_values);
+        let n = (NEEDLE_DENSITY / 2.0 * list_size as f64) as i64;
+        let n = 2.max(n);

Review Comment:
   For `array_remove_n`, roughly try to remove half of the average amount of 
needles in each list in the haystack (capping it at 2 minimum to differ from 
`array_remove`)



##########
datafusion/functions-nested/benches/array_remove.rs:
##########
@@ -16,74 +16,61 @@
 // under the License.
 
 use arrow::array::{
-    Array, ArrayRef, BinaryArray, BooleanArray, Decimal128Array, 
FixedSizeBinaryArray,
-    Float64Array, Int64Array, ListArray, StringArray,
+    Array, ArrayBuilder, ArrayRef, BooleanBuilder, FixedSizeBinaryArray, 
Int64Builder,
+    ListArray, ListBuilder, StringBuilder,
 };
-use arrow::buffer::OffsetBuffer;
+use arrow::buffer::{NullBuffer, OffsetBuffer};
 use arrow::datatypes::{DataType, Field};
 use criterion::{
     criterion_group, criterion_main, {BenchmarkId, Criterion},
 };
 use datafusion_common::ScalarValue;
 use datafusion_common::config::ConfigOptions;
 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
-use datafusion_functions_nested::remove::ArrayRemove;
+use datafusion_functions_nested::remove::{ArrayRemove, ArrayRemoveAll, 
ArrayRemoveN};
 use rand::Rng;
 use rand::SeedableRng;
 use rand::rngs::StdRng;
+use rand::seq::IndexedRandom;
 use std::hint::black_box;
 use std::sync::Arc;
 
 const NUM_ROWS: usize = 10000;
-const ARRAY_SIZES: &[usize] = &[10, 100, 500];
+const LIST_SIZES: &[usize] = &[10, 100, 500];
 const SEED: u64 = 42;
-const NULL_DENSITY: f64 = 0.1;
+const HAYSTACK_NULL_DENSITY: f64 = 0.1;
+const NEEDLE_DENSITY: f64 = 0.1;
 
 fn criterion_benchmark(c: &mut Criterion) {
-    // Test array_remove with different data types and array sizes
-    // TODO: Add performance tests for nested datatypes
     bench_array_remove_int64(c);
-    bench_array_remove_f64(c);
+    bench_array_remove_n_int64(c);
+    bench_array_remove_all_int64(c);
+
+    bench_array_remove_int64_nested(c);
+    bench_array_remove_n_int64_nested(c);
+    bench_array_remove_all_int64_nested(c);

Review Comment:
   We didn't benchmark `array_remove_n` and `array_remove_all` before so adding 
them here; same for nested types



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] refactor `array_remove` benchmarks & add nested benches [datafusion]

Reply via email to