This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 4166a6d60f perf: Optimize comparison on nested types (#20716)
4166a6d60f is described below

commit 4166a6d60f531b47826c2ed554e08a6f5afd7ca1
Author: Neil Conway <[email protected]>
AuthorDate: Mon Mar 16 13:43:48 2026 -0700

    perf: Optimize comparison on nested types (#20716)
    
    ## Which issue does this PR close?
    
    N/A
    
    ## Rationale for this change
    
    `BooleanBuffer::collect_bool()` is faster than `map().collect()`. Per
    discussion on #20694; originally suggested by @Dandandan.
    
    ## What changes are included in this PR?
    
    - Implement optimization
    - Add benchmark for nested struct comparison
    
    ## Are these changes tested?
    
    Yes, covered by existing tests.
    
    ## Are there any user-facing changes?
    
    No.
    
    ## AI usage
    
    Multiple AI tools were used to iterate on this PR. I have reviewed and
    understand the resulting code.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 Cargo.lock                                         |  2 +
 datafusion/physical-expr-common/Cargo.toml         |  8 +++
 .../physical-expr-common/benches/compare_nested.rs | 74 ++++++++++++++++++++++
 datafusion/physical-expr-common/src/datum.rs       |  8 +--
 4 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 35660359ce..632c82be5a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2425,12 +2425,14 @@ dependencies = [
  "ahash",
  "arrow",
  "chrono",
+ "criterion",
  "datafusion-common",
  "datafusion-expr-common",
  "hashbrown 0.16.1",
  "indexmap 2.13.0",
  "itertools 0.14.0",
  "parking_lot",
+ "rand 0.9.2",
 ]
 
 [[package]]
diff --git a/datafusion/physical-expr-common/Cargo.toml 
b/datafusion/physical-expr-common/Cargo.toml
index a81eafe196..453c8a0cb4 100644
--- a/datafusion/physical-expr-common/Cargo.toml
+++ b/datafusion/physical-expr-common/Cargo.toml
@@ -50,3 +50,11 @@ hashbrown = { workspace = true }
 indexmap = { workspace = true }
 itertools = { workspace = true }
 parking_lot = { workspace = true }
+
+[dev-dependencies]
+criterion = { workspace = true }
+rand = { workspace = true }
+
+[[bench]]
+harness = false
+name = "compare_nested"
diff --git a/datafusion/physical-expr-common/benches/compare_nested.rs 
b/datafusion/physical-expr-common/benches/compare_nested.rs
new file mode 100644
index 0000000000..56c122fef9
--- /dev/null
+++ b/datafusion/physical-expr-common/benches/compare_nested.rs
@@ -0,0 +1,74 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray};
+use arrow::datatypes::{DataType, Field, Fields};
+use criterion::{Criterion, criterion_group, criterion_main};
+use datafusion_expr_common::operator::Operator;
+use datafusion_physical_expr_common::datum::compare_op_for_nested;
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
+use std::hint::black_box;
+use std::sync::Arc;
+
+/// Build a StructArray with fields {x: Int32, y: Utf8}.
+fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef {
+    let ints: Int32Array = (0..num_rows).map(|_| 
Some(rng.random::<i32>())).collect();
+
+    let strings: StringArray = (0..num_rows)
+        .map(|_| {
+            let s: String = (0..12)
+                .map(|_| rng.random_range(b'a'..=b'z') as char)
+                .collect();
+            Some(s)
+        })
+        .collect();
+
+    let fields = Fields::from(vec![
+        Field::new("x", DataType::Int32, false),
+        Field::new("y", DataType::Utf8, false),
+    ]);
+
+    Arc::new(
+        StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], 
None)
+            .unwrap(),
+    )
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let num_rows = 8192;
+    let mut rng = StdRng::seed_from_u64(42);
+
+    let lhs = make_struct_array(num_rows, &mut rng);
+    let rhs_array = make_struct_array(num_rows, &mut rng);
+    let rhs_scalar = Scalar::new(make_struct_array(1, &mut rng));
+
+    c.bench_function("compare_nested array_array", |b| {
+        b.iter(|| {
+            black_box(compare_op_for_nested(Operator::Eq, &lhs, 
&rhs_array).unwrap())
+        })
+    });
+
+    c.bench_function("compare_nested array_scalar", |b| {
+        b.iter(|| {
+            black_box(compare_op_for_nested(Operator::Eq, &lhs, 
&rhs_scalar).unwrap())
+        })
+    });
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/datafusion/physical-expr-common/src/datum.rs 
b/datafusion/physical-expr-common/src/datum.rs
index 9efaca0f6b..bd5790507f 100644
--- a/datafusion/physical-expr-common/src/datum.rs
+++ b/datafusion/physical-expr-common/src/datum.rs
@@ -17,7 +17,7 @@
 
 use arrow::array::BooleanArray;
 use arrow::array::{ArrayRef, Datum, make_comparator};
-use arrow::buffer::NullBuffer;
+use arrow::buffer::{BooleanBuffer, NullBuffer};
 use arrow::compute::kernels::cmp::{
     distinct, eq, gt, gt_eq, lt, lt_eq, neq, not_distinct,
 };
@@ -171,9 +171,9 @@ pub fn compare_op_for_nested(
     };
 
     let values = match (is_l_scalar, is_r_scalar) {
-        (false, false) => (0..len).map(|i| cmp_with_op(i, i)).collect(),
-        (true, false) => (0..len).map(|i| cmp_with_op(0, i)).collect(),
-        (false, true) => (0..len).map(|i| cmp_with_op(i, 0)).collect(),
+        (false, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, 
i)),
+        (true, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(0, 
i)),
+        (false, true) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, 
0)),
         (true, true) => std::iter::once(cmp_with_op(0, 0)).collect(),
     };
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to