This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 1ba902ef0f Fix `nullif` kernel (#9087)
1ba902ef0f is described below

commit 1ba902ef0fb88d06dbb1e90c59d5641bd24d15bd
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Jan 5 07:02:11 2026 -0500

    Fix `nullif` kernel (#9087)
    
    # Which issue does this PR close?
    
    - Closes https://github.com/apache/arrow-rs/issues/9085
    
    # Rationale for this change
    
    Fix a regression introduced in
    https://github.com/apache/arrow-rs/pull/8996
    
    # What changes are included in this PR?
    
    1. Add test coverage for nullif kernel
    1. Undeprecate `bitwise_unary_op_helper`
    2. Document subtle differences
    3. Restore nullif kernel from
    https://github.com/apache/arrow-rs/pull/8996
    
    # Are these changes tested
    Yes
    
    # Are there any user-facing changes?
    
    Fix (not yet released) bug
---
 arrow-buffer/src/buffer/boolean.rs |   1 +
 arrow-buffer/src/buffer/ops.rs     |  49 +++++++++++++---
 arrow-select/src/nullif.rs         | 114 ++++++++++++++++++++++++++++++-------
 3 files changed, 136 insertions(+), 28 deletions(-)

diff --git a/arrow-buffer/src/buffer/boolean.rs 
b/arrow-buffer/src/buffer/boolean.rs
index 548401ed42..ecd7de38c0 100644
--- a/arrow-buffer/src/buffer/boolean.rs
+++ b/arrow-buffer/src/buffer/boolean.rs
@@ -165,6 +165,7 @@ impl BooleanBuffer {
     /// * `op` must only apply bitwise operations
     ///   on the relevant bits; the input `u64` may contain irrelevant bits
     ///   and may be processed differently on different endian architectures.
+    /// * `op` may be called with input bits outside the requested range
     /// * The output always has zero offset
     ///
     /// # See Also
diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs
index 05593504b1..cb0925bb2c 100644
--- a/arrow-buffer/src/buffer/ops.rs
+++ b/arrow-buffer/src/buffer/ops.rs
@@ -20,7 +20,12 @@ use crate::BooleanBuffer;
 use crate::util::bit_util::ceil;
 
 /// Apply a bitwise operation `op` to four inputs and return the result as a 
Buffer.
-/// The inputs are treated as bitmaps, meaning that offsets and length are 
specified in number of bits.
+///
+/// The inputs are treated as bitmaps, meaning that offsets and length are
+/// specified in number of bits.
+///
+/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
+/// outside the offsets and len are set to zero out before calling `op`.
 pub fn bitwise_quaternary_op_helper<F>(
     buffers: [&Buffer; 4],
     offsets: [usize; 4],
@@ -60,7 +65,12 @@ where
 }
 
 /// Apply a bitwise operation `op` to two inputs and return the result as a 
Buffer.
-/// The inputs are treated as bitmaps, meaning that offsets and length are 
specified in number of bits.
+///
+/// The inputs are treated as bitmaps, meaning that offsets and length are
+/// specified in number of bits.
+///
+/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
+/// outside the offsets and len are set to zero out before calling `op`.
 pub fn bitwise_bin_op_helper<F>(
     left: &Buffer,
     left_offset_in_bits: usize,
@@ -93,21 +103,42 @@ where
 }
 
 /// Apply a bitwise operation `op` to one input and return the result as a 
Buffer.
-/// The input is treated as a bitmap, meaning that offset and length are 
specified in number of bits.
-#[deprecated(
-    since = "57.2.0",
-    note = "use BooleanBuffer::from_bitwise_unary_op instead"
-)]
+///
+/// The input is treated as a bitmap, meaning that offset and length are
+/// specified in number of bits.
+///
+/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
+/// outside the offsets and len are set to zero out before calling `op`.
 pub fn bitwise_unary_op_helper<F>(
     left: &Buffer,
     offset_in_bits: usize,
     len_in_bits: usize,
-    op: F,
+    mut op: F,
 ) -> Buffer
 where
     F: FnMut(u64) -> u64,
 {
-    BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits, 
op).into_inner()
+    // reserve capacity and set length so we can get a typed view of u64 chunks
+    let mut result =
+        MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64 
* 8, false);
+
+    let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);
+
+    let result_chunks = result.typed_data_mut::<u64>().iter_mut();
+
+    result_chunks
+        .zip(left_chunks.iter())
+        .for_each(|(res, left)| {
+            *res = op(left);
+        });
+
+    let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
+    let rem = op(left_chunks.remainder_bits());
+    // we are counting its starting from the least significant bit, to 
to_le_bytes should be correct
+    let rem = &rem.to_le_bytes()[0..remainder_bytes];
+    result.extend_from_slice(rem);
+
+    result.into()
 }
 
 /// Apply a bitwise and to two inputs and return the result as a Buffer.
diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs
index 211cabf7af..e51016f9ba 100644
--- a/arrow-select/src/nullif.rs
+++ b/arrow-select/src/nullif.rs
@@ -19,7 +19,7 @@
 
 use arrow_array::{Array, ArrayRef, BooleanArray, make_array};
 use arrow_buffer::buffer::bitwise_bin_op_helper;
-use arrow_buffer::{BooleanBuffer, NullBuffer};
+use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper};
 use arrow_schema::{ArrowError, DataType};
 
 /// Returns a new array with the same values and the validity bit to false 
where
@@ -91,13 +91,11 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) -> 
Result<ArrayRef, ArrowE
         }
         None => {
             let mut null_count = 0;
-            let buffer =
-                BooleanBuffer::from_bitwise_unary_op(right.inner(), 
right.offset(), len, |b| {
-                    let t = !b;
-                    null_count += t.count_zeros() as usize;
-                    t
-                })
-                .into_inner();
+            let buffer = bitwise_unary_op_helper(right.inner(), 
right.offset(), len, |b| {
+                let t = !b;
+                null_count += t.count_zeros() as usize;
+                t
+            });
             (buffer, null_count)
         }
     };
@@ -122,7 +120,8 @@ mod tests {
     use arrow_array::{Int32Array, NullArray, StringArray, StructArray};
     use arrow_data::ArrayData;
     use arrow_schema::{Field, Fields};
-    use rand::{Rng, rng};
+    use rand::prelude::StdRng;
+    use rand::{Rng, SeedableRng};
 
     #[test]
     fn test_nullif_int_array() {
@@ -494,23 +493,60 @@ mod tests {
         let r_data = r.to_data();
         r_data.validate().unwrap();
 
-        assert_eq!(r.as_ref(), &expected);
+        assert_eq!(
+            r.as_ref(),
+            &expected,
+            "expected nulls: {:#?}\n\n\
+        result nulls:   {:#?}\n\n\\
+        expected values: {:#?}\n\n\
+        result values:   {:#?}",
+            expected.nulls(),
+            r.nulls(),
+            expected.values(),
+            r.as_primitive::<Int32Type>().values()
+        );
+        validate_nulls(expected.nulls());
+        validate_nulls(r.nulls());
+    }
+
+    /// Ensures that the null count matches the actual number of nulls.
+    fn validate_nulls(nulls: Option<&NullBuffer>) {
+        let Some(nulls) = nulls else {
+            return;
+        };
+        let mut actual_null_count = 0;
+        for i in 0..nulls.len() {
+            if nulls.is_null(i) {
+                actual_null_count += 1;
+            }
+        }
+        assert_eq!(actual_null_count, nulls.null_count());
     }
 
     #[test]
     fn nullif_fuzz() {
-        let mut rng = rng();
+        let mut rng = StdRng::seed_from_u64(7337);
 
         let arrays = [
-            Int32Array::from(vec![0; 128]),
-            (0..128)
-                .map(|_| rng.random_bool(0.5).then_some(0))
+            Int32Array::from(vec![0; 1024]), // no nulls
+            (0..1024) // 50% nulls
+                .map(|_| rng.random_bool(0.5).then_some(1))
                 .collect(),
         ];
 
         for a in arrays {
-            let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0), 
(32, 0)];
-
+            let a_slices = [
+                (0, 128),
+                (0, 129),
+                (64, 64),
+                (0, 64),
+                (32, 32),
+                (0, 0),
+                (32, 0),
+                (5, 800),
+                (33, 53),
+                (77, 101),
+            ];
             for (a_offset, a_length) in a_slices {
                 let a = a.slice(a_offset, a_length);
 
@@ -518,14 +554,54 @@ mod tests {
                     let b_start_offset = rng.random_range(0..i);
                     let b_end_offset = rng.random_range(0..i);
 
+                    // b with 50% nulls
                     let b: BooleanArray = (0..a_length + b_start_offset + 
b_end_offset)
                         .map(|_| rng.random_bool(0.5).then(|| 
rng.random_bool(0.5)))
                         .collect();
-                    let b = b.slice(b_start_offset, a_length);
-
-                    test_nullif(&a, &b);
+                    let b_sliced = b.slice(b_start_offset, a_length);
+                    test_nullif(&a, &b_sliced);
+
+                    // b with no nulls (and no null buffer)
+                    let b = remove_null_buffer(&b);
+                    let b_sliced = b.slice(b_start_offset, a_length);
+                    test_nullif(&a, &b_sliced);
+
+                    // b with no nulls (but with a null buffer)
+                    let b = remove_null_values(&b);
+                    let b_sliced = b.slice(b_start_offset, a_length);
+                    test_nullif(&a, &b_sliced);
                 }
             }
         }
     }
+
+    /// Returns a new BooleanArray with no null buffer
+    fn remove_null_buffer(array: &BooleanArray) -> BooleanArray {
+        make_array(
+            array
+                .into_data()
+                .into_builder()
+                .nulls(None)
+                .build()
+                .unwrap(),
+        )
+        .as_boolean()
+        .clone()
+    }
+
+    /// Returns a new BooleanArray with a null buffer where all values are 
valid
+    fn remove_null_values(array: &BooleanArray) -> BooleanArray {
+        let len = array.len();
+        let new_nulls = NullBuffer::from_iter(std::iter::repeat_n(true, len));
+        make_array(
+            array
+                .into_data()
+                .into_builder()
+                .nulls(Some(new_nulls))
+                .build()
+                .unwrap(),
+        )
+        .as_boolean()
+        .clone()
+    }
 }

Reply via email to