This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 1ba902ef0f Fix `nullif` kernel (#9087)
1ba902ef0f is described below
commit 1ba902ef0fb88d06dbb1e90c59d5641bd24d15bd
Author: Andrew Lamb <[email protected]>
AuthorDate: Mon Jan 5 07:02:11 2026 -0500
Fix `nullif` kernel (#9087)
# Which issue does this PR close?
- Closes https://github.com/apache/arrow-rs/issues/9085
# Rationale for this change
Fix a regression introduced in
https://github.com/apache/arrow-rs/pull/8996
# What changes are included in this PR?
1. Add test coverage for nullif kernel
1. Undeprecate `bitwise_unary_op_helper`
2. Document subtle differences
3. Restore nullif kernel from
https://github.com/apache/arrow-rs/pull/8996
# Are these changes tested
Yes
# Are there any user-facing changes?
Fix (not yet released) bug
---
arrow-buffer/src/buffer/boolean.rs | 1 +
arrow-buffer/src/buffer/ops.rs | 49 +++++++++++++---
arrow-select/src/nullif.rs | 114 ++++++++++++++++++++++++++++++-------
3 files changed, 136 insertions(+), 28 deletions(-)
diff --git a/arrow-buffer/src/buffer/boolean.rs
b/arrow-buffer/src/buffer/boolean.rs
index 548401ed42..ecd7de38c0 100644
--- a/arrow-buffer/src/buffer/boolean.rs
+++ b/arrow-buffer/src/buffer/boolean.rs
@@ -165,6 +165,7 @@ impl BooleanBuffer {
/// * `op` must only apply bitwise operations
/// on the relevant bits; the input `u64` may contain irrelevant bits
/// and may be processed differently on different endian architectures.
+ /// * `op` may be called with input bits outside the requested range
/// * The output always has zero offset
///
/// # See Also
diff --git a/arrow-buffer/src/buffer/ops.rs b/arrow-buffer/src/buffer/ops.rs
index 05593504b1..cb0925bb2c 100644
--- a/arrow-buffer/src/buffer/ops.rs
+++ b/arrow-buffer/src/buffer/ops.rs
@@ -20,7 +20,12 @@ use crate::BooleanBuffer;
use crate::util::bit_util::ceil;
/// Apply a bitwise operation `op` to four inputs and return the result as a
Buffer.
-/// The inputs are treated as bitmaps, meaning that offsets and length are
specified in number of bits.
+///
+/// The inputs are treated as bitmaps, meaning that offsets and length are
+/// specified in number of bits.
+///
+/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
+/// outside the offsets and len are set to zero out before calling `op`.
pub fn bitwise_quaternary_op_helper<F>(
buffers: [&Buffer; 4],
offsets: [usize; 4],
@@ -60,7 +65,12 @@ where
}
/// Apply a bitwise operation `op` to two inputs and return the result as a
Buffer.
-/// The inputs are treated as bitmaps, meaning that offsets and length are
specified in number of bits.
+///
+/// The inputs are treated as bitmaps, meaning that offsets and length are
+/// specified in number of bits.
+///
+/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
+/// outside the offsets and len are set to zero out before calling `op`.
pub fn bitwise_bin_op_helper<F>(
left: &Buffer,
left_offset_in_bits: usize,
@@ -93,21 +103,42 @@ where
}
/// Apply a bitwise operation `op` to one input and return the result as a
Buffer.
-/// The input is treated as a bitmap, meaning that offset and length are
specified in number of bits.
-#[deprecated(
- since = "57.2.0",
- note = "use BooleanBuffer::from_bitwise_unary_op instead"
-)]
+///
+/// The input is treated as a bitmap, meaning that offset and length are
+/// specified in number of bits.
+///
+/// NOTE: The operation `op` is applied to chunks of 64 bits (u64) and any bits
+/// outside the offsets and len are set to zero out before calling `op`.
pub fn bitwise_unary_op_helper<F>(
left: &Buffer,
offset_in_bits: usize,
len_in_bits: usize,
- op: F,
+ mut op: F,
) -> Buffer
where
F: FnMut(u64) -> u64,
{
- BooleanBuffer::from_bitwise_unary_op(left, offset_in_bits, len_in_bits,
op).into_inner()
+ // reserve capacity and set length so we can get a typed view of u64 chunks
+ let mut result =
+ MutableBuffer::new(ceil(len_in_bits, 8)).with_bitset(len_in_bits / 64
* 8, false);
+
+ let left_chunks = left.bit_chunks(offset_in_bits, len_in_bits);
+
+ let result_chunks = result.typed_data_mut::<u64>().iter_mut();
+
+ result_chunks
+ .zip(left_chunks.iter())
+ .for_each(|(res, left)| {
+ *res = op(left);
+ });
+
+ let remainder_bytes = ceil(left_chunks.remainder_len(), 8);
+ let rem = op(left_chunks.remainder_bits());
+ // we are counting its starting from the least significant bit, to
to_le_bytes should be correct
+ let rem = &rem.to_le_bytes()[0..remainder_bytes];
+ result.extend_from_slice(rem);
+
+ result.into()
}
/// Apply a bitwise and to two inputs and return the result as a Buffer.
diff --git a/arrow-select/src/nullif.rs b/arrow-select/src/nullif.rs
index 211cabf7af..e51016f9ba 100644
--- a/arrow-select/src/nullif.rs
+++ b/arrow-select/src/nullif.rs
@@ -19,7 +19,7 @@
use arrow_array::{Array, ArrayRef, BooleanArray, make_array};
use arrow_buffer::buffer::bitwise_bin_op_helper;
-use arrow_buffer::{BooleanBuffer, NullBuffer};
+use arrow_buffer::{BooleanBuffer, NullBuffer, bitwise_unary_op_helper};
use arrow_schema::{ArrowError, DataType};
/// Returns a new array with the same values and the validity bit to false
where
@@ -91,13 +91,11 @@ pub fn nullif(left: &dyn Array, right: &BooleanArray) ->
Result<ArrayRef, ArrowE
}
None => {
let mut null_count = 0;
- let buffer =
- BooleanBuffer::from_bitwise_unary_op(right.inner(),
right.offset(), len, |b| {
- let t = !b;
- null_count += t.count_zeros() as usize;
- t
- })
- .into_inner();
+ let buffer = bitwise_unary_op_helper(right.inner(),
right.offset(), len, |b| {
+ let t = !b;
+ null_count += t.count_zeros() as usize;
+ t
+ });
(buffer, null_count)
}
};
@@ -122,7 +120,8 @@ mod tests {
use arrow_array::{Int32Array, NullArray, StringArray, StructArray};
use arrow_data::ArrayData;
use arrow_schema::{Field, Fields};
- use rand::{Rng, rng};
+ use rand::prelude::StdRng;
+ use rand::{Rng, SeedableRng};
#[test]
fn test_nullif_int_array() {
@@ -494,23 +493,60 @@ mod tests {
let r_data = r.to_data();
r_data.validate().unwrap();
- assert_eq!(r.as_ref(), &expected);
+ assert_eq!(
+ r.as_ref(),
+ &expected,
+ "expected nulls: {:#?}\n\n\
+ result nulls: {:#?}\n\n\\
+ expected values: {:#?}\n\n\
+ result values: {:#?}",
+ expected.nulls(),
+ r.nulls(),
+ expected.values(),
+ r.as_primitive::<Int32Type>().values()
+ );
+ validate_nulls(expected.nulls());
+ validate_nulls(r.nulls());
+ }
+
+ /// Ensures that the null count matches the actual number of nulls.
+ fn validate_nulls(nulls: Option<&NullBuffer>) {
+ let Some(nulls) = nulls else {
+ return;
+ };
+ let mut actual_null_count = 0;
+ for i in 0..nulls.len() {
+ if nulls.is_null(i) {
+ actual_null_count += 1;
+ }
+ }
+ assert_eq!(actual_null_count, nulls.null_count());
}
#[test]
fn nullif_fuzz() {
- let mut rng = rng();
+ let mut rng = StdRng::seed_from_u64(7337);
let arrays = [
- Int32Array::from(vec![0; 128]),
- (0..128)
- .map(|_| rng.random_bool(0.5).then_some(0))
+ Int32Array::from(vec![0; 1024]), // no nulls
+ (0..1024) // 50% nulls
+ .map(|_| rng.random_bool(0.5).then_some(1))
.collect(),
];
for a in arrays {
- let a_slices = [(0, 128), (64, 64), (0, 64), (32, 32), (0, 0),
(32, 0)];
-
+ let a_slices = [
+ (0, 128),
+ (0, 129),
+ (64, 64),
+ (0, 64),
+ (32, 32),
+ (0, 0),
+ (32, 0),
+ (5, 800),
+ (33, 53),
+ (77, 101),
+ ];
for (a_offset, a_length) in a_slices {
let a = a.slice(a_offset, a_length);
@@ -518,14 +554,54 @@ mod tests {
let b_start_offset = rng.random_range(0..i);
let b_end_offset = rng.random_range(0..i);
+ // b with 50% nulls
let b: BooleanArray = (0..a_length + b_start_offset +
b_end_offset)
.map(|_| rng.random_bool(0.5).then(||
rng.random_bool(0.5)))
.collect();
- let b = b.slice(b_start_offset, a_length);
-
- test_nullif(&a, &b);
+ let b_sliced = b.slice(b_start_offset, a_length);
+ test_nullif(&a, &b_sliced);
+
+ // b with no nulls (and no null buffer)
+ let b = remove_null_buffer(&b);
+ let b_sliced = b.slice(b_start_offset, a_length);
+ test_nullif(&a, &b_sliced);
+
+ // b with no nulls (but with a null buffer)
+ let b = remove_null_values(&b);
+ let b_sliced = b.slice(b_start_offset, a_length);
+ test_nullif(&a, &b_sliced);
}
}
}
}
+
+ /// Returns a new BooleanArray with no null buffer
+ fn remove_null_buffer(array: &BooleanArray) -> BooleanArray {
+ make_array(
+ array
+ .into_data()
+ .into_builder()
+ .nulls(None)
+ .build()
+ .unwrap(),
+ )
+ .as_boolean()
+ .clone()
+ }
+
+ /// Returns a new BooleanArray with a null buffer where all values are
valid
+ fn remove_null_values(array: &BooleanArray) -> BooleanArray {
+ let len = array.len();
+ let new_nulls = NullBuffer::from_iter(std::iter::repeat_n(true, len));
+ make_array(
+ array
+ .into_data()
+ .into_builder()
+ .nulls(Some(new_nulls))
+ .build()
+ .unwrap(),
+ )
+ .as_boolean()
+ .clone()
+ }
}