This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 28c7c5239c builder: Error when concatenating binary arrays would
exceed offset size (#8252)
28c7c5239c is described below
commit 28c7c5239caecb4b50f717b9262592701ab4eada
Author: mwish <[email protected]>
AuthorDate: Fri Sep 26 04:54:51 2025 +0800
builder: Error when concatenating binary arrays would exceed offset size
(#8252)
# Which issue does this PR close?
- Closes #8247 .
# Rationale for this change
When concat array, the final value might:
1. unwrap when adding
2. unlucky, some even not unwrap, leaving a negative offset at ending of
offsets, causing coredump
# What changes are included in this PR?
Prevent from offset here
# Are these changes tested?
* [x] To add ( I don't know memory size would be too large for this?)
# Are there any user-facing changes?
**Breaking changes**:
1. append_array now return `Result<()>`
2. OffsetSize trait change to have CheckedAdd
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/src/array/list_array.rs | 4 +-
arrow-array/src/builder/generic_bytes_builder.rs | 54 +++++++++++++++++-------
arrow-select/src/concat.rs | 2 +-
3 files changed, 43 insertions(+), 17 deletions(-)
diff --git a/arrow-array/src/array/list_array.rs
b/arrow-array/src/array/list_array.rs
index 8836b5b0f7..0ddccb9681 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -37,7 +37,9 @@ use std::sync::Arc;
/// [`LargeBinaryArray`]: crate::array::LargeBinaryArray
/// [`StringArray`]: crate::array::StringArray
/// [`LargeStringArray`]: crate::array::LargeStringArray
-pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer {
+pub trait OffsetSizeTrait:
+ ArrowNativeType + std::ops::AddAssign + Integer + num::CheckedAdd
+{
/// True for 64 bit offset size and false for 32 bit offset size
const IS_LARGE: bool;
/// Prefix for the offset size
diff --git a/arrow-array/src/builder/generic_bytes_builder.rs
b/arrow-array/src/builder/generic_bytes_builder.rs
index ffaf9ff351..1480f8f328 100644
--- a/arrow-array/src/builder/generic_bytes_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_builder.rs
@@ -20,6 +20,7 @@ use crate::types::{ByteArrayType, GenericBinaryType,
GenericStringType};
use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, NullBufferBuilder,
ScalarBuffer};
use arrow_data::ArrayDataBuilder;
+use arrow_schema::ArrowError;
use std::any::Any;
use std::sync::Arc;
@@ -142,9 +143,10 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
/// Appends array values and null to this builder as is
/// (this means that underlying null values are copied as is).
#[inline]
- pub fn append_array(&mut self, array: &GenericByteArray<T>) {
+ pub fn append_array(&mut self, array: &GenericByteArray<T>) -> Result<(),
ArrowError> {
+ use num::CheckedAdd;
if array.len() == 0 {
- return;
+ return Ok(());
}
let offsets = array.offsets();
@@ -157,6 +159,12 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
// Shifting all the offsets
let shift: T::Offset = self.next_offset() - offsets[0];
+ if shift.checked_add(&offsets[offsets.len() - 1]).is_none() {
+ return Err(ArrowError::OffsetOverflowError(
+ shift.as_usize() + offsets[offsets.len() - 1].as_usize(),
+ ));
+ }
+
self.offsets_builder
.extend(offsets[1..].iter().map(|&offset| offset + shift));
}
@@ -171,6 +179,7 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
} else {
self.null_buffer_builder.append_n_non_nulls(array.len());
}
+ Ok(())
}
/// Builds the [`GenericByteArray`] and reset this builder.
@@ -665,9 +674,9 @@ mod tests {
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
let mut builder = GenericStringBuilder::<i32>::new();
- builder.append_array(&arr1);
- builder.append_array(&arr2);
- builder.append_array(&arr3);
+ builder.append_array(&arr1).unwrap();
+ builder.append_array(&arr2).unwrap();
+ builder.append_array(&arr3).unwrap();
let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(input);
@@ -695,9 +704,9 @@ mod tests {
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
let mut builder = GenericStringBuilder::<i32>::new();
- builder.append_array(&arr1);
- builder.append_array(&arr2);
- builder.append_array(&arr3);
+ builder.append_array(&arr1).unwrap();
+ builder.append_array(&arr2).unwrap();
+ builder.append_array(&arr3).unwrap();
let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(input);
@@ -709,7 +718,7 @@ mod tests {
fn test_append_empty_array() {
let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
let mut builder = GenericStringBuilder::<i32>::new();
- builder.append_array(&arr);
+ builder.append_array(&arr).unwrap();
let result = builder.finish();
assert_eq!(result.len(), 0);
}
@@ -736,7 +745,7 @@ mod tests {
assert_ne!(sliced.offsets().last(), full_array.offsets().last());
let mut builder = GenericStringBuilder::<i32>::new();
- builder.append_array(&sliced);
+ builder.append_array(&sliced).unwrap();
let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(vec![None, Some("how"),
None, None]);
@@ -772,8 +781,8 @@ mod tests {
};
let mut builder = GenericStringBuilder::<i32>::new();
- builder.append_array(&input_1_array_with_nulls);
- builder.append_array(&input_2_array_with_nulls);
+ builder.append_array(&input_1_array_with_nulls).unwrap();
+ builder.append_array(&input_2_array_with_nulls).unwrap();
let actual = builder.finish();
let expected = GenericStringArray::<i32>::from(vec![
@@ -819,12 +828,27 @@ mod tests {
let slice3 = full_array.slice(7, full_array.len() - 7);
let mut builder = GenericStringBuilder::<i32>::new();
- builder.append_array(&slice1);
- builder.append_array(&slice2);
- builder.append_array(&slice3);
+ builder.append_array(&slice1).unwrap();
+ builder.append_array(&slice2).unwrap();
+ builder.append_array(&slice3).unwrap();
let actual = builder.finish();
assert_eq!(actual, full_array);
}
+
+ #[test]
+ fn test_append_array_offset_overflow_precise() {
+ let mut builder = GenericStringBuilder::<i32>::new();
+
+ let initial_string = "x".repeat(i32::MAX as usize - 100);
+ builder.append_value(&initial_string);
+
+ let overflow_string = "y".repeat(200);
+ let overflow_array =
GenericStringArray::<i32>::from(vec![overflow_string.as_str()]);
+
+ let result = builder.append_array(&overflow_array);
+
+ assert!(matches!(result, Err(ArrowError::OffsetOverflowError(_))));
+ }
}
diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs
index d300644792..fab47a588d 100644
--- a/arrow-select/src/concat.rs
+++ b/arrow-select/src/concat.rs
@@ -236,7 +236,7 @@ fn concat_bytes<T: ByteArrayType>(arrays: &[&dyn Array]) ->
Result<ArrayRef, Arr
let mut builder = GenericByteBuilder::<T>::with_capacity(item_capacity,
bytes_capacity);
for array in arrays {
- builder.append_array(array.as_bytes::<T>());
+ builder.append_array(array.as_bytes::<T>())?;
}
Ok(Arc::new(builder.finish()))