This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 28c7c5239c builder: Error when concatenating binary arrays would 
exceed offset size (#8252)
28c7c5239c is described below

commit 28c7c5239caecb4b50f717b9262592701ab4eada
Author: mwish <[email protected]>
AuthorDate: Fri Sep 26 04:54:51 2025 +0800

    builder: Error when concatenating binary arrays would exceed offset size 
(#8252)
    
    # Which issue does this PR close?
    
    - Closes #8247 .
    
    # Rationale for this change
    
    When concat array, the final value might:
    1. unwrap when adding
    2. unlucky, some even not unwrap, leaving a negative offset at ending of
    offsets, causing coredump
    
    # What changes are included in this PR?
    
    Prevent from offset here
    
    # Are these changes tested?
    
    * [x] To add ( I don't know memory size would be too large for this?)
    
    
    # Are there any user-facing changes?
    
    **Breaking changes**:
    
    1. append_array now return `Result<()>`
    2. OffsetSize trait change to have CheckedAdd
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-array/src/array/list_array.rs              |  4 +-
 arrow-array/src/builder/generic_bytes_builder.rs | 54 +++++++++++++++++-------
 arrow-select/src/concat.rs                       |  2 +-
 3 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/arrow-array/src/array/list_array.rs 
b/arrow-array/src/array/list_array.rs
index 8836b5b0f7..0ddccb9681 100644
--- a/arrow-array/src/array/list_array.rs
+++ b/arrow-array/src/array/list_array.rs
@@ -37,7 +37,9 @@ use std::sync::Arc;
 /// [`LargeBinaryArray`]: crate::array::LargeBinaryArray
 /// [`StringArray`]: crate::array::StringArray
 /// [`LargeStringArray`]: crate::array::LargeStringArray
-pub trait OffsetSizeTrait: ArrowNativeType + std::ops::AddAssign + Integer {
+pub trait OffsetSizeTrait:
+    ArrowNativeType + std::ops::AddAssign + Integer + num::CheckedAdd
+{
     /// True for 64 bit offset size and false for 32 bit offset size
     const IS_LARGE: bool;
     /// Prefix for the offset size
diff --git a/arrow-array/src/builder/generic_bytes_builder.rs 
b/arrow-array/src/builder/generic_bytes_builder.rs
index ffaf9ff351..1480f8f328 100644
--- a/arrow-array/src/builder/generic_bytes_builder.rs
+++ b/arrow-array/src/builder/generic_bytes_builder.rs
@@ -20,6 +20,7 @@ use crate::types::{ByteArrayType, GenericBinaryType, 
GenericStringType};
 use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait};
 use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer, NullBufferBuilder, 
ScalarBuffer};
 use arrow_data::ArrayDataBuilder;
+use arrow_schema::ArrowError;
 use std::any::Any;
 use std::sync::Arc;
 
@@ -142,9 +143,10 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
     /// Appends array values and null to this builder as is
     /// (this means that underlying null values are copied as is).
     #[inline]
-    pub fn append_array(&mut self, array: &GenericByteArray<T>) {
+    pub fn append_array(&mut self, array: &GenericByteArray<T>) -> Result<(), 
ArrowError> {
+        use num::CheckedAdd;
         if array.len() == 0 {
-            return;
+            return Ok(());
         }
 
         let offsets = array.offsets();
@@ -157,6 +159,12 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
             // Shifting all the offsets
             let shift: T::Offset = self.next_offset() - offsets[0];
 
+            if shift.checked_add(&offsets[offsets.len() - 1]).is_none() {
+                return Err(ArrowError::OffsetOverflowError(
+                    shift.as_usize() + offsets[offsets.len() - 1].as_usize(),
+                ));
+            }
+
             self.offsets_builder
                 .extend(offsets[1..].iter().map(|&offset| offset + shift));
         }
@@ -171,6 +179,7 @@ impl<T: ByteArrayType> GenericByteBuilder<T> {
         } else {
             self.null_buffer_builder.append_n_non_nulls(array.len());
         }
+        Ok(())
     }
 
     /// Builds the [`GenericByteArray`] and reset this builder.
@@ -665,9 +674,9 @@ mod tests {
         let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
 
         let mut builder = GenericStringBuilder::<i32>::new();
-        builder.append_array(&arr1);
-        builder.append_array(&arr2);
-        builder.append_array(&arr3);
+        builder.append_array(&arr1).unwrap();
+        builder.append_array(&arr2).unwrap();
+        builder.append_array(&arr3).unwrap();
 
         let actual = builder.finish();
         let expected = GenericStringArray::<i32>::from(input);
@@ -695,9 +704,9 @@ mod tests {
         let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec());
 
         let mut builder = GenericStringBuilder::<i32>::new();
-        builder.append_array(&arr1);
-        builder.append_array(&arr2);
-        builder.append_array(&arr3);
+        builder.append_array(&arr1).unwrap();
+        builder.append_array(&arr2).unwrap();
+        builder.append_array(&arr3).unwrap();
 
         let actual = builder.finish();
         let expected = GenericStringArray::<i32>::from(input);
@@ -709,7 +718,7 @@ mod tests {
     fn test_append_empty_array() {
         let arr = GenericStringArray::<i32>::from(Vec::<&str>::new());
         let mut builder = GenericStringBuilder::<i32>::new();
-        builder.append_array(&arr);
+        builder.append_array(&arr).unwrap();
         let result = builder.finish();
         assert_eq!(result.len(), 0);
     }
@@ -736,7 +745,7 @@ mod tests {
         assert_ne!(sliced.offsets().last(), full_array.offsets().last());
 
         let mut builder = GenericStringBuilder::<i32>::new();
-        builder.append_array(&sliced);
+        builder.append_array(&sliced).unwrap();
         let actual = builder.finish();
 
         let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), 
None, None]);
@@ -772,8 +781,8 @@ mod tests {
         };
 
         let mut builder = GenericStringBuilder::<i32>::new();
-        builder.append_array(&input_1_array_with_nulls);
-        builder.append_array(&input_2_array_with_nulls);
+        builder.append_array(&input_1_array_with_nulls).unwrap();
+        builder.append_array(&input_2_array_with_nulls).unwrap();
 
         let actual = builder.finish();
         let expected = GenericStringArray::<i32>::from(vec![
@@ -819,12 +828,27 @@ mod tests {
         let slice3 = full_array.slice(7, full_array.len() - 7);
 
         let mut builder = GenericStringBuilder::<i32>::new();
-        builder.append_array(&slice1);
-        builder.append_array(&slice2);
-        builder.append_array(&slice3);
+        builder.append_array(&slice1).unwrap();
+        builder.append_array(&slice2).unwrap();
+        builder.append_array(&slice3).unwrap();
 
         let actual = builder.finish();
 
         assert_eq!(actual, full_array);
     }
+
+    #[test]
+    fn test_append_array_offset_overflow_precise() {
+        let mut builder = GenericStringBuilder::<i32>::new();
+
+        let initial_string = "x".repeat(i32::MAX as usize - 100);
+        builder.append_value(&initial_string);
+
+        let overflow_string = "y".repeat(200);
+        let overflow_array = 
GenericStringArray::<i32>::from(vec![overflow_string.as_str()]);
+
+        let result = builder.append_array(&overflow_array);
+
+        assert!(matches!(result, Err(ArrowError::OffsetOverflowError(_))));
+    }
 }
diff --git a/arrow-select/src/concat.rs b/arrow-select/src/concat.rs
index d300644792..fab47a588d 100644
--- a/arrow-select/src/concat.rs
+++ b/arrow-select/src/concat.rs
@@ -236,7 +236,7 @@ fn concat_bytes<T: ByteArrayType>(arrays: &[&dyn Array]) -> 
Result<ArrayRef, Arr
     let mut builder = GenericByteBuilder::<T>::with_capacity(item_capacity, 
bytes_capacity);
 
     for array in arrays {
-        builder.append_array(array.as_bytes::<T>());
+        builder.append_array(array.as_bytes::<T>())?;
     }
 
     Ok(Arc::new(builder.finish()))

Reply via email to