This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 5db072f1c7 arrow-select: implement specialized interleave_list (#8953)
5db072f1c7 is described below

commit 5db072f1c7fab3825748d693d7cbb3138d471162
Author: Alfonso Subiotto Marqués <[email protected]>
AuthorDate: Sat Dec 13 15:26:40 2025 +0100

    arrow-select: implement specialized interleave_list (#8953)
    
    Previously, List and LargeList would fall through to the
    interleave_fallback match arm, which is inefficient. This commit
    implements interleave_list, which interleaves a list's child arrays and
    rebuilds the offsets buffer. Running it on production tests reduced
    memory by 80%.
    
    # Which issue does this PR close?
    
    - Closes #8952
    
    # Rationale for this change
    
    Performance and memory usage when interleaving List/LargeList
    
    # Are these changes tested?
    
    This PR does not include tests because interleave tests for Lists
    already exist
    
    # Are there any user-facing changes?
    
    No, purely performance
    
    Signed-off-by: Alfonso Subiotto Marques <[email protected]>
---
 arrow-select/src/interleave.rs | 74 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 8 deletions(-)

diff --git a/arrow-select/src/interleave.rs b/arrow-select/src/interleave.rs
index cb3ca655dc..f870bc8c2f 100644
--- a/arrow-select/src/interleave.rs
+++ b/arrow-select/src/interleave.rs
@@ -26,7 +26,7 @@ use arrow_array::*;
 use arrow_buffer::{ArrowNativeType, BooleanBuffer, MutableBuffer, NullBuffer, 
OffsetBuffer};
 use arrow_data::ByteView;
 use arrow_data::transform::MutableArrayData;
-use arrow_schema::{ArrowError, DataType, Fields};
+use arrow_schema::{ArrowError, DataType, FieldRef, Fields};
 use std::sync::Arc;
 
 macro_rules! primitive_helper {
@@ -106,6 +106,8 @@ pub fn interleave(
             _ => unreachable!("illegal dictionary key type {k}")
         },
         DataType::Struct(fields) => interleave_struct(fields, values, indices),
+        DataType::List(field) => interleave_list::<i32>(values, indices, 
field),
+        DataType::LargeList(field) => interleave_list::<i64>(values, indices, 
field),
         _ => interleave_fallback(values, indices)
     }
 }
@@ -319,6 +321,50 @@ fn interleave_struct(
     Ok(Arc::new(struct_array))
 }
 
+fn interleave_list<O: OffsetSizeTrait>(
+    values: &[&dyn Array],
+    indices: &[(usize, usize)],
+    field: &FieldRef,
+) -> Result<ArrayRef, ArrowError> {
+    let interleaved = Interleave::<'_, GenericListArray<O>>::new(values, 
indices);
+
+    let mut capacity = 0usize;
+    let mut offsets = Vec::with_capacity(indices.len() + 1);
+    offsets.push(O::from_usize(0).unwrap());
+    offsets.extend(indices.iter().map(|(array, row)| {
+        let o = interleaved.arrays[*array].value_offsets();
+        let element_len = o[*row + 1].as_usize() - o[*row].as_usize();
+        capacity += element_len;
+        O::from_usize(capacity).expect("offset overflow")
+    }));
+
+    let mut child_indices = Vec::with_capacity(capacity);
+    for (array, row) in indices {
+        let list = interleaved.arrays[*array];
+        let start = list.value_offsets()[*row].as_usize();
+        let end = list.value_offsets()[*row + 1].as_usize();
+        child_indices.extend((start..end).map(|i| (*array, i)));
+    }
+
+    let child_arrays: Vec<&dyn Array> = interleaved
+        .arrays
+        .iter()
+        .map(|list| list.values().as_ref())
+        .collect();
+
+    let interleaved_values = interleave(&child_arrays, &child_indices)?;
+
+    let offsets = OffsetBuffer::new(offsets.into());
+    let list_array = GenericListArray::<O>::new(
+        field.clone(),
+        offsets,
+        interleaved_values,
+        interleaved.nulls,
+    );
+
+    Ok(Arc::new(list_array))
+}
+
 /// Fallback implementation of interleave using [`MutableArrayData`]
 fn interleave_fallback(
     values: &[&dyn Array],
@@ -488,7 +534,7 @@ pub fn interleave_record_batch(
 mod tests {
     use super::*;
     use arrow_array::Int32RunArray;
-    use arrow_array::builder::{Int32Builder, ListBuilder, PrimitiveRunBuilder};
+    use arrow_array::builder::{GenericListBuilder, Int32Builder, 
PrimitiveRunBuilder};
     use arrow_array::types::Int8Type;
     use arrow_schema::Field;
 
@@ -622,10 +668,9 @@ mod tests {
         assert_eq!(string_result, vec!["v0", "v0", "v49"]);
     }
 
-    #[test]
-    fn test_lists() {
+    fn test_interleave_lists<O: OffsetSizeTrait>() {
         // [[1, 2], null, [3]]
-        let mut a = ListBuilder::new(Int32Builder::new());
+        let mut a = GenericListBuilder::<O, _>::new(Int32Builder::new());
         a.values().append_value(1);
         a.values().append_value(2);
         a.append(true);
@@ -635,7 +680,7 @@ mod tests {
         let a = a.finish();
 
         // [[4], null, [5, 6, null]]
-        let mut b = ListBuilder::new(Int32Builder::new());
+        let mut b = GenericListBuilder::<O, _>::new(Int32Builder::new());
         b.values().append_value(4);
         b.append(true);
         b.append(false);
@@ -646,10 +691,13 @@ mod tests {
         let b = b.finish();
 
         let values = interleave(&[&a, &b], &[(0, 2), (0, 1), (1, 0), (1, 2), 
(1, 1)]).unwrap();
-        let v = values.as_any().downcast_ref::<ListArray>().unwrap();
+        let v = values
+            .as_any()
+            .downcast_ref::<GenericListArray<O>>()
+            .unwrap();
 
         // [[3], null, [4], [5, 6, null], null]
-        let mut expected = ListBuilder::new(Int32Builder::new());
+        let mut expected = GenericListBuilder::<O, 
_>::new(Int32Builder::new());
         expected.values().append_value(3);
         expected.append(true);
         expected.append(false);
@@ -665,6 +713,16 @@ mod tests {
         assert_eq!(v, &expected);
     }
 
+    #[test]
+    fn test_lists() {
+        test_interleave_lists::<i32>();
+    }
+
+    #[test]
+    fn test_large_lists() {
+        test_interleave_lists::<i64>();
+    }
+
     #[test]
     fn test_struct_without_nulls() {
         let fields = Fields::from(vec![

Reply via email to