This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 57b64b4978 Fix IPC roundtripping dicts nested in ListViews (#9126)
57b64b4978 is described below

commit 57b64b49788da4ba36e63415f8130b01ab8dbb86
Author: Frederic Branczyk <[email protected]>
AuthorDate: Sat Jan 10 13:03:01 2026 +0100

    Fix IPC roundtripping dicts nested in ListViews (#9126)
    
    # Which issue does this PR close?
    
    Closes https://github.com/apache/arrow-rs/issues/9125
    
    # Rationale for this change
    
    Fixes correct handling of dict building for IPC when ListView is used.
    
    # What changes are included in this PR?
    
    Creating schemas correctly and reading dict IDs correctly when ListView
    occurs in IPC.
    
    # Are these changes tested?
    
    Yes, see new unit tests.
    
    # Are there any user-facing changes?
    
    No, just a bug fix.
    
    @alamb
---
 arrow-ipc/src/writer.rs   | 136 ++++++++++++++++++++++++++++++++++++++++++++++
 arrow-schema/src/field.rs |   2 +
 2 files changed, 138 insertions(+)

diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs
index c868641ab6..8afc3c8ed2 100644
--- a/arrow-ipc/src/writer.rs
+++ b/arrow-ipc/src/writer.rs
@@ -296,6 +296,30 @@ impl IpcDataGenerator {
                     compression_context,
                 )?;
             }
+            DataType::ListView(field) => {
+                let list = column.as_list_view::<i32>();
+                self.encode_dictionaries(
+                    field,
+                    list.values(),
+                    encoded_dictionaries,
+                    dictionary_tracker,
+                    write_options,
+                    dict_id,
+                    compression_context,
+                )?;
+            }
+            DataType::LargeListView(field) => {
+                let list = column.as_list_view::<i64>();
+                self.encode_dictionaries(
+                    field,
+                    list.values(),
+                    encoded_dictionaries,
+                    dictionary_tracker,
+                    write_options,
+                    dict_id,
+                    compression_context,
+                )?;
+            }
             DataType::FixedSizeList(field, _) => {
                 let list = column
                     .as_any()
@@ -3402,6 +3426,118 @@ mod tests {
         assert_eq!(in_batch, out_batch);
     }
 
+    fn test_roundtrip_list_view_of_dict_impl<OffsetSize: OffsetSizeTrait, U: 
ArrowNativeType>(
+        list_data_type: DataType,
+        offsets: &[U; 5],
+        sizes: &[U; 4],
+    ) {
+        let values = StringArray::from(vec![Some("alpha"), None, Some("beta"), 
Some("gamma")]);
+        let keys = Int32Array::from_iter_values([0, 0, 1, 2, 3, 0, 2]);
+        let dict_array = DictionaryArray::new(keys, Arc::new(values));
+        let dict_data = dict_array.to_data();
+
+        let value_offsets = Buffer::from_slice_ref(offsets);
+        let value_sizes = Buffer::from_slice_ref(sizes);
+
+        let list_data = ArrayData::builder(list_data_type)
+            .len(4)
+            .add_buffer(value_offsets)
+            .add_buffer(value_sizes)
+            .add_child_data(dict_data)
+            .build()
+            .unwrap();
+        let list_view_array = 
GenericListViewArray::<OffsetSize>::from(list_data);
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "f1",
+            list_view_array.data_type().clone(),
+            false,
+        )]));
+        let input_batch = RecordBatch::try_new(schema, 
vec![Arc::new(list_view_array)]).unwrap();
+
+        let output_batch = deserialize_file(serialize_file(&input_batch));
+        assert_eq!(input_batch, output_batch);
+
+        let output_batch = deserialize_stream(serialize_stream(&input_batch));
+        assert_eq!(input_batch, output_batch);
+    }
+
+    #[test]
+    fn test_roundtrip_list_view_of_dict() {
+        #[allow(deprecated)]
+        let list_data_type = DataType::ListView(Arc::new(Field::new_dict(
+            "item",
+            DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8)),
+            true,
+            1,
+            false,
+        )));
+        let offsets: &[i32; 5] = &[0, 2, 4, 4, 7];
+        let sizes: &[i32; 4] = &[2, 2, 0, 3];
+        test_roundtrip_list_view_of_dict_impl::<i32, i32>(list_data_type, 
offsets, sizes);
+    }
+
+    #[test]
+    fn test_roundtrip_large_list_view_of_dict() {
+        #[allow(deprecated)]
+        let list_data_type = DataType::LargeListView(Arc::new(Field::new_dict(
+            "item",
+            DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8)),
+            true,
+            2,
+            false,
+        )));
+        let offsets: &[i64; 5] = &[0, 2, 4, 4, 7];
+        let sizes: &[i64; 4] = &[2, 2, 0, 3];
+        test_roundtrip_list_view_of_dict_impl::<i64, i64>(list_data_type, 
offsets, sizes);
+    }
+
+    #[test]
+    fn test_roundtrip_sliced_list_view_of_dict() {
+        #[allow(deprecated)]
+        let list_data_type = DataType::ListView(Arc::new(Field::new_dict(
+            "item",
+            DataType::Dictionary(Box::new(DataType::Int32), 
Box::new(DataType::Utf8)),
+            true,
+            3,
+            false,
+        )));
+
+        let values = StringArray::from(vec![Some("alpha"), None, Some("beta"), 
Some("gamma")]);
+        let keys = Int32Array::from_iter_values([0, 0, 1, 2, 3, 0, 2, 1, 0, 3, 
2, 1]);
+        let dict_array = DictionaryArray::new(keys, Arc::new(values));
+        let dict_data = dict_array.to_data();
+
+        let offsets: &[i32; 7] = &[0, 2, 4, 4, 7, 9, 12];
+        let sizes: &[i32; 6] = &[2, 2, 0, 3, 2, 3];
+        let value_offsets = Buffer::from_slice_ref(offsets);
+        let value_sizes = Buffer::from_slice_ref(sizes);
+
+        let list_data = ArrayData::builder(list_data_type)
+            .len(6)
+            .add_buffer(value_offsets)
+            .add_buffer(value_sizes)
+            .add_child_data(dict_data)
+            .build()
+            .unwrap();
+        let list_view_array = GenericListViewArray::<i32>::from(list_data);
+
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "f1",
+            list_view_array.data_type().clone(),
+            false,
+        )]));
+        let input_batch = RecordBatch::try_new(schema, 
vec![Arc::new(list_view_array)]).unwrap();
+
+        let sliced_batch = input_batch.slice(1, 4);
+
+        let output_batch = deserialize_file(serialize_file(&sliced_batch));
+        assert_eq!(sliced_batch, output_batch);
+
+        let output_batch = deserialize_stream(serialize_stream(&sliced_batch));
+        assert_eq!(sliced_batch, output_batch);
+    }
+
     #[test]
     fn test_decimal128_alignment16_is_sufficient() {
         const IPC_ALIGNMENT: usize = 16;
diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs
index 1b9a298e59..c4566e41bf 100644
--- a/arrow-schema/src/field.rs
+++ b/arrow-schema/src/field.rs
@@ -718,6 +718,8 @@ impl Field {
             DataType::Union(fields, _) => fields.iter().flat_map(|(_, f)| 
f.fields()).collect(),
             DataType::List(field)
             | DataType::LargeList(field)
+            | DataType::ListView(field)
+            | DataType::LargeListView(field)
             | DataType::FixedSizeList(field, _)
             | DataType::Map(field, _) => field.fields(),
             DataType::Dictionary(_, value_field) => 
Field::_fields(value_field.as_ref()),

Reply via email to