This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new cd6053649c Add support for using ListView arrays and types through FFI
(#8822)
cd6053649c is described below
commit cd6053649c00200c2a8b826d1f94b62c9ab3cd06
Author: Adam Gutglick <[email protected]>
AuthorDate: Thu Dec 4 10:48:20 2025 -0500
Add support for using ListView arrays and types through FFI (#8822)
# Which issue does this PR close?
- Closes #8819
# Rationale for this change
Adds support for using list view arrays through FFI
# What changes are included in this PR?
Support for the ListView types and arrays
# Are these changes tested?
I've added additional tests that verify round trips through FFI, and a
pyarrow integration test.
# Are there any user-facing changes?
Extended support for ListView, doesn't effect existing functionality.
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-array/src/ffi.rs | 146 ++++++++++++++++++++-
arrow-data/src/data.rs | 2 +-
.../tests/test_sql.py | 37 ++++--
arrow-schema/src/ffi.rs | 22 ++++
4 files changed, 194 insertions(+), 13 deletions(-)
diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs
index 542b61d892..f50dd3420b 100644
--- a/arrow-array/src/ffi.rs
+++ b/arrow-array/src/ffi.rs
@@ -140,8 +140,8 @@ pub unsafe fn export_array_into_raw(
Ok(())
}
-// returns the number of bits that buffer `i` (in the C data interface) is
expected to have.
-// This is set by the Arrow specification
+/// returns the number of bits that buffer `i` (in the C data interface) is
expected to have.
+/// This is set by the Arrow specification
fn bit_width(data_type: &DataType, i: usize) -> Result<usize> {
if let Some(primitive) = data_type.primitive_width() {
return match i {
@@ -180,6 +180,10 @@ fn bit_width(data_type: &DataType, i: usize) ->
Result<usize> {
| (DataType::List(_), 1)
| (DataType::Map(_, _), 1) => i32::BITS as _,
(DataType::Utf8, 2) | (DataType::Binary, 2) => u8::BITS as _,
+ // List views have two i32 buffers, offsets and sizes
+ (DataType::ListView(_), 1) | (DataType::ListView(_), 2) => i32::BITS
as _,
+ // Large list views have two i64 buffers, offsets and sizes
+ (DataType::LargeListView(_), 1) | (DataType::LargeListView(_), 2) =>
i64::BITS as _,
(DataType::List(_), _) | (DataType::Map(_, _), _) => {
return Err(ArrowError::CDataInterface(format!(
"The datatype \"{data_type}\" expects 2 buffers, but requested
{i}. Please verify that the C data interface is correctly implemented."
@@ -351,6 +355,8 @@ impl ImportedArrowArray<'_> {
DataType::List(field)
| DataType::FixedSizeList(field, _)
| DataType::LargeList(field)
+ | DataType::ListView(field)
+ | DataType::LargeListView(field)
| DataType::Map(field, _) => Ok([self.consume_child(0,
field.data_type())?].to_vec()),
DataType::Struct(fields) => {
assert!(fields.len() == self.array.num_children());
@@ -471,6 +477,14 @@ impl ImportedArrowArray<'_> {
debug_assert_eq!(bits % 8, 0);
(length + 1) * (bits / 8)
}
+ (DataType::ListView(_), 1)
+ | (DataType::ListView(_), 2)
+ | (DataType::LargeListView(_), 1)
+ | (DataType::LargeListView(_), 2) => {
+ let bits = bit_width(data_type, i)?;
+ debug_assert_eq!(bits % 8, 0);
+ length * (bits / 8)
+ }
(DataType::Utf8, 2) | (DataType::Binary, 2) => {
if self.array.is_empty() {
return Ok(0);
@@ -553,7 +567,7 @@ mod tests_to_then_from_ffi {
use std::collections::HashMap;
use std::mem::ManuallyDrop;
- use arrow_buffer::NullBuffer;
+ use arrow_buffer::{ArrowNativeType, NullBuffer};
use arrow_schema::Field;
use crate::builder::UnionBuilder;
@@ -783,6 +797,71 @@ mod tests_to_then_from_ffi {
test_generic_list::<i64>()
}
+ fn test_generic_list_view<Offset: OffsetSizeTrait + ArrowNativeType>() ->
Result<()> {
+ // Construct a value array
+ let value_data = ArrayData::builder(DataType::Int16)
+ .len(8)
+ .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
+ .build()
+ .unwrap();
+
+ // Construct a buffer for value offsets, for the nested array:
+ // [[0, 1, 2], [3, 4, 5], [6, 7]]
+ let value_offsets = [0_usize, 3, 6]
+ .iter()
+ .map(|i| Offset::from_usize(*i).unwrap())
+ .collect::<Buffer>();
+
+ let sizes_buffer = [3_usize, 3, 2]
+ .iter()
+ .map(|i| Offset::from_usize(*i).unwrap())
+ .collect::<Buffer>();
+
+ // Construct a list array from the above two
+ let list_view_dt =
GenericListViewArray::<Offset>::DATA_TYPE_CONSTRUCTOR(Arc::new(
+ Field::new_list_field(DataType::Int16, false),
+ ));
+
+ let list_data = ArrayData::builder(list_view_dt)
+ .len(3)
+ .add_buffer(value_offsets)
+ .add_buffer(sizes_buffer)
+ .add_child_data(value_data)
+ .build()
+ .unwrap();
+
+ let original = GenericListViewArray::<Offset>::from(list_data.clone());
+
+ // export it
+ let (array, schema) = to_ffi(&original.to_data())?;
+
+ // (simulate consumer) import it
+ let data = unsafe { from_ffi(array, &schema) }?;
+ let array = make_array(data);
+
+ // downcast
+ let array = array
+ .as_any()
+ .downcast_ref::<GenericListViewArray<Offset>>()
+ .unwrap();
+
+ assert_eq!(&array.value(0), &original.value(0));
+ assert_eq!(&array.value(1), &original.value(1));
+ assert_eq!(&array.value(2), &original.value(2));
+
+ Ok(())
+ }
+
+ #[test]
+ fn test_list_view() -> Result<()> {
+ test_generic_list_view::<i32>()
+ }
+
+ #[test]
+ fn test_large_list_view() -> Result<()> {
+ test_generic_list_view::<i64>()
+ }
+
fn test_generic_binary<Offset: OffsetSizeTrait>() -> Result<()> {
// create an array natively
let array: Vec<Option<&[u8]>> = vec![Some(b"a"), None, Some(b"aaa")];
@@ -1315,6 +1394,7 @@ mod tests_from_ffi {
use std::ptr::NonNull;
use std::sync::Arc;
+ use arrow_buffer::NullBuffer;
#[cfg(not(feature = "force_validate"))]
use arrow_buffer::{ScalarBuffer, bit_util, buffer::Buffer};
#[cfg(feature = "force_validate")]
@@ -1325,6 +1405,7 @@ mod tests_from_ffi {
use arrow_schema::{DataType, Field};
use super::Result;
+
use crate::builder::GenericByteViewBuilder;
use crate::types::{BinaryViewType, ByteViewType, Int32Type,
StringViewType};
use crate::{
@@ -1528,6 +1609,65 @@ mod tests_from_ffi {
test_round_trip(&data)
}
+ #[test]
+ fn test_list_view() -> Result<()> {
+ // Construct a value array
+ let value_data = ArrayData::builder(DataType::Int16)
+ .len(8)
+ .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
+ .build()
+ .unwrap();
+
+ // Construct a buffer for value offsets, for the nested array:
+ // [[0, 1, 2], [3, 4, 5], [6, 7]]
+ let value_offsets = Buffer::from(vec![0_i32, 3, 6]);
+ let sizes_buffer = Buffer::from(vec![3_i32, 3, 2]);
+
+ // Construct a list array from the above two
+ let list_view_dt =
+ DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16,
false)));
+
+ let list_view_data = ArrayData::builder(list_view_dt)
+ .len(3)
+ .add_buffer(value_offsets)
+ .add_buffer(sizes_buffer)
+ .add_child_data(value_data)
+ .build()
+ .unwrap();
+
+ test_round_trip(&list_view_data)
+ }
+
+ #[test]
+ fn test_list_view_with_nulls() -> Result<()> {
+ // Construct a value array
+ let value_data = ArrayData::builder(DataType::Int16)
+ .len(8)
+ .add_buffer(Buffer::from_slice_ref([0_i16, 1, 2, 3, 4, 5, 6, 7]))
+ .build()
+ .unwrap();
+
+ // Construct a buffer for value offsets, for the nested array:
+ // [[0, 1, 2], [3, 4, 5], [6, 7], null]
+ let value_offsets = Buffer::from(vec![0_i32, 3, 6, 8]);
+ let sizes_buffer = Buffer::from(vec![3_i32, 3, 2, 0]);
+
+ // Construct a list array from the above two
+ let list_view_dt =
+ DataType::ListView(Arc::new(Field::new_list_field(DataType::Int16,
true)));
+
+ let list_view_data = ArrayData::builder(list_view_dt)
+ .len(4)
+ .add_buffer(value_offsets)
+ .add_buffer(sizes_buffer)
+ .add_child_data(value_data)
+ .nulls(Some(NullBuffer::from(vec![true, true, true, false])))
+ .build()
+ .unwrap();
+
+ test_round_trip(&list_view_data)
+ }
+
#[test]
#[cfg(not(feature = "force_validate"))]
fn test_empty_string_with_non_zero_offset() -> Result<()> {
diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs
index 329c12bbf4..df3b2c578f 100644
--- a/arrow-data/src/data.rs
+++ b/arrow-data/src/data.rs
@@ -1793,7 +1793,7 @@ impl DataTypeLayout {
},
],
can_contain_null_mask: true,
- variadic: true,
+ variadic: false,
}
}
}
diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py
b/arrow-pyarrow-integration-testing/tests/test_sql.py
index 3b46d5729a..f5d53155fe 100644
--- a/arrow-pyarrow-integration-testing/tests/test_sql.py
+++ b/arrow-pyarrow-integration-testing/tests/test_sql.py
@@ -27,7 +27,9 @@ import pytz
import arrow_pyarrow_integration_testing as rust
-PYARROW_PRE_14 = int(pa.__version__.split('.')[0]) < 14
+PYARROW_MAJOR_VER = int(pa.__version__.split(".")[0])
+PYARROW_PRE_14 = PYARROW_MAJOR_VER < 14
+PYARROW_PRE_16 = PYARROW_MAJOR_VER < 16
@contextlib.contextmanager
@@ -112,8 +114,16 @@ _supported_pyarrow_types = [
),
]
-_unsupported_pyarrow_types = [
-]
+if PYARROW_MAJOR_VER >= 16:
+ _supported_pyarrow_types.extend(
+ [
+ pa.list_view(pa.uint64()),
+ pa.large_list_view(pa.uint64()),
+ pa.list_view(pa.string()),
+ pa.large_list_view(pa.string()),
+ ]
+ )
+
# As of pyarrow 14, pyarrow implements the Arrow PyCapsule interface
#
(https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
@@ -158,12 +168,6 @@ def test_type_roundtrip_pycapsule(pyarrow_type):
assert restored == pyarrow_type
assert restored is not pyarrow_type
-
[email protected]("pyarrow_type", _unsupported_pyarrow_types, ids=str)
-def test_type_roundtrip_raises(pyarrow_type):
- with pytest.raises(pa.ArrowException):
- rust.round_trip_type(pyarrow_type)
-
@pytest.mark.parametrize('pyarrow_type', _supported_pyarrow_types, ids=str)
def test_field_roundtrip(pyarrow_type):
pyarrow_field = pa.field("test", pyarrow_type, nullable=True)
@@ -337,6 +341,21 @@ def test_list_array():
del a
del b
+
[email protected](PYARROW_PRE_16, reason="requires pyarrow 16")
+def test_list_view_array():
+ """
+ Python -> Rust -> Python
+ """
+ a = pa.array([[], None, [1, 2], [4, 5, 6]], pa.list_view(pa.int64()))
+ b = rust.round_trip_array(a)
+ b.validate(full=True)
+ assert a.to_pylist() == b.to_pylist()
+ assert a.type == b.type
+ del a
+ del b
+
+
def test_map_array():
"""
Python -> Rust -> Python
diff --git a/arrow-schema/src/ffi.rs b/arrow-schema/src/ffi.rs
index 2f890508e0..a1a32224a4 100644
--- a/arrow-schema/src/ffi.rs
+++ b/arrow-schema/src/ffi.rs
@@ -456,6 +456,14 @@ impl TryFrom<&FFI_ArrowSchema> for DataType {
let c_child = c_schema.child(0);
DataType::LargeList(Arc::new(Field::try_from(c_child)?))
}
+ "+vl" => {
+ let c_child = c_schema.child(0);
+ DataType::ListView(Arc::new(Field::try_from(c_child)?))
+ }
+ "+vL" => {
+ let c_child = c_schema.child(0);
+ DataType::LargeListView(Arc::new(Field::try_from(c_child)?))
+ }
"+s" => {
let fields = c_schema.children().map(Field::try_from);
DataType::Struct(fields.collect::<Result<_, ArrowError>>()?)
@@ -657,6 +665,8 @@ impl TryFrom<&DataType> for FFI_ArrowSchema {
let children = match dtype {
DataType::List(child)
| DataType::LargeList(child)
+ | DataType::ListView(child)
+ | DataType::LargeListView(child)
| DataType::FixedSizeList(child, _)
| DataType::Map(child, _) => {
vec![FFI_ArrowSchema::try_from(child.as_ref())?]
@@ -746,6 +756,8 @@ fn get_format_string(dtype: &DataType) ->
Result<Cow<'static, str>, ArrowError>
DataType::Interval(IntervalUnit::MonthDayNano) => Ok("tin".into()),
DataType::List(_) => Ok("+l".into()),
DataType::LargeList(_) => Ok("+L".into()),
+ DataType::ListView(_) => Ok("+vl".into()),
+ DataType::LargeListView(_) => Ok("+vL".into()),
DataType::Struct(_) => Ok("+s".into()),
DataType::Map(_, _) => Ok("+m".into()),
DataType::RunEndEncoded(_, _) => Ok("+r".into()),
@@ -874,6 +886,16 @@ mod tests {
DataType::Int16,
false,
))));
+ round_trip_type(DataType::ListView(Arc::new(Field::new(
+ "a",
+ DataType::Int16,
+ false,
+ ))));
+ round_trip_type(DataType::LargeListView(Arc::new(Field::new(
+ "a",
+ DataType::Int16,
+ false,
+ ))));
round_trip_type(DataType::Struct(Fields::from(vec![Field::new(
"a",
DataType::Utf8,