alamb commented on code in PR #21934:
URL: https://github.com/apache/datafusion/pull/21934#discussion_r3283304180
##########
datafusion/common/src/scalar/mod.rs:
##########
Review Comment:
Is there any reason not to recurse into Dictionary, Union, and and REE
arrays too? They are all recursive as well
##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -10708,4 +10792,200 @@ mod tests {
]
);
}
+
+ // ── compact / compact_view_buffers
───────────────────────────────────────
+
+ /// Builds a `StringViewArray` with `n` strings that are all longer than
+ /// 12 bytes so they are stored in backing buffers rather than inline.
+ fn make_long_strings(n: usize) -> StringViewArray {
+ let mut b = StringViewBuilder::new();
+ for i in 0..n {
+ b.append_value(format!("long_string_value_pad_{i:04}"));
+ }
+ b.finish()
+ }
+
+ /// Total bytes across all backing buffers of a `StringViewArray`.
+ fn utf8view_buffer_bytes(a: &StringViewArray) -> usize {
+ a.data_buffers().iter().map(|b| b.len()).sum()
+ }
+
+ #[test]
+ fn test_compact_list_utf8view() {
+ const N: usize = 50;
Review Comment:
there is an awful lot of repetition in these test
```rust
const N: usize = 50;
let strings = make_long_strings(N);
let one_len = strings.value(0).len();
assert!(utf8view_buffer_bytes(&strings) >= N * one_len);
let single_row_list_array =
SingleRowListArrayBuilder::new(Arc::new(strings.slice(0, 1)) as
ArrayRef)
.build_large_list_array();
...
assert_eq!(
utf8view_buffer_bytes(arr.values().as_string_view()),
one_len
);
assert_eq!(arr.values().as_string_view().value(0), strings.value(0));
```
is just copy/pasted in each one.
Maybe we could move it into a fixture to reduce duplication and make it
clearer what is different between different tests
##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -10708,4 +10792,200 @@ mod tests {
]
);
}
+
+ // ── compact / compact_view_buffers
───────────────────────────────────────
Review Comment:
I tested that these tests fail when I back out the code changes
<details>
```diff
diff --git a/datafusion/common/src/scalar/mod.rs
b/datafusion/common/src/scalar/mod.rs
index 63cbce10ae..73f4f0b76c 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -4719,14 +4719,6 @@ impl ScalarValue {
/// This can be relevant when `self` is a list or contains a list as a
nested value, as
/// a single list holds an Arc to its entire original array buffer.
pub fn compact(&mut self) {
- // copy_array_data + compact_view_buffers + downcast back, all in
one step.
- macro_rules! compact_array {
- ($arr:expr, $from_type:ty, $($as_method:tt)+) => {
- *Arc::make_mut($arr) = ScalarValue::compact_view_buffers(
-
Arc::new(<$from_type>::from(copy_array_data(&$arr.to_data()))) as ArrayRef,
- ).$($as_method)+.clone()
- };
- }
match self {
ScalarValue::Null
| ScalarValue::Boolean(_)
@@ -4770,20 +4762,33 @@ impl ScalarValue {
| ScalarValue::LargeBinary(_)
| ScalarValue::BinaryView(_) => (),
ScalarValue::FixedSizeList(arr) => {
- compact_array!(arr, FixedSizeListArray,
as_fixed_size_list())
+ let array = copy_array_data(&arr.to_data());
+ *Arc::make_mut(arr) = FixedSizeListArray::from(array);
+ }
+ ScalarValue::List(arr) => {
+ let array = copy_array_data(&arr.to_data());
+ *Arc::make_mut(arr) = ListArray::from(array);
}
- ScalarValue::List(arr) => compact_array!(arr, ListArray,
as_list::<i32>()),
ScalarValue::LargeList(arr) => {
- compact_array!(arr, LargeListArray, as_list::<i64>())
+ let array = copy_array_data(&arr.to_data());
+ *Arc::make_mut(arr) = LargeListArray::from(array)
}
ScalarValue::ListView(arr) => {
- compact_array!(arr, ListViewArray, as_list_view::<i32>())
+ let array = copy_array_data(&arr.to_data());
+ *Arc::make_mut(arr) = ListViewArray::from(array);
}
ScalarValue::LargeListView(arr) => {
- compact_array!(arr, LargeListViewArray,
as_list_view::<i64>())
+ let array = copy_array_data(&arr.to_data());
+ *Arc::make_mut(arr) = LargeListViewArray::from(array)
+ }
+ ScalarValue::Struct(arr) => {
+ let array = copy_array_data(&arr.to_data());
+ *Arc::make_mut(arr) = StructArray::from(array);
+ }
+ ScalarValue::Map(arr) => {
+ let array = copy_array_data(&arr.to_data());
+ *Arc::make_mut(arr) = MapArray::from(array);
}
- ScalarValue::Struct(arr) => compact_array!(arr, StructArray,
as_struct()),
- ScalarValue::Map(arr) => compact_array!(arr, MapArray,
as_map()),
- ScalarValue::Union(val, _, _) => {
if let Some((_, value)) = val.as_mut() {
value.compact();
@@ -4804,95 +4809,6 @@ impl ScalarValue {
self
}
- /// Recursively compacts the backing buffers of any [`StringViewArray`]
or
- /// [`BinaryViewArray`] nested within `array`.
- ///
- /// View-typed arrays keep an `Arc` reference to their original backing
- /// buffers, so a single scalar extracted from a large batch still
retains
- /// the entire buffer. Calling [`.gc()`][StringViewArray::gc] copies
only
- /// the bytes that are actually referenced by the surviving views,
releasing
- /// the rest.
- ///
- /// Container types (`List`, `LargeList`, `FixedSizeList`, `ListView`,
- /// `LargeListView`, `Struct`, `Map`) are handled by recursing into
their
- /// child / values arrays and reconstructing the parent with the
compacted
- /// children. All other types are returned unchanged.
- fn compact_view_buffers(array: ArrayRef) -> ArrayRef {
- // Macro for the i32/i64-offset list pair (List / LargeList).
- macro_rules! gc_list {
- ($field:expr, $offset_type:ty, $array_type:ty) => {{
- let list = array.as_list::<$offset_type>();
- Arc::new(<$array_type>::new(
- Arc::clone($field),
- list.offsets().clone(),
-
ScalarValue::compact_view_buffers(Arc::clone(list.values())),
- list.nulls().cloned(),
- )) as ArrayRef
- }};
- }
- // Macro for the i32/i64-offset list-view pair (ListView /
LargeListView).
- macro_rules! gc_list_view {
- ($field:expr, $offset_type:ty, $array_type:ty) => {{
- let list = array.as_list_view::<$offset_type>();
- Arc::new(<$array_type>::new(
- Arc::clone($field),
- list.offsets().clone(),
- list.sizes().clone(),
-
ScalarValue::compact_view_buffers(Arc::clone(list.values())),
- list.nulls().cloned(),
- )) as ArrayRef
- }};
- }
-
- match array.data_type() {
- DataType::Utf8View => Arc::new(array.as_string_view().gc()),
- DataType::BinaryView => Arc::new(array.as_binary_view().gc()),
- DataType::Struct(_) => {
- let s = array.as_struct();
- let columns = s
- .columns()
- .iter()
- .map(|c|
ScalarValue::compact_view_buffers(Arc::clone(c)))
- .collect();
- Arc::new(StructArray::new(
- s.fields().clone(),
- columns,
- s.nulls().cloned(),
- ))
- }
-
- DataType::List(field) => gc_list!(field, i32, ListArray),
- DataType::LargeList(field) => gc_list!(field, i64,
LargeListArray),
- DataType::FixedSizeList(field, size) => {
- let list = array.as_fixed_size_list();
- Arc::new(FixedSizeListArray::new(
- Arc::clone(field),
- *size,
-
ScalarValue::compact_view_buffers(Arc::clone(list.values())),
- list.nulls().cloned(),
- ))
- }
- DataType::ListView(field) => gc_list_view!(field, i32,
ListViewArray),
- DataType::LargeListView(field) => {
- gc_list_view!(field, i64, LargeListViewArray)
- }
- DataType::Map(field, ordered) => {
- let map = array.as_map();
- let entries = ScalarValue::compact_view_buffers(Arc::new(
- map.entries().clone(),
- )
- as ArrayRef);
- Arc::new(MapArray::new(
- Arc::clone(field),
- map.offsets().clone(),
- entries.as_struct().clone(),
- map.nulls().cloned(),
- *ordered,
- ))
- }
- _ => array,
- }
- }
-
```
</details>
```shell
$ cargo test -p datafusion-common --lib scalar::tests::test_compact
...
failures:
---- scalar::tests::test_compact_struct_utf8view stdout ----
thread 'scalar::tests::test_compact_struct_utf8view' (76301068) panicked at
datafusion/common/src/scalar/mod.rs:10864:9:
assertion `left == right` failed
left: 1300
right: 26
---- scalar::tests::test_compact_large_list_view_utf8view stdout ----
thread 'scalar::tests::test_compact_large_list_view_utf8view' (76301064)
panicked at datafusion/common/src/scalar/mod.rs:10837:9:
assertion `left == right` failed
left: 1300
right: 26
note: run with `RUST_BACKTRACE=1` environment variable to display a backtrace
---- scalar::tests::test_compact_list_view_utf8view stdout ----
thread 'scalar::tests::test_compact_list_view_utf8view' (76301066) panicked
at datafusion/common/src/scalar/mod.rs:10814:9:
assertion `left == right` failed
left: 1300
right: 26
---- scalar::tests::test_compact_list_utf8view stdout ----
thread 'scalar::tests::test_compact_list_utf8view' (76301065) panicked at
datafusion/common/src/scalar/mod.rs:10745:9:
assertion `left == right` failed
left: 1300
right: 26
---- scalar::tests::test_compact_large_list_utf8view stdout ----
thread 'scalar::tests::test_compact_large_list_utf8view' (76301063) panicked
at datafusion/common/src/scalar/mod.rs:10768:9:
assertion `left == right` failed
left: 1300
right: 26
---- scalar::tests::test_compact_map_utf8view stdout ----
thread 'scalar::tests::test_compact_map_utf8view' (76301067) panicked at
datafusion/common/src/scalar/mod.rs:10904:9:
assertion `left == right` failed
left: 1300
right: 26
---- scalar::tests::test_compact_fixed_size_list_utf8view stdout ----
thread 'scalar::tests::test_compact_fixed_size_list_utf8view' (76301062)
panicked at datafusion/common/src/scalar/mod.rs:10791:9:
assertion `left == right` failed
left: 1300
right: 26
failures:
scalar::tests::test_compact_fixed_size_list_utf8view
scalar::tests::test_compact_large_list_utf8view
scalar::tests::test_compact_large_list_view_utf8view
scalar::tests::test_compact_list_utf8view
scalar::tests::test_compact_list_view_utf8view
scalar::tests::test_compact_map_utf8view
scalar::tests::test_compact_struct_utf8view
test result: FAILED. 0 passed; 7 failed; 0 ignored; 0 measured; 450 filtered
out; finished in 0.00s
error: test failed, to rerun pass `-p datafusion-common --lib`
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]