This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 6de3881152 Fix RowConverter when FixedSizeList is not the last (#7789)
6de3881152 is described below
commit 6de38811526b960f79ce580cfee817c7a4bbfbc6
Author: Piotr Findeisen <[email protected]>
AuthorDate: Sat Jul 5 13:32:55 2025 +0200
Fix RowConverter when FixedSizeList is not the last (#7789)
# Which issue does this PR close?
none
# Rationale for this change
`RowConverter` decoding fails when there is a `FixedSizeList` element
and it's not the last.
# What changes are included in this PR?
Fix `RowConverter` row decoding when there is a `FixedSizeList` element
and it's not the last.
# Are these changes tested?
yes
---
arrow-row/src/lib.rs | 200 +++++++++++++++++++++++++++++++++++++++++++++++++-
arrow-row/src/list.rs | 10 +--
2 files changed, 202 insertions(+), 8 deletions(-)
diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs
index ee1c117859..d76c51578c 100644
--- a/arrow-row/src/lib.rs
+++ b/arrow-row/src/lib.rs
@@ -690,6 +690,15 @@ impl RowConverter {
columns.len()
)));
}
+ for colum in columns.iter().skip(1) {
+ if colum.len() != columns[0].len() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "RowConverter columns must all have the same length,
expected {} got {}",
+ columns[0].len(),
+ colum.len()
+ )));
+ }
+ }
let encoders = columns
.iter()
@@ -758,7 +767,20 @@ impl RowConverter {
// SAFETY
// We have validated that the rows came from this [`RowConverter`]
// and therefore must be valid
- unsafe { self.convert_raw(&mut rows, validate_utf8) }
+ let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?;
+
+ if cfg!(test) {
+ for (i, row) in rows.iter().enumerate() {
+ if !row.is_empty() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Codecs {codecs:?} did not consume all bytes for row
{i}, remaining bytes: {row:?}",
+ codecs = &self.codecs
+ )));
+ }
+ }
+ }
+
+ Ok(result)
}
/// Returns an empty [`Rows`] with capacity for `row_capacity` rows with
@@ -2549,6 +2571,182 @@ mod tests {
assert_eq!(&back[0], &list);
}
+ #[test]
+ fn test_two_fixed_size_lists() {
+ let mut first = FixedSizeListBuilder::new(UInt8Builder::new(), 1);
+ // 0: [100]
+ first.values().append_value(100);
+ first.append(true);
+ // 1: [101]
+ first.values().append_value(101);
+ first.append(true);
+ // 2: [102]
+ first.values().append_value(102);
+ first.append(true);
+ // 3: [null]
+ first.values().append_null();
+ first.append(true);
+ // 4: null
+ first.values().append_null(); // MASKED
+ first.append(false);
+ let first = Arc::new(first.finish()) as ArrayRef;
+ let first_type = first.data_type().clone();
+
+ let mut second = FixedSizeListBuilder::new(UInt8Builder::new(), 1);
+ // 0: [200]
+ second.values().append_value(200);
+ second.append(true);
+ // 1: [201]
+ second.values().append_value(201);
+ second.append(true);
+ // 2: [202]
+ second.values().append_value(202);
+ second.append(true);
+ // 3: [null]
+ second.values().append_null();
+ second.append(true);
+ // 4: null
+ second.values().append_null(); // MASKED
+ second.append(false);
+ let second = Arc::new(second.finish()) as ArrayRef;
+ let second_type = second.data_type().clone();
+
+ let converter = RowConverter::new(vec![
+ SortField::new(first_type.clone()),
+ SortField::new(second_type.clone()),
+ ])
+ .unwrap();
+
+ let rows = converter
+ .convert_columns(&[Arc::clone(&first), Arc::clone(&second)])
+ .unwrap();
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 2);
+ back[0].to_data().validate_full().unwrap();
+ assert_eq!(&back[0], &first);
+ back[1].to_data().validate_full().unwrap();
+ assert_eq!(&back[1], &second);
+ }
+
+ #[test]
+ fn test_fixed_size_list_with_variable_width_content() {
+ let mut first = FixedSizeListBuilder::new(
+ StructBuilder::from_fields(
+ vec![
+ Field::new(
+ "timestamp",
+ DataType::Timestamp(TimeUnit::Microsecond,
Some(Arc::from("UTC"))),
+ false,
+ ),
+ Field::new("offset_minutes", DataType::Int16, false),
+ Field::new("time_zone", DataType::Utf8, false),
+ ],
+ 1,
+ ),
+ 1,
+ );
+ // 0: null
+ first
+ .values()
+ .field_builder::<TimestampMicrosecondBuilder>(0)
+ .unwrap()
+ .append_null();
+ first
+ .values()
+ .field_builder::<Int16Builder>(1)
+ .unwrap()
+ .append_null();
+ first
+ .values()
+ .field_builder::<StringBuilder>(2)
+ .unwrap()
+ .append_null();
+ first.values().append(false);
+ first.append(false);
+ // 1: [null]
+ first
+ .values()
+ .field_builder::<TimestampMicrosecondBuilder>(0)
+ .unwrap()
+ .append_null();
+ first
+ .values()
+ .field_builder::<Int16Builder>(1)
+ .unwrap()
+ .append_null();
+ first
+ .values()
+ .field_builder::<StringBuilder>(2)
+ .unwrap()
+ .append_null();
+ first.values().append(false);
+ first.append(true);
+ // 2: [1970-01-01 00:00:00.000000 UTC]
+ first
+ .values()
+ .field_builder::<TimestampMicrosecondBuilder>(0)
+ .unwrap()
+ .append_value(0);
+ first
+ .values()
+ .field_builder::<Int16Builder>(1)
+ .unwrap()
+ .append_value(0);
+ first
+ .values()
+ .field_builder::<StringBuilder>(2)
+ .unwrap()
+ .append_value("UTC");
+ first.values().append(true);
+ first.append(true);
+ // 3: [2005-09-10 13:30:00.123456 Europe/Warsaw]
+ first
+ .values()
+ .field_builder::<TimestampMicrosecondBuilder>(0)
+ .unwrap()
+ .append_value(1126351800123456);
+ first
+ .values()
+ .field_builder::<Int16Builder>(1)
+ .unwrap()
+ .append_value(120);
+ first
+ .values()
+ .field_builder::<StringBuilder>(2)
+ .unwrap()
+ .append_value("Europe/Warsaw");
+ first.values().append(true);
+ first.append(true);
+ let first = Arc::new(first.finish()) as ArrayRef;
+ let first_type = first.data_type().clone();
+
+ let mut second = StringBuilder::new();
+ second.append_value("somewhere near");
+ second.append_null();
+ second.append_value("Greenwich");
+ second.append_value("Warsaw");
+ let second = Arc::new(second.finish()) as ArrayRef;
+ let second_type = second.data_type().clone();
+
+ let converter = RowConverter::new(vec![
+ SortField::new(first_type.clone()),
+ SortField::new(second_type.clone()),
+ ])
+ .unwrap();
+
+ let rows = converter
+ .convert_columns(&[Arc::clone(&first), Arc::clone(&second)])
+ .unwrap();
+
+ let back = converter.convert_rows(&rows).unwrap();
+ assert_eq!(back.len(), 2);
+ back[0].to_data().validate_full().unwrap();
+ assert_eq!(&back[0], &first);
+ back[1].to_data().validate_full().unwrap();
+ assert_eq!(&back[1], &second);
+ }
+
fn generate_primitive_array<K>(len: usize, valid_percent: f64) ->
PrimitiveArray<K>
where
K: ArrowPrimitiveType,
diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs
index 58fbc71caa..e9dc38e0fb 100644
--- a/arrow-row/src/list.rs
+++ b/arrow-row/src/list.rs
@@ -225,7 +225,6 @@ pub fn encode_fixed_size_list(
data[*offset] = 0x01;
*offset += 1;
for child_idx in (idx * value_length)..(idx + 1) *
value_length {
- //dbg!(child_idx);
let row = rows.row(child_idx);
let end_offset = *offset + row.as_ref().len();
data[*offset..end_offset].copy_from_slice(row.as_ref());
@@ -233,12 +232,8 @@ pub fn encode_fixed_size_list(
}
}
false => {
- let null_sentinels = 1;
- //+ value_length; // 1 for self + for values too
- for i in 0..null_sentinels {
- data[*offset + i] = null_sentinel;
- }
- *offset += null_sentinels;
+ data[*offset] = null_sentinel;
+ *offset += 1;
}
};
})
@@ -291,6 +286,7 @@ pub unsafe fn decode_fixed_size_list(
row_offset = next_offset;
}
}
+ *row = &row[row_offset..]; // Update row for the next decoder
}
let children = converter.convert_raw(&mut child_rows, validate_utf8)?;