This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 2b179b805f feat(parquet): relax type compatility check in parquet
ArrowWriter (#9099)
2b179b805f is described below
commit 2b179b805f62e927771f764f8ae92151249e9edd
Author: Marko Grujic <[email protected]>
AuthorDate: Wed Jan 7 16:33:04 2026 +0100
feat(parquet): relax type compatility check in parquet ArrowWriter (#9099)
# Which issue does this PR close?
- Closes #9098.
# Rationale for this change
Don't require strict equality for nested fields (including inner field
name/metadata), just require that nested data types are logically
equivalent.
# What changes are included in this PR?
Use `a.equals_datatype(b)` instead of `a == b` at the start of
`LevelInfoBuilder::types_compatible`.
# Are these changes tested?
Yes.
# Are there any user-facing changes?
---
parquet/src/arrow/arrow_writer/levels.rs | 4 +--
parquet/src/arrow/arrow_writer/mod.rs | 51 ++++++++++++++++++++++++++++++--
2 files changed, 51 insertions(+), 4 deletions(-)
diff --git a/parquet/src/arrow/arrow_writer/levels.rs
b/parquet/src/arrow/arrow_writer/levels.rs
index 3c283bcbe3..59bf6c6024 100644
--- a/parquet/src/arrow/arrow_writer/levels.rs
+++ b/parquet/src/arrow/arrow_writer/levels.rs
@@ -550,8 +550,8 @@ impl LevelInfoBuilder {
/// and the other is a native array, the dictionary values must have the
same type as the
/// native array
fn types_compatible(a: &DataType, b: &DataType) -> bool {
- // if the Arrow data types are the same, the types are clearly
compatible
- if a == b {
+ // if the Arrow data types are equal, the types are deemed compatible
+ if a.equals_datatype(b) {
return true;
}
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index 3e3c9108d5..6b1566a681 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -1522,11 +1522,12 @@ fn get_fsb_array_slice(
#[cfg(test)]
mod tests {
use super::*;
+ use std::collections::HashMap;
use std::fs::File;
- use crate::arrow::ARROW_SCHEMA_META_KEY;
use crate::arrow::arrow_reader::{ParquetRecordBatchReader,
ParquetRecordBatchReaderBuilder};
+ use crate::arrow::{ARROW_SCHEMA_META_KEY, PARQUET_FIELD_ID_META_KEY};
use crate::column::page::{Page, PageReader};
use crate::file::metadata::thrift::PageHeader;
use crate::file::page_index::column_index::ColumnIndexMetaData;
@@ -1539,7 +1540,7 @@ mod tests {
use arrow::util::data_gen::create_random_array;
use arrow::util::pretty::pretty_format_batches;
use arrow::{array::*, buffer::Buffer};
- use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer,
i256};
+ use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano, NullBuffer,
OffsetBuffer, i256};
use arrow_schema::Fields;
use half::f16;
use num_traits::{FromPrimitive, ToPrimitive};
@@ -3323,6 +3324,52 @@ mod tests {
BinaryViewArray::from_iter_values(vec![b"barquet"]),
LargeBinaryArray::from_iter_values(vec![b"parquet", b"barquet"]),
);
+
+ // check compatibility for list types
+
+ let list_field_metadata = HashMap::from_iter(vec![(
+ PARQUET_FIELD_ID_META_KEY.to_string(),
+ "1".to_string(),
+ )]);
+ let list_field = Field::new_list_field(DataType::Int32, false);
+
+ let values1 = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4]));
+ let offsets1 = OffsetBuffer::new(vec![0, 2, 5].into());
+
+ let values2 = Arc::new(Int32Array::from(vec![5, 6, 7, 8, 9]));
+ let offsets2 = OffsetBuffer::new(vec![0, 3, 5].into());
+
+ let values_expected = Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4, 5,
6, 7, 8, 9]));
+ let offsets_expected = OffsetBuffer::new(vec![0, 2, 5, 8, 10].into());
+
+ ensure_compatible_write(
+ // when the initial schema has the metadata ...
+ ListArray::try_new(
+ Arc::new(
+ list_field
+ .clone()
+ .with_metadata(list_field_metadata.clone()),
+ ),
+ offsets1,
+ values1,
+ None,
+ )
+ .unwrap(),
+ // ... and some intermediate schema doesn't have the metadata
+ ListArray::try_new(Arc::new(list_field.clone()), offsets2,
values2, None).unwrap(),
+ // ... the write will still go through, and the resulting schema
will inherit the initial metadata
+ ListArray::try_new(
+ Arc::new(
+ list_field
+ .clone()
+ .with_metadata(list_field_metadata.clone()),
+ ),
+ offsets_expected,
+ values_expected,
+ None,
+ )
+ .unwrap(),
+ );
}
#[test]