codephage2020 commented on code in PR #9598:
URL: https://github.com/apache/arrow-rs/pull/9598#discussion_r3025630995
##########
parquet-variant-compute/src/variant_get.rs:
##########
@@ -223,26 +224,61 @@ fn shredded_get_path(
// For shredded/partially-shredded targets (`typed_value` present),
recurse into each field
// separately to take advantage of deeper shredding in child fields.
if let DataType::Struct(fields) = as_field.data_type() {
- if target.typed_value_field().is_none() {
+ let has_variant_fields = fields
+ .iter()
+ .any(|f| f.try_extension_type::<VariantType>().is_ok());
+ if target.typed_value_field().is_none() && !has_variant_fields {
return shred_basic_variant(target, VariantPath::default(),
Some(as_field));
}
- let children = fields
- .iter()
- .map(|field| {
- shredded_get_path(
- &target,
- &[VariantPathElement::from(field.name().as_str())],
- Some(field),
- cast_options,
- )
- })
- .collect::<Result<Vec<_>>>()?;
+ let mut updated_fields = Vec::with_capacity(fields.len());
+ let mut children = Vec::with_capacity(fields.len());
+ for field in fields.iter() {
+ // If the field has VariantType extension metadata, extract it as a
+ // VariantArray instead of casting to the declared data type. This
allows
+ // callers to request structs where some fields remain as variants.
+ // See test_struct_extraction_with_variant_fields for usage
example.
+ let is_variant_field =
field.try_extension_type::<VariantType>().is_ok();
+ let field_as_type = (!is_variant_field).then(|| field.as_ref());
+ let child = shredded_get_path(
+ &target,
+ &[VariantPathElement::from(field.name().as_str())],
+ field_as_type,
+ cast_options,
+ )?;
+
+ // When the field is entirely absent in the data, shredded_get_path
+ // returns a NullArray. For variant fields, construct an all-null
+ // VariantArray so the extension metadata is preserved.
+ let child = if is_variant_field && child.data_type() ==
&DataType::Null {
+ let mut builder = VariantArrayBuilder::new(child.len());
+ for _ in 0..child.len() {
+ builder.append_null();
+ }
+ let null_variant = builder.build();
+ Arc::new(null_variant.into_inner()) as ArrayRef
+ } else {
+ child
+ };
+
+ // Update field data type to match the actual child array.
+ // Preserve VariantType extension metadata for variant fields so
+ // downstream consumers can recognize them as Variant columns.
+ let mut new_field = field
+ .as_ref()
+ .clone()
+ .with_data_type(child.data_type().clone());
Review Comment:
Good question. VariantType does NOT leak to intermediate fields. Here's why:
The `is_variant_field` check is driven by the **user's requested schema**,
not by the
data's shredding structure. When a user marks field "metadata" with
VariantType, we pass
`as_type = None` to `shredded_get_path`, which returns a VariantArray
directly without
recursing into the struct branch. So the recursion stops at that level —
inner fields
of the VariantArray's internal structure never go through the VariantType
annotation path.
For case 1 (strongly typed struct), I've also addressed your observation
that `with_data_type`
is redundant — strongly typed fields now use the original field directly,
since the child
data type already matches the requested type.
To make this clearer, I split the handling into two explicit branches:
if is_variant_field {
// Update data type + attach VariantType extension
// (or build all-null VariantArray when field is absent)
} else {
// Strongly typed: use original field as-is
}
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]