This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 56cdfa75a4 [Variant][Shredding] Support typed_access for
timestamp_micro/timestamp_nano (#8401)
56cdfa75a4 is described below
commit 56cdfa75a4db59637f9f4431f026642af05fc938
Author: Congxian Qiu <[email protected]>
AuthorDate: Fri Sep 26 18:54:58 2025 +0800
[Variant][Shredding] Support typed_access for
timestamp_micro/timestamp_nano (#8401)
# Which issue does this PR close?
- Closes #8331 .
# Rationale for this change
- Add typed_access for `Timestamp(Micro, _)` and `Timestamp(Nano, -)`
# What changes are included in this PR?
- Extract some data gen logic in tests to simplify the test logic
(commit 93090d56717a6804e4862c23a0f85030b9f6406d), but it based on some
old code(before #8392), rebase the master in the last commit
- Add typed_access for `Timestamp(Micro, _)` and `Timestamp(Nano, _)`
- Add test for typed_access for `Timestamp(Micro, _)` and
`Timestamp(Nano, _)`
# Are these changes tested?
Covered by existing and added tests
# Are there any user-facing changes?
No
---
parquet-variant-compute/src/variant_array.rs | 40 ++-
parquet-variant-compute/src/variant_get.rs | 417 +++++++++++++++------------
parquet/tests/variant_integration.rs | 18 +-
3 files changed, 284 insertions(+), 191 deletions(-)
diff --git a/parquet-variant-compute/src/variant_array.rs
b/parquet-variant-compute/src/variant_array.rs
index 16dbff4c34..bf24eb6266 100644
--- a/parquet-variant-compute/src/variant_array.rs
+++ b/parquet-variant-compute/src/variant_array.rs
@@ -17,15 +17,17 @@
//! [`VariantArray`] implementation
-use crate::type_conversion::primitive_conversion_single_value;
+use crate::type_conversion::{generic_conversion_single_value,
primitive_conversion_single_value};
use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray};
use arrow::buffer::NullBuffer;
use arrow::compute::cast;
use arrow::datatypes::{
Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type,
Int64Type, Int8Type,
+ TimestampMicrosecondType, TimestampNanosecondType,
};
use arrow_schema::extension::ExtensionType;
use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields, TimeUnit};
+use chrono::DateTime;
use parquet_variant::Uuid;
use parquet_variant::Variant;
@@ -837,6 +839,42 @@ fn typed_value_to_variant<'a>(
DataType::Float64 => {
primitive_conversion_single_value!(Float64Type, typed_value, index)
}
+ DataType::Timestamp(TimeUnit::Microsecond, Some(_)) => {
+ generic_conversion_single_value!(
+ TimestampMicrosecondType,
+ as_primitive,
+ |v| DateTime::from_timestamp_micros(v).unwrap(),
+ typed_value,
+ index
+ )
+ }
+ DataType::Timestamp(TimeUnit::Microsecond, None) => {
+ generic_conversion_single_value!(
+ TimestampMicrosecondType,
+ as_primitive,
+ |v| DateTime::from_timestamp_micros(v).unwrap().naive_utc(),
+ typed_value,
+ index
+ )
+ }
+ DataType::Timestamp(TimeUnit::Nanosecond, Some(_)) => {
+ generic_conversion_single_value!(
+ TimestampNanosecondType,
+ as_primitive,
+ DateTime::from_timestamp_nanos,
+ typed_value,
+ index
+ )
+ }
+ DataType::Timestamp(TimeUnit::Nanosecond, None) => {
+ generic_conversion_single_value!(
+ TimestampNanosecondType,
+ as_primitive,
+ |v| DateTime::from_timestamp_nanos(v).naive_utc(),
+ typed_value,
+ index
+ )
+ }
// todo other types here (note this is very similar to
cast_to_variant.rs)
// so it would be great to figure out how to share this code
_ => {
diff --git a/parquet-variant-compute/src/variant_get.rs
b/parquet-variant-compute/src/variant_get.rs
index 49f56af573..a923732ca4 100644
--- a/parquet-variant-compute/src/variant_get.rs
+++ b/parquet-variant-compute/src/variant_get.rs
@@ -34,7 +34,7 @@ pub(crate) enum ShreddedPathStep {
/// Path step succeeded, return the new shredding state
Success(ShreddingState),
/// The path element is not present in the `typed_value` column and there
is no `value` column,
- /// so we we know it does not exist. It, and all paths under it, are
all-NULL.
+ /// so we know it does not exist. It, and all paths under it, are all-NULL.
Missing,
/// The path element is not present in the `typed_value` column and must
be retrieved from the `value`
/// column instead. The caller should be prepared to handle any value,
including the requested
@@ -296,22 +296,21 @@ impl<'a> GetOptions<'a> {
mod test {
use std::sync::Arc;
+ use super::{variant_get, GetOptions};
+ use crate::json_to_variant;
+ use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
+ use crate::VariantArray;
use arrow::array::{
- Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array,
Float32Array,
- Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
StringArray, StructArray,
+ Array, ArrayRef, AsArray, BinaryViewArray, Date32Array, Float32Array,
Float64Array,
+ Int16Array, Int32Array, Int64Array, Int8Array, StringArray,
StructArray,
};
use arrow::buffer::NullBuffer;
use arrow::compute::CastOptions;
use arrow::datatypes::DataType::{Int16, Int32, Int64};
use arrow_schema::{DataType, Field, FieldRef, Fields};
+ use chrono::DateTime;
use parquet_variant::{Variant, VariantPath, EMPTY_VARIANT_METADATA_BYTES};
- use crate::json_to_variant;
- use crate::variant_array::{ShreddedVariantFieldArray, StructArrayBuilder};
- use crate::VariantArray;
-
- use super::{variant_get, GetOptions};
-
fn single_variant_get_test(input_json: &str, path: VariantPath,
expected_json: &str) {
// Create input array from JSON string
let input_array_ref: ArrayRef =
Arc::new(StringArray::from(vec![Some(input_json)]));
@@ -417,6 +416,49 @@ mod test {
};
}
+ macro_rules! partially_shredded_variant_array_gen {
+ ($func_name:ident, $typed_value_array_gen: expr) => {
+ fn $func_name() -> ArrayRef {
+ let (metadata, string_value) = {
+ let mut builder = parquet_variant::VariantBuilder::new();
+ builder.append_value("n/a");
+ builder.finish()
+ };
+
+ let nulls = NullBuffer::from(vec![
+ true, // row 0 non null
+ false, // row 1 is null
+ true, // row 2 non null
+ true, // row 3 non null
+ ]);
+
+ // metadata is the same for all rows
+ let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
+
+ // See
https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY
+ // about why row1 is an empty but non null, value.
+ let values = BinaryViewArray::from(vec![
+ None, // row 0 is shredded, so no value
+ Some(b"" as &[u8]), // row 1 is null, so empty value
(why?)
+ Some(&string_value), // copy the string value "N/A"
+ None, // row 3 is shredded, so no value
+ ]);
+
+ let typed_value = $typed_value_array_gen();
+
+ let struct_array = StructArrayBuilder::new()
+ .with_field("metadata", Arc::new(metadata), false)
+ .with_field("typed_value", Arc::new(typed_value), true)
+ .with_field("value", Arc::new(values), true)
+ .with_nulls(nulls)
+ .build();
+ ArrayRef::from(
+ VariantArray::try_new(&struct_array).expect("should create
variant array"),
+ )
+ }
+ };
+ }
+
#[test]
fn get_variant_partially_shredded_int8_as_variant() {
numeric_partially_shredded_test!(i8,
partially_shredded_int8_variant_array);
@@ -481,6 +523,15 @@ mod test {
assert_eq!(result.value(3), Variant::from("world"));
}
+
partially_shredded_variant_array_gen!(partially_shredded_binary_view_variant_array,
|| {
+ BinaryViewArray::from(vec![
+ Some(&[1u8, 2u8, 3u8][..]), // row 0 is shredded
+ None, // row 1 is null
+ None, // row 2 is a string
+ Some(&[4u8, 5u8, 6u8][..]), // row 3 is shredded
+ ])
+ });
+
#[test]
fn get_variant_partially_shredded_date32_as_variant() {
let array = partially_shredded_date32_variant_array();
@@ -750,6 +801,156 @@ mod test {
f64
);
+ macro_rules! assert_variant_get_as_variant_array_with_default_option {
+ ($variant_array: expr, $array_expected: expr) => {{
+ let options = GetOptions::new();
+ let array = $variant_array;
+ let result = variant_get(&array, options).unwrap();
+
+ // expect the result is a VariantArray
+ let result = VariantArray::try_new(&result).unwrap();
+
+ assert_eq!(result.len(), $array_expected.len());
+
+ for (idx, item) in $array_expected.into_iter().enumerate() {
+ match item {
+ Some(item) => assert_eq!(result.value(idx), item),
+ None => assert!(result.is_null(idx)),
+ }
+ }
+ }};
+ }
+
+ partially_shredded_variant_array_gen!(
+ partially_shredded_timestamp_micro_ntz_variant_array,
+ || {
+ arrow::array::TimestampMicrosecondArray::from(vec![
+ Some(-456000),
+ None,
+ None,
+ Some(1758602096000000),
+ ])
+ }
+ );
+
+ #[test]
+ fn get_variant_partial_shredded_timestamp_micro_ntz_as_variant() {
+ let array = partially_shredded_timestamp_micro_ntz_variant_array();
+ assert_variant_get_as_variant_array_with_default_option!(
+ array,
+ vec![
+ Some(Variant::from(
+ DateTime::from_timestamp_micros(-456000i64)
+ .unwrap()
+ .naive_utc(),
+ )),
+ None,
+ Some(Variant::from("n/a")),
+ Some(Variant::from(
+ DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00")
+ .unwrap()
+ .naive_utc(),
+ )),
+ ]
+ )
+ }
+
+
partially_shredded_variant_array_gen!(partially_shredded_timestamp_micro_variant_array,
|| {
+ arrow::array::TimestampMicrosecondArray::from(vec![
+ Some(-456000),
+ None,
+ None,
+ Some(1758602096000000),
+ ])
+ .with_timezone("+00:00")
+ });
+
+ #[test]
+ fn get_variant_partial_shredded_timestamp_micro_as_variant() {
+ let array = partially_shredded_timestamp_micro_variant_array();
+ assert_variant_get_as_variant_array_with_default_option!(
+ array,
+ vec![
+ Some(Variant::from(
+ DateTime::from_timestamp_micros(-456000i64)
+ .unwrap()
+ .to_utc(),
+ )),
+ None,
+ Some(Variant::from("n/a")),
+ Some(Variant::from(
+ DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00")
+ .unwrap()
+ .to_utc(),
+ )),
+ ]
+ )
+ }
+
+ partially_shredded_variant_array_gen!(
+ partially_shredded_timestamp_nano_ntz_variant_array,
+ || {
+ arrow::array::TimestampNanosecondArray::from(vec![
+ Some(-4999999561),
+ None,
+ None,
+ Some(1758602096000000000),
+ ])
+ }
+ );
+
+ #[test]
+ fn get_variant_partial_shredded_timestamp_nano_ntz_as_variant() {
+ let array = partially_shredded_timestamp_nano_ntz_variant_array();
+
+ assert_variant_get_as_variant_array_with_default_option!(
+ array,
+ vec![
+ Some(Variant::from(
+ DateTime::from_timestamp(-5, 439).unwrap().naive_utc()
+ )),
+ None,
+ Some(Variant::from("n/a")),
+ Some(Variant::from(
+ DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00")
+ .unwrap()
+ .naive_utc()
+ )),
+ ]
+ )
+ }
+
+
partially_shredded_variant_array_gen!(partially_shredded_timestamp_nano_variant_array,
|| {
+ arrow::array::TimestampNanosecondArray::from(vec![
+ Some(-4999999561),
+ None,
+ None,
+ Some(1758602096000000000),
+ ])
+ .with_timezone("+00:00")
+ });
+
+ #[test]
+ fn get_variant_partial_shredded_timestamp_nano_as_variant() {
+ let array = partially_shredded_timestamp_nano_variant_array();
+
+ assert_variant_get_as_variant_array_with_default_option!(
+ array,
+ vec![
+ Some(Variant::from(
+ DateTime::from_timestamp(-5, 439).unwrap().to_utc()
+ )),
+ None,
+ Some(Variant::from("n/a")),
+ Some(Variant::from(
+ DateTime::parse_from_rfc3339("2025-09-23T12:34:56+08:00")
+ .unwrap()
+ .to_utc()
+ )),
+ ]
+ )
+ }
+
/// Return a VariantArray that represents a normal "shredded" variant
/// for the following example
///
@@ -775,6 +976,17 @@ mod test {
/// ```
macro_rules! numeric_partially_shredded_variant_array_fn {
($func:ident, $array_type:ident, $primitive_type:ty) => {
+ partially_shredded_variant_array_gen!($func, ||
$array_type::from(vec![
+ Some(<$primitive_type>::try_from(34u8).unwrap()), // row 0 is
shredded, so it has a value
+ None, // row 1 is
null, so no value
+ None, // row 2 is a string, so no typed value
+ Some(<$primitive_type>::try_from(100u8).unwrap()), // row 3 is
shredded, so it has a value
+ ]));
+ };
+ }
+
+ macro_rules! partially_shredded_variant_array_gen {
+ ($func:ident, $typed_array_gen: expr) => {
fn $func() -> ArrayRef {
// At the time of writing, the `VariantArrayBuilder` does not
support shredding.
// so we must construct the array manually. see
https://github.com/apache/arrow-rs/issues/7895
@@ -803,12 +1015,7 @@ mod test {
None, // row 3 is shredded, so no value
]);
- let typed_value = $array_type::from(vec![
- Some(<$primitive_type>::try_from(34u8).unwrap()), // row 0
is shredded, so it has a value
- None, // row 1
is null, so no value
- None, // row 2 is a string, so no typed value
- Some(<$primitive_type>::try_from(100u8).unwrap()), // row
3 is shredded, so it has a value
- ]);
+ let typed_value = $typed_array_gen();
let struct_array = StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata), false)
@@ -817,7 +1024,9 @@ mod test {
.with_nulls(nulls)
.build();
- Arc::new(struct_array)
+ ArrayRef::from(
+ VariantArray::try_new(&struct_array).expect("should create
variant array"),
+ )
}
};
}
@@ -853,184 +1062,32 @@ mod test {
f64
);
- /// Return a VariantArray that represents a partially "shredded" variant
for bool
- fn partially_shredded_bool_variant_array() -> ArrayRef {
- let (metadata, string_value) = {
- let mut builder = parquet_variant::VariantBuilder::new();
- builder.append_value("n/a");
- builder.finish()
- };
-
- let nulls = NullBuffer::from(vec![
- true, // row 0 non null
- false, // row 1 is null
- true, // row 2 non null
- true, // row 3 non null
- ]);
-
- // metadata is the same for all rows
- let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
-
- // See
https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY
- // about why row1 is an empty but non null, value.
- let values = BinaryViewArray::from(vec![
- None, // row 0 is shredded, so no value
- Some(b"" as &[u8]), // row 1 is null, so empty value (why?)
- Some(&string_value), // copy the string value "N/A"
- None, // row 3 is shredded, so no value
- ]);
-
- let typed_value = BooleanArray::from(vec![
+
partially_shredded_variant_array_gen!(partially_shredded_bool_variant_array, ||
{
+ arrow::array::BooleanArray::from(vec![
Some(true), // row 0 is shredded, so it has a value
None, // row 1 is null, so no value
None, // row 2 is a string, so no typed value
Some(false), // row 3 is shredded, so it has a value
- ]);
-
- let struct_array = StructArrayBuilder::new()
- .with_field("metadata", Arc::new(metadata), false)
- .with_field("typed_value", Arc::new(typed_value), true)
- .with_field("value", Arc::new(values), true)
- .with_nulls(nulls)
- .build();
+ ])
+ });
- Arc::new(struct_array)
- }
-
- /// Return a VariantArray that represents a partially "shredded" variant
for UTF8
- fn partially_shredded_utf8_variant_array() -> ArrayRef {
- let (metadata, string_value) = {
- let mut builder = parquet_variant::VariantBuilder::new();
- builder.append_value("n/a");
- builder.finish()
- };
-
- // Create the null buffer for the overall array
- let nulls = NullBuffer::from(vec![
- true, // row 0 non null
- false, // row 1 is null
- true, // row 2 non null
- true, // row 3 non null
- ]);
-
- // metadata is the same for all rows
- let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
-
- // See
https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY
- // about why row1 is an empty but non null, value.
- let values = BinaryViewArray::from(vec![
- None, // row 0 is shredded, so no value
- Some(b"" as &[u8]), // row 1 is null, so empty value
- Some(&string_value), // copy the string value "N/A"
- None, // row 3 is shredded, so no value
- ]);
-
- let typed_value = StringArray::from(vec![
+
partially_shredded_variant_array_gen!(partially_shredded_utf8_variant_array, ||
{
+ StringArray::from(vec![
Some("hello"), // row 0 is shredded
None, // row 1 is null
None, // row 2 is a string
Some("world"), // row 3 is shredded
- ]);
-
- let struct_array = StructArrayBuilder::new()
- .with_field("metadata", Arc::new(metadata), false)
- .with_field("typed_value", Arc::new(typed_value), true)
- .with_field("value", Arc::new(values), true)
- .with_nulls(nulls)
- .build();
-
- Arc::new(struct_array)
- }
+ ])
+ });
- /// Return a VariantArray that represents a partially "shredded" variant
for Date32
- fn partially_shredded_date32_variant_array() -> ArrayRef {
- let (metadata, string_value) = {
- let mut builder = parquet_variant::VariantBuilder::new();
- builder.append_value("n/a");
- builder.finish()
- };
-
- // Create the null buffer for the overall array
- let nulls = NullBuffer::from(vec![
- true, // row 0 non null
- false, // row 1 is null
- true, // row 2 non null
- true, // row 3 non null
- ]);
-
- // metadata is the same for all rows
- let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
-
- // See
https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY
- // about why row1 is an empty but non null, value.
- let values = BinaryViewArray::from(vec![
- None, // row 0 is shredded, so no value
- Some(b"" as &[u8]), // row 1 is null, so empty value
- Some(&string_value), // copy the string value "N/A"
- None, // row 3 is shredded, so no value
- ]);
-
- let typed_value = Date32Array::from(vec![
+
partially_shredded_variant_array_gen!(partially_shredded_date32_variant_array,
|| {
+ Date32Array::from(vec![
Some(20348), // row 0 is shredded, 2025-09-17
None, // row 1 is null
None, // row 2 is a string, not a date
Some(20340), // row 3 is shredded, 2025-09-09
- ]);
-
- let struct_array = StructArrayBuilder::new()
- .with_field("metadata", Arc::new(metadata), false)
- .with_field("typed_value", Arc::new(typed_value), true)
- .with_field("value", Arc::new(values), true)
- .with_nulls(nulls)
- .build();
-
- Arc::new(struct_array)
- }
-
- /// Return a VariantArray that represents a partially "shredded" variant
for BinaryView
- fn partially_shredded_binary_view_variant_array() -> ArrayRef {
- let (metadata, string_value) = {
- let mut builder = parquet_variant::VariantBuilder::new();
- builder.append_value("n/a");
- builder.finish()
- };
-
- // Create the null buffer for the overall array
- let nulls = NullBuffer::from(vec![
- true, // row 0 non null
- false, // row 1 is null
- true, // row 2 non null
- true, // row 3 non null
- ]);
-
- // metadata is the same for all rows
- let metadata =
BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4));
-
- // See
https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?disco=AAABml8WQrY
- // about why row1 is an empty but non null, value.
- let values = BinaryViewArray::from(vec![
- None, // row 0 is shredded, so no value
- Some(b"" as &[u8]), // row 1 is null, so empty value
- Some(&string_value), // copy the string value "N/A"
- None, // row 3 is shredded, so no value
- ]);
-
- let typed_value = BinaryViewArray::from(vec![
- Some(&[1u8, 2u8, 3u8][..]), // row 0 is shredded
- None, // row 1 is null
- None, // row 2 is a string
- Some(&[4u8, 5u8, 6u8][..]), // row 3 is shredded
- ]);
-
- let struct_array = StructArrayBuilder::new()
- .with_field("metadata", Arc::new(metadata), false)
- .with_field("typed_value", Arc::new(typed_value), true)
- .with_field("value", Arc::new(values), true)
- .with_nulls(nulls)
- .build();
-
- Arc::new(struct_array)
- }
+ ])
+ });
/// Return a VariantArray that represents an "all null" variant
/// for the following example (3 null values):
diff --git a/parquet/tests/variant_integration.rs
b/parquet/tests/variant_integration.rs
index 9f202f4db8..a933a3faa1 100644
--- a/parquet/tests/variant_integration.rs
+++ b/parquet/tests/variant_integration.rs
@@ -91,11 +91,10 @@ variant_test_case!(16);
variant_test_case!(17);
variant_test_case!(18);
variant_test_case!(19);
-// https://github.com/apache/arrow-rs/issues/8331
-variant_test_case!(20, "Unsupported typed_value type: Timestamp(µs, \"UTC\")");
-variant_test_case!(21, "Unsupported typed_value type: Timestamp(µs, \"UTC\")");
-variant_test_case!(22, "Unsupported typed_value type: Timestamp(µs)");
-variant_test_case!(23, "Unsupported typed_value type: Timestamp(µs)");
+variant_test_case!(20);
+variant_test_case!(21);
+variant_test_case!(22);
+variant_test_case!(23);
// https://github.com/apache/arrow-rs/issues/8332
variant_test_case!(24, "Unsupported typed_value type: Decimal128(9, 4)");
variant_test_case!(25, "Unsupported typed_value type: Decimal128(9, 4)");
@@ -107,11 +106,10 @@ variant_test_case!(30);
variant_test_case!(31);
// https://github.com/apache/arrow-rs/issues/8334
variant_test_case!(32, "Unsupported typed_value type: Time64(µs)");
-// https://github.com/apache/arrow-rs/issues/8331
-variant_test_case!(33, "Unsupported typed_value type: Timestamp(ns, \"UTC\")");
-variant_test_case!(34, "Unsupported typed_value type: Timestamp(ns, \"UTC\")");
-variant_test_case!(35, "Unsupported typed_value type: Timestamp(ns)");
-variant_test_case!(36, "Unsupported typed_value type: Timestamp(ns)");
+variant_test_case!(33);
+variant_test_case!(34);
+variant_test_case!(35);
+variant_test_case!(36);
variant_test_case!(37);
// https://github.com/apache/arrow-rs/issues/8336
variant_test_case!(38, "Unsupported typed_value type: Struct(");