This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 89d6b49716 Add `TimestampWithOffset` canonical extension type (#8743)
89d6b49716 is described below
commit 89d6b497169ec9b4828deeec9a543e168aee76d6
Author: Lucas Valente <[email protected]>
AuthorDate: Wed Jan 14 22:59:07 2026 +0100
Add `TimestampWithOffset` canonical extension type (#8743)
# Which issue does this PR close?
Implement `arrow.timestamp_with_offset` canonical extension type.
# Rationale for this change
Be compliant with Arrow spec: https://github.com/apache/arrow/pull/48002
# What changes are included in this PR?
This commit adds a new `TimestampWithOffset` extension type. This type
represents a timestamp column that stores potentially different timezone
offsets per value. The timestamp is stored in UTC alongside the original
timezone offset in minutes.
# Are these changes tested?
Yes.
# Are there any user-facing changes?
Yes, this is a new canonical extension type.
---
arrow-schema/src/extension/canonical/mod.rs | 16 +
.../extension/canonical/timestamp_with_offset.rs | 532 +++++++++++++++++++++
2 files changed, 548 insertions(+)
diff --git a/arrow-schema/src/extension/canonical/mod.rs
b/arrow-schema/src/extension/canonical/mod.rs
index ab775d4bd2..a68169c701 100644
--- a/arrow-schema/src/extension/canonical/mod.rs
+++ b/arrow-schema/src/extension/canonical/mod.rs
@@ -33,6 +33,8 @@ mod json;
pub use json::{Json, JsonMetadata};
mod opaque;
pub use opaque::{Opaque, OpaqueMetadata};
+mod timestamp_with_offset;
+pub use timestamp_with_offset::TimestampWithOffset;
mod uuid;
pub use uuid::Uuid;
mod variable_shape_tensor;
@@ -77,6 +79,11 @@ pub enum CanonicalExtensionType {
///
///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#bit-boolean>
Bool8(Bool8),
+
+ /// The extension type for `TimestampWithOffset`.
+ ///
+ ///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#timestamp-with-offset>
+ TimestampWithOffset(TimestampWithOffset),
}
impl TryFrom<&Field> for CanonicalExtensionType {
@@ -97,6 +104,9 @@ impl TryFrom<&Field> for CanonicalExtensionType {
Uuid::NAME =>
value.try_extension_type::<Uuid>().map(Into::into),
Opaque::NAME =>
value.try_extension_type::<Opaque>().map(Into::into),
Bool8::NAME =>
value.try_extension_type::<Bool8>().map(Into::into),
+ TimestampWithOffset::NAME => value
+ .try_extension_type::<TimestampWithOffset>()
+ .map(Into::into),
_ => Err(ArrowError::InvalidArgumentError(format!(
"Unsupported canonical extension type: {name}"
))),
@@ -148,3 +158,9 @@ impl From<Bool8> for CanonicalExtensionType {
CanonicalExtensionType::Bool8(value)
}
}
+
+impl From<TimestampWithOffset> for CanonicalExtensionType {
+ fn from(value: TimestampWithOffset) -> Self {
+ CanonicalExtensionType::TimestampWithOffset(value)
+ }
+}
diff --git a/arrow-schema/src/extension/canonical/timestamp_with_offset.rs
b/arrow-schema/src/extension/canonical/timestamp_with_offset.rs
new file mode 100644
index 0000000000..643025919d
--- /dev/null
+++ b/arrow-schema/src/extension/canonical/timestamp_with_offset.rs
@@ -0,0 +1,532 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Timestamp with an offset in minutes
+//!
+//!
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#timestamp-with-offset>
+
+use crate::{ArrowError, DataType, extension::ExtensionType};
+
+/// The extension type for `TimestampWithOffset`.
+///
+/// Extension name: `arrow.timestamp_with_offset`.
+///
+/// This type represents a timestamp column that stores potentially different
timezone offsets per
+/// value. The timestamp is stored in UTC alongside the original timezone
offset in minutes. This
+/// extension type is intended to be compatible with ANSI SQL's `TIMESTAMP
WITH TIME ZONE`, which
+/// is supported by multiple database engines.
+///
+/// The storage type of the extension is a `Struct` with 2 fields, in order: -
`timestamp`: a
+/// non-nullable `Timestamp(time_unit, "UTC")`, where `time_unit` is any Arrow
`TimeUnit` (s, ms,
+/// us or ns). - `offset_minutes`: a non-nullable signed 16-bit integer
(`Int16`) representing the
+/// offset in minutes from the UTC timezone. Negative offsets represent time
zones west of UTC,
+/// while positive offsets represent east. Offsets normally range from -779
(-12:59) to +780
+/// (+13:00).
+///
+/// This type has no type parameters.
+///
+/// Metadata is either empty or an empty string.
+///
+/// It is also *permissible* for the `offset_minutes` field to be
dictionary-encoded with a
+/// preferred (*but not required*) index type of `int8`, or run-end-encoded
with a preferred (*but
+/// not required*) runs type of `int8`.
+///
+/// It's worth noting that the data source needs to resolve timezone strings
such as `UTC` or
+/// `Americas/Los_Angeles` into an offset in minutes in order to construct a
`TimestampWithOffset`.
+/// This makes `TimestampWithOffset` type "lossy" in the sense that any
original "unresolved"
+/// timezone string gets lost in this conversion. It's a tradeoff for
optimizing the row
+/// representation and simplifying the client code, which does not need to
know how to convert from
+/// timezone string to its corresponding offset in minutes.
+///
+///
<https://arrow.apache.org/docs/format/CanonicalExtensions.html#timestamp-with-offset>
+#[derive(Debug, Default, Clone, Copy, PartialEq)]
+pub struct TimestampWithOffset;
+
+const TIMESTAMP_FIELD_NAME: &str = "timestamp";
+const OFFSET_FIELD_NAME: &str = "offset_minutes";
+
+impl ExtensionType for TimestampWithOffset {
+ const NAME: &'static str = "arrow.timestamp_with_offset";
+
+ type Metadata = ();
+
+ fn metadata(&self) -> &Self::Metadata {
+ &()
+ }
+
+ fn serialize_metadata(&self) -> Option<String> {
+ None
+ }
+
+ fn deserialize_metadata(metadata: Option<&str>) -> Result<Self::Metadata,
ArrowError> {
+ metadata.map_or_else(
+ || Ok(()),
+ |v| {
+ if !v.is_empty() {
+ Err(ArrowError::InvalidArgumentError(
+ "TimestampWithOffset extension type expects no
metadata".to_owned(),
+ ))
+ } else {
+ Ok(())
+ }
+ },
+ )
+ }
+
+ fn supports_data_type(&self, data_type: &DataType) -> Result<(),
ArrowError> {
+ let ok = match data_type {
+ DataType::Struct(fields) => match fields.len() {
+ 2 => {
+ let maybe_timestamp = fields.first().unwrap();
+ let maybe_offset = fields.get(1).unwrap();
+
+ let timestamp_type_ok =
matches!(maybe_timestamp.data_type(), DataType::Timestamp(_, tz) if {
+ match tz {
+ Some(tz) => {
+ tz.as_ref() == "UTC"
+ },
+ None => false
+ }
+ });
+
+ let offset_type_ok = match maybe_offset.data_type() {
+ DataType::Int16 => true,
+ DataType::Dictionary(key_type, value_type) => {
+ key_type.is_dictionary_key_type()
+ && matches!(value_type.as_ref(),
DataType::Int16)
+ }
+ DataType::RunEndEncoded(run_ends, values) => {
+ run_ends.data_type().is_run_ends_type()
+ && matches!(values.data_type(),
DataType::Int16)
+ }
+ _ => false,
+ };
+
+ maybe_timestamp.name() == TIMESTAMP_FIELD_NAME
+ && timestamp_type_ok
+ && !maybe_timestamp.is_nullable()
+ && maybe_offset.name() == OFFSET_FIELD_NAME
+ && offset_type_ok
+ && !maybe_offset.is_nullable()
+ }
+ _ => false,
+ },
+ _ => false,
+ };
+
+ match ok {
+ true => Ok(()),
+ false => Err(ArrowError::InvalidArgumentError(format!(
+ "TimestampWithOffset data type mismatch, expected
Struct(\"timestamp\": Timestamp(_, Some(\"UTC\")), \"offset_minutes\": Int16),
found {data_type}"
+ ))),
+ }
+ }
+
+ fn try_new(data_type: &DataType, _metadata: Self::Metadata) ->
Result<Self, ArrowError> {
+ Self.supports_data_type(data_type).map(|_| Self)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::sync::Arc;
+
+ #[cfg(feature = "canonical_extension_types")]
+ use crate::extension::CanonicalExtensionType;
+ use crate::{
+ Field, Fields, TimeUnit,
+ extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY},
+ };
+
+ use super::*;
+
+ fn make_valid_field_primitive(time_unit: TimeUnit) -> Field {
+ Field::new(
+ "",
+ DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(time_unit, Some("UTC".into())),
+ false,
+ ),
+ Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
+ ])),
+ false,
+ )
+ }
+
+ fn make_valid_field_dict_encoded(time_unit: TimeUnit, key_type: DataType)
-> Field {
+ assert!(key_type.is_dictionary_key_type());
+
+ Field::new(
+ "",
+ DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(time_unit, Some("UTC".into())),
+ false,
+ ),
+ Field::new(
+ OFFSET_FIELD_NAME,
+ DataType::Dictionary(Box::new(key_type),
Box::new(DataType::Int16)),
+ false,
+ ),
+ ])),
+ false,
+ )
+ }
+
+ fn make_valid_field_run_end_encoded(time_unit: TimeUnit, run_ends_type:
DataType) -> Field {
+ assert!(run_ends_type.is_run_ends_type());
+ Field::new(
+ "",
+ DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(time_unit, Some("UTC".into())),
+ false,
+ ),
+ Field::new(
+ OFFSET_FIELD_NAME,
+ DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", run_ends_type, false)),
+ Arc::new(Field::new("values", DataType::Int16, false)),
+ ),
+ false,
+ ),
+ ])),
+ false,
+ )
+ }
+
+ #[test]
+ fn valid_primitive_offsets() -> Result<(), ArrowError> {
+ let time_units = [
+ TimeUnit::Second,
+ TimeUnit::Millisecond,
+ TimeUnit::Microsecond,
+ TimeUnit::Nanosecond,
+ ];
+
+ for time_unit in time_units {
+ let mut field = make_valid_field_primitive(time_unit);
+ field.try_with_extension_type(TimestampWithOffset)?;
+ field.try_extension_type::<TimestampWithOffset>()?;
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+
CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
+ );
+ }
+
+ Ok(())
+ }
+
+ #[test]
+ fn valid_dict_encoded_offsets() -> Result<(), ArrowError> {
+ let time_units = [
+ TimeUnit::Second,
+ TimeUnit::Millisecond,
+ TimeUnit::Microsecond,
+ TimeUnit::Nanosecond,
+ ];
+
+ let key_types = [
+ DataType::UInt8,
+ DataType::UInt16,
+ DataType::UInt32,
+ DataType::UInt64,
+ DataType::Int8,
+ DataType::Int16,
+ DataType::Int32,
+ DataType::Int64,
+ ];
+
+ for time_unit in time_units {
+ for key_type in &key_types {
+ let mut field = make_valid_field_dict_encoded(time_unit,
key_type.clone());
+ field.try_with_extension_type(TimestampWithOffset)?;
+ field.try_extension_type::<TimestampWithOffset>()?;
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+
CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
+ );
+ }
+ }
+
+ Ok(())
+ }
+
+ #[test]
+ fn valid_run_end_encoded_offsets() -> Result<(), ArrowError> {
+ let time_units = [
+ TimeUnit::Second,
+ TimeUnit::Millisecond,
+ TimeUnit::Microsecond,
+ TimeUnit::Nanosecond,
+ ];
+
+ let run_ends_types = [DataType::Int16, DataType::Int32,
DataType::Int64];
+
+ for time_unit in time_units {
+ for run_ends_type in &run_ends_types {
+ let mut field = make_valid_field_run_end_encoded(time_unit,
run_ends_type.clone());
+ field.try_with_extension_type(TimestampWithOffset)?;
+ field.try_extension_type::<TimestampWithOffset>()?;
+ #[cfg(feature = "canonical_extension_types")]
+ assert_eq!(
+ field.try_canonical_extension_type()?,
+
CanonicalExtensionType::TimestampWithOffset(TimestampWithOffset)
+ );
+ }
+ }
+
+ Ok(())
+ }
+
+ #[test]
+ #[should_panic(expected = "Field extension type name missing")]
+ fn missing_name() {
+ let field = make_valid_field_primitive(TimeUnit::Second)
+ .with_metadata([(EXTENSION_TYPE_METADATA_KEY.to_owned(),
"".to_owned())].into());
+ field.extension_type::<TimestampWithOffset>();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Boolean"
+ )]
+ fn invalid_type_top_level() {
+ Field::new("", DataType::Boolean,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_struct_field_count() {
+ let data_type =
+ DataType::Struct(Fields::from_iter([Field::new("",
DataType::Int16, false)]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_wrong_timestamp_type() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(TIMESTAMP_FIELD_NAME, DataType::Int16, false),
+ Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_wrong_offset_type() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
+ false,
+ ),
+ Field::new(OFFSET_FIELD_NAME, DataType::UInt64, false),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_wrong_offset_key_dict_encoded() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
+ false,
+ ),
+ Field::new(
+ OFFSET_FIELD_NAME,
+ DataType::Dictionary(Box::new(DataType::Boolean),
Box::new(DataType::Int16)),
+ false,
+ ),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_wrong_offset_value_dict_encoded() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
+ false,
+ ),
+ Field::new(
+ OFFSET_FIELD_NAME,
+ DataType::Dictionary(Box::new(DataType::UInt8),
Box::new(DataType::Int32)),
+ false,
+ ),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_wrong_run_ends_run_end_encoded() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
+ false,
+ ),
+ Field::new(
+ OFFSET_FIELD_NAME,
+ DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::Boolean, false)),
+ Arc::new(Field::new("values", DataType::Int16, false)),
+ ),
+ false,
+ ),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_wrong_values_run_end_encoded() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
+ false,
+ ),
+ Field::new(
+ OFFSET_FIELD_NAME,
+ DataType::RunEndEncoded(
+ Arc::new(Field::new("run_ends", DataType::UInt16, false)),
+ Arc::new(Field::new("values", DataType::Int32, false)),
+ ),
+ false,
+ ),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_nullable_timestamp() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
+ true,
+ ),
+ Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_nullable_offset() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second, Some("UTC".into())),
+ false,
+ ),
+ Field::new(OFFSET_FIELD_NAME, DataType::Int16, true),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_no_timezone() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second, None),
+ false,
+ ),
+ Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "expected Struct(\"timestamp\": Timestamp(_,
Some(\"UTC\")), \"offset_minutes\": Int16), found Struct"
+ )]
+ fn invalid_type_wrong_timezone() {
+ let data_type = DataType::Struct(Fields::from_iter([
+ Field::new(
+ TIMESTAMP_FIELD_NAME,
+ DataType::Timestamp(TimeUnit::Second,
Some("Americas/Sao_Paulo".into())),
+ false,
+ ),
+ Field::new(OFFSET_FIELD_NAME, DataType::Int16, false),
+ ]));
+ Field::new("", data_type,
false).with_extension_type(TimestampWithOffset);
+ }
+
+ #[test]
+ fn no_metadata() {
+ let field = make_valid_field_primitive(TimeUnit::Second).with_metadata(
+ [(
+ EXTENSION_TYPE_NAME_KEY.to_owned(),
+ TimestampWithOffset::NAME.to_owned(),
+ )]
+ .into(),
+ );
+ field.extension_type::<TimestampWithOffset>();
+ }
+
+ #[test]
+ fn empty_metadata() {
+ let field = make_valid_field_primitive(TimeUnit::Second).with_metadata(
+ [
+ (
+ EXTENSION_TYPE_NAME_KEY.to_owned(),
+ TimestampWithOffset::NAME.to_owned(),
+ ),
+ (EXTENSION_TYPE_METADATA_KEY.to_owned(), String::new()),
+ ]
+ .into(),
+ );
+ field.extension_type::<TimestampWithOffset>();
+ }
+}