ZENOTME commented on code in PR #42: URL: https://github.com/apache/iceberg-rust/pull/42#discussion_r1308159095
########## crates/iceberg/src/transform/temporal.rs: ########## @@ -0,0 +1,669 @@ +use super::TransformFunction; +use crate::{Error, Result}; +use arrow::array::{ + Array, Date64Array, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, +}; +use arrow::compute::binary; +use arrow::datatypes; +use arrow::datatypes::DataType; +use arrow::{ + array::{ArrayRef, Date32Array, Int32Array}, + compute::{month_dyn, year_dyn}, +}; +use chrono::Datelike; +use std::sync::Arc; + +/// 719163 is the number of days from 0000-01-01 to 1970-01-01 +const EPOCH_DAY_FROM_CE: i32 = 719163; +const DAY_PER_SECOND: f64 = 0.0000115741; +const HOUR_PER_SECOND: f64 = 1_f64 / 3600.0; + +/// Extract a date or timestamp year, as years from 1970 +pub struct Year; + +impl TransformFunction for Year { + fn transform(&self, input: ArrayRef) -> Result<ArrayRef> { + let array = year_dyn(&input).map_err(|err| { + Error::new( + crate::ErrorKind::ArrowCompute, + format!("error in transformfunction: {}", err), + ) + })?; + Ok(Arc::<Int32Array>::new( + array + .as_any() + .downcast_ref::<Int32Array>() + .unwrap() + .unary(|v| v - 1970), + )) + } +} + +/// Extract a date or timestamp month, as months from 1970-01-01 +pub struct Month; + +impl TransformFunction for Month { + fn transform(&self, input: ArrayRef) -> Result<ArrayRef> { + let year_array = year_dyn(&input) + .map_err(|err| Error::new(crate::ErrorKind::ArrowCompute, format!("{err}")))?; + let year_array: Int32Array = year_array + .as_any() + .downcast_ref::<Int32Array>() + .unwrap() + .unary(|v| 12 * (v - 1970)); + let month_array = month_dyn(&input) + .map_err(|err| Error::new(crate::ErrorKind::ArrowCompute, format!("{err}")))?; + Ok(Arc::<Int32Array>::new( + binary( + month_array.as_any().downcast_ref::<Int32Array>().unwrap(), + year_array.as_any().downcast_ref::<Int32Array>().unwrap(), + // Compute month from 1970-01-01, so minus 1 here. + |a, b| a + b - 1, + ) + .unwrap(), + )) + } +} + +/// Extract a date or timestamp day, as days from 1970-01-01 +pub struct Day; + +impl TransformFunction for Day { + fn transform(&self, input: ArrayRef) -> Result<ArrayRef> { + let res: Int32Array = match input.data_type() { + DataType::Timestamp(unit, _) => match unit { + datatypes::TimeUnit::Second => input + .as_any() + .downcast_ref::<TimestampSecondArray>() + .unwrap() + .unary(|v| -> i32 { (v as f64 * DAY_PER_SECOND) as i32 }), + datatypes::TimeUnit::Millisecond => input + .as_any() + .downcast_ref::<TimestampMillisecondArray>() + .unwrap() + .unary(|v| -> i32 { (v as f64 / 1000.0 * DAY_PER_SECOND) as i32 }), + datatypes::TimeUnit::Microsecond => input + .as_any() + .downcast_ref::<TimestampMicrosecondArray>() + .unwrap() + .unary(|v| -> i32 { (v as f64 / 1000.0 / 1000.0 * DAY_PER_SECOND) as i32 }), + datatypes::TimeUnit::Nanosecond => input + .as_any() + .downcast_ref::<TimestampNanosecondArray>() + .unwrap() + .unary(|v| -> i32 { + (v as f64 / 1000.0 / 1000.0 / 1000.0 * DAY_PER_SECOND) as i32 + }), + }, + DataType::Date32 => { + input + .as_any() + .downcast_ref::<Date32Array>() + .unwrap() + .unary(|v| -> i32 { + datatypes::Date32Type::to_naive_date(v).num_days_from_ce() + - EPOCH_DAY_FROM_CE + }) + } + DataType::Date64 => { + input + .as_any() + .downcast_ref::<Date64Array>() + .unwrap() + .unary(|v| -> i32 { + datatypes::Date64Type::to_naive_date(v).num_days_from_ce() + - EPOCH_DAY_FROM_CE + }) + } + _ => unreachable!( + "Should not call transform in Day with type {:?}", + input.data_type() + ), + }; + Ok(Arc::new(res)) Review Comment: > If a Timestamp column has a non-empty timezone value, its epoch is 1970-01-01 00:00:00 (January 1st 1970, midnight) in the UTC timezone (the Unix epoch), regardless of the Timestamp’s own timezone. The timezone will not affect the epoch so we can use it directly. The result is always regard as in the UTC timezone. https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#timestamps-with-a-non-empty-timezone ########## crates/iceberg/src/transform/temporal.rs: ########## @@ -0,0 +1,669 @@ +use super::TransformFunction; +use crate::{Error, Result}; +use arrow::array::{ + Array, Date64Array, TimestampMicrosecondArray, TimestampMillisecondArray, + TimestampNanosecondArray, TimestampSecondArray, +}; +use arrow::compute::binary; +use arrow::datatypes; +use arrow::datatypes::DataType; +use arrow::{ + array::{ArrayRef, Date32Array, Int32Array}, + compute::{month_dyn, year_dyn}, +}; +use chrono::Datelike; +use std::sync::Arc; + +/// 719163 is the number of days from 0000-01-01 to 1970-01-01 +const EPOCH_DAY_FROM_CE: i32 = 719163; +const DAY_PER_SECOND: f64 = 0.0000115741; +const HOUR_PER_SECOND: f64 = 1_f64 / 3600.0; + +/// Extract a date or timestamp year, as years from 1970 +pub struct Year; + +impl TransformFunction for Year { + fn transform(&self, input: ArrayRef) -> Result<ArrayRef> { + let array = year_dyn(&input).map_err(|err| { + Error::new( + crate::ErrorKind::ArrowCompute, + format!("error in transformfunction: {}", err), + ) + })?; + Ok(Arc::<Int32Array>::new( + array + .as_any() + .downcast_ref::<Int32Array>() + .unwrap() + .unary(|v| v - 1970), + )) + } +} + +/// Extract a date or timestamp month, as months from 1970-01-01 +pub struct Month; + +impl TransformFunction for Month { + fn transform(&self, input: ArrayRef) -> Result<ArrayRef> { + let year_array = year_dyn(&input) + .map_err(|err| Error::new(crate::ErrorKind::ArrowCompute, format!("{err}")))?; + let year_array: Int32Array = year_array + .as_any() + .downcast_ref::<Int32Array>() + .unwrap() + .unary(|v| 12 * (v - 1970)); + let month_array = month_dyn(&input) + .map_err(|err| Error::new(crate::ErrorKind::ArrowCompute, format!("{err}")))?; + Ok(Arc::<Int32Array>::new( + binary( + month_array.as_any().downcast_ref::<Int32Array>().unwrap(), + year_array.as_any().downcast_ref::<Int32Array>().unwrap(), + // Compute month from 1970-01-01, so minus 1 here. + |a, b| a + b - 1, + ) + .unwrap(), + )) + } +} + +/// Extract a date or timestamp day, as days from 1970-01-01 +pub struct Day; + +impl TransformFunction for Day { + fn transform(&self, input: ArrayRef) -> Result<ArrayRef> { + let res: Int32Array = match input.data_type() { + DataType::Timestamp(unit, _) => match unit { + datatypes::TimeUnit::Second => input + .as_any() + .downcast_ref::<TimestampSecondArray>() + .unwrap() + .unary(|v| -> i32 { (v as f64 * DAY_PER_SECOND) as i32 }), + datatypes::TimeUnit::Millisecond => input + .as_any() + .downcast_ref::<TimestampMillisecondArray>() + .unwrap() + .unary(|v| -> i32 { (v as f64 / 1000.0 * DAY_PER_SECOND) as i32 }), + datatypes::TimeUnit::Microsecond => input + .as_any() + .downcast_ref::<TimestampMicrosecondArray>() + .unwrap() + .unary(|v| -> i32 { (v as f64 / 1000.0 / 1000.0 * DAY_PER_SECOND) as i32 }), + datatypes::TimeUnit::Nanosecond => input + .as_any() + .downcast_ref::<TimestampNanosecondArray>() + .unwrap() + .unary(|v| -> i32 { + (v as f64 / 1000.0 / 1000.0 / 1000.0 * DAY_PER_SECOND) as i32 + }), + }, + DataType::Date32 => { + input + .as_any() + .downcast_ref::<Date32Array>() + .unwrap() + .unary(|v| -> i32 { + datatypes::Date32Type::to_naive_date(v).num_days_from_ce() + - EPOCH_DAY_FROM_CE + }) + } + DataType::Date64 => { + input + .as_any() + .downcast_ref::<Date64Array>() + .unwrap() + .unary(|v| -> i32 { + datatypes::Date64Type::to_naive_date(v).num_days_from_ce() + - EPOCH_DAY_FROM_CE + }) + } + _ => unreachable!( + "Should not call transform in Day with type {:?}", + input.data_type() + ), + }; + Ok(Arc::new(res)) Review Comment: > If a Timestamp column has a non-empty timezone value, its epoch is 1970-01-01 00:00:00 (January 1st 1970, midnight) in the UTC timezone (the Unix epoch), regardless of the Timestamp’s own timezone. The timezone will not affect the epoch so we can use it directly. The result is always regard as in the UTC timezone. https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html#timestamps-with-a-non-empty-timezone -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org