liurenjie1024 commented on code in PR #731: URL: https://github.com/apache/iceberg-rust/pull/731#discussion_r1964807198
########## crates/iceberg/src/arrow/value.rs: ########## @@ -0,0 +1,1222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, + FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, + LargeListArray, LargeStringArray, ListArray, MapArray, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, +}; +use arrow_schema::DataType; +use uuid::Uuid; + +use super::get_field_id; +use crate::spec::{ + visit_struct_with_partner, Literal, Map, PartnerAccessor, PrimitiveType, + SchemaWithPartnerVisitor, Struct, StructType, +}; +use crate::{Error, ErrorKind, Result}; + +struct ArrowArrayConverter; + +impl SchemaWithPartnerVisitor<ArrayRef> for ArrowArrayConverter { + type T = Vec<Option<Literal>>; + + fn schema( + &mut self, + _schema: &crate::spec::Schema, + _partner: &ArrayRef, + value: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + Ok(value) + } + + fn field( + &mut self, + field: &crate::spec::NestedFieldRef, + _partner: &ArrayRef, + value: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + // Make there is no null value if the field is required + if field.required && value.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The field is required but has null value", Review Comment: We should include field id and field name here for easier debugging. ########## crates/iceberg/src/arrow/value.rs: ########## @@ -0,0 +1,1222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, + FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, + LargeListArray, LargeStringArray, ListArray, MapArray, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, +}; +use arrow_schema::DataType; +use uuid::Uuid; + +use super::get_field_id; +use crate::spec::{ + visit_struct_with_partner, Literal, Map, PartnerAccessor, PrimitiveType, + SchemaWithPartnerVisitor, Struct, StructType, +}; +use crate::{Error, ErrorKind, Result}; + +struct ArrowArrayConverter; + +impl SchemaWithPartnerVisitor<ArrayRef> for ArrowArrayConverter { + type T = Vec<Option<Literal>>; + + fn schema( + &mut self, + _schema: &crate::spec::Schema, + _partner: &ArrayRef, + value: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + Ok(value) + } + + fn field( + &mut self, + field: &crate::spec::NestedFieldRef, + _partner: &ArrayRef, + value: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + // Make there is no null value if the field is required + if field.required && value.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The field is required but has null value", + )); + } + Ok(value) + } + + fn r#struct( + &mut self, + _struct: &StructType, + array: &ArrayRef, + results: Vec<Vec<Option<Literal>>>, + ) -> Result<Vec<Option<Literal>>> { + let row_len = results.first().map(|column| column.len()).unwrap_or(0); + if results.iter().any(|column| column.len() != row_len) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The struct columns have different row length", Review Comment: nit: Would be better to add actual length. ########## crates/iceberg/src/arrow/value.rs: ########## @@ -0,0 +1,1222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, + FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, + LargeListArray, LargeStringArray, ListArray, MapArray, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, +}; +use arrow_schema::DataType; +use uuid::Uuid; + +use super::get_field_id; +use crate::spec::{ + visit_struct_with_partner, Literal, Map, PartnerAccessor, PrimitiveType, + SchemaWithPartnerVisitor, Struct, StructType, +}; +use crate::{Error, ErrorKind, Result}; + +struct ArrowArrayConverter; + +impl SchemaWithPartnerVisitor<ArrayRef> for ArrowArrayConverter { + type T = Vec<Option<Literal>>; + + fn schema( + &mut self, + _schema: &crate::spec::Schema, + _partner: &ArrayRef, + value: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + Ok(value) + } + + fn field( + &mut self, + field: &crate::spec::NestedFieldRef, + _partner: &ArrayRef, + value: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + // Make there is no null value if the field is required + if field.required && value.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The field is required but has null value", + )); + } + Ok(value) + } + + fn r#struct( + &mut self, + _struct: &StructType, + array: &ArrayRef, + results: Vec<Vec<Option<Literal>>>, + ) -> Result<Vec<Option<Literal>>> { + let row_len = results.first().map(|column| column.len()).unwrap_or(0); + if results.iter().any(|column| column.len() != row_len) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The struct columns have different row length", + )); + } + + let mut struct_literals = Vec::with_capacity(row_len); + let mut columns_iters = results + .into_iter() + .map(|column| column.into_iter()) + .collect::<Vec<_>>(); + + for i in 0..row_len { + let mut literals = Vec::with_capacity(columns_iters.len()); + for column_iter in columns_iters.iter_mut() { + literals.push(column_iter.next().unwrap()); + } + if array.is_null(i) { + struct_literals.push(None); + } else { + struct_literals.push(Some(Literal::Struct(Struct::from_iter(literals)))); + } + } + + Ok(struct_literals) + } + + fn list( + &mut self, + list: &crate::spec::ListType, + array: &ArrayRef, + elements: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + if list.element_field.required && elements.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The list should not have null value", + )); + } + match array.data_type() { + DataType::List(_) => { + let offset = array + .as_any() + .downcast_ref::<ListArray>() + .ok_or_else(|| { + Error::new(ErrorKind::DataInvalid, "The partner is not a list array") + })? + .offsets(); + // combine the result according to the offset + let mut result = Vec::with_capacity(offset.len() - 1); + for i in 0..offset.len() - 1 { + let start = offset[i] as usize; + let end = offset[i + 1] as usize; + result.push(Some(Literal::List(elements[start..end].to_vec()))); + } + Ok(result) + } + DataType::LargeList(_) => { + let offset = array + .as_any() + .downcast_ref::<LargeListArray>() + .ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "The partner is not a large list array", + ) + })? + .offsets(); + // combine the result according to the offset + let mut result = Vec::with_capacity(offset.len() - 1); + for i in 0..offset.len() - 1 { + let start = offset[i] as usize; + let end = offset[i + 1] as usize; + result.push(Some(Literal::List(elements[start..end].to_vec()))); + } + Ok(result) + } + DataType::FixedSizeList(_, len) => { + let mut result = Vec::with_capacity(elements.len() / *len as usize); + for i in 0..elements.len() / *len as usize { + let start = i * *len as usize; + let end = (i + 1) * *len as usize; + result.push(Some(Literal::List(elements[start..end].to_vec()))); + } + Ok(result) + } + _ => Err(Error::new( + ErrorKind::DataInvalid, + "The partner is not a list type", + )), + } + } + + fn map( + &mut self, + _map: &crate::spec::MapType, Review Comment: ```suggestion _map: &MapType, ``` ########## crates/iceberg/src/spec/schema.rs: ########## @@ -1146,6 +1146,170 @@ impl ReassignFieldIds { } } +/// A post order schema visitor with partner. +/// +/// For order of methods called, please refer to [`visit_schema_with_partner`]. +pub trait SchemaWithPartnerVisitor<P> { + /// Return type of this visitor. + type T; + + /// Called before struct field. + fn before_struct_field(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after struct field. + fn after_struct_field(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before list field. + fn before_list_element(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after list field. + fn after_list_element(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before map key field. + fn before_map_key(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after map key field. + fn after_map_key(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before map value field. + fn before_map_value(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after map value field. + fn after_map_value(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + + /// Called after schema's type visited. + fn schema(&mut self, schema: &Schema, partner: &P, value: Self::T) -> Result<Self::T>; + /// Called after struct's field type visited. + fn field(&mut self, field: &NestedFieldRef, partner: &P, value: Self::T) -> Result<Self::T>; + /// Called after struct's fields visited. + fn r#struct( + &mut self, + r#struct: &StructType, + partner: &P, + results: Vec<Self::T>, + ) -> Result<Self::T>; + /// Called after list fields visited. + fn list(&mut self, list: &ListType, partner: &P, value: Self::T) -> Result<Self::T>; + /// Called after map's key and value fields visited. + fn map( + &mut self, + map: &MapType, + partner: &P, + key_value: Self::T, + value: Self::T, + ) -> Result<Self::T>; + /// Called when see a primitive type. + fn primitive(&mut self, p: &PrimitiveType, partner: &P) -> Result<Self::T>; +} + +/// Accessor used to get child partner from parent partner. +pub trait PartnerAccessor<P> { + /// Get the struct partner from schema partner. + fn struct_parner<'a>(&self, schema_partner: &'a P) -> Result<&'a P>; + /// Get the field partner from struct partner. + fn field_partner<'a>( + &self, + struct_partner: &'a P, + field_id: i32, + field_name: &str, Review Comment: How about just passing the ref to `NestedField`? ########## crates/iceberg/src/arrow/value.rs: ########## @@ -0,0 +1,1222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, + FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, + LargeListArray, LargeStringArray, ListArray, MapArray, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, +}; +use arrow_schema::DataType; +use uuid::Uuid; + +use super::get_field_id; +use crate::spec::{ + visit_struct_with_partner, Literal, Map, PartnerAccessor, PrimitiveType, + SchemaWithPartnerVisitor, Struct, StructType, +}; +use crate::{Error, ErrorKind, Result}; + +struct ArrowArrayConverter; + +impl SchemaWithPartnerVisitor<ArrayRef> for ArrowArrayConverter { + type T = Vec<Option<Literal>>; + + fn schema( + &mut self, + _schema: &crate::spec::Schema, + _partner: &ArrayRef, + value: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + Ok(value) + } + + fn field( + &mut self, + field: &crate::spec::NestedFieldRef, + _partner: &ArrayRef, + value: Vec<Option<Literal>>, + ) -> Result<Vec<Option<Literal>>> { + // Make there is no null value if the field is required + if field.required && value.iter().any(Option::is_none) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The field is required but has null value", + )); + } + Ok(value) + } + + fn r#struct( + &mut self, + _struct: &StructType, + array: &ArrayRef, + results: Vec<Vec<Option<Literal>>>, + ) -> Result<Vec<Option<Literal>>> { + let row_len = results.first().map(|column| column.len()).unwrap_or(0); + if results.iter().any(|column| column.len() != row_len) { + return Err(Error::new( + ErrorKind::DataInvalid, + "The struct columns have different row length", + )); + } + + let mut struct_literals = Vec::with_capacity(row_len); + let mut columns_iters = results + .into_iter() + .map(|column| column.into_iter()) + .collect::<Vec<_>>(); + + for i in 0..row_len { + let mut literals = Vec::with_capacity(columns_iters.len()); + for column_iter in columns_iters.iter_mut() { + literals.push(column_iter.next().unwrap()); + } + if array.is_null(i) { + struct_literals.push(None); + } else { + struct_literals.push(Some(Literal::Struct(Struct::from_iter(literals)))); + } + } + + Ok(struct_literals) + } + + fn list( + &mut self, + list: &crate::spec::ListType, Review Comment: ```suggestion list: &ListType, ``` ########## crates/iceberg/src/arrow/value.rs: ########## @@ -0,0 +1,1222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow_array::{ + Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, + FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, + LargeListArray, LargeStringArray, ListArray, MapArray, StringArray, StructArray, + Time64MicrosecondArray, TimestampMicrosecondArray, TimestampNanosecondArray, +}; +use arrow_schema::DataType; +use uuid::Uuid; + +use super::get_field_id; +use crate::spec::{ + visit_struct_with_partner, Literal, Map, PartnerAccessor, PrimitiveType, + SchemaWithPartnerVisitor, Struct, StructType, +}; +use crate::{Error, ErrorKind, Result}; + +struct ArrowArrayConverter; Review Comment: ```suggestion struct ArrowArrayToIcebergStructConverter; ``` ########## crates/iceberg/src/spec/values.rs: ########## @@ -1564,6 +1564,16 @@ impl Literal { Self::Primitive(PrimitiveLiteral::Long(value)) } + /// Creates a timestamp from unix epoch in nanoseconds. + pub fn timestamp_nano(value: i64) -> Self { + Self::Primitive(PrimitiveLiteral::Long(value)) + } + + /// Creates a timestamp with timezone from unix epoch in nanoseconds. + pub fn timestamptz_nano(value: i64) -> Self { Review Comment: We should no longer constructing directly from `Literal` but using `Datum` ########## crates/iceberg/src/spec/schema.rs: ########## @@ -1146,6 +1146,170 @@ impl ReassignFieldIds { } } +/// A post order schema visitor with partner. +/// +/// For order of methods called, please refer to [`visit_schema_with_partner`]. +pub trait SchemaWithPartnerVisitor<P> { Review Comment: nit: Maybe it's time to split schema module to multi file module, but we can do it later. ########## crates/iceberg/src/spec/schema.rs: ########## @@ -1146,6 +1146,170 @@ impl ReassignFieldIds { } } +/// A post order schema visitor with partner. +/// +/// For order of methods called, please refer to [`visit_schema_with_partner`]. +pub trait SchemaWithPartnerVisitor<P> { + /// Return type of this visitor. + type T; + + /// Called before struct field. + fn before_struct_field(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after struct field. + fn after_struct_field(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before list field. + fn before_list_element(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after list field. + fn after_list_element(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before map key field. + fn before_map_key(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after map key field. + fn after_map_key(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called before map value field. + fn before_map_value(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + /// Called after map value field. + fn after_map_value(&mut self, _field: &NestedFieldRef, _partner: &P) -> Result<()> { + Ok(()) + } + + /// Called after schema's type visited. + fn schema(&mut self, schema: &Schema, partner: &P, value: Self::T) -> Result<Self::T>; + /// Called after struct's field type visited. + fn field(&mut self, field: &NestedFieldRef, partner: &P, value: Self::T) -> Result<Self::T>; + /// Called after struct's fields visited. + fn r#struct( + &mut self, + r#struct: &StructType, + partner: &P, + results: Vec<Self::T>, + ) -> Result<Self::T>; + /// Called after list fields visited. + fn list(&mut self, list: &ListType, partner: &P, value: Self::T) -> Result<Self::T>; + /// Called after map's key and value fields visited. + fn map( + &mut self, + map: &MapType, + partner: &P, + key_value: Self::T, + value: Self::T, + ) -> Result<Self::T>; + /// Called when see a primitive type. + fn primitive(&mut self, p: &PrimitiveType, partner: &P) -> Result<Self::T>; +} + +/// Accessor used to get child partner from parent partner. +pub trait PartnerAccessor<P> { + /// Get the struct partner from schema partner. + fn struct_parner<'a>(&self, schema_partner: &'a P) -> Result<&'a P>; + /// Get the field partner from struct partner. + fn field_partner<'a>( + &self, + struct_partner: &'a P, + field_id: i32, + field_name: &str, + ) -> Result<&'a P>; + /// Get the list element partner from list partner. + fn list_element_partner<'a>(&self, list_partner: &'a P) -> Result<&'a P>; + /// Get the map key partner from map partner. + fn map_key_partner<'a>(&self, map_partner: &'a P) -> Result<&'a P>; + /// Get the map value partner from map partner. + fn map_value_partner<'a>(&self, map_partner: &'a P) -> Result<&'a P>; +} + +/// Visiting a type in post order. +pub fn visit_type_with_partner<P, V: SchemaWithPartnerVisitor<P>, A: PartnerAccessor<P>>( Review Comment: Should we mark this method as private? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org