Fokko commented on code in PR #731:
URL: https://github.com/apache/iceberg-rust/pull/731#discussion_r1872818669


##########
crates/iceberg/src/arrow/value.rs:
##########
@@ -0,0 +1,874 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_array::{
+    Array, BinaryArray, BooleanArray, Date32Array, Decimal128Array, 
Float32Array, Float64Array,
+    Int16Array, Int32Array, Int64Array, LargeBinaryArray, LargeStringArray, 
NullArray, StringArray,
+    StructArray, Time64MicrosecondArray, TimestampMicrosecondArray, 
TimestampNanosecondArray,
+};
+use arrow_schema::{DataType, TimeUnit};
+use itertools::Itertools;
+
+use crate::spec::{Literal, PrimitiveType, Struct, StructType, Type};
+use crate::{Error, ErrorKind, Result};
+
+trait ArrowArrayVistor {
+    type T;
+    fn null(
+        &self,
+        array: &NullArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn boolean(
+        &self,
+        array: &BooleanArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn int16(
+        &self,
+        array: &Int16Array,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn int32(
+        &self,
+        array: &Int32Array,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn int64(
+        &self,
+        array: &Int64Array,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn float(
+        &self,
+        array: &Float32Array,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn double(
+        &self,
+        array: &Float64Array,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn decimal(
+        &self,
+        array: &Decimal128Array,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn date(
+        &self,
+        array: &Date32Array,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn time(
+        &self,
+        array: &Time64MicrosecondArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn timestamp(
+        &self,
+        array: &TimestampMicrosecondArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn timestamp_nano(
+        &self,
+        array: &TimestampNanosecondArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn string(
+        &self,
+        array: &StringArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn large_string(
+        &self,
+        array: &LargeStringArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn binary(
+        &self,
+        array: &BinaryArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn large_binary(
+        &self,
+        array: &LargeBinaryArray,
+        arrow_type: &DataType,
+        iceberg_type: &PrimitiveType,
+    ) -> Result<Vec<Self::T>>;
+    fn combine_struct(
+        &self,
+        array: &StructArray,
+        columns: Vec<Vec<Self::T>>,
+    ) -> Result<Vec<Self::T>>;
+    fn r#struct(
+        &self,
+        array: &StructArray,
+        arrow_type: &DataType,
+        iceberg_type: &StructType,
+    ) -> Result<Vec<Self::T>> {
+        let DataType::Struct(arrow_struct_fields) = arrow_type else {
+            return Err(Error::new(
+                ErrorKind::DataInvalid,
+                "The type of arrow struct array is not a struct type",
+            ));
+        };
+
+        if array.columns().len() != iceberg_type.fields().len()
+            || arrow_struct_fields.len() != iceberg_type.fields().len()
+        {
+            return Err(Error::new(
+                ErrorKind::DataInvalid,
+                "The type of arrow struct array is not compatitable with 
iceberg struct type",
+            ));
+        }
+
+        let mut columns = Vec::with_capacity(array.columns().len());
+
+        for ((array, arrow_type), iceberg_field) in array

Review Comment:
   Looping back @liurenjie1024 question here:
   
   > We need have clear documentation about what kind of visitor it is.
   
   First of all, thanks for converting this into a visitor, I think that's the 
right approach here. One question regarding the lookup. It looks like we're 
expecting the Arrow data to be in the same position of the Iceberg schema. 
Instead, what would be more Iceberg-esque, is performing the lookup by field-ID.
   
   For example:
   
   
   ```python
   
   schema = Schema(
       NestedField(1, "city", StringType(), required=False),
       NestedField(2, "lat", DoubleType(), required=False),
       NestedField(3, "long", DoubleType(), required=False),
   )
   
   table {
     1: city: optional string
     2: lat: optional double
     3: long: optional double
   }
   
   table = catalog.create_table("default.locations", schema)
   
   with table.update_schema() as update:
       update.move_after("lat", "long")
   
   
   table {
     1: city: optional string
     3: lat: optional double
     2: long: optional double
   }
   ```
   
   Now the fields are switched. Data that's already written needs to be 
projected, and new data can be read as is. Both of the files will be read by 
this visitor without raising an error, but for the old files, the data the 
fields will be switched.
   
   This way, we could also do smart things, like fill in nullable fields:
   
   ```rust
           let struct_array = StructArray::new_null();
           let iceberg_struct_type = 
StructType::new(vec![Arc::new(NestedField::optional(
               0,
               "bool_field",
               Type::Primitive(PrimitiveType::Boolean),
           ))]);
           let result = arrow_struct_to_literal(&struct_array, 
iceberg_struct_type).unwrap();
           assert_eq!(result, vec![None; 3]);
   ```
   And in a step after that, https://github.com/apache/iceberg-rust/issues/737. 
You can find the Python equivalent 
[here](https://github.com/apache/iceberg-python/blob/bfc0d9a62176803094da0867ee793808f105d352/pyiceberg/io/pyarrow.py#L1726).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to