syun64 commented on code in PR #910:
URL: https://github.com/apache/iceberg-python/pull/910#discussion_r1674797591


##########
pyiceberg/io/pyarrow.py:
##########
@@ -1296,31 +1297,49 @@ def to_requested_schema(
 
 
 class ArrowProjectionVisitor(SchemaWithPartnerVisitor[pa.Array, 
Optional[pa.Array]]):
-    file_schema: Schema
+    _file_schema: Schema
     _include_field_ids: bool
+    _downcast_ns_timestamp_to_us: bool
 
     def __init__(self, file_schema: Schema, downcast_ns_timestamp_to_us: bool 
= False, include_field_ids: bool = False) -> None:
-        self.file_schema = file_schema
+        self._file_schema = file_schema
         self._include_field_ids = include_field_ids
-        self.downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
+        self._downcast_ns_timestamp_to_us = downcast_ns_timestamp_to_us
 
     def _cast_if_needed(self, field: NestedField, values: pa.Array) -> 
pa.Array:
-        file_field = self.file_schema.find_field(field.field_id)
+        file_field = self._file_schema.find_field(field.field_id)
 
         if field.field_type.is_primitive:
             if field.field_type != file_field.field_type:
                 return values.cast(
                     schema_to_pyarrow(promote(file_field.field_type, 
field.field_type), include_field_ids=self._include_field_ids)
                 )
             elif (target_type := schema_to_pyarrow(field.field_type, 
include_field_ids=self._include_field_ids)) != values.type:
-                # Downcasting of nanoseconds to microseconds
-                if (
-                    pa.types.is_timestamp(target_type)
-                    and target_type.unit == "us"
-                    and pa.types.is_timestamp(values.type)
-                    and values.type.unit == "ns"
-                ):
-                    return values.cast(target_type, safe=False)
+                if field.field_type == TimestampType():

Review Comment:
   stricter and clearer `field_type` driven checks



##########
tests/io/test_pyarrow.py:
##########
@@ -1798,3 +1799,35 @@ def test_identity_partition_on_multi_columns() -> None:
             ("n_legs", "ascending"),
             ("animal", "ascending"),
         ]) == arrow_table.sort_by([("born_year", "ascending"), ("n_legs", 
"ascending"), ("animal", "ascending")])
+
+
+def test_to_requested_schema_timestamps(

Review Comment:
   I noticed that there weren't any tests for `to_requested_schema` although it 
is a public API, so I added this in.
   
   Do we want to keep `to_requested_schema` as a public API?



##########
pyiceberg/io/pyarrow.py:
##########
@@ -1320,7 +1321,16 @@ def _cast_if_needed(self, field: NestedField, values: 
pa.Array) -> pa.Array:
                     and pa.types.is_timestamp(values.type)
                     and values.type.unit == "ns"
                 ):
-                    return values.cast(target_type, safe=False)
+                    if target_type.tz == "UTC" and values.type.tz in 
UTC_ALIASES or not target_type.tz and not values.type.tz:
+                        return values.cast(target_type, safe=False)
+                if (
+                    pa.types.is_timestamp(target_type)
+                    and target_type.unit == "us"
+                    and pa.types.is_timestamp(values.type)
+                    and values.type.unit in {"s", "ms", "us"}

Review Comment:
   Sounds good @Fokko- thank you for the review.
   I've made these checks stricter and also clearer for users to follow.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to