HonahX commented on code in PR #252: URL: https://github.com/apache/iceberg-python/pull/252#discussion_r1464482344
########## pyiceberg/io/pyarrow.py: ########## @@ -1152,24 +1163,31 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional return field_array def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[pa.Array]: - return ( - pa.ListArray.from_arrays(list_array.offsets, self.cast_if_needed(list_type.element_field, value_array)) - if isinstance(list_array, pa.ListArray) - else None - ) + if isinstance(list_array, pa.ListArray) and value_array is not None: + arrow_field = pa.list_(self._construct_field(list_type.element_field, value_array.type)) + if isinstance(value_array, pa.StructArray): + # Arrow does not allow reordering of fields, therefore we have to copy the array :( Review Comment: Just to confirm my understanding, another reason that we have to copy the array: Arrow also does not allow field-mismatch, which happens when we have an optional schema field and no values for that field in the file. ########## pyiceberg/io/pyarrow.py: ########## @@ -1152,24 +1163,31 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional return field_array def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[pa.Array]: - return ( - pa.ListArray.from_arrays(list_array.offsets, self.cast_if_needed(list_type.element_field, value_array)) - if isinstance(list_array, pa.ListArray) - else None - ) + if isinstance(list_array, pa.ListArray) and value_array is not None: + arrow_field = pa.list_(self._construct_field(list_type.element_field, value_array.type)) + if isinstance(value_array, pa.StructArray): + # Arrow does not allow reordering of fields, therefore we have to copy the array :( Review Comment: It seems adding the following ```python if list_array.is_null(): return None ``` can let me pass the above test. ########## pyiceberg/io/pyarrow.py: ########## @@ -1152,24 +1163,31 @@ def field(self, field: NestedField, _: Optional[pa.Array], field_array: Optional return field_array def list(self, list_type: ListType, list_array: Optional[pa.Array], value_array: Optional[pa.Array]) -> Optional[pa.Array]: - return ( - pa.ListArray.from_arrays(list_array.offsets, self.cast_if_needed(list_type.element_field, value_array)) - if isinstance(list_array, pa.ListArray) - else None - ) + if isinstance(list_array, pa.ListArray) and value_array is not None: + arrow_field = pa.list_(self._construct_field(list_type.element_field, value_array.type)) + if isinstance(value_array, pa.StructArray): + # Arrow does not allow reordering of fields, therefore we have to copy the array :( Review Comment: And this limitation will make the issue remaining in this edge case (`col_list array<struct<test:int>>`): ```python spark.sql( f""" CREATE TABLE {catalog_name}.default.test_table_empty_list_and_map ( col_list array<struct<test:int>>, col_map map<int, int> ) USING iceberg TBLPROPERTIES ( 'format-version'='1' ); """ ) spark.sql( f""" INSERT INTO {catalog_name}.default.test_table_empty_list_and_map VALUES (null, null) """ ) def test_null_list_and_map(catalog: Catalog) -> None: table_test_empty_list_and_map = catalog.load_table("default.test_table_empty_list_and_map") arrow_table = table_test_empty_list_and_map.scan().to_arrow() > assert arrow_table["col_list"].to_pylist() == [None] E assert [[]] == [None] ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org