syun64 commented on code in PR #902: URL: https://github.com/apache/iceberg-python/pull/902#discussion_r1669524329
########## pyiceberg/io/pyarrow.py: ########## @@ -1268,14 +1265,8 @@ def __init__(self, file_schema: Schema): def _cast_if_needed(self, field: NestedField, values: pa.Array) -> pa.Array: file_field = self.file_schema.find_field(field.field_id) - if field.field_type.is_primitive: - if field.field_type != file_field.field_type: - return values.cast(schema_to_pyarrow(promote(file_field.field_type, field.field_type), include_field_ids=False)) - elif (target_type := schema_to_pyarrow(field.field_type, include_field_ids=False)) != values.type: Review Comment: The removal of this casting logic results in errors when writing the parquet files: ``` > for data_file in _dataframe_to_data_files(table_metadata=txn.table_metadata, df=pa_table, io=txn._table.io): tests/integration/test_writes/test_writes.py:732: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ pyiceberg/table/__init__.py:2944: in _dataframe_to_data_files yield from write_file( /usr/local/python/3.10.13/lib/python3.10/concurrent/futures/_base.py:621: in result_iterator yield _result_or_cancel(fs.pop()) /usr/local/python/3.10.13/lib/python3.10/concurrent/futures/_base.py:319: in _result_or_cancel return fut.result(timeout) /usr/local/python/3.10.13/lib/python3.10/concurrent/futures/_base.py:458: in result return self.__get_result() /usr/local/python/3.10.13/lib/python3.10/concurrent/futures/_base.py:403: in __get_result raise self._exception /usr/local/python/3.10.13/lib/python3.10/concurrent/futures/thread.py:58: in run result = self.fn(*self.args, **self.kwargs) pyiceberg/io/pyarrow.py:1915: in write_parquet writer.write(arrow_table, row_group_size=row_group_size) /home/codespace/.cache/pypoetry/virtualenvs/pyiceberg-FsHa-ZgB-py3.10/lib/python3.10/site-packages/pyarrow/parquet/core.py:1052: in write self.write_table(table_or_batch, row_group_size) _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ self = <pyarrow.parquet.core.ParquetWriter object at 0x7f79a96cc820>, table = pyarrow.Table foo: string ---- foo: [["a",null,"z"]], row_group_size = 134217728 def write_table(self, table, row_group_size=None): """ Write Table to the Parquet file. Parameters ---------- table : Table row_group_size : int, default None Maximum number of rows in each written row group. If None, the row group size will be the minimum of the Table size and 1024 * 1024. If set larger than 64Mi then 64Mi will be used instead. """ if self.schema_changed: table = _sanitize_table(table, self.schema, self.flavor) assert self.is_open if not table.schema.equals(self.schema, check_metadata=False): msg = ('Table schema does not match schema used to create file: ' '\ntable:\n{!s} vs. \nfile:\n{!s}' .format(table.schema, self.schema)) > raise ValueError(msg) E ValueError: Table schema does not match schema used to create file: E table: E foo: string vs. E file: E foo: large_string E -- field metadata -- E PARQUET:field_id: '1' /home/codespace/.cache/pypoetry/virtualenvs/pyiceberg-FsHa-ZgB-py3.10/lib/python3.10/site-packages/pyarrow/parquet/core.py:1094: ValueError =============================================================================== short test summary info ================================================================================ FAILED tests/integration/test_writes/test_writes.py::test_create_table_transaction[session_catalog_hive-1] - ValueError: Table schema does not match schema used to create file: ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org