Fokko commented on code in PR #848: URL: https://github.com/apache/iceberg-python/pull/848#discussion_r1666823874
########## pyiceberg/io/pyarrow.py: ########## @@ -918,11 +919,24 @@ def primitive(self, primitive: pa.DataType) -> PrimitiveType: return TimeType() elif pa.types.is_timestamp(primitive): primitive = cast(pa.TimestampType, primitive) - if primitive.unit == "us": - if primitive.tz == "UTC" or primitive.tz == "+00:00": - return TimestamptzType() - elif primitive.tz is None: - return TimestampType() + if primitive.unit in ("s", "ms", "us"): + # Supported types, will be upcast automatically to 'us' + pass + elif primitive.unit == "ns": + if Config().get_bool("downcast-ns-timestamp-on-write"): Review Comment: Thanks @HonahX for giving the example, I just gave this a spin and ran into the following: ```python @pytest.mark.integration def test_timestamp_tz( session_catalog: Catalog, format_version: int, mocker: MockerFixture ) -> None: nanoseconds_schema_iceberg = Schema( NestedField(1, "quux", TimestamptzType()) ) nanoseconds_schema = pa.schema([ ("quux", pa.timestamp("ns", tz="UTC")), ]) arrow_table = pa.Table.from_pylist( [ { "quux": 1615967687249846175, # 2021-03-17 07:54:47.249846159 } ], schema=nanoseconds_schema, ) mocker.patch.dict(os.environ, values={"PYICEBERG_DOWNCAST_NS_TIMESTAMP_ON_WRITE": "True"}) identifier = f"default.abccccc{format_version}" try: session_catalog.drop_table(identifier=identifier) except NoSuchTableError: pass tbl = session_catalog.create_table( identifier=identifier, schema=nanoseconds_schema_iceberg, properties={"format-version": str(format_version)}, partition_spec=PartitionSpec(), ) file_paths = [f"s3://warehouse/default/test_timestamp_tz/v{format_version}/test-{i}.parquet" for i in range(5)] # write parquet files for file_path in file_paths: fo = tbl.io.new_output(file_path) with fo.create(overwrite=True) as fos: with pq.ParquetWriter(fos, schema=nanoseconds_schema) as writer: writer.write_table(arrow_table) # add the parquet files as data files tbl.add_files(file_paths=file_paths) print(tbl.scan().to_arrow()) ``` I think we can force the cast to be unsafe: ```python return values.cast(target_type, safe=False) ``` We might want to check if we only apply this when doing the nanos to micros. I'm not sure what will happen when we do other lossy conversions. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org