Fokko commented on code in PR #921: URL: https://github.com/apache/iceberg-python/pull/921#discussion_r1679424345
########## tests/integration/test_add_files.py: ########## @@ -617,3 +607,92 @@ def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v ), ): tbl.add_files(file_paths=[file_path]) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_table_write_schema_with_valid_nullability_diff( + spark: SparkSession, session_catalog: Catalog, format_version: int +) -> None: + identifier = f"default.test_table_write_with_valid_nullability_diff{format_version}" + table_schema = Schema( + NestedField(field_id=1, name="long", field_type=LongType(), required=False), + ) + other_schema = pa.schema(( + pa.field("long", pa.int64(), nullable=False), # can support writing required pyarrow field to optional Iceberg field + )) + arrow_table = pa.Table.from_pydict( + { + "long": [1, 9], + }, + schema=other_schema, + ) + tbl = _create_table(session_catalog, identifier, format_version, schema=table_schema) + + file_path = f"s3://warehouse/default/test_valid_nullability_diff/v{format_version}/test.parquet" + # write parquet files + fo = tbl.io.new_output(file_path) + with fo.create(overwrite=True) as fos: + with pq.ParquetWriter(fos, schema=other_schema) as writer: + writer.write_table(arrow_table) + + tbl.add_files(file_paths=[file_path]) + # table's long field should cast to be optional on read + written_arrow_table = tbl.scan().to_arrow() + assert written_arrow_table == arrow_table.cast(pa.schema((pa.field("long", pa.int64(), nullable=True),))) + lhs = spark.table(f"{identifier}").toPandas() + rhs = written_arrow_table.to_pandas() + + for column in written_arrow_table.column_names: + for left, right in zip(lhs[column].to_list(), rhs[column].to_list()): + assert left == right + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_table_write_schema_with_valid_upcast( + spark: SparkSession, + session_catalog: Catalog, + format_version: int, + table_schema_with_promoted_types: Schema, + pyarrow_schema_with_promoted_types: pa.Schema, + pyarrow_table_with_promoted_types: pa.Table, +) -> None: + identifier = f"default.test_table_write_with_valid_upcast{format_version}" + tbl = _create_table(session_catalog, identifier, format_version, schema=table_schema_with_promoted_types) + + file_path = f"s3://warehouse/default/test_valid_nullability_diff/v{format_version}/test.parquet" Review Comment: ```suggestion file_path = f"s3://warehouse/default/test_table_write_schema_with_valid_upcast/v{format_version}/test.parquet" ``` ########## tests/integration/test_add_files.py: ########## @@ -617,3 +607,92 @@ def test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v ), ): tbl.add_files(file_paths=[file_path]) + + +@pytest.mark.integration +@pytest.mark.parametrize("format_version", [1, 2]) +def test_table_write_schema_with_valid_nullability_diff( + spark: SparkSession, session_catalog: Catalog, format_version: int +) -> None: + identifier = f"default.test_table_write_with_valid_nullability_diff{format_version}" + table_schema = Schema( + NestedField(field_id=1, name="long", field_type=LongType(), required=False), + ) + other_schema = pa.schema(( + pa.field("long", pa.int64(), nullable=False), # can support writing required pyarrow field to optional Iceberg field + )) + arrow_table = pa.Table.from_pydict( + { + "long": [1, 9], + }, + schema=other_schema, + ) + tbl = _create_table(session_catalog, identifier, format_version, schema=table_schema) + + file_path = f"s3://warehouse/default/test_valid_nullability_diff/v{format_version}/test.parquet" Review Comment: ```suggestion file_path = f"s3://warehouse/default/test_table_write_schema_with_valid_nullability_diff/v{format_version}/test.parquet" ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org