Re: [PR] Allow writing `pa.Table` that are either a subset of table schema or in arbitrary order, and support type promotion on write [iceberg-python]

via GitHub Tue, 16 Jul 2024 06:41:22 -0700


Fokko commented on code in PR #921:
URL: https://github.com/apache/iceberg-python/pull/921#discussion_r1679424345



##########
tests/integration/test_add_files.py:
##########
@@ -617,3 +607,92 @@ def 
test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v
         ),
     ):
         tbl.add_files(file_paths=[file_path])
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_table_write_schema_with_valid_nullability_diff(
+    spark: SparkSession, session_catalog: Catalog, format_version: int
+) -> None:
+    identifier = 
f"default.test_table_write_with_valid_nullability_diff{format_version}"
+    table_schema = Schema(
+        NestedField(field_id=1, name="long", field_type=LongType(), 
required=False),
+    )
+    other_schema = pa.schema((
+        pa.field("long", pa.int64(), nullable=False),  # can support writing 
required pyarrow field to optional Iceberg field
+    ))
+    arrow_table = pa.Table.from_pydict(
+        {
+            "long": [1, 9],
+        },
+        schema=other_schema,
+    )
+    tbl = _create_table(session_catalog, identifier, format_version, 
schema=table_schema)
+
+    file_path = 
f"s3://warehouse/default/test_valid_nullability_diff/v{format_version}/test.parquet"
+    # write parquet files
+    fo = tbl.io.new_output(file_path)
+    with fo.create(overwrite=True) as fos:
+        with pq.ParquetWriter(fos, schema=other_schema) as writer:
+            writer.write_table(arrow_table)
+
+    tbl.add_files(file_paths=[file_path])
+    # table's long field should cast to be optional on read
+    written_arrow_table = tbl.scan().to_arrow()
+    assert written_arrow_table == arrow_table.cast(pa.schema((pa.field("long", 
pa.int64(), nullable=True),)))
+    lhs = spark.table(f"{identifier}").toPandas()
+    rhs = written_arrow_table.to_pandas()
+
+    for column in written_arrow_table.column_names:
+        for left, right in zip(lhs[column].to_list(), rhs[column].to_list()):
+            assert left == right
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_table_write_schema_with_valid_upcast(
+    spark: SparkSession,
+    session_catalog: Catalog,
+    format_version: int,
+    table_schema_with_promoted_types: Schema,
+    pyarrow_schema_with_promoted_types: pa.Schema,
+    pyarrow_table_with_promoted_types: pa.Table,
+) -> None:
+    identifier = f"default.test_table_write_with_valid_upcast{format_version}"
+    tbl = _create_table(session_catalog, identifier, format_version, 
schema=table_schema_with_promoted_types)
+
+    file_path = 
f"s3://warehouse/default/test_valid_nullability_diff/v{format_version}/test.parquet"

Review Comment:
   ```suggestion
       file_path = 
f"s3://warehouse/default/test_table_write_schema_with_valid_upcast/v{format_version}/test.parquet"
   ```



##########
tests/integration/test_add_files.py:
##########
@@ -617,3 +607,92 @@ def 
test_add_files_with_timestamp_tz_ns_fails(session_catalog: Catalog, format_v
         ),
     ):
         tbl.add_files(file_paths=[file_path])
+
+
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_table_write_schema_with_valid_nullability_diff(
+    spark: SparkSession, session_catalog: Catalog, format_version: int
+) -> None:
+    identifier = 
f"default.test_table_write_with_valid_nullability_diff{format_version}"
+    table_schema = Schema(
+        NestedField(field_id=1, name="long", field_type=LongType(), 
required=False),
+    )
+    other_schema = pa.schema((
+        pa.field("long", pa.int64(), nullable=False),  # can support writing 
required pyarrow field to optional Iceberg field
+    ))
+    arrow_table = pa.Table.from_pydict(
+        {
+            "long": [1, 9],
+        },
+        schema=other_schema,
+    )
+    tbl = _create_table(session_catalog, identifier, format_version, 
schema=table_schema)
+
+    file_path = 
f"s3://warehouse/default/test_valid_nullability_diff/v{format_version}/test.parquet"

Review Comment:
   ```suggestion
       file_path = 
f"s3://warehouse/default/test_table_write_schema_with_valid_nullability_diff/v{format_version}/test.parquet"
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Allow writing `pa.Table` that are either a subset of table schema or in arbitrary order, and support type promotion on write [iceberg-python]

Reply via email to