kevinjqliu commented on code in PR #1879: URL: https://github.com/apache/iceberg-python/pull/1879#discussion_r2040215134
########## tests/integration/test_deletes.py: ########## @@ -467,21 +467,19 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio assert snapshots[2].summary == Summary( Operation.OVERWRITE, Review Comment: maybe something in spark is off... the `total-records` here should be `3` based on the assert right below ########## tests/integration/test_deletes.py: ########## @@ -467,21 +467,19 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio assert snapshots[2].summary == Summary( Operation.OVERWRITE, Review Comment: hm i used spark to perform the same partial overwrite, ``` run_spark_commands( spark, [ f""" ALTER TABLE {identifier} SET TBLPROPERTIES ( 'write.delete.mode' = 'copy-on-write' ); """, # Replace data file f""" DELETE FROM {identifier} WHERE number = 201 """, ], ) ``` and this is the snapshot summary ``` { "added-data-files": "1", "added-files-size": "763", "added-records": "2", "changed-partition-count": "1", "deleted-data-files": "1", "deleted-records": "3", "removed-files-size": "747", "total-data-files": "2", "total-delete-files": "1", "total-equality-deletes": "0", "total-files-size": "3214", "total-position-deletes": "1", "total-records": "4", } ``` which is the same from above... ########## tests/integration/test_deletes.py: ########## @@ -467,21 +467,19 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio assert snapshots[2].summary == Summary( Operation.OVERWRITE, Review Comment: this snapshot summary seems wrong ``` { 'added-data-files': '1', 'added-files-size': '949', 'added-records': '2', # ? this op only deleted 1 record 'changed-partition-count': '1', 'deleted-data-files': '1', 'deleted-records': '3', # ? this op only deleted 1 record 'removed-files-size': '747', 'total-data-files': '2', 'total-delete-files': '1', # ? should be no delete files 'total-equality-deletes': '0', 'total-files-size': '3402', 'total-position-deletes': '1', # ? should be no delete files 'total-records': '4' # ? should only have 3 records (see below assert) } ``` `tbl.delete(EqualTo("number", 201))` is COW, it replaces the original data file for the `20` partition with a new data file with 1 less record ########## tests/integration/test_deletes.py: ########## @@ -467,21 +467,19 @@ def test_partitioned_table_positional_deletes_sequence_number(spark: SparkSessio assert snapshots[2].summary == Summary( Review Comment: i ran this locally. snapshot 0 has the following metadata, which is correct ``` { 'added-data-files': '2', 'added-files-size': '1490', 'added-records': '5', 'changed-partition-count': '2', 'total-data-files': '2', 'total-delete-files': '0', 'total-equality-deletes': '0', 'total-files-size': '1490', 'total-position-deletes': '0', 'total-records': '5' } ``` but snapshot 1, DELETE op with the positional delete, has the following metadata, ``` { 'added-delete-files': '1', 'added-files-size': '1710', 'added-position-delete-files': '1', 'added-position-deletes': '1', 'changed-partition-count': '1', 'total-data-files': '2', 'total-delete-files': '1', 'total-equality-deletes': '0', 'total-files-size': '3200', 'total-position-deletes': '1', 'total-records': '5' } ``` everything looks right except for the `total-records`. we started off with 5 records, and the DELETE op removed 1 record. So `total-records` should be `4` here. Is this a bug in the spark snapshot summary? ########## tests/integration/test_writes/test_writes.py: ########## @@ -262,6 +262,100 @@ def test_summaries(spark: SparkSession, session_catalog: Catalog, arrow_table_wi } +@pytest.mark.integration +def test_summaries_partial_overwrite(spark: SparkSession, session_catalog: Catalog) -> None: + identifier = "default.test_summaries_partial_overwrite" + TEST_DATA = { + "id": [1, 2, 3, 1, 1], + "name": ["AB", "CD", "EF", "CD", "EF"], + } + pa_schema = pa.schema( + [ + pa.field("id", pa.int32()), + pa.field("name", pa.string()), + ] + ) + arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema) + tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, schema=pa_schema) + with tbl.update_spec() as txn: + txn.add_identity("id") + tbl.append(arrow_table) + + # TODO: We might want to check why this ends up in 3 files + assert len(tbl.inspect.data_files()) == 3 Review Comment: its in 3 files because there are 3 unique `id` values and we partition by `id`, right? ```suggestion assert len(tbl.inspect.data_files()) == 3 ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org