kevinjqliu commented on code in PR #1561: URL: https://github.com/apache/iceberg-python/pull/1561#discussion_r1938122584
########## tests/table/test_init.py: ########## @@ -793,6 +794,40 @@ def test_update_metadata_set_snapshot_ref(table_v2: Table) -> None: ) +def test_update_remove_snapshots(table_v2: Table) -> None: + # assert fixture data to easily understand the test assumptions + assert len(table_v2.metadata.snapshots) == 2 + assert len(table_v2.metadata.snapshot_log) == 2 + assert len(table_v2.metadata.refs) == 2 + update = RemoveSnapshotsUpdate(snapshot_ids=[3051729675574597004]) Review Comment: nit can you make `3051729675574597004` a constant for readability? ########## pyiceberg/table/update/__init__.py: ########## @@ -455,6 +456,61 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: _Tabl return base_metadata.model_copy(update=metadata_updates) +@_apply_table_update.register(RemoveSnapshotsUpdate) +def _(update: RemoveSnapshotsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata: + for remove_snapshot_id in update.snapshot_ids: + if remove_snapshot_id == base_metadata.current_snapshot_id: + raise ValueError(f"Can't remove current snapshot id {remove_snapshot_id}") + if not any(s.snapshot_id == remove_snapshot_id for s in base_metadata.snapshots): + raise ValueError(f"Snapshot with snapshot id {remove_snapshot_id} does not exist: {base_metadata.snapshots}") + + snapshots = [ + (s.model_copy(update={"parent_snapshot_id": None}) if s.parent_snapshot_id in update.snapshot_ids else s) + for s in base_metadata.snapshots + if s.snapshot_id not in update.snapshot_ids + ] + snapshot_log = [ + snapshot_log_entry + for snapshot_log_entry in base_metadata.snapshot_log + if snapshot_log_entry.snapshot_id not in update.snapshot_ids + ] + + remove_ref_updates = ( + RemoveSnapshotRefUpdate(ref_name=ref_name) + for ref_name, ref in base_metadata.refs.items() + if ref.snapshot_id in update.snapshot_ids + ) + remove_statistics_updates = ( + RemoveStatisticsUpdate(statistics_file.snapshot_id) + for statistics_file in base_metadata.statistics + if statistics_file.snapshot_id in update.snapshot_ids + ) + updates = itertools.chain(remove_ref_updates, remove_statistics_updates) + new_metadata = base_metadata + for upd in updates: + new_metadata = _apply_table_update(upd, new_metadata, context) Review Comment: nit: im not sure if its a good idea to recursively call `_apply_table_update` here. ########## tests/table/test_init.py: ########## @@ -793,6 +794,40 @@ def test_update_metadata_set_snapshot_ref(table_v2: Table) -> None: ) +def test_update_remove_snapshots(table_v2: Table) -> None: + # assert fixture data to easily understand the test assumptions + assert len(table_v2.metadata.snapshots) == 2 + assert len(table_v2.metadata.snapshot_log) == 2 + assert len(table_v2.metadata.refs) == 2 + update = RemoveSnapshotsUpdate(snapshot_ids=[3051729675574597004]) + new_metadata = update_table_metadata(table_v2.metadata, (update,)) + assert len(new_metadata.snapshots) == 1 + assert new_metadata.snapshots[0].snapshot_id == 3055729675574597004 + assert new_metadata.snapshots[0].parent_snapshot_id is None + assert new_metadata.current_snapshot_id == 3055729675574597004 + assert new_metadata.last_updated_ms > table_v2.metadata.last_updated_ms + assert len(new_metadata.snapshot_log) == 1 + assert new_metadata.snapshot_log[0].snapshot_id == 3055729675574597004 + assert len(new_metadata.refs) == 1 + assert new_metadata.refs["main"].snapshot_id == 3055729675574597004 + + +def test_update_remove_snapshots_doesnt_exist(table_v2: Table) -> None: + update = RemoveSnapshotsUpdate( + snapshot_ids=[123], + ) + with pytest.raises(ValueError, match="Snapshot with snapshot id 123 does not exist"): + update_table_metadata(table_v2.metadata, (update,)) + + +def test_update_remove_snapshots_cant_remove_current_snapshot_id(table_v2: Table) -> None: + update = RemoveSnapshotsUpdate( + snapshot_ids=[3055729675574597004], + ) + with pytest.raises(ValueError, match="Can't remove current snapshot id 3055729675574597004"): + update_table_metadata(table_v2.metadata, (update,)) + + Review Comment: lets also add some tests for `RemoveSnapshotRefUpdate` ########## pyiceberg/table/update/__init__.py: ########## @@ -455,6 +455,19 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: _Tabl return base_metadata.model_copy(update=metadata_updates) +@_apply_table_update.register(RemoveSnapshotsUpdate) +def _(update: RemoveSnapshotsUpdate, base_metadata: TableMetadata, context: _TableMetadataUpdateContext) -> TableMetadata: + for remove_snapshot_id in update.snapshot_ids: + if remove_snapshot_id == base_metadata.current_snapshot_id: + raise ValueError(f"Can't remove current snapshot id {remove_snapshot_id}") Review Comment: > it's not clear what should happen if you try to remove the current snapshot. im looking at the java implementation for answers, i think you can just remove the current snapshot... because you can have an empty table with no snapshots -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org