kevinjqliu commented on code in PR #1607: URL: https://github.com/apache/iceberg-python/pull/1607#discussion_r1948155351
########## pyiceberg/table/__init__.py: ########## @@ -1179,6 +1182,11 @@ def refs(self) -> Dict[str, SnapshotRef]: def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...]) -> None: response = self.catalog.commit_table(self, requirements, updates) + + # https://github.com/apache/iceberg/blob/f6faa58/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L527 + # delete old metadata if METADATA_DELETE_AFTER_COMMIT_ENABLED is set to true + self.catalog._delete_old_metadata(self.io, self.metadata, response.metadata) Review Comment: similar to https://github.com/apache/iceberg/blob/f6faa58dac57e03be6e02a43937ac7c15c770225/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L539-L544 can we add a comment here explaining how `METADATA_PREVIOUS_VERSIONS_MAX` is taken into account? https://github.com/apache/iceberg-python/blob/826a00628216993de50f7f0c2111b6c824b02939/pyiceberg/table/update/__init__.py#L576 ########## pyiceberg/table/__init__.py: ########## @@ -1179,6 +1182,11 @@ def refs(self) -> Dict[str, SnapshotRef]: def _do_commit(self, updates: Tuple[TableUpdate, ...], requirements: Tuple[TableRequirement, ...]) -> None: response = self.catalog.commit_table(self, requirements, updates) + + # https://github.com/apache/iceberg/blob/f6faa58/core/src/main/java/org/apache/iceberg/CatalogUtil.java#L527 + # delete old metadata if METADATA_DELETE_AFTER_COMMIT_ENABLED is set to true + self.catalog._delete_old_metadata(self.io, self.metadata, response.metadata) Review Comment: also maybe we want to wrap this in try/catch and throw a warning as to not block the commit process ########## tests/catalog/test_sql.py: ########## @@ -1613,3 +1614,56 @@ def test_merge_manifests_local_file_system(catalog: SqlCatalog, arrow_table_with tbl.append(arrow_table_with_null) assert len(tbl.scan().to_arrow()) == 5 * len(arrow_table_with_null) + + +@pytest.mark.parametrize( + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + ], +) +def test_delete_metadata_multiple(catalog: SqlCatalog, table_schema_nested: Schema, table_identifier: Identifier) -> None: + namespace = Catalog.namespace_from(table_identifier) + catalog.create_namespace(namespace) + table = catalog.create_table(table_identifier, table_schema_nested) + + original_metadata_location = table.metadata_location + + for i in range(5): + with table.transaction() as transaction: + with transaction.update_schema() as update: + update.add_column(path=f"new_column_{i}", field_type=IntegerType()) + + assert len(table.metadata.metadata_log) == 5 + assert os.path.exists(original_metadata_location[len("file://") :]) + + # Set the max versions property to 2, and delete after commit + new_property = { + TableProperties.METADATA_PREVIOUS_VERSIONS_MAX: "2", + TableProperties.METADATA_DELETE_AFTER_COMMIT_ENABLED: "true", + } + + with table.transaction() as transaction: + transaction.set_properties(properties=new_property) + + # Verify that only the most recent metadata files are kept + assert len(table.metadata.metadata_log) == 2 + updated_metadata_1, updated_metadata_2 = table.metadata.metadata_log + + with table.transaction() as transaction: Review Comment: nit: add a comment to mention that a new metadata log was added so the first one is removed ########## tests/catalog/test_sql.py: ########## @@ -1613,3 +1614,56 @@ def test_merge_manifests_local_file_system(catalog: SqlCatalog, arrow_table_with tbl.append(arrow_table_with_null) assert len(tbl.scan().to_arrow()) == 5 * len(arrow_table_with_null) + + +@pytest.mark.parametrize( + "catalog", + [ + lazy_fixture("catalog_memory"), + lazy_fixture("catalog_sqlite"), + lazy_fixture("catalog_sqlite_without_rowcount"), + ], +) +@pytest.mark.parametrize( + "table_identifier", + [ + lazy_fixture("random_table_identifier"), + lazy_fixture("random_hierarchical_identifier"), + ], +) Review Comment: nit: we dont really need this part for testing metadata log deletion ########## pyiceberg/catalog/__init__.py: ########## @@ -858,6 +875,7 @@ def _update_and_stage_table( enforce_validation=current_table is None, metadata_location=current_table.metadata_location if current_table else None, ) + io = self._load_file_io(properties=updated_metadata.properties, location=updated_metadata.location) Review Comment: should we revert this change since its unrelated to this PR? is `updated_metadata.location` the same as `new_metadata_location`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org