Fokko commented on code in PR #1938: URL: https://github.com/apache/iceberg-python/pull/1938#discussion_r2098728278
########## pyiceberg/table/update/validate.py: ########## @@ -69,3 +75,78 @@ def validation_history( raise ValidationException("No matching snapshot found.") return manifests_files, snapshots + + +def _deleted_data_files( + table: Table, + starting_snapshot: Snapshot, + data_filter: Optional[BooleanExpression], + partition_set: Optional[dict[int, set[Record]]], + parent_snapshot: Optional[Snapshot], +) -> Iterator[ManifestEntry]: + """Find deleted data files matching a filter since a starting snapshot. + + Args: + table: Table to validate + starting_snapshot: Snapshot current at the start of the operation + data_filter: Expression used to find deleted data files + partition_set: dict of {spec_id: set[partition]} to filter on + parent_snapshot: Ending snapshot on the branch being validated + + Returns: + List of conflicting manifest-entries + """ + # if there is no current table state, no files have been deleted + if parent_snapshot is None: + return + + manifests, snapshot_ids = validation_history( + table, + parent_snapshot, + starting_snapshot, + VALIDATE_DATA_FILES_EXIST_OPERATIONS, + ManifestContent.DATA, + ) + + if data_filter is not None: + evaluator = _InclusiveMetricsEvaluator(table.schema(), data_filter).eval + + for manifest in manifests: + for entry in manifest.fetch_manifest_entry(table.io, discard_deleted=False): + if entry.snapshot_id not in snapshot_ids: + continue + + if entry.status != ManifestEntryStatus.DELETED: + continue + + if data_filter is not None and evaluator(entry.data_file) is ROWS_CANNOT_MATCH: + continue + + if partition_set is not None: + spec_id = entry.data_file.spec_id + partition = entry.data_file.partition + if spec_id not in partition_set or partition not in partition_set[spec_id]: + continue + + yield entry + + +def _validate_deleted_data_files( + table: Table, + starting_snapshot: Snapshot, + data_filter: Optional[BooleanExpression], + parent_snapshot: Snapshot, +) -> None: + """Validate that no files matching a filter have been deleted from the table since a starting snapshot. + + Args: + table: Table to validate + starting_snapshot: Snapshot current at the start of the operation + data_filter: Expression used to find deleted data files + parent_snapshot: Ending snapshot on the branch being validated + + """ + conflicting_entries = _deleted_data_files(table, starting_snapshot, data_filter, None, parent_snapshot) Review Comment: nit, I think style wise it is more elegant to switch over to keyword-arguments: ```suggestion conflicting_entries = _deleted_data_files(table, starting_snapshot, data_filter, parent_snapshot=parent_snapshot) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org