Fokko commented on code in PR #2050: URL: https://github.com/apache/iceberg-python/pull/2050#discussion_r2164616244
########## pyiceberg/table/update/validate.py: ########## @@ -150,3 +151,54 @@ def _validate_deleted_data_files( if any(conflicting_entries): conflicting_snapshots = {entry.snapshot_id for entry in conflicting_entries} raise ValidationException(f"Deleted data files were found matching the filter for snapshots {conflicting_snapshots}!") + + +def _added_data_files( + table: Table, + starting_snapshot: Snapshot, + data_filter: Optional[BooleanExpression], + partition_set: Optional[dict[int, set[Record]]], + parent_snapshot: Optional[Snapshot], +) -> Iterator[ManifestEntry]: + if parent_snapshot is None: Review Comment: I don't think this `if` makes sense, why not make the `parent_snapshot` non-optional? ########## pyiceberg/table/update/validate.py: ########## @@ -77,6 +79,47 @@ def _validation_history( return manifests_files, snapshots +def _filter_manifest_entries( + entry: ManifestEntry, + snapshot_ids: set[int], + data_filter: Optional[BooleanExpression], + partition_set: Optional[dict[int, set[Record]]], + entry_status: Optional[ManifestEntryStatus], + schema: Schema, +) -> bool: + """Filter manifest entries based on data filter and partition set. + + Args: + entry: Manifest entry to filter + snapshot_ids: set of snapshot ids to match data files + data_filter: Optional filter to match data files + partition_set: Optional set of partitions to match data files + status: Optional status to match data files Review Comment: ```suggestion entry_status: Optional status to match data files ``` ########## pyiceberg/table/update/validate.py: ########## @@ -77,6 +79,47 @@ def _validation_history( return manifests_files, snapshots +def _filter_manifest_entries( + entry: ManifestEntry, + snapshot_ids: set[int], + data_filter: Optional[BooleanExpression], + partition_set: Optional[dict[int, set[Record]]], + entry_status: Optional[ManifestEntryStatus], + schema: Schema, +) -> bool: + """Filter manifest entries based on data filter and partition set. + + Args: + entry: Manifest entry to filter + snapshot_ids: set of snapshot ids to match data files + data_filter: Optional filter to match data files + partition_set: Optional set of partitions to match data files + status: Optional status to match data files + table: Table containing the schema for filtering Review Comment: ```suggestion schema: schema for filtering ``` ########## pyiceberg/table/update/validate.py: ########## @@ -108,27 +151,12 @@ def _deleted_data_files( ManifestContent.DATA, ) - if data_filter is not None: - evaluator = _InclusiveMetricsEvaluator(table.schema(), data_filter).eval - for manifest in manifests: for entry in manifest.fetch_manifest_entry(table.io, discard_deleted=False): - if entry.snapshot_id not in snapshot_ids: - continue - - if entry.status != ManifestEntryStatus.DELETED: - continue - - if data_filter is not None and evaluator(entry.data_file) is ROWS_CANNOT_MATCH: - continue - - if partition_set is not None: - spec_id = entry.data_file.spec_id - partition = entry.data_file.partition - if spec_id not in partition_set or partition not in partition_set[spec_id]: - continue - - yield entry + if _filter_manifest_entries( + entry, snapshot_ids, data_filter, partition_set, ManifestEntryStatus.DELETED, table.schema() + ): + yield entry Review Comment: Ignore this comment, but a missed opportunity for some syntactic sugar: ```python for manifest in manifests: yield from ( entry for entry in manifest.fetch_manifest_entry(table.io, discard_deleted=False) if _filter_manifest_entries( entry, snapshot_ids, data_filter, partition_set, ManifestEntryStatus.DELETED, table.schema() ) ) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org