jayceslesar commented on code in PR #1958: URL: https://github.com/apache/iceberg-python/pull/1958#discussion_r2072436539
########## pyiceberg/table/inspect.py: ########## @@ -657,3 +665,37 @@ def all_manifests(self) -> "pa.Table": lambda args: self._generate_manifests_table(*args), [(snapshot, True) for snapshot in snapshots] ) return pa.concat_tables(manifests_by_snapshots) + + def orphaned_files(self, location: str, older_than: Optional[timedelta] = timedelta(days=3)) -> Set[str]: + try: + import pyarrow as pa # noqa: F401 + except ModuleNotFoundError as e: + raise ModuleNotFoundError("For deleting orphaned files PyArrow needs to be installed") from e + + from pyarrow.fs import FileSelector, FileType + + from pyiceberg.io.pyarrow import _fs_from_file_path + + all_known_files = set() + snapshots = self.tbl.snapshots() + manifests_paths = self.all_manifests(snapshots)["path"].to_pylist() + all_known_files.update(manifests_paths) + + executor = ExecutorFactory.get_or_create() + files_by_snapshots: Iterator[Set[str]] = executor.map( + lambda snapshot_id: set(self.files(snapshot_id)["file_path"].to_pylist()) + ) + datafile_paths: set[str] = reduce(set.union, files_by_snapshots, set()) + all_known_files.update(datafile_paths) + + fs = _fs_from_file_path(self.tbl.io, location) + + _, _, path = _parse_location(location) + selector = FileSelector(path, recursive=True) + # filter to just files as it may return directories, and filter on time + as_of = datetime.now(timezone.utc) - older_than if older_than else None + all_files = [f for f in fs.get_file_info(selector) if f.type == FileType.File and (as_of is None or (f.mtime < as_of))] + + orphaned_files = set(all_files).difference(all_known_files) Review Comment: ah good catch this happened in a little refactor, just need to call f.path -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org