kevinjqliu commented on code in PR #1958: URL: https://github.com/apache/iceberg-python/pull/1958#discussion_r2072493113
########## pyiceberg/table/__init__.py: ########## @@ -1371,6 +1376,28 @@ def to_polars(self) -> pl.LazyFrame: return pl.scan_iceberg(self) + def delete_orphaned_files(self, older_than: Optional[timedelta] = timedelta(days=3), dry_run: bool = False) -> None: + """Delete orphaned files in the table.""" + location = self.location() + orphaned_files = self.inspect.orphaned_files(location, older_than) + logger.info(f"Found {len(orphaned_files)} orphaned files at {location}!") + + def _delete(file: str) -> None: + # don't error if the file doesn't exist + # still catch ctrl-c, etc. + with contextlib.suppress(Exception): + self.io.delete(file) + + if orphaned_files: + if dry_run: + logger.info(f"(Dry Run) Deleted {len(orphaned_files)} orphaned files at {location}!") + else: + executor = ExecutorFactory.get_or_create() + deletes = executor.map(_delete, orphaned_files) + # exhaust + list(deletes) + logger.info(f"Deleted {len(orphaned_files)} orphaned files at {location}!") Review Comment: the spark procedure outputs the `orphan_file_location` which are all the files set to be deleted. this is pretty useful for logging https://iceberg.apache.org/docs/nightly/spark-procedures/#output_7 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org