ismailsimsek commented on code in PR #11906: URL: https://github.com/apache/iceberg/pull/11906#discussion_r1914531213
########## spark/v3.5/spark/src/main/java/org/apache/iceberg/spark/actions/DeleteOrphanFilesSparkAction.java: ########## @@ -292,19 +294,49 @@ private Dataset<FileURI> validFileIdentDS() { private Dataset<FileURI> actualFileIdentDS() { StringToFileURI toFileURI = new StringToFileURI(equalSchemes, equalAuthorities); + Dataset<String> dataList; if (compareToFileList == null) { - return toFileURI.apply(listedFileDS()); + dataList = + table.io() instanceof SupportsPrefixOperations ? listWithPrefix() : listWithoutPrefix(); } else { - return toFileURI.apply(filteredCompareToFileList()); + dataList = filteredCompareToFileList(); } + + return toFileURI.apply(dataList); + } + + @VisibleForTesting + Dataset<String> listWithPrefix() { + List<String> matchingFiles = Lists.newArrayList(); + // listPrefix only returns files. so we additionally need to check parent folders for each file + // in following example file itself is not filtered out, + // but it should be excluded due to its parent folder: `_c2_trunc` + // "/data/_c2_trunc/file.txt" + PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs(), true); + + Iterator<org.apache.iceberg.io.FileInfo> iterator = + ((SupportsPrefixOperations) table.io()).listPrefix(location).iterator(); + while (iterator.hasNext()) { + org.apache.iceberg.io.FileInfo fileInfo = iterator.next(); + // NOTE: check the path relative to table location. To avoid checking un necessary root + // folders + Path relativeFilePath = new Path(fileInfo.location().replace(location, "")); Review Comment: creating relative path to avoid checking parent folders of the table. however this `replace(location, ""));` might not be the best solution. open to any ideas -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org