dramaticlly commented on code in PR #12006: URL: https://github.com/apache/iceberg/pull/12006#discussion_r1945480537
########## core/src/main/java/org/apache/iceberg/RewriteTablePathUtil.java: ########## @@ -354,7 +354,10 @@ private static RewriteResult<DataFile> writeDataFileEntry( DataFile newDataFile = DataFiles.builder(spec).copy(entry.file()).withPath(targetDataFilePath).build(); appendEntryWithFile(entry, writer, newDataFile); - result.copyPlan().add(Pair.of(sourceDataFilePath, newDataFile.location())); + // Keep non-live entry but exclude deleted data files as part of copyPlan Review Comment: Sounds good, updated as suggested for all 3 comments ########## spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java: ########## @@ -533,14 +554,77 @@ public void testExpireSnapshotBeforeRewrite() throws Exception { checkFileNum(4, 1, 2, 9, result); } + @Test + public void testRewritePathWithNonLiveEntry() throws Exception { + String location = newTableLocation(); + // first overwrite generate 1 manifest and 1 data file + // each subsequent overwrite on unpartitioned table generate 2 manifests and 1 data file + Table tableWith3Snaps = createTableWithSnapshots(location, 3, Maps.newHashMap(), "overwrite"); + + Snapshot oldest = SnapshotUtil.oldestAncestor(tableWith3Snaps); + String oldestDataFilePath = + Iterables.getOnlyElement( + tableWith3Snaps.snapshot(oldest.snapshotId()).addedDataFiles(tableWith3Snaps.io())) + .location(); + String deletedDataFilePathInTargetLocation = + String.format("%sdata/%s", targetTableLocation(), fileName(oldestDataFilePath)); + + // expire the oldest snapshot and remove oldest DataFile + ExpireSnapshots.Result expireResult = + actions().expireSnapshots(tableWith3Snaps).expireSnapshotId(oldest.snapshotId()).execute(); + assertThat(expireResult) + .as("Should deleted 1 data files in root snapshot") + .extracting( + ExpireSnapshots.Result::deletedManifestListsCount, + ExpireSnapshots.Result::deletedManifestsCount, + ExpireSnapshots.Result::deletedDataFilesCount) + .contains(1L, 1L, 1L); + + RewriteTablePath.Result result = + actions() + .rewriteTablePath(tableWith3Snaps) + .stagingLocation(stagingLocation()) + .rewriteLocationPrefix(tableWith3Snaps.location(), targetTableLocation()) + .execute(); + + // 5 version files include 1 table creation 3 overwrite and 1 snapshot expiration + // 3 overwrites generate 3 manifest list and 5 manifests with 3 data files + // snapshot expiration removed 1 of each + checkFileNum(5, 2, 4, 13, result); + + // copy the metadata files and data files + copyTableFiles(result); + + // expect deleted data file is excluded from rewrite and copy + List<String> copiedDataFiles = + spark + .read() + .format("iceberg") + .load(targetTableLocation() + "#all_files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); + assertThat(copiedDataFiles).hasSize(2).doesNotContain(deletedDataFilePathInTargetLocation); + + // expect manifest entries still contain deleted entry + List<String> copiedEntries = + spark + .read() + .format("iceberg") + .load(targetTableLocation() + "#all_entries") + .filter("status == 2") + .select("data_file.file_path") + .as(Encoders.STRING()) + .collectAsList(); + assertThat(copiedEntries).contains(deletedDataFilePathInTargetLocation); + } + @Test public void testStartSnapshotWithoutValidSnapshot() throws Exception { // expire one snapshot actions().expireSnapshots(table).expireSnapshotId(table.currentSnapshot().parentId()).execute(); - assertThat(((List) table.snapshots()).size()) - .withFailMessage("1 out 2 snapshot has been removed") - .isEqualTo(1); + assertThat(table.snapshots()).withFailMessage("1 out 2 snapshot has been removed").hasSize(1); Review Comment: Sounds good, I think this is unrelated but IDE highlight the potential improvement and I cant resist apply it. Removed redundant message as suggested -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org