szehon-ho commented on code in PR #12006: URL: https://github.com/apache/iceberg/pull/12006#discussion_r1938210083
########## core/src/main/java/org/apache/iceberg/RewriteTablePathUtil.java: ########## @@ -354,7 +354,10 @@ private static RewriteResult<DataFile> writeDataFileEntry( DataFile newDataFile = DataFiles.builder(spec).copy(entry.file()).withPath(targetDataFilePath).build(); appendEntryWithFile(entry, writer, newDataFile); - result.copyPlan().add(Pair.of(sourceDataFilePath, newDataFile.location())); + // Keep non-live entry but exclude deleted data files as part of copyPlan Review Comment: Nit: keep deleted data file entries but exclude them from copyPlan (same for the following few comments) My thought is, negative statements always are a bit harder to parse (non-live). ########## spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java: ########## @@ -533,14 +554,77 @@ public void testExpireSnapshotBeforeRewrite() throws Exception { checkFileNum(4, 1, 2, 9, result); } + @Test + public void testRewritePathWithNonLiveEntry() throws Exception { + String location = newTableLocation(); + // first overwrite generate 1 manifest and 1 data file + // each subsequent overwrite on unpartitioned table generate 2 manifests and 1 data file + Table tableWith3Snaps = createTableWithSnapshots(location, 3, Maps.newHashMap(), "overwrite"); + + Snapshot oldest = SnapshotUtil.oldestAncestor(tableWith3Snaps); + String oldestDataFilePath = + Iterables.getOnlyElement( + tableWith3Snaps.snapshot(oldest.snapshotId()).addedDataFiles(tableWith3Snaps.io())) + .location(); + String deletedDataFilePathInTargetLocation = + String.format("%sdata/%s", targetTableLocation(), fileName(oldestDataFilePath)); + + // expire the oldest snapshot and remove oldest DataFile + ExpireSnapshots.Result expireResult = + actions().expireSnapshots(tableWith3Snaps).expireSnapshotId(oldest.snapshotId()).execute(); + assertThat(expireResult) + .as("Should deleted 1 data files in root snapshot") + .extracting( + ExpireSnapshots.Result::deletedManifestListsCount, + ExpireSnapshots.Result::deletedManifestsCount, + ExpireSnapshots.Result::deletedDataFilesCount) + .contains(1L, 1L, 1L); + + RewriteTablePath.Result result = + actions() + .rewriteTablePath(tableWith3Snaps) + .stagingLocation(stagingLocation()) + .rewriteLocationPrefix(tableWith3Snaps.location(), targetTableLocation()) + .execute(); + + // 5 version files include 1 table creation 3 overwrite and 1 snapshot expiration + // 3 overwrites generate 3 manifest list and 5 manifests with 3 data files + // snapshot expiration removed 1 of each + checkFileNum(5, 2, 4, 13, result); + + // copy the metadata files and data files + copyTableFiles(result); + + // expect deleted data file is excluded from rewrite and copy + List<String> copiedDataFiles = + spark + .read() + .format("iceberg") + .load(targetTableLocation() + "#all_files") + .select("file_path") + .as(Encoders.STRING()) + .collectAsList(); + assertThat(copiedDataFiles).hasSize(2).doesNotContain(deletedDataFilePathInTargetLocation); + + // expect manifest entries still contain deleted entry + List<String> copiedEntries = + spark + .read() + .format("iceberg") + .load(targetTableLocation() + "#all_entries") + .filter("status == 2") + .select("data_file.file_path") + .as(Encoders.STRING()) + .collectAsList(); + assertThat(copiedEntries).contains(deletedDataFilePathInTargetLocation); + } + @Test public void testStartSnapshotWithoutValidSnapshot() throws Exception { // expire one snapshot actions().expireSnapshots(table).expireSnapshotId(table.currentSnapshot().parentId()).execute(); - assertThat(((List) table.snapshots()).size()) - .withFailMessage("1 out 2 snapshot has been removed") - .isEqualTo(1); + assertThat(table.snapshots()).withFailMessage("1 out 2 snapshot has been removed").hasSize(1); Review Comment: This is probably unrelated? But while we are here, does the fail message make sense? Can we just remove it as the default error message would say how many snapshots are expected? ########## spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java: ########## @@ -940,16 +1024,20 @@ protected void checkFileNum( .load(result.fileListLocation()) .as(Encoders.STRING()) .collectAsList(); - assertThat(filesToMove.stream().filter(f -> f.endsWith(".metadata.json")).count()) - .withFailMessage("Wrong rebuilt version file count") + Predicate<String> isManifest = f -> (f.endsWith("-m0.avro")) || (f.endsWith("-m1.avro")); Review Comment: Nit: remove the extra parens here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org