dramaticlly commented on code in PR #12006:
URL: https://github.com/apache/iceberg/pull/12006#discussion_r1945480537


##########
core/src/main/java/org/apache/iceberg/RewriteTablePathUtil.java:
##########
@@ -354,7 +354,10 @@ private static RewriteResult<DataFile> writeDataFileEntry(
     DataFile newDataFile =
         
DataFiles.builder(spec).copy(entry.file()).withPath(targetDataFilePath).build();
     appendEntryWithFile(entry, writer, newDataFile);
-    result.copyPlan().add(Pair.of(sourceDataFilePath, newDataFile.location()));
+    // Keep non-live entry but exclude deleted data files as part of copyPlan

Review Comment:
   Sounds good, updated as suggested for all 3 comments



##########
spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/actions/TestRewriteTablePathsAction.java:
##########
@@ -533,14 +554,77 @@ public void testExpireSnapshotBeforeRewrite() throws 
Exception {
     checkFileNum(4, 1, 2, 9, result);
   }
 
+  @Test
+  public void testRewritePathWithNonLiveEntry() throws Exception {
+    String location = newTableLocation();
+    // first overwrite generate 1 manifest and 1 data file
+    // each subsequent overwrite on unpartitioned table generate 2 manifests 
and 1 data file
+    Table tableWith3Snaps = createTableWithSnapshots(location, 3, 
Maps.newHashMap(), "overwrite");
+
+    Snapshot oldest = SnapshotUtil.oldestAncestor(tableWith3Snaps);
+    String oldestDataFilePath =
+        Iterables.getOnlyElement(
+                
tableWith3Snaps.snapshot(oldest.snapshotId()).addedDataFiles(tableWith3Snaps.io()))
+            .location();
+    String deletedDataFilePathInTargetLocation =
+        String.format("%sdata/%s", targetTableLocation(), 
fileName(oldestDataFilePath));
+
+    // expire the oldest snapshot and remove oldest DataFile
+    ExpireSnapshots.Result expireResult =
+        
actions().expireSnapshots(tableWith3Snaps).expireSnapshotId(oldest.snapshotId()).execute();
+    assertThat(expireResult)
+        .as("Should deleted 1 data files in root snapshot")
+        .extracting(
+            ExpireSnapshots.Result::deletedManifestListsCount,
+            ExpireSnapshots.Result::deletedManifestsCount,
+            ExpireSnapshots.Result::deletedDataFilesCount)
+        .contains(1L, 1L, 1L);
+
+    RewriteTablePath.Result result =
+        actions()
+            .rewriteTablePath(tableWith3Snaps)
+            .stagingLocation(stagingLocation())
+            .rewriteLocationPrefix(tableWith3Snaps.location(), 
targetTableLocation())
+            .execute();
+
+    // 5 version files include 1 table creation 3 overwrite and 1 snapshot 
expiration
+    // 3 overwrites generate 3 manifest list and 5 manifests with 3 data files
+    // snapshot expiration removed 1 of each
+    checkFileNum(5, 2, 4, 13, result);
+
+    // copy the metadata files and data files
+    copyTableFiles(result);
+
+    // expect deleted data file is excluded from rewrite and copy
+    List<String> copiedDataFiles =
+        spark
+            .read()
+            .format("iceberg")
+            .load(targetTableLocation() + "#all_files")
+            .select("file_path")
+            .as(Encoders.STRING())
+            .collectAsList();
+    
assertThat(copiedDataFiles).hasSize(2).doesNotContain(deletedDataFilePathInTargetLocation);
+
+    // expect manifest entries still contain deleted entry
+    List<String> copiedEntries =
+        spark
+            .read()
+            .format("iceberg")
+            .load(targetTableLocation() + "#all_entries")
+            .filter("status == 2")
+            .select("data_file.file_path")
+            .as(Encoders.STRING())
+            .collectAsList();
+    assertThat(copiedEntries).contains(deletedDataFilePathInTargetLocation);
+  }
+
   @Test
   public void testStartSnapshotWithoutValidSnapshot() throws Exception {
     // expire one snapshot
     
actions().expireSnapshots(table).expireSnapshotId(table.currentSnapshot().parentId()).execute();
 
-    assertThat(((List) table.snapshots()).size())
-        .withFailMessage("1 out 2 snapshot has been removed")
-        .isEqualTo(1);
+    assertThat(table.snapshots()).withFailMessage("1 out 2 snapshot has been 
removed").hasSize(1);

Review Comment:
   Sounds good, I think this is unrelated but IDE highlight the potential 
improvement and I cant resist apply it. Removed redundant message as suggested



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to