aokolnychyi commented on code in PR #11131: URL: https://github.com/apache/iceberg/pull/11131#discussion_r1851129804
########## core/src/main/java/org/apache/iceberg/ManifestFilterManager.java: ########## @@ -69,6 +70,7 @@ public String partition() { private final Map<Integer, PartitionSpec> specsById; private final PartitionSet deleteFilePartitions; private final Set<F> deleteFiles = newFileSet(); + private final Set<String> manifestsReferencedForDeletes = Sets.newHashSet(); Review Comment: Any shorter names like `manifestsWithDeletes` or `deleteFileManifests`? A few calls below split into multiple lines. ########## core/src/main/java/org/apache/iceberg/ManifestFilterManager.java: ########## @@ -185,6 +198,14 @@ List<ManifestFile> filterManifests(Schema tableSchema, List<ManifestFile> manife return ImmutableList.of(); } + // Use the current set of referenced manifests as a source of truth when it's a subset of all + // manifests and all removals which were performed reference manifests. + // If a manifest is not in the trusted referenced set and has no live files, this means that the + // manifest has no deleted entries and does not need to be rewritten Review Comment: Missing `.` for consistency with the first sentence? ########## core/src/main/java/org/apache/iceberg/ManifestFilterManager.java: ########## @@ -185,6 +198,14 @@ List<ManifestFile> filterManifests(Schema tableSchema, List<ManifestFile> manife return ImmutableList.of(); } + // Use the current set of referenced manifests as a source of truth when it's a subset of all + // manifests and all removals which were performed reference manifests. + // If a manifest is not in the trusted referenced set and has no live files, this means that the Review Comment: The first sentence is very clear. I am not sure I follow the bit about "and has no live files" in the second sentence, as the check for live files is done further. Even if we want to mention live files, shall `and` become `or`? ########## core/src/main/java/org/apache/iceberg/ManifestFilterManager.java: ########## @@ -327,62 +354,71 @@ private ManifestFile filterManifest(Schema tableSchema, ManifestFile manifest) { // this assumes that the manifest doesn't have files to remove and streams through the Review Comment: I wonder if the empty line above this comment still has a purpose, given it is one block now. ########## core/src/jmh/java/org/apache/iceberg/ReplaceDeleteFilesBenchmark.java: ########## @@ -104,27 +113,44 @@ private void dropTable() { TABLES.dropTable(TABLE_IDENT); } - private void initFiles() { - List<DeleteFile> generatedDeleteFiles = Lists.newArrayListWithExpectedSize(numFiles); + private void initFiles() throws IOException { List<DeleteFile> generatedPendingDeleteFiles = Lists.newArrayListWithExpectedSize(numFiles); - + int numDeleteFilesToReplace = (int) Math.ceil(numFiles * (percentDeleteFilesReplaced / 100.0)); + Map<String, DeleteFile> filesToReplace = + Maps.newHashMapWithExpectedSize(numDeleteFilesToReplace); RowDelta rowDelta = table.newRowDelta(); - for (int ordinal = 0; ordinal < numFiles; ordinal++) { DataFile dataFile = FileGenerationUtil.generateDataFile(table, null); rowDelta.addRows(dataFile); - DeleteFile deleteFile = FileGenerationUtil.generatePositionDeleteFile(table, dataFile); rowDelta.addDeletes(deleteFile); - generatedDeleteFiles.add(deleteFile); - - DeleteFile pendingDeleteFile = FileGenerationUtil.generatePositionDeleteFile(table, dataFile); - generatedPendingDeleteFiles.add(pendingDeleteFile); + if (numDeleteFilesToReplace > 0) { + filesToReplace.put(deleteFile.location(), deleteFile); + DeleteFile pendingDeleteFile = + FileGenerationUtil.generatePositionDeleteFile(table, dataFile); + generatedPendingDeleteFiles.add(pendingDeleteFile); + numDeleteFilesToReplace--; + } } rowDelta.commit(); - this.deleteFiles = generatedDeleteFiles; + List<DeleteFile> deleteFilesReadFromManifests = Lists.newArrayList(); Review Comment: It is a bit hard to follow the way we generate these values (like why use a map if only checking the key?). What if we restructure this logic a bit given that we have to read the manifests anyway? ``` RowDelta rowDelta = table.newRowDelta(); for (int ordinal = 0; ordinal < numFiles; ordinal++) { ... } rowDelta.commit(); int replacedDeleteFilesCount = (int) Math.ceil(numFiles * (percentDeleteFilesReplaced / 100.0)); List<DeleteFile> oldDeleteFiles = Lists.newArrayListWithExpectedSize(replacedDeleteFilesCount); List<DeleteFile> newDeleteFiles = Lists.newArrayListWithExpectedSize(replacedDeleteFilesCount); try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) { for (FileScanTask task : Iterables.limit(tasks, replacedDeleteFilesCount)) { DeleteFile oldDeletes = Iterables.getOnlyElement(task.deletes()); oldDeleteFiles.add(oldDeletes); DeleteFile newDeletes = FileGenerationUtil.generatePositionDeleteFile(table, task.file()); newDeleteFiles.add(newDeletes); } } this.deleteFilesToReplace = oldDeleteFiles; this.pendingDeleteFiles = newDeleteFiles; ``` ########## core/src/main/java/org/apache/iceberg/ManifestFilterManager.java: ########## @@ -185,6 +198,14 @@ List<ManifestFile> filterManifests(Schema tableSchema, List<ManifestFile> manife return ImmutableList.of(); } + // Use the current set of referenced manifests as a source of truth when it's a subset of all + // manifests and all removals which were performed reference manifests. + // If a manifest is not in the trusted referenced set and has no live files, this means that the + // manifest has no deleted entries and does not need to be rewritten + Set<String> manifestLocations = Review Comment: Question: Why not have this inside `canTrustManifestReferences`? ########## core/src/main/java/org/apache/iceberg/ManifestFilterManager.java: ########## @@ -327,62 +354,71 @@ private ManifestFile filterManifest(Schema tableSchema, ManifestFile manifest) { // this assumes that the manifest doesn't have files to remove and streams through the // manifest without copying data. if a manifest does have a file to remove, this will break // out of the loop and move on to filtering the manifest. - boolean hasDeletedFiles = manifestHasDeletedFiles(evaluator, reader); - if (!hasDeletedFiles) { + if (manifestHasDeletedFiles(evaluator, reader)) { + return filterManifestWithDeletedFiles(evaluator, manifest, reader); + } else { filteredManifests.put(manifest, manifest); return manifest; } - - return filterManifestWithDeletedFiles(evaluator, manifest, reader); - } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close manifest: %s", manifest); } } - private boolean canContainDeletedFiles(ManifestFile manifest) { - boolean canContainExpressionDeletes; + private boolean canContainDeletedFiles(ManifestFile manifest, boolean trustManifestReferences) { + if (hasNoLiveFiles(manifest)) { + return false; + } + + if (trustManifestReferences) { + return manifestsReferencedForDeletes.contains(manifest.path()); + } + + return canContainDroppedFiles(manifest) + || canContainExpressionDeletes(manifest) + || canContainDroppedPartitions(manifest); + } + + private boolean hasNoLiveFiles(ManifestFile manifest) { + return !manifest.hasAddedFiles() && !manifest.hasExistingFiles(); + } + + private boolean canContainExpressionDeletes(ManifestFile manifest) { if (deleteExpression != null && deleteExpression != Expressions.alwaysFalse()) { ManifestEvaluator manifestEvaluator = ManifestEvaluator.forRowFilter( deleteExpression, specsById.get(manifest.partitionSpecId()), caseSensitive); - canContainExpressionDeletes = manifestEvaluator.eval(manifest); - } else { - canContainExpressionDeletes = false; + return manifestEvaluator.eval(manifest); } - boolean canContainDroppedPartitions; + return false; + } + + private boolean canContainDroppedPartitions(ManifestFile manifest) { if (!dropPartitions.isEmpty()) { - canContainDroppedPartitions = - ManifestFileUtil.canContainAny(manifest, dropPartitions, specsById); - } else { - canContainDroppedPartitions = false; + return ManifestFileUtil.canContainAny(manifest, dropPartitions, specsById); } - boolean canContainDroppedFiles; + return false; + } + + private boolean canContainDroppedFiles(ManifestFile manifest) { if (!deletePaths.isEmpty()) { - canContainDroppedFiles = true; + return true; } else if (!deleteFiles.isEmpty()) { - // because there were no path-only deletes, the set of deleted file partitions is valid - canContainDroppedFiles = - ManifestFileUtil.canContainAny(manifest, deleteFilePartitions, specsById); - } else { - canContainDroppedFiles = false; + return ManifestFileUtil.canContainAny(manifest, deleteFilePartitions, specsById); } - boolean canContainDropBySeq = - manifest.content() == ManifestContent.DELETES - && manifest.minSequenceNumber() < minSequenceNumber; - - return canContainExpressionDeletes - || canContainDroppedPartitions - || canContainDroppedFiles - || canContainDropBySeq; + return false; } @SuppressWarnings({"CollectionUndefinedEquality", "checkstyle:CyclomaticComplexity"}) private boolean manifestHasDeletedFiles( PartitionAndMetricsEvaluator evaluator, ManifestReader<F> reader) { + if (manifestsReferencedForDeletes.contains(reader.file().location())) { Review Comment: It was my bad to suggest `reader.file().location()`. It will be fragile as the location may undergo some validation or parsing in `FileIO`, which we don't control here. Probably better to pass `ManifestFile` to this method and use `manifest.path()`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org