singhpk234 commented on code in PR #11825: URL: https://github.com/apache/iceberg/pull/11825#discussion_r1893149965
########## core/src/main/java/org/apache/iceberg/actions/SizeBasedDataRewriter.java: ########## @@ -84,13 +86,30 @@ private boolean shouldRewrite(List<FileScanTask> group) { return enoughInputFiles(group) || enoughContent(group) || tooMuchContent(group) - || anyTaskHasTooManyDeletes(group); + || anyTaskHasTooManyDeletes(group) + || anyTaskHasTooHighDeleteRatio(group); } private boolean anyTaskHasTooManyDeletes(List<FileScanTask> group) { return group.stream().anyMatch(this::tooManyDeletes); } + private boolean anyTaskHasTooHighDeleteRatio(List<FileScanTask> group) { + return group.stream().anyMatch(this::tooHighDeleteRatio); + } + + private boolean tooHighDeleteRatio(FileScanTask task) { + if (ContentFileUtil.containsSingleDV(task.deletes())) { + DeleteFile file = Iterables.getOnlyElement(task.deletes()); + double deletedRecords = (double) Math.min(file.recordCount(), task.file().recordCount()); + double deleteRatio = deletedRecords / task.file().recordCount(); Review Comment: [question] wondering if adjusting deleteRecords by taking a min is the right thing for SplitScanTask ? as the recordCount is records in data files. my understanding was that delete ratio can be always: ``` file.recordCount() / task.file().recordCount() ``` IIUC, the following is Maths for this : long[] splitOffsets = splitOffsets(file); long splitOffset = splitOffsets != null ? splitOffsets[0] : 0L; // meaning this fileScan task contains only scannedFileFraction of the file in it double scannedFileFraction = ((double) length) / (file.fileSizeInBytes() - splitOffset); // number of deletes associated with this based on uniform distribution of deletes across the splitScan task long deletedRecords = (scannedFileFraction * deleteFile.recordCount()) ; long totalRecordWithoutDeletes = (scannedFileFraction * file.recordCount()); double deleteRatio = deletedRecords / totalRecordWithoutDeletes deleteRatio = (scannedFileFraction * deleteFile.recordCount()) / (scannedFileFraction * file.recordCount()) deleteRatio = (~~scannedFileFraction~~ * deleteFile.recordCount()) / (~~scannedFileFraction~~ * file.recordCount()) deleteRatio = deleteFile.recordCount() / file.recordCount() -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org