szehon-ho commented on code in PR #9251: URL: https://github.com/apache/iceberg/pull/9251#discussion_r1427495126
########## core/src/main/java/org/apache/iceberg/DeleteFileIndex.java: ########## @@ -582,93 +513,187 @@ private Iterable<CloseableIterable<ManifestEntry<DeleteFile>>> deleteManifestRea } } - // a group of indexed delete files sorted by the sequence number they apply to - private static class DeleteFileGroup { - private final long[] seqs; - private final IndexedDeleteFile[] files; - - DeleteFileGroup(IndexedDeleteFile[] files) { - this.seqs = Arrays.stream(files).mapToLong(IndexedDeleteFile::applySequenceNumber).toArray(); - this.files = files; + private static int findStartIndex(long[] seqs, long seq) { + int pos = Arrays.binarySearch(seqs, seq); + int start; + if (pos < 0) { + // the sequence number was not found, where it would be inserted is -(pos + 1) + start = -(pos + 1); + } else { + // the sequence number was found, but may not be the first + // find the first delete file with the given sequence number by decrementing the position + start = pos; + while (start > 0 && seqs[start - 1] >= seq) { + start -= 1; + } } - DeleteFileGroup(long[] seqs, IndexedDeleteFile[] files) { - this.seqs = seqs; - this.files = files; + return start; + } + + private static DeleteFile[] concat(DeleteFile[]... deletes) { + return ArrayUtil.concat(DeleteFile.class, deletes); + } + + // a group of position delete files sorted by the sequence number they apply to + private static class PositionDeletes { + private static final Comparator<DeleteFile> SEQ_COMPARATOR = + Comparator.comparingLong(DeleteFile::dataSequenceNumber); + + private long[] seqs = null; + private DeleteFile[] files = null; + private volatile List<DeleteFile> buffer = Lists.newArrayList(); Review Comment: Yea I was wondering if we could eliminate those member variables somehow. I feel like we always need the sorted version in the end, so is there any need to be lazy? For example, always keep everything in a sorted treeset. Or just have DeleteFileIndex::build() method populate intermediate maps of Lists, and then have a final pass that makes EqualityDeletes/PositionDeletes with sorted arrays? Yea if not possible to eliminate the member variables, i guess we can just try to add the precondition and benchmark, hopefully it may get optimized out by branch prediciton -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org