aokolnychyi commented on code in PR #8157:
URL: https://github.com/apache/iceberg/pull/8157#discussion_r1275729870
##########
core/src/main/java/org/apache/iceberg/DeleteFileIndex.java:
##########
@@ -569,4 +570,153 @@ private
Iterable<CloseableIterable<ManifestEntry<DeleteFile>>> deleteManifestRea
.liveEntries());
}
}
+
+ // a group of indexed delete files sorted by the sequence number they apply
to
+ private static class DeleteFileGroup {
+ private final long[] seqs;
+ private final IndexedDeleteFile[] files;
+
+ DeleteFileGroup(IndexedDeleteFile[] files) {
+ this.seqs =
Arrays.stream(files).mapToLong(IndexedDeleteFile::applySequenceNumber).toArray();
+ this.files = files;
+ }
+
+ DeleteFileGroup(long[] seqs, IndexedDeleteFile[] files) {
+ this.seqs = seqs;
+ this.files = files;
+ }
+
+ public Stream<IndexedDeleteFile> limit(long seq) {
+ return limitBySequenceNumber(seq, seqs, files);
+ }
+
+ public Iterable<DeleteFile> referencedDeleteFiles() {
+ return
Arrays.stream(files).map(IndexedDeleteFile::wrapped).collect(Collectors.toList());
+ }
+ }
+
+ // a delete file wrapper that caches the converted boundaries for faster
boundary checks
+ // this class is not meant to be exposed beyond the delete file index
+ private static class IndexedDeleteFile {
+ private final PartitionSpec spec;
+ private final DeleteFile wrapped;
+ private final long applySequenceNumber;
+ private volatile Map<Integer, Object> convertedLowerBounds = null;
+ private volatile Map<Integer, Object> convertedUpperBounds = null;
+
+ IndexedDeleteFile(PartitionSpec spec, DeleteFile file, long
applySequenceNumber) {
+ this.spec = spec;
+ this.wrapped = file;
+ this.applySequenceNumber = applySequenceNumber;
+ }
+
+ IndexedDeleteFile(PartitionSpec spec, DeleteFile file) {
+ this.spec = spec;
+ this.wrapped = file;
+
+ if (file.content() == FileContent.EQUALITY_DELETES) {
+ this.applySequenceNumber = file.dataSequenceNumber() - 1;
+ } else {
+ this.applySequenceNumber = file.dataSequenceNumber();
+ }
+ }
+
+ public DeleteFile wrapped() {
+ return wrapped;
+ }
+
+ public long applySequenceNumber() {
+ return applySequenceNumber;
+ }
+
+ public FileContent content() {
+ return wrapped.content();
+ }
+
+ public List<Integer> equalityFieldIds() {
+ return wrapped.equalityFieldIds();
+ }
+
+ public Map<Integer, Long> valueCounts() {
+ return wrapped.valueCounts();
+ }
+
+ public Map<Integer, Long> nullValueCounts() {
+ return wrapped.nullValueCounts();
+ }
+
+ public Map<Integer, Long> nanValueCounts() {
+ return wrapped.nanValueCounts();
+ }
+
+ public boolean hasNoBounds() {
+ return wrapped.lowerBounds() == null || wrapped.upperBounds() == null;
+ }
+
+ public boolean hasBounds() {
+ return wrapped.lowerBounds() != null && wrapped.upperBounds() != null;
+ }
+
+ @SuppressWarnings("unchecked")
+ public <T> T lowerBound(int id) {
+ return (T) lowerBounds().get(id);
+ }
+
+ private Map<Integer, Object> lowerBounds() {
+ if (convertedLowerBounds == null) {
+ synchronized (this) {
+ if (convertedLowerBounds == null) {
+ this.convertedLowerBounds = convertBounds(wrapped.lowerBounds());
Review Comment:
I spent a bit more time thinking about this and I don't think it would be
worth the extra complexity. Let's keep this as is for now. We only index
equality IDs and all columns must be checked to discard a file.
I also checked Caffeine caches and they have some workarounds but I don't
think we need it here.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]