gf2121 commented on code in PR #13221: URL: https://github.com/apache/lucene/pull/13221#discussion_r1601216525
########## lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java: ########## @@ -405,5 +395,278 @@ public int advance(int target) throws IOException { protected abstract long bottomAsComparableLong(); protected abstract long topAsComparableLong(); + + class DisjunctionBuildVisitor extends RangeVisitor { + + final Deque<DisiAndMostCompetitiveValue> disis = new ArrayDeque<>(); + // most competitive entry stored last. + final Consumer<DisiAndMostCompetitiveValue> adder = + reverse == false ? disis::addFirst : disis::addLast; + + final int minBlockLength = minBlockLength(); + + final LSBRadixSorter sorter = new LSBRadixSorter(); + int[] docs = IntsRef.EMPTY_INTS; + int index = 0; + int blockMaxDoc = -1; + boolean docsInOrder = true; + long blockMinValue = Long.MAX_VALUE; + long blockMaxValue = Long.MIN_VALUE; + + private DisjunctionBuildVisitor() { + super(minValueAsLong, maxValueAsLong, maxDocVisited); + } + + @Override + public void grow(int count) { + docs = ArrayUtil.grow(docs, index + count + 1); + } + + @Override + protected void consumeDoc(int doc) { + docs[index++] = doc; + if (doc >= blockMaxDoc) { + blockMaxDoc = doc; + } else { + docsInOrder = false; + } + } + + void intersectLeaves(PointValues.PointTree pointTree) throws IOException { + PointValues.Relation r = + compare(pointTree.getMinPackedValue(), pointTree.getMaxPackedValue()); + switch (r) { + case CELL_INSIDE_QUERY, CELL_CROSSES_QUERY -> { + if (pointTree.moveToChild()) { + do { + intersectLeaves(pointTree); + } while (pointTree.moveToSibling()); + pointTree.moveToParent(); + } else { + if (r == PointValues.Relation.CELL_CROSSES_QUERY) { + pointTree.visitDocValues(this); + } else { + pointTree.visitDocIDs(this); + } + updateMinMax( + sortableBytesToLong(pointTree.getMinPackedValue()), + sortableBytesToLong(pointTree.getMaxPackedValue())); + } + } + case CELL_OUTSIDE_QUERY -> {} + default -> throw new IllegalStateException("unreachable code"); + } + } + + void updateMinMax(long leafMinValue, long leafMaxValue) throws IOException { + this.blockMinValue = Math.min(blockMinValue, leafMinValue); + this.blockMaxValue = Math.max(blockMaxValue, leafMaxValue); + if (index >= minBlockLength) { + update(); + this.blockMinValue = Long.MAX_VALUE; + this.blockMaxValue = Long.MIN_VALUE; + } + } + + void update() throws IOException { + if (blockMinValue > blockMaxValue) { + return; + } + long mostCompetitiveValue = + reverse == false + ? Math.max(blockMinValue, minValueAsLong) + : Math.min(blockMaxValue, maxValueAsLong); + + if (docsInOrder == false) { + sorter.sort(PackedInts.bitsRequired(blockMaxDoc), docs, index); + } + docs[index] = DocIdSetIterator.NO_MORE_DOCS; + DocIdSetIterator iter = new IntArrayDocIdSet(docs, index).iterator(); + adder.accept(new DisiAndMostCompetitiveValue(iter, mostCompetitiveValue)); + docs = IntsRef.EMPTY_INTS; + index = 0; + blockMaxDoc = -1; + docsInOrder = true; + } + + DocIdSetIterator generateCompetitiveIterator() throws IOException { + intersectLeaves(pointValues.getPointTree()); + update(); + + if (disis.isEmpty()) { + return DocIdSetIterator.empty(); + } + assert assertMostCompetitiveValuesSorted(disis); + + PriorityQueue<DisiAndMostCompetitiveValue> disjunction = + new PriorityQueue<>(disis.size()) { + @Override + protected boolean lessThan( + DisiAndMostCompetitiveValue a, DisiAndMostCompetitiveValue b) { + return a.disi.docID() < b.disi.docID(); + } + }; + disjunction.addAll(disis); + + return new CompetitiveIterator(maxDoc, disis, disjunction); + } + + /** + * Used for assert. When reverse is false, smaller values are more competitive, so + * mostCompetitiveValues should be in desc order. + */ + private boolean assertMostCompetitiveValuesSorted(Deque<DisiAndMostCompetitiveValue> deque) { + long lastValue = reverse == false ? Long.MAX_VALUE : Long.MIN_VALUE; + for (DisiAndMostCompetitiveValue value : deque) { + if (reverse == false) { + assert value.mostCompetitiveValue <= lastValue + : deque.stream().map(d -> d.mostCompetitiveValue).toList().toString(); + } else { + assert value.mostCompetitiveValue >= lastValue + : deque.stream().map(d -> d.mostCompetitiveValue).toList().toString(); + } + lastValue = value.mostCompetitiveValue; + } + return true; + } + + private int minBlockLength() { + // bottom value can be much more competitive than thresholdAsLong, recompute the cost. + int cost = + Math.toIntExact( Review Comment: > can we actually safely cast to an int? the field may be multi-valued and have more than Integer.MAX_VALUE values? Nice catch! I thought the count of points between `thresholdValueAsLong` and `topValue` is limited to `maxDoc >> 5` ([link](https://github.com/apache/lucene/blob/1886178a3ab572a4a39cb849295f57708ab027de/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java#L230)) so it can be cast safely. But if there are many many (more than ` Integer.MAX_VALUE - maxDoc / 128`) points' value equals `thresholdValueAsLong` the cast will not be safe. > FWIW I'd be ok with throwing an exception if there are more than MAX_DISJUNCTION_CLAUSE*Integer.MAX_VALUE values or something like that, so that minBlockLength can still be stored as an integer. +1. BTW I wonder if we should catch all exceptions when trying dynamic pruning and fallback to normal logic? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org