gf2121 commented on code in PR #13221:
URL: https://github.com/apache/lucene/pull/13221#discussion_r1601216525


##########
lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java:
##########
@@ -405,5 +395,278 @@ public int advance(int target) throws IOException {
     protected abstract long bottomAsComparableLong();
 
     protected abstract long topAsComparableLong();
+
+    class DisjunctionBuildVisitor extends RangeVisitor {
+
+      final Deque<DisiAndMostCompetitiveValue> disis = new ArrayDeque<>();
+      // most competitive entry stored last.
+      final Consumer<DisiAndMostCompetitiveValue> adder =
+          reverse == false ? disis::addFirst : disis::addLast;
+
+      final int minBlockLength = minBlockLength();
+
+      final LSBRadixSorter sorter = new LSBRadixSorter();
+      int[] docs = IntsRef.EMPTY_INTS;
+      int index = 0;
+      int blockMaxDoc = -1;
+      boolean docsInOrder = true;
+      long blockMinValue = Long.MAX_VALUE;
+      long blockMaxValue = Long.MIN_VALUE;
+
+      private DisjunctionBuildVisitor() {
+        super(minValueAsLong, maxValueAsLong, maxDocVisited);
+      }
+
+      @Override
+      public void grow(int count) {
+        docs = ArrayUtil.grow(docs, index + count + 1);
+      }
+
+      @Override
+      protected void consumeDoc(int doc) {
+        docs[index++] = doc;
+        if (doc >= blockMaxDoc) {
+          blockMaxDoc = doc;
+        } else {
+          docsInOrder = false;
+        }
+      }
+
+      void intersectLeaves(PointValues.PointTree pointTree) throws IOException 
{
+        PointValues.Relation r =
+            compare(pointTree.getMinPackedValue(), 
pointTree.getMaxPackedValue());
+        switch (r) {
+          case CELL_INSIDE_QUERY, CELL_CROSSES_QUERY -> {
+            if (pointTree.moveToChild()) {
+              do {
+                intersectLeaves(pointTree);
+              } while (pointTree.moveToSibling());
+              pointTree.moveToParent();
+            } else {
+              if (r == PointValues.Relation.CELL_CROSSES_QUERY) {
+                pointTree.visitDocValues(this);
+              } else {
+                pointTree.visitDocIDs(this);
+              }
+              updateMinMax(
+                  sortableBytesToLong(pointTree.getMinPackedValue()),
+                  sortableBytesToLong(pointTree.getMaxPackedValue()));
+            }
+          }
+          case CELL_OUTSIDE_QUERY -> {}
+          default -> throw new IllegalStateException("unreachable code");
+        }
+      }
+
+      void updateMinMax(long leafMinValue, long leafMaxValue) throws 
IOException {
+        this.blockMinValue = Math.min(blockMinValue, leafMinValue);
+        this.blockMaxValue = Math.max(blockMaxValue, leafMaxValue);
+        if (index >= minBlockLength) {
+          update();
+          this.blockMinValue = Long.MAX_VALUE;
+          this.blockMaxValue = Long.MIN_VALUE;
+        }
+      }
+
+      void update() throws IOException {
+        if (blockMinValue > blockMaxValue) {
+          return;
+        }
+        long mostCompetitiveValue =
+            reverse == false
+                ? Math.max(blockMinValue, minValueAsLong)
+                : Math.min(blockMaxValue, maxValueAsLong);
+
+        if (docsInOrder == false) {
+          sorter.sort(PackedInts.bitsRequired(blockMaxDoc), docs, index);
+        }
+        docs[index] = DocIdSetIterator.NO_MORE_DOCS;
+        DocIdSetIterator iter = new IntArrayDocIdSet(docs, index).iterator();
+        adder.accept(new DisiAndMostCompetitiveValue(iter, 
mostCompetitiveValue));
+        docs = IntsRef.EMPTY_INTS;
+        index = 0;
+        blockMaxDoc = -1;
+        docsInOrder = true;
+      }
+
+      DocIdSetIterator generateCompetitiveIterator() throws IOException {
+        intersectLeaves(pointValues.getPointTree());
+        update();
+
+        if (disis.isEmpty()) {
+          return DocIdSetIterator.empty();
+        }
+        assert assertMostCompetitiveValuesSorted(disis);
+
+        PriorityQueue<DisiAndMostCompetitiveValue> disjunction =
+            new PriorityQueue<>(disis.size()) {
+              @Override
+              protected boolean lessThan(
+                  DisiAndMostCompetitiveValue a, DisiAndMostCompetitiveValue 
b) {
+                return a.disi.docID() < b.disi.docID();
+              }
+            };
+        disjunction.addAll(disis);
+
+        return new CompetitiveIterator(maxDoc, disis, disjunction);
+      }
+
+      /**
+       * Used for assert. When reverse is false, smaller values are more 
competitive, so
+       * mostCompetitiveValues should be in desc order.
+       */
+      private boolean 
assertMostCompetitiveValuesSorted(Deque<DisiAndMostCompetitiveValue> deque) {
+        long lastValue = reverse == false ? Long.MAX_VALUE : Long.MIN_VALUE;
+        for (DisiAndMostCompetitiveValue value : deque) {
+          if (reverse == false) {
+            assert value.mostCompetitiveValue <= lastValue
+                : deque.stream().map(d -> 
d.mostCompetitiveValue).toList().toString();
+          } else {
+            assert value.mostCompetitiveValue >= lastValue
+                : deque.stream().map(d -> 
d.mostCompetitiveValue).toList().toString();
+          }
+          lastValue = value.mostCompetitiveValue;
+        }
+        return true;
+      }
+
+      private int minBlockLength() {
+        // bottom value can be much more competitive than thresholdAsLong, 
recompute the cost.
+        int cost =
+            Math.toIntExact(

Review Comment:
   > can we actually safely cast to an int? the field may be multi-valued and 
have more than Integer.MAX_VALUE values?
   
   Nice catch! I thought the count of points between `thresholdValueAsLong` and 
`topValue` is limited to `maxDoc >> 5` 
([link](https://github.com/apache/lucene/blob/1886178a3ab572a4a39cb849295f57708ab027de/lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java#L230))
 so it can be cast safely. But if there are many many (more than ` 
Integer.MAX_VALUE - maxDoc / 128`) points' value equals `thresholdValueAsLong` 
the cast will not be safe.
   
    > FWIW I'd be ok with throwing an exception if there are more than 
MAX_DISJUNCTION_CLAUSE*Integer.MAX_VALUE values or something like that, so that 
minBlockLength can still be stored as an integer.
    
    +1. BTW I wonder if we should catch all exceptions when trying dynamic 
pruning and fallback to normal logic?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to