[GitHub] [lucene-solr] mayya-sharipova commented on a change in pull request #2063: LUCENE-9599 Make comparator aware of index sorting

GitBox Fri, 06 Nov 2020 07:09:17 -0800


mayya-sharipova commented on a change in pull request #2063:
URL: https://github.com/apache/lucene-solr/pull/2063#discussion_r518811869




##########
File path: lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
##########
@@ -387,125 +346,225 @@ protected SortedDocValues 
getSortedDocValues(LeafReaderContext context, String f
     
     @Override
     public LeafFieldComparator getLeafComparator(LeafReaderContext context) 
throws IOException {
-      termsIndex = getSortedDocValues(context, field);
-      currentReaderGen++;
+      return new TermOrdValLeafComparator(context);
+    }
 
-      if (topValue != null) {
-        // Recompute topOrd/SameReader
-        int ord = termsIndex.lookupTerm(topValue);
-        if (ord >= 0) {
-          topSameReader = true;
-          topOrd = ord;
-        } else {
-          topSameReader = false;
-          topOrd = -ord-2;
+    @Override
+    public BytesRef value(int slot) {
+      return values[slot];
+    }
+
+    @Override
+    public int compareValues(BytesRef val1, BytesRef val2) {
+      if (val1 == null) {
+        if (val2 == null) {
+          return 0;
         }
-      } else {
-        topOrd = missingOrd;
-        topSameReader = true;
+        return missingSortCmp;
+      } else if (val2 == null) {
+        return -missingSortCmp;
       }
-      //System.out.println("  getLeafComparator topOrd=" + topOrd + " 
topSameReader=" + topSameReader);
+      return val1.compareTo(val2);
+    }
 
-      if (bottomSlot != -1) {
-        // Recompute bottomOrd/SameReader
-        setBottom(bottomSlot);
+    /**
+     * Leaf comparator for {@link TermOrdValComparator} that provides skipping 
functionality when index is sorted
+     */
+    public class TermOrdValLeafComparator implements LeafFieldComparator {
+      private final SortedDocValues termsIndex;
+      private boolean indexSort = false; // true if a query sort is a part of 
the index sort
+      private DocIdSetIterator competitiveIterator;
+      private boolean collectedAllCompetitiveHits = false;
+      private boolean iteratorUpdated = false;
+
+      public TermOrdValLeafComparator(LeafReaderContext context) throws 
IOException {
+        termsIndex = getSortedDocValues(context, field);
+        currentReaderGen++;
+        if (topValue != null) {
+          // Recompute topOrd/SameReader
+          int ord = termsIndex.lookupTerm(topValue);
+          if (ord >= 0) {
+            topSameReader = true;
+            topOrd = ord;
+          } else {
+            topSameReader = false;
+            topOrd = -ord-2;
+          }
+        } else {
+          topOrd = missingOrd;
+          topSameReader = true;
+        }
+        if (bottomSlot != -1) {
+          // Recompute bottomOrd/SameReader
+          setBottom(bottomSlot);
+        }
+        this.competitiveIterator = 
DocIdSetIterator.all(context.reader().maxDoc());
       }
 
-      return this;
-    }
-    
-    @Override
-    public void setBottom(final int bottom) throws IOException {
-      bottomSlot = bottom;
+      protected SortedDocValues getSortedDocValues(LeafReaderContext context, 
String field) throws IOException {
+        return DocValues.getSorted(context.reader(), field);
+      }
 
-      bottomValue = values[bottomSlot];
-      if (currentReaderGen == readerGen[bottomSlot]) {
-        bottomOrd = ords[bottomSlot];
-        bottomSameReader = true;
-      } else {
-        if (bottomValue == null) {
-          // missingOrd is null for all segments
-          assert ords[bottomSlot] == missingOrd;
-          bottomOrd = missingOrd;
+      @Override
+      public void setBottom(final int slot) throws IOException {
+        bottomSlot = slot;
+        bottomValue = values[bottomSlot];
+        if (currentReaderGen == readerGen[bottomSlot]) {
+          bottomOrd = ords[bottomSlot];
           bottomSameReader = true;
-          readerGen[bottomSlot] = currentReaderGen;
         } else {
-          final int ord = termsIndex.lookupTerm(bottomValue);
-          if (ord < 0) {
-            bottomOrd = -ord - 2;
-            bottomSameReader = false;
-          } else {
-            bottomOrd = ord;
-            // exact value match
+          if (bottomValue == null) {
+            // missingOrd is null for all segments
+            assert ords[bottomSlot] == missingOrd;
+            bottomOrd = missingOrd;
             bottomSameReader = true;
-            readerGen[bottomSlot] = currentReaderGen;            
-            ords[bottomSlot] = bottomOrd;
+            readerGen[bottomSlot] = currentReaderGen;
+          } else {
+            final int ord = termsIndex.lookupTerm(bottomValue);
+            if (ord < 0) {
+              bottomOrd = -ord - 2;
+              bottomSameReader = false;
+            } else {
+              bottomOrd = ord;
+              // exact value match
+              bottomSameReader = true;
+              readerGen[bottomSlot] = currentReaderGen;
+              ords[bottomSlot] = bottomOrd;
+            }
           }
         }
       }
-    }
 
-    @Override
-    public void setTopValue(BytesRef value) {
-      // null is fine: it means the last doc of the prior
-      // search was missing this value
-      topValue = value;
-      //System.out.println("setTopValue " + topValue);
-    }
+      @Override
+      public int compareBottom(int doc) throws IOException {
+        assert bottomSlot != -1;
+        int docOrd = getOrdForDoc(doc);
+        if (docOrd == -1) {
+          docOrd = missingOrd;
+        }
+        int result;
+        if (bottomSameReader) {
+          // ord is precisely comparable, even in the equal case
+          result = bottomOrd - docOrd;
+        } else if (bottomOrd >= docOrd) {
+          // the equals case always means bottom is > doc
+          // (because we set bottomOrd to the lower bound in setBottom):
+          result = 1;
+        } else {
+          result = -1;
+        }
+        // for the index sort case, if we encounter a first doc that is 
non-competitive,
+        // and the hits threshold is reached, we can update the iterator to 
skip the rest of docs
+        if (indexSort && (reverse ? result >= 0 : result <= 0)) {
+          collectedAllCompetitiveHits = true;
+          if (hitsThresholdReached && iteratorUpdated == false) {
+            competitiveIterator = DocIdSetIterator.empty();
+            iteratorUpdated = true;
+          }
+        }
+        return result;
+      }
 
-    @Override
-    public BytesRef value(int slot) {
-      return values[slot];
-    }
+      @Override
+      public int compareTop(int doc) throws IOException {
+        int ord = getOrdForDoc(doc);
+        if (ord == -1) {
+          ord = missingOrd;
+        }
+        if (topSameReader) {
+          // ord is precisely comparable, even in the equal case
+          return topOrd - ord;
+        } else if (ord <= topOrd) {
+          // the equals case always means doc is < value (because we set 
lastOrd to the lower bound)
+          return 1;
+        } else {
+          return -1;
+        }
+      }
 
-    @Override
-    public int compareTop(int doc) throws IOException {
+      @Override
+      public void copy(int slot, int doc) throws IOException {
+        int ord = getOrdForDoc(doc);
+        if (ord == -1) {
+          ord = missingOrd;
+          values[slot] = null;
+        } else {
+          assert ord >= 0;
+          if (tempBRs[slot] == null) {
+            tempBRs[slot] = new BytesRefBuilder();
+          }
+          tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
+          values[slot] = tempBRs[slot].get();
+        }
+        ords[slot] = ord;
+        readerGen[slot] = currentReaderGen;
+      }
 
-      int ord = getOrdForDoc(doc);
-      if (ord == -1) {
-        ord = missingOrd;
+      @Override
+      public void setScorer(Scorable scorer) throws IOException {}
+
+      @Override
+      public void usesIndexSort() {
+        indexSort = true;
       }
 
-      if (topSameReader) {
-        // ord is precisely comparable, even in the equal
-        // case
-        //System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" 
+ (topOrd-ord));
-        return topOrd - ord;
-      } else if (ord <= topOrd) {
-        // the equals case always means doc is < value
-        // (because we set lastOrd to the lower bound)
-        return 1;
-      } else {
-        return -1;
+      @Override
+      public void setHitsThresholdReached() {
+        hitsThresholdReached = true;
+        // for the index sort case, if we collected collected all competitive 
hits
+        // we can update the iterator to skip the rest of docs
+        if (indexSort && collectedAllCompetitiveHits && iteratorUpdated == 
false) {
+          competitiveIterator = DocIdSetIterator.empty();
+          iteratorUpdated = true;
+        }
       }
-    }
 
-    @Override
-    public int compareValues(BytesRef val1, BytesRef val2) {
-      if (val1 == null) {
-        if (val2 == null) {
-          return 0;
+      @Override
+      public DocIdSetIterator competitiveIterator() {
+        if (indexSort == false) return null;
+        return new DocIdSetIterator() {
+          private int docID = -1;
+
+          @Override
+          public int nextDoc() throws IOException {
+            return advance(docID + 1);
+          }
+
+          @Override
+          public int docID() {
+            return docID;
+          }
+
+          @Override
+          public long cost() {
+            return competitiveIterator.cost();
+          }
+
+          @Override
+          public int advance(int target) throws IOException {
+            return docID = competitiveIterator.advance(target);
+          }
+        };
+      }
+
+      private int getOrdForDoc(int doc) throws IOException {
+        if (termsIndex.advanceExact(doc)) {
+          return termsIndex.ordValue();
+        } else {
+          return -1;
         }
-        return missingSortCmp;
-      } else if (val2 == null) {
-        return -missingSortCmp;
       }
-      return val1.compareTo(val2);
     }
-
-    @Override
-    public void setScorer(Scorable scorer) {}
   }
   
   /** Sorts by field's natural Term sort order.  All
    *  comparisons are done using BytesRef.compareTo, which is
    *  slow for medium to large result sets but possibly
    *  very fast for very small results sets. */
-  public static class TermValComparator extends FieldComparator<BytesRef> 
implements LeafFieldComparator {
+  public static class TermValComparator extends FieldComparator<BytesRef> {

Review comment:
       It seems that we [don't support index 
sort](https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/SortField.java#L524)
 on `STRING_VAL` type  that will use `TermValComparator` comparator,  thus 
early termination on index sort is not necessary for this class.
   




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene-solr] mayya-sharipova commented on a change in pull request #2063: LUCENE-9599 Make comparator aware of index sorting

Reply via email to