[GitHub] [lucene-solr] jpountz commented on a change in pull request #1351: LUCENE-9280: Collectors to skip noncompetitive documents

GitBox Tue, 16 Jun 2020 01:49:16 -0700


jpountz commented on a change in pull request #1351:
URL: https://github.com/apache/lucene-solr/pull/1351#discussion_r440675787




##########
File path: 
lucene/core/src/java/org/apache/lucene/search/FilteringNumericComparator.java
##########
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import org.apache.lucene.index.LeafReaderContext;
+
+import java.io.IOException;
+
+/**
+ * A wrapper over {@code NumericComparator} that provides a leaf comparator 
that can filter non-competitive docs.
+ */
+class FilteringNumericComparator<T extends Number> extends 
FilteringFieldComparator<T> {
+  public FilteringNumericComparator(NumericComparator<T> in, boolean reverse, 
boolean singleSort) {
+    super(in, reverse, singleSort);
+  }
+
+  @Override
+  public final FilteringLeafFieldComparator 
getLeafComparator(LeafReaderContext context) throws IOException {
+    LeafFieldComparator inLeafComparator = in.getLeafComparator(context);
+    Class<?> comparatorClass = inLeafComparator.getClass();
+    if (comparatorClass == FieldComparator.LongComparator.class) {
+      return new 
FilteringNumericLeafComparator.FilteringLongLeafComparator((FieldComparator.LongComparator)
 inLeafComparator, context,
+          ((LongComparator) inLeafComparator).field, reverse, singleSort, 
hasTopValue);
+    } if (comparatorClass == FieldComparator.IntComparator.class) {
+      return new 
FilteringNumericLeafComparator.FilteringIntLeafComparator((FieldComparator.IntComparator)
 inLeafComparator, context,
+          ((IntComparator) inLeafComparator).field, reverse, singleSort, 
hasTopValue);
+    } else if (comparatorClass == FieldComparator.DoubleComparator.class) {
+      return new 
FilteringNumericLeafComparator.FilteringDoubleLeafComparator((FieldComparator.DoubleComparator)
 inLeafComparator, context,
+          ((DoubleComparator) inLeafComparator).field, reverse, singleSort, 
hasTopValue);
+    } else if (comparatorClass == FieldComparator.FloatComparator.class) {
+      return new 
FilteringNumericLeafComparator.FilteringFloatLeafComparator((FieldComparator.FloatComparator)
 inLeafComparator, context,
+          ((FloatComparator) inLeafComparator).field, reverse, singleSort, 
hasTopValue);
+    } else {
+      assert false: "Unexpected class for [FieldComparator]!";

Review comment:
       add the class to the error message?

##########
File path: 
lucene/core/src/java/org/apache/lucene/search/FilteringNumericLeafComparator.java
##########
@@ -0,0 +1,340 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import org.apache.lucene.document.DoublePoint;
+import org.apache.lucene.document.FloatPoint;
+import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.document.LongPoint;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.util.DocIdSetBuilder;
+
+import java.io.IOException;
+import java.util.Arrays;
+
+/**
+ * A {@code FilteringLeafFieldComparator} that provides a functionality to 
skip over non-competitive documents
+ * for numeric fields indexed with points.
+ */
+abstract class FilteringNumericLeafComparator implements 
FilteringLeafFieldComparator {
+  protected final LeafFieldComparator in;
+  protected final boolean reverse;
+  protected final boolean singleSort;
+  private final boolean hasTopValue;
+  private final PointValues pointValues;
+  private final int bytesCount;
+  private final int maxDoc;
+  private final byte[] minValueAsBytes;
+  private final byte[] maxValueAsBytes;
+
+  private long iteratorCost;
+  private int maxDocVisited = 0;
+  private int updateCounter = 0;
+  private boolean canUpdateIterator = false; // set to true when queue becomes 
full and hitsThreshold is reached
+  private DocIdSetIterator competitiveIterator;
+
+  public FilteringNumericLeafComparator(LeafFieldComparator in, 
LeafReaderContext context, String field,
+        boolean reverse, boolean singleSort, boolean hasTopValue, int 
bytesCount) throws IOException {
+    this.in = in;
+    this.pointValues = context.reader().getPointValues(field);
+    this.reverse = reverse;
+    this.singleSort = singleSort;
+    this.hasTopValue = hasTopValue;
+    this.maxDoc = context.reader().maxDoc();
+    this.bytesCount = bytesCount;
+    this.maxValueAsBytes = reverse == false ? new byte[bytesCount] : 
hasTopValue ? new byte[bytesCount] : null;
+    this.minValueAsBytes = reverse ? new byte[bytesCount] : hasTopValue ? new 
byte[bytesCount] : null;
+
+    // TODO: optimize a case when pointValues are missing only on this segment
+    this.competitiveIterator = pointValues == null ? null : 
DocIdSetIterator.all(maxDoc);
+    this.iteratorCost = maxDoc;
+  }
+
+  @Override
+  public void setBottom(int slot) throws IOException {
+    in.setBottom(slot);
+    updateCompetitiveIterator(); // update an iterator if we set a new bottom
+  }
+
+  @Override
+  public int compareBottom(int doc) throws IOException {
+    return in.compareBottom(doc);
+  }
+
+  @Override
+  public int compareTop(int doc) throws IOException {
+    return in.compareTop(doc);
+  }
+
+  @Override
+  public void copy(int slot, int doc) throws IOException {
+    in.copy(slot, doc);
+    maxDocVisited = doc;
+  }
+
+  @Override
+  public void setScorer(Scorable scorer) throws IOException {
+    in.setScorer(scorer);
+    if (scorer instanceof Scorer) {
+      iteratorCost = ((Scorer) scorer).iterator().cost(); // starting iterator 
cost is the scorer's cost
+      updateCompetitiveIterator(); // update an iterator when we have a new 
segment
+    }
+  }
+
+  @Override
+  public void setCanUpdateIterator() throws IOException {
+    this.canUpdateIterator = true;
+    updateCompetitiveIterator();
+  }
+
+  @Override
+  public DocIdSetIterator competitiveIterator() {
+    if (competitiveIterator == null) return null;
+    return new DocIdSetIterator() {
+      private int doc;
+
+      @Override
+      public int nextDoc() throws IOException {
+        return doc = competitiveIterator.nextDoc();
+      }
+
+      @Override
+      public int docID() {
+        return doc;
+      }
+
+      @Override
+      public long cost() {
+        return competitiveIterator.cost();
+      }
+
+      @Override
+      public int advance(int target) throws IOException {
+        return doc = competitiveIterator.advance(target);
+      }
+    };
+  }
+
+  // update its iterator to include possibly only docs that are "stronger" 
than the current bottom entry
+  private void updateCompetitiveIterator() throws IOException {
+    if (canUpdateIterator == false) return;
+    if (pointValues == null) return;
+    // if some documents have missing points, check that missing values 
prohibits optimization
+    if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) {
+      return; // we can't filter out documents, as documents with missing 
values are competitive
+    }
+
+    updateCounter++;
+    if (updateCounter > 256 && (updateCounter & 0x1f) != 0x1f) { // Start 
sampling if we get called too much
+      return;
+    }
+    if (reverse == false) {
+      encodeBottom(maxValueAsBytes);
+      if (hasTopValue) {
+        encodeTop(minValueAsBytes);
+      }
+    } else {
+      encodeBottom(minValueAsBytes);
+      if (hasTopValue) {
+        encodeTop(maxValueAsBytes);
+      }
+    }
+
+    DocIdSetBuilder result = new DocIdSetBuilder(maxDoc);
+    PointValues.IntersectVisitor visitor = new PointValues.IntersectVisitor() {
+      DocIdSetBuilder.BulkAdder adder;
+
+      @Override
+      public void grow(int count) {
+        adder = result.grow(count);
+      }
+
+      @Override
+      public void visit(int docID) {
+        if (docID <= maxDocVisited) {
+          return; // Already visited or skipped
+        }
+        adder.add(docID);
+      }
+
+      @Override
+      public void visit(int docID, byte[] packedValue) {
+        if (docID <= maxDocVisited) {
+          return;  // already visited or skipped
+        }
+        if (maxValueAsBytes != null) {
+          int cmp = Arrays.compareUnsigned(packedValue, 0, bytesCount, 
maxValueAsBytes, 0, bytesCount);
+          // if doc's value is too high or for single sort even equal, it is 
not competitive and the doc can be skipped
+          if (cmp > 0 || (singleSort && cmp == 0)) return;
+        }
+        if (minValueAsBytes != null) {
+          int cmp = Arrays.compareUnsigned(packedValue, 0, bytesCount, 
minValueAsBytes, 0, bytesCount);
+          // if doc's value is too low or for single sort even equal, it is 
not competitive and the doc can be skipped
+          if (cmp < 0 || (singleSort && cmp == 0)) return;
+        }
+        adder.add(docID); // doc is competitive
+      }
+
+      @Override
+      public PointValues.Relation compare(byte[] minPackedValue, byte[] 
maxPackedValue) {
+        boolean maxValueOutsideQuery = false;
+        boolean minValueOutsideQuery = false;
+        if (maxValueAsBytes != null) {
+          int cmp = Arrays.compareUnsigned(minPackedValue, 0, bytesCount, 
maxValueAsBytes, 0, bytesCount);
+          maxValueOutsideQuery = cmp > 0 || (singleSort && cmp == 0);

Review comment:
       should we return CELL_OUTSIDE_QUERY here when appropriate instead of 
delaying?

##########
File path: 
lucene/core/src/java/org/apache/lucene/search/FilteringNumericComparator.java
##########
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import org.apache.lucene.index.LeafReaderContext;
+
+import java.io.IOException;
+
+/**
+ * A wrapper over {@code NumericComparator} that provides a leaf comparator 
that can filter non-competitive docs.
+ */
+class FilteringNumericComparator<T extends Number> extends 
FilteringFieldComparator<T> {
+  public FilteringNumericComparator(NumericComparator<T> in, boolean reverse, 
boolean singleSort) {
+    super(in, reverse, singleSort);
+  }
+
+  @Override
+  public final FilteringLeafFieldComparator 
getLeafComparator(LeafReaderContext context) throws IOException {
+    LeafFieldComparator inLeafComparator = in.getLeafComparator(context);
+    Class<?> comparatorClass = inLeafComparator.getClass();
+    if (comparatorClass == FieldComparator.LongComparator.class) {
+      return new 
FilteringNumericLeafComparator.FilteringLongLeafComparator((FieldComparator.LongComparator)
 inLeafComparator, context,
+          ((LongComparator) inLeafComparator).field, reverse, singleSort, 
hasTopValue);
+    } if (comparatorClass == FieldComparator.IntComparator.class) {
+      return new 
FilteringNumericLeafComparator.FilteringIntLeafComparator((FieldComparator.IntComparator)
 inLeafComparator, context,
+          ((IntComparator) inLeafComparator).field, reverse, singleSort, 
hasTopValue);
+    } else if (comparatorClass == FieldComparator.DoubleComparator.class) {
+      return new 
FilteringNumericLeafComparator.FilteringDoubleLeafComparator((FieldComparator.DoubleComparator)
 inLeafComparator, context,
+          ((DoubleComparator) inLeafComparator).field, reverse, singleSort, 
hasTopValue);
+    } else if (comparatorClass == FieldComparator.FloatComparator.class) {
+      return new 
FilteringNumericLeafComparator.FilteringFloatLeafComparator((FieldComparator.FloatComparator)
 inLeafComparator, context,
+          ((FloatComparator) inLeafComparator).field, reverse, singleSort, 
hasTopValue);
+    } else {
+      assert false: "Unexpected class for [FieldComparator]!";
+      return null;

Review comment:
       should we throw an IllegalStateException instead of being lenient when 
assertions are disabled?

##########
File path: 
lucene/core/src/java/org/apache/lucene/search/FilteringLeafFieldComparator.java
##########
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+/**
+ * Decorates a wrapped LeafFieldComparator to add a functionality to skip over 
non-competitive docs.
+ * FilteringLeafFieldComparator provides two additional functions to a 
LeafFieldComparator:
+ *  {@code competitiveIterator()} and {@code setCanUpdateIterator()}.
+ */
+public interface FilteringLeafFieldComparator extends LeafFieldComparator {
+  /**
+   * Returns a competitive iterator
+   * @return an iterator over competitive docs that are stronger than already 
collected docs
+   * or {@code null} if such an iterator is not available for the current 
segment.
+   */
+  DocIdSetIterator competitiveIterator();

Review comment:
       maybe we should throw IOException here to not force some implementations 
to catch/rethrow an an UncheckedIOException




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene-solr] jpountz commented on a change in pull request #1351: LUCENE-9280: Collectors to skip noncompetitive documents

Reply via email to