[GitHub] [lucene] gsmiller commented on a diff in pull request #12089: [DRAFT] Explore TermInSet Query that "self optimizes"

GitBox Tue, 17 Jan 2023 14:02:58 -0800


gsmiller commented on code in PR #12089:
URL: https://github.com/apache/lucene/pull/12089#discussion_r1072874867



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/TermInSetQuery.java:
##########
@@ -0,0 +1,527 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.sandbox.queries;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Objects;
+import java.util.Set;
+import org.apache.lucene.index.DocValues;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.search.ConstantScoreScorer;
+import org.apache.lucene.search.ConstantScoreWeight;
+import org.apache.lucene.search.DisiPriorityQueue;
+import org.apache.lucene.search.DisiWrapper;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.QueryVisitor;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Scorer;
+import org.apache.lucene.search.ScorerSupplier;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TwoPhaseIterator;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.LongBitSet;
+import org.apache.lucene.util.PriorityQueue;
+
+public class TermInSetQuery extends Query {
+  // TODO: tunable coefficients. need to actually tune them (or maybe these 
are too complex and not
+  // useful)
+  private static final double J = 1.0;
+  private static final double K = 1.0;
+  // L: postings lists under this threshold will always be "pre-processed" 
into a bitset
+  private static final int L = 512;
+  // M: max number of clauses we'll manage/check during scoring (these remain 
"unprocessed")
+  private static final int M = Math.min(IndexSearcher.getMaxClauseCount(), 64);
+
+  private final String field;
+  // TODO: Not particularly memory-efficient; could use prefix-coding here but 
sorting isn't free
+  private final BytesRef[] terms;
+  private final int termsHashCode;
+
+  public TermInSetQuery(String field, Collection<BytesRef> terms) {
+    this.field = field;
+
+    final Set<BytesRef> uniqueTerms;
+    if (terms instanceof Set<BytesRef>) {
+      uniqueTerms = (Set<BytesRef>) terms;
+    } else {
+      uniqueTerms = new HashSet<>(terms);
+    }
+    this.terms = new BytesRef[uniqueTerms.size()];
+    Iterator<BytesRef> it = uniqueTerms.iterator();
+    for (int i = 0; i < uniqueTerms.size(); i++) {
+      assert it.hasNext();
+      this.terms[i] = it.next();
+    }
+    // TODO: compute lazily?
+    termsHashCode = Arrays.hashCode(this.terms);
+  }
+
+  @Override
+  public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, 
float boost)
+      throws IOException {
+
+    return new ConstantScoreWeight(this, boost) {
+
+      @Override
+      public Scorer scorer(LeafReaderContext context) throws IOException {
+        ScorerSupplier supplier = scorerSupplier(context);
+        if (supplier == null) {
+          return null;
+        } else {
+          return supplier.get(Long.MAX_VALUE);
+        }
+      }
+
+      @Override
+      public ScorerSupplier scorerSupplier(LeafReaderContext context) throws 
IOException {
+        if (terms.length <= 1) {
+          throw new IllegalStateException("Must call IndexSearcher#rewrite");
+        }
+
+        // If the field doesn't exist in the segment, return null:
+        LeafReader reader = context.reader();
+        FieldInfo fi = reader.getFieldInfos().fieldInfo(field);
+        if (fi == null) {
+          return null;
+        }
+
+        return new ScorerSupplier() {
+
+          @Override
+          public Scorer get(long leadCost) throws IOException {
+            assert terms.length > 1;
+
+            // If there are no doc values indexed, we have to use a 
postings-based approach:
+            DocValuesType dvType = fi.getDocValuesType();
+            if (dvType != DocValuesType.SORTED && dvType != 
DocValuesType.SORTED_SET) {
+              return postingsScorer(reader);
+            }
+
+            if (terms.length > J * leadCost) {
+              // If the number of terms is > the number of candidates, a DV 
should perform better.
+              // Note that we don't know the actual number of terms here since 
terms may not be
+              // found in the segment:
+              return docValuesScorer(reader);
+            } else {
+              Terms t = reader.terms(field);
+              if (t == null) {
+                // If there are no postings, we have to use doc values:
+                return docValuesScorer(reader);
+              }
+
+              // Assuming documents are randomly distributed (note: they may 
not be), as soon as
+              // a given term's postings list size is >= the number of 
candidate documents, we
+              // would expect a that term's postings to require advancing for 
every candidate doc
+              // (in the average case). This cost is multiplied out by the 
number of terms, so we
+              // can estimate the total amount of posting list advances we'll 
need to do by
+              // taking the sum of the postings sizes across all of our terms. 
Unfortunately, it's
+              // costly to get the actual postings sizes (we need to seek to 
each term), so we
+              // do a gross estimate up-front by computing the average 
postings size. Further
+              // complicating matters, we don't know how many of the terms are 
actually in the
+              // segment (we assume they all are), and of course, our 
candidate size is also an
+              // estimate:
+              double avgTermAdvances = Math.min(leadCost, (double) 
t.getSumDocFreq() / t.size());

Review Comment:
   This may be a very poor estimation depending on the use-case. If the field 
follows a Zipf distribution, this is making some pretty poor assumptions (that 
all terms have approximately equal lengths). I think this bit will be 
challenging to do well.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] gsmiller commented on a diff in pull request #12089: [DRAFT] Explore TermInSet Query that "self optimizes"

Reply via email to