gsmiller commented on code in PR #12089: URL: https://github.com/apache/lucene/pull/12089#discussion_r1072874867
########## lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/TermInSetQuery.java: ########## @@ -0,0 +1,527 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.queries; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Objects; +import java.util.Set; +import org.apache.lucene.index.DocValues; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SortedDocValues; +import org.apache.lucene.index.SortedSetDocValues; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DisiPriorityQueue; +import org.apache.lucene.search.DisiWrapper; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TwoPhaseIterator; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.LongBitSet; +import org.apache.lucene.util.PriorityQueue; + +public class TermInSetQuery extends Query { + // TODO: tunable coefficients. need to actually tune them (or maybe these are too complex and not + // useful) + private static final double J = 1.0; + private static final double K = 1.0; + // L: postings lists under this threshold will always be "pre-processed" into a bitset + private static final int L = 512; + // M: max number of clauses we'll manage/check during scoring (these remain "unprocessed") + private static final int M = Math.min(IndexSearcher.getMaxClauseCount(), 64); + + private final String field; + // TODO: Not particularly memory-efficient; could use prefix-coding here but sorting isn't free + private final BytesRef[] terms; + private final int termsHashCode; + + public TermInSetQuery(String field, Collection<BytesRef> terms) { + this.field = field; + + final Set<BytesRef> uniqueTerms; + if (terms instanceof Set<BytesRef>) { + uniqueTerms = (Set<BytesRef>) terms; + } else { + uniqueTerms = new HashSet<>(terms); + } + this.terms = new BytesRef[uniqueTerms.size()]; + Iterator<BytesRef> it = uniqueTerms.iterator(); + for (int i = 0; i < uniqueTerms.size(); i++) { + assert it.hasNext(); + this.terms[i] = it.next(); + } + // TODO: compute lazily? + termsHashCode = Arrays.hashCode(this.terms); + } + + @Override + public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) + throws IOException { + + return new ConstantScoreWeight(this, boost) { + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + ScorerSupplier supplier = scorerSupplier(context); + if (supplier == null) { + return null; + } else { + return supplier.get(Long.MAX_VALUE); + } + } + + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + if (terms.length <= 1) { + throw new IllegalStateException("Must call IndexSearcher#rewrite"); + } + + // If the field doesn't exist in the segment, return null: + LeafReader reader = context.reader(); + FieldInfo fi = reader.getFieldInfos().fieldInfo(field); + if (fi == null) { + return null; + } + + return new ScorerSupplier() { + + @Override + public Scorer get(long leadCost) throws IOException { + assert terms.length > 1; + + // If there are no doc values indexed, we have to use a postings-based approach: + DocValuesType dvType = fi.getDocValuesType(); + if (dvType != DocValuesType.SORTED && dvType != DocValuesType.SORTED_SET) { + return postingsScorer(reader); + } + + if (terms.length > J * leadCost) { + // If the number of terms is > the number of candidates, a DV should perform better. + // Note that we don't know the actual number of terms here since terms may not be + // found in the segment: + return docValuesScorer(reader); + } else { + Terms t = reader.terms(field); + if (t == null) { + // If there are no postings, we have to use doc values: + return docValuesScorer(reader); + } + + // Assuming documents are randomly distributed (note: they may not be), as soon as + // a given term's postings list size is >= the number of candidate documents, we + // would expect a that term's postings to require advancing for every candidate doc + // (in the average case). This cost is multiplied out by the number of terms, so we + // can estimate the total amount of posting list advances we'll need to do by + // taking the sum of the postings sizes across all of our terms. Unfortunately, it's + // costly to get the actual postings sizes (we need to seek to each term), so we + // do a gross estimate up-front by computing the average postings size. Further + // complicating matters, we don't know how many of the terms are actually in the + // segment (we assume they all are), and of course, our candidate size is also an + // estimate: + double avgTermAdvances = Math.min(leadCost, (double) t.getSumDocFreq() / t.size()); Review Comment: This may be a very poor estimation depending on the use-case. If the field follows a Zipf distribution, this is making some pretty poor assumptions (that all terms have approximately equal lengths). I think this bit will be challenging to do well. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org