jpountz commented on code in PR #972: URL: https://github.com/apache/lucene/pull/972#discussion_r910683270
########## lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java: ########## @@ -0,0 +1,332 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; + +/** Scorer implementing Block-Max Maxscore algorithm */ +public class BlockMaxMaxscoreScorer extends Scorer { + // current doc ID of the leads + private int doc; + + // doc id boundary that all scorers maxScore are valid + private int upTo; + + // heap of scorers ordered by doc ID + private final DisiPriorityQueue essentialsScorers; + + // list of scorers ordered by maxScore + private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers; + + private final DisiWrapper[] allScorers; + + // sum of max scores of scorers in nonEssentialScorers list + private double nonEssentialMaxScoreSum; + + private long cost; Review Comment: nit: let's make it final ########## lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java: ########## @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; + +/** Scorer implementing Block-Max Maxscore algorithm */ +public class BlockMaxMaxscoreScorer extends Scorer { + // current doc ID of the leads + private int doc; + + // doc id boundary that all scorers maxScore are valid + private int upTo = -1; + + // heap of scorers ordered by doc ID + private final DisiPriorityQueue essentialsScorers; + // list of scorers ordered by maxScore + private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers; + + private final DisiWrapper[] allScorers; + + // sum of max scores of scorers in nonEssentialScorers list + private float nonEssentialMaxScoreSum; + + private long cost; + + private final MaxScoreSumPropagator maxScoreSumPropagator; + + // scaled min competitive score + private float minCompetitiveScore = 0; + + private int cachedScoredDoc = -1; + private float cachedScore = 0; + + /** + * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm + * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to + * WANDScorer, and could be used for simple disjunction queries. + * + * @param weight The weight to be used. + * @param scorers The sub scorers this Scorer should iterate on for optional clauses + */ + public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException { + super(weight); + + this.doc = -1; + this.allScorers = new DisiWrapper[scorers.size()]; + this.essentialsScorers = new DisiPriorityQueue(scorers.size()); + this.maxScoreSortedEssentialScorers = new LinkedList<>(); + + long cost = 0; + for (int i = 0; i < scorers.size(); i++) { + DisiWrapper w = new DisiWrapper(scorers.get(i)); + cost += w.cost; + allScorers[i] = w; + } + + this.cost = cost; + maxScoreSumPropagator = new MaxScoreSumPropagator(scorers); + } + + @Override + public DocIdSetIterator iterator() { + // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee + return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator()); + } + + @Override + public TwoPhaseIterator twoPhaseIterator() { + DocIdSetIterator approximation = + new DocIdSetIterator() { + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + while (true) { + + if (target > upTo) { + updateMaxScoresAndLists(target); + } else { + // minCompetitiveScore might have increased, + // move potentially no-longer-competitive scorers from essential to non-essential + // list + movePotentiallyNonCompetitiveScorers(); + } + + assert target <= upTo; + + DisiWrapper top = essentialsScorers.top(); + + if (top == null) { + // all scorers in non-essential list, skip to next boundary or return no_more_docs + if (upTo == NO_MORE_DOCS) { + return doc = NO_MORE_DOCS; + } else { + target = upTo + 1; + } + } else { + // position all scorers in essential list to on or after target + while (top.doc < target) { + top.doc = top.iterator.advance(target); + top = essentialsScorers.updateTop(); + } + + if (top.doc == NO_MORE_DOCS) { + return doc = NO_MORE_DOCS; + } else if (top.doc > upTo) { + target = upTo + 1; + } else { + double docScoreUpperBound = nonEssentialMaxScoreSum; + + for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) { + docScoreUpperBound += w.scorer.score(); + } + + if ((float) docScoreUpperBound < minCompetitiveScore) { + // skip straight to next candidate doc from essential scorer + int docId = top.doc; + do { + top.doc = top.iterator.nextDoc(); + top = essentialsScorers.updateTop(); + } while (top.doc == docId); + + target = top.doc; + } else { + return doc = top.doc; + } + } + } + } + } + + private void movePotentiallyNonCompetitiveScorers() { + while (maxScoreSortedEssentialScorers.size() > 0 + && nonEssentialMaxScoreSum + maxScoreSortedEssentialScorers.get(0).maxScoreFloat + < minCompetitiveScore) { + DisiWrapper nextLeastContributingScorer = + maxScoreSortedEssentialScorers.removeFirst(); + nonEssentialMaxScoreSum += nextLeastContributingScorer.maxScoreFloat; + } + + // list adjusted + if (essentialsScorers.size() != maxScoreSortedEssentialScorers.size()) { + essentialsScorers.clear(); + for (DisiWrapper w : maxScoreSortedEssentialScorers) { + essentialsScorers.add(w); + } + } + } + + private void updateMaxScoresAndLists(int target) throws IOException { + assert target > upTo; + // Next candidate doc id is above interval boundary, or minCompetitive has increased. + // Find next interval boundary. + // Block boundary alignment strategy is adapted from "Optimizing Top-k Document + // Retrieval Strategies for Block-Max Indexes" by Dimopoulos, Nepomnyachiy and Suel. + // Find the block interval boundary that is the minimum of all participating scorer's + // block boundary. Then run BMM within each interval. + updateUpToAndMaxScore(target); + repartitionLists(); + } + + private void updateUpToAndMaxScore(int target) throws IOException { + // reset upTo + upTo = -1; + for (DisiWrapper w : allScorers) { + // using Math.max here is a good approach when there are only two clauses, + // but when this scorer is used for more than two clauses, we may need to + // consider other approaches such as avg, as the further out the boundary, + // the higher maxScore would be for a scorer, which makes skipping based on + // comparison with minCompetitiveScore harder / less effective. + upTo = Math.max(w.scorer.advanceShallow(Math.max(w.doc, target)), upTo); + } + assert target <= upTo; + + for (DisiWrapper w : allScorers) { + if (w.doc <= upTo) { Review Comment: I have a slight preference for the second approach (remove plus a comment) so that our test coverage build doesn't show up this branch as a branch that is never tested. ########## lucene/core/src/java/org/apache/lucene/search/Boolean2ScorerSupplier.java: ########## @@ -118,6 +118,21 @@ private Scorer getInternal(long leadCost) throws IOException { leadCost); } + // pure two terms disjunction + if (scoreMode == ScoreMode.TOP_SCORES + && minShouldMatch == 0 Review Comment: `minShouldMatch = 1` also qualifies ```suggestion && minShouldMatch <= 1 ``` ########## lucene/core/src/java/org/apache/lucene/search/DisiWrapper.java: ########## @@ -39,6 +39,9 @@ public class DisiWrapper { // For WANDScorer long maxScore; + // For BlockMaxMaxscoreScorer + float maxScoreFloat; Review Comment: Maybe we should call this one `maxScore` and rename the other one `scaledMaxScore` or something like that ########## lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java: ########## @@ -0,0 +1,332 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; + +/** Scorer implementing Block-Max Maxscore algorithm */ +public class BlockMaxMaxscoreScorer extends Scorer { Review Comment: Could it be pkg-private? ########## lucene/core/src/java/org/apache/lucene/search/BlockMaxMaxscoreScorer.java: ########## @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; + +/** Scorer implementing Block-Max Maxscore algorithm */ +public class BlockMaxMaxscoreScorer extends Scorer { + // current doc ID of the leads + private int doc; + + // doc id boundary that all scorers maxScore are valid + private int upTo = -1; + + // heap of scorers ordered by doc ID + private final DisiPriorityQueue essentialsScorers; + // list of scorers ordered by maxScore + private final LinkedList<DisiWrapper> maxScoreSortedEssentialScorers; + + private final DisiWrapper[] allScorers; + + // sum of max scores of scorers in nonEssentialScorers list + private float nonEssentialMaxScoreSum; + + private long cost; + + private final MaxScoreSumPropagator maxScoreSumPropagator; + + // scaled min competitive score + private float minCompetitiveScore = 0; + + private int cachedScoredDoc = -1; + private float cachedScore = 0; + + /** + * Constructs a Scorer that scores doc based on Block-Max-Maxscore (BMM) algorithm + * http://engineering.nyu.edu/~suel/papers/bmm.pdf . This algorithm has lower overhead compared to + * WANDScorer, and could be used for simple disjunction queries. + * + * @param weight The weight to be used. + * @param scorers The sub scorers this Scorer should iterate on for optional clauses + */ + public BlockMaxMaxscoreScorer(Weight weight, List<Scorer> scorers) throws IOException { + super(weight); + + this.doc = -1; + this.allScorers = new DisiWrapper[scorers.size()]; + this.essentialsScorers = new DisiPriorityQueue(scorers.size()); + this.maxScoreSortedEssentialScorers = new LinkedList<>(); + + long cost = 0; + for (int i = 0; i < scorers.size(); i++) { + DisiWrapper w = new DisiWrapper(scorers.get(i)); + cost += w.cost; + allScorers[i] = w; + } + + this.cost = cost; + maxScoreSumPropagator = new MaxScoreSumPropagator(scorers); + } + + @Override + public DocIdSetIterator iterator() { + // twoPhaseIterator needed to honor scorer.setMinCompetitiveScore guarantee + return TwoPhaseIterator.asDocIdSetIterator(twoPhaseIterator()); + } + + @Override + public TwoPhaseIterator twoPhaseIterator() { + DocIdSetIterator approximation = + new DocIdSetIterator() { + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + while (true) { + + if (target > upTo) { + updateMaxScoresAndLists(target); + } else { + // minCompetitiveScore might have increased, + // move potentially no-longer-competitive scorers from essential to non-essential + // list + movePotentiallyNonCompetitiveScorers(); + } + + assert target <= upTo; + + DisiWrapper top = essentialsScorers.top(); + + if (top == null) { + // all scorers in non-essential list, skip to next boundary or return no_more_docs + if (upTo == NO_MORE_DOCS) { + return doc = NO_MORE_DOCS; + } else { + target = upTo + 1; + } + } else { + // position all scorers in essential list to on or after target + while (top.doc < target) { + top.doc = top.iterator.advance(target); + top = essentialsScorers.updateTop(); + } + + if (top.doc == NO_MORE_DOCS) { + return doc = NO_MORE_DOCS; + } else if (top.doc > upTo) { + target = upTo + 1; + } else { + double docScoreUpperBound = nonEssentialMaxScoreSum; + + for (DisiWrapper w = essentialsScorers.topList(); w != null; w = w.next) { + docScoreUpperBound += w.scorer.score(); + } Review Comment: :+1: -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org