jpountz commented on code in PR #13221: URL: https://github.com/apache/lucene/pull/13221#discussion_r1544316035
########## lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java: ########## @@ -193,170 +209,160 @@ public void setHitsThresholdReached() throws IOException { private void updateCompetitiveIterator() throws IOException { if (enableSkipping == false || hitsThresholdReached == false - || (queueFull == false && topValueSet == false)) return; + || (leafTopSet == false && queueFull == false)) return; // if some documents have missing points, check that missing values prohibits optimization - if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) { + boolean dense = pointValues.getDocCount() == maxDoc; + if (dense == false && isMissingValueCompetitive()) { return; // we can't filter out documents, as documents with missing values are competitive } - updateCounter++; - // Start sampling if we get called too much - if (updateCounter > 256 - && (updateCounter & (currentSkipInterval - 1)) != currentSkipInterval - 1) { + if (competitiveIterator instanceof CompetitiveIterator iter) { + encodeBottom(); + // CompetitiveIterator already built, try to reduce clause. + tryReduceDisjunctionClause(iter); return; } - if (reverse == false) { - if (queueFull) { // bottom is available only when queue is full - maxValueAsBytes = maxValueAsBytes == null ? new byte[bytesCount] : maxValueAsBytes; - encodeBottom(); - } - if (topValueSet) { - minValueAsBytes = minValueAsBytes == null ? new byte[bytesCount] : minValueAsBytes; - encodeTop(); - } - } else { - if (queueFull) { // bottom is available only when queue is full - minValueAsBytes = minValueAsBytes == null ? new byte[bytesCount] : minValueAsBytes; - encodeBottom(); - } - if (topValueSet) { - maxValueAsBytes = maxValueAsBytes == null ? new byte[bytesCount] : maxValueAsBytes; - encodeTop(); + + if (thresholdAsLong == -1) { + if (dense == false) { + competitiveIterator = getNumericDocValues(context, field); + leadCost = Math.min(leadCost, competitiveIterator.cost()); } + long threshold = Math.min(leadCost >> 3, maxDoc >> 5); + thresholdAsLong = intersectThresholdValue(threshold); } - DocIdSetBuilder result = new DocIdSetBuilder(maxDoc); - PointValues.IntersectVisitor visitor = - new PointValues.IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; - - @Override - public void grow(int count) { - adder = result.grow(count); - } + if ((reverse == false && bottomAsComparableLong() <= thresholdAsLong) + || (reverse && bottomAsComparableLong() >= thresholdAsLong)) { + encodeBottom(); + DisjunctionBuildVisitor visitor = new DisjunctionBuildVisitor(); + competitiveIterator = visitor.generateCompetitiveIterator(); + } + } - @Override - public void visit(int docID) { - if (docID <= maxDocVisited) { - return; // Already visited or skipped - } - adder.add(docID); - } + private void tryReduceDisjunctionClause(CompetitiveIterator iter) { + int originalSize = iter.disis.size(); + Predicate<DisiAndMostCompetitiveValue> isCompetitive = + d -> d.mostCompetitiveValue <= maxValueAsLong && d.mostCompetitiveValue >= minValueAsLong; - @Override - public void visit(int docID, byte[] packedValue) { - if (docID <= maxDocVisited) { - return; // already visited or skipped - } - if (maxValueAsBytes != null) { - int cmp = bytesComparator.compare(packedValue, 0, maxValueAsBytes, 0); - - if (cmp > 0) return; - } - if (minValueAsBytes != null) { - int cmp = bytesComparator.compare(packedValue, 0, minValueAsBytes, 0); + while (iter.disis.isEmpty() == false && isCompetitive.test(iter.disis.getFirst()) == false) { + iter.disis.removeFirst(); + } - if (cmp < 0) return; - } - adder.add(docID); // doc is competitive - } + if (originalSize != iter.disis.size()) { + iter.disjunction.clear(); + iter.disjunction.addAll(iter.disis); + } + } - @Override - public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { - if (maxValueAsBytes != null) { - int cmp = bytesComparator.compare(minPackedValue, 0, maxValueAsBytes, 0); - // 1. cmp ==0 and pruning==Pruning.GREATER_THAN_OR_EQUAL_TO : if the sort is - // ascending then maxValueAsBytes is bottom's next less value, so it is competitive - // 2. cmp ==0 and pruning==Pruning.GREATER_THAN: maxValueAsBytes equals to - // bottom, but there are multiple comparators, so it could be competitive - if (cmp > 0) return PointValues.Relation.CELL_OUTSIDE_QUERY; - } - if (minValueAsBytes != null) { - int cmp = bytesComparator.compare(maxPackedValue, 0, minValueAsBytes, 0); - if (cmp < 0) return PointValues.Relation.CELL_OUTSIDE_QUERY; - } - if ((maxValueAsBytes != null - && bytesComparator.compare(maxPackedValue, 0, maxValueAsBytes, 0) > 0) - || (minValueAsBytes != null - && bytesComparator.compare(minPackedValue, 0, minValueAsBytes, 0) < 0)) { - return PointValues.Relation.CELL_CROSSES_QUERY; - } - return PointValues.Relation.CELL_INSIDE_QUERY; - } - }; - final long threshold = iteratorCost >>> 3; - - if (PointValues.isEstimatedPointCountGreaterThanOrEqualTo(visitor, pointTree, threshold)) { - // the new range is not selective enough to be worth materializing, it doesn't reduce number - // of docs at least 8x - updateSkipInterval(false); - if (pointValues.getDocCount() < iteratorCost) { - // Use the set of doc with values to help drive iteration - competitiveIterator = getNumericDocValues(context, field); - iteratorCost = pointValues.getDocCount(); + /** + * Find out the value that threshold docs away from topValue/infinite. + * + * <p>TODO: we are assuming a binary tree Review Comment: It would be good to remove this assumption indeed. ########## lucene/core/src/java/org/apache/lucene/search/comparators/NumericComparator.java: ########## @@ -193,170 +209,160 @@ public void setHitsThresholdReached() throws IOException { private void updateCompetitiveIterator() throws IOException { if (enableSkipping == false || hitsThresholdReached == false - || (queueFull == false && topValueSet == false)) return; + || (leafTopSet == false && queueFull == false)) return; // if some documents have missing points, check that missing values prohibits optimization - if ((pointValues.getDocCount() < maxDoc) && isMissingValueCompetitive()) { + boolean dense = pointValues.getDocCount() == maxDoc; + if (dense == false && isMissingValueCompetitive()) { return; // we can't filter out documents, as documents with missing values are competitive } - updateCounter++; - // Start sampling if we get called too much - if (updateCounter > 256 - && (updateCounter & (currentSkipInterval - 1)) != currentSkipInterval - 1) { + if (competitiveIterator instanceof CompetitiveIterator iter) { + encodeBottom(); + // CompetitiveIterator already built, try to reduce clause. + tryReduceDisjunctionClause(iter); return; } - if (reverse == false) { - if (queueFull) { // bottom is available only when queue is full - maxValueAsBytes = maxValueAsBytes == null ? new byte[bytesCount] : maxValueAsBytes; - encodeBottom(); - } - if (topValueSet) { - minValueAsBytes = minValueAsBytes == null ? new byte[bytesCount] : minValueAsBytes; - encodeTop(); - } - } else { - if (queueFull) { // bottom is available only when queue is full - minValueAsBytes = minValueAsBytes == null ? new byte[bytesCount] : minValueAsBytes; - encodeBottom(); - } - if (topValueSet) { - maxValueAsBytes = maxValueAsBytes == null ? new byte[bytesCount] : maxValueAsBytes; - encodeTop(); + + if (thresholdAsLong == -1) { + if (dense == false) { + competitiveIterator = getNumericDocValues(context, field); + leadCost = Math.min(leadCost, competitiveIterator.cost()); } + long threshold = Math.min(leadCost >> 3, maxDoc >> 5); + thresholdAsLong = intersectThresholdValue(threshold); } - DocIdSetBuilder result = new DocIdSetBuilder(maxDoc); - PointValues.IntersectVisitor visitor = - new PointValues.IntersectVisitor() { - DocIdSetBuilder.BulkAdder adder; - - @Override - public void grow(int count) { - adder = result.grow(count); - } + if ((reverse == false && bottomAsComparableLong() <= thresholdAsLong) + || (reverse && bottomAsComparableLong() >= thresholdAsLong)) { + encodeBottom(); + DisjunctionBuildVisitor visitor = new DisjunctionBuildVisitor(); + competitiveIterator = visitor.generateCompetitiveIterator(); + } + } - @Override - public void visit(int docID) { - if (docID <= maxDocVisited) { - return; // Already visited or skipped - } - adder.add(docID); - } + private void tryReduceDisjunctionClause(CompetitiveIterator iter) { + int originalSize = iter.disis.size(); + Predicate<DisiAndMostCompetitiveValue> isCompetitive = + d -> d.mostCompetitiveValue <= maxValueAsLong && d.mostCompetitiveValue >= minValueAsLong; - @Override - public void visit(int docID, byte[] packedValue) { - if (docID <= maxDocVisited) { - return; // already visited or skipped - } - if (maxValueAsBytes != null) { - int cmp = bytesComparator.compare(packedValue, 0, maxValueAsBytes, 0); - - if (cmp > 0) return; - } - if (minValueAsBytes != null) { - int cmp = bytesComparator.compare(packedValue, 0, minValueAsBytes, 0); + while (iter.disis.isEmpty() == false && isCompetitive.test(iter.disis.getFirst()) == false) { + iter.disis.removeFirst(); + } - if (cmp < 0) return; - } - adder.add(docID); // doc is competitive - } + if (originalSize != iter.disis.size()) { + iter.disjunction.clear(); + iter.disjunction.addAll(iter.disis); + } + } - @Override - public PointValues.Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { - if (maxValueAsBytes != null) { - int cmp = bytesComparator.compare(minPackedValue, 0, maxValueAsBytes, 0); - // 1. cmp ==0 and pruning==Pruning.GREATER_THAN_OR_EQUAL_TO : if the sort is - // ascending then maxValueAsBytes is bottom's next less value, so it is competitive - // 2. cmp ==0 and pruning==Pruning.GREATER_THAN: maxValueAsBytes equals to - // bottom, but there are multiple comparators, so it could be competitive - if (cmp > 0) return PointValues.Relation.CELL_OUTSIDE_QUERY; - } - if (minValueAsBytes != null) { - int cmp = bytesComparator.compare(maxPackedValue, 0, minValueAsBytes, 0); - if (cmp < 0) return PointValues.Relation.CELL_OUTSIDE_QUERY; - } - if ((maxValueAsBytes != null - && bytesComparator.compare(maxPackedValue, 0, maxValueAsBytes, 0) > 0) - || (minValueAsBytes != null - && bytesComparator.compare(minPackedValue, 0, minValueAsBytes, 0) < 0)) { - return PointValues.Relation.CELL_CROSSES_QUERY; - } - return PointValues.Relation.CELL_INSIDE_QUERY; - } - }; - final long threshold = iteratorCost >>> 3; - - if (PointValues.isEstimatedPointCountGreaterThanOrEqualTo(visitor, pointTree, threshold)) { - // the new range is not selective enough to be worth materializing, it doesn't reduce number - // of docs at least 8x - updateSkipInterval(false); - if (pointValues.getDocCount() < iteratorCost) { - // Use the set of doc with values to help drive iteration - competitiveIterator = getNumericDocValues(context, field); - iteratorCost = pointValues.getDocCount(); + /** + * Find out the value that threshold docs away from topValue/infinite. + * + * <p>TODO: we are assuming a binary tree + */ + private long intersectThresholdValue(long threshold) throws IOException { Review Comment: My understanding is that we're trying to compute the threshold-th value from the tree. Do we actually need to distinguish the L2R and R2L cases, or could we more simply look for the `pointValues.size() - threshold`-th value in the reverse case? ########## lucene/core/src/java/org/apache/lucene/search/comparators/DoubleComparator.java: ########## @@ -54,6 +54,11 @@ public Double value(int slot) { return Double.valueOf(values[slot]); } + @Override + protected long missingValueAsComparableLong() { + return NumericUtils.doubleToSortableLong(missingValue); + } Review Comment: I'm looking at this change and it's not clear to me if it is strictly required to introduce this new API. It's annoying me a bit because I'm already not a fan of the fact that we mix comparisons in double space (`Double#compare`) and sometimes in binary encoded space (running the range query on points encoded as a `byte[]`), and now we're also comparing values as `long`s. Could your optimization work without touching this part of the API? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org