jpountz commented on PR #14077: URL: https://github.com/apache/lucene/pull/14077#issuecomment-2549190865
To benchmark this change, I applied a (quick and dirty) patch to luceneutil to have a mix of 3 `Bits` implementations to represent live docs, using a `FixedBitSet` on 75% of segments: ```patch diff --git a/src/main/perf/SearchPerfTest.java b/src/main/perf/SearchPerfTest.java index 1819be5..2bbe46b 100755 --- a/src/main/perf/SearchPerfTest.java +++ b/src/main/perf/SearchPerfTest.java @@ -25,7 +25,6 @@ package perf; import java.io.IOException; import java.io.PrintStream; -import java.lang.management.ManagementFactory; import java.lang.management.ThreadInfo; import java.nio.file.Path; import java.nio.file.Paths; @@ -42,6 +41,7 @@ import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import org.apache.lucene.analysis.Analyzer; @@ -60,6 +60,8 @@ import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.ExitableDirectoryReader; +import org.apache.lucene.index.FilterDirectoryReader; +import org.apache.lucene.index.FilterLeafReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -80,7 +82,9 @@ import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.spell.DirectSpellChecker; import org.apache.lucene.store.Directory; import org.apache.lucene.store.NRTCachingDirectory; +import org.apache.lucene.util.Bits; import org.apache.lucene.util.Constants; +import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.NamedThreadFactory; import org.apache.lucene.util.PrintStreamInfoStream; @@ -470,7 +474,7 @@ public class SearchPerfTest { } else { dir = dir0; writer = null; - final DirectoryReader _reader; + DirectoryReader _reader; if (commit != null && commit.length() > 0) { System.out.println("Opening searcher on commit=" + commit); _reader = DirectoryReader.open(PerfUtils.findCommitPoint(commit, dir)); @@ -479,7 +483,9 @@ public class SearchPerfTest { _reader = DirectoryReader.open(dir); } // if exitable == true, wrap the directory readery by ExitableDirectoryReader with (almost) infinite timeout budget. - final DirectoryReader reader = exitable ? ExitableDirectoryReader.wrap(_reader, new QueryTimeoutImpl(-1L)) : _reader; + _reader = exitable ? ExitableDirectoryReader.wrap(_reader, new QueryTimeoutImpl(-1L)) : _reader; + _reader = new DeletesDirectoryReader(_reader); + final DirectoryReader reader = _reader; IndexSearcher s = createIndexSearcher(reader, executorService); s.setQueryCache(null); // don't bench the cache @@ -500,7 +506,7 @@ public class SearchPerfTest { // TODO: sort by descending segment size -- it makes it easier to eyeball the segment -> slice mapping. OR, maybe just // print the slices not the segments? for (LeafReaderContext leaf : s.getIndexReader().leaves()) { - System.out.println(" " + ((SegmentReader) leaf.reader()).getSegmentName() + " has maxDoc=" + leaf.reader().maxDoc()); + //System.out.println(" " + ((SegmentReader) leaf.reader()).getSegmentName() + " has maxDoc=" + leaf.reader().maxDoc()); } } finally { mgr.release(s); @@ -774,4 +780,102 @@ public class SearchPerfTest { private static IndexSearcher createIndexSearcher(IndexReader reader, ExecutorService executorService) { return new IndexSearcher(reader, executorService); } + + private static class MatchAllBits1 implements Bits { + + private final int length; + + MatchAllBits1(int length) { + this.length = length; + } + + @Override + public boolean get(int index) { + return true; + } + + @Override + public int length() { + return length; + } + + } + + private static class MatchAllBits2 implements Bits { + + private final int length; + + MatchAllBits2(int length) { + this.length = length; + } + + @Override + public boolean get(int index) { + return true; + } + + @Override + public int length() { + return length; + } + + } + + private static class DeletesDirectoryReader extends FilterDirectoryReader { + + private static final AtomicInteger COUNTER = new AtomicInteger(); + + public DeletesDirectoryReader(DirectoryReader in) throws IOException { + super(in, new SubReaderWrapper() { + + @Override + public LeafReader wrap(LeafReader reader) { + final Bits bits; + final int c = COUNTER.getAndIncrement(); + switch (c & 0x07) { + case 6: + bits = new MatchAllBits1(reader.maxDoc()); + break; + case 7: + bits = new MatchAllBits2(reader.maxDoc()); + break; + default: + FixedBitSet bitSet = new FixedBitSet(reader.maxDoc()); + bitSet.set(0, reader.maxDoc()); + bits = bitSet; + break; + } + return new FilterLeafReader(reader) { + + @Override + public Bits getLiveDocs() { + return bits; + } + + @Override + public CacheHelper getCoreCacheHelper() { + return in.getCoreCacheHelper(); + } + + @Override + public CacheHelper getReaderCacheHelper() { + return in.getReaderCacheHelper(); + } + + }; + } + }); + } + + @Override + protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException { + return new DeletesDirectoryReader(in); + } + + @Override + public CacheHelper getReaderCacheHelper() { + return in.getReaderCacheHelper(); + } + + } } ``` Here are the results on wikibigall: ``` TaskQPS baseline StdDevQPS my_modified_version StdDev Pct diff p-value Phrase 14.97 (4.7%) 14.74 (4.4%) -1.5% ( -10% - 7%) 0.286 OrStopWords 31.60 (5.7%) 31.28 (6.5%) -1.0% ( -12% - 11%) 0.598 CombinedTerm 30.82 (2.3%) 30.58 (2.0%) -0.8% ( -4% - 3%) 0.263 FilteredIntNRQ 109.42 (11.9%) 108.61 (10.7%) -0.7% ( -20% - 24%) 0.836 Or2Terms2StopWords 154.39 (3.3%) 153.44 (4.3%) -0.6% ( -7% - 7%) 0.614 Or3Terms 162.40 (3.4%) 161.57 (4.4%) -0.5% ( -8% - 7%) 0.677 OrHighMed 180.29 (3.1%) 179.46 (3.0%) -0.5% ( -6% - 5%) 0.635 TermDTSort 275.19 (6.3%) 273.98 (6.5%) -0.4% ( -12% - 13%) 0.830 CombinedAndHighMed 53.78 (1.3%) 53.55 (2.4%) -0.4% ( -4% - 3%) 0.486 IntNRQ 110.08 (11.1%) 109.72 (10.9%) -0.3% ( -20% - 24%) 0.925 OrHighHigh 48.85 (3.5%) 48.72 (3.5%) -0.3% ( -7% - 6%) 0.808 CombinedAndHighHigh 14.84 (1.5%) 14.81 (2.3%) -0.2% ( -3% - 3%) 0.757 CountTerm 9290.73 (3.1%) 9275.15 (3.8%) -0.2% ( -6% - 6%) 0.878 CountFilteredPhrase 24.94 (2.0%) 24.91 (1.9%) -0.1% ( -3% - 3%) 0.839 Fuzzy1 81.01 (2.6%) 80.98 (3.0%) -0.0% ( -5% - 5%) 0.962 CountPhrase 4.19 (1.3%) 4.19 (1.5%) -0.0% ( -2% - 2%) 0.981 FilteredOr2Terms2StopWords 145.89 (1.3%) 145.88 (1.0%) -0.0% ( -2% - 2%) 0.995 FilteredOr3Terms 162.52 (0.8%) 162.53 (1.0%) 0.0% ( -1% - 1%) 0.979 FilteredOrHighMed 151.45 (1.2%) 151.47 (1.0%) 0.0% ( -2% - 2%) 0.971 FilteredPhrase 29.61 (1.8%) 29.62 (1.6%) 0.0% ( -3% - 3%) 0.976 Wildcard 78.42 (3.9%) 78.45 (4.4%) 0.0% ( -7% - 8%) 0.978 TermMonthSort 3318.21 (2.8%) 3319.46 (2.2%) 0.0% ( -4% - 5%) 0.962 CombinedOrHighHigh 18.19 (1.5%) 18.20 (1.5%) 0.1% ( -2% - 3%) 0.844 CombinedOrHighMed 68.94 (1.2%) 69.02 (1.5%) 0.1% ( -2% - 2%) 0.770 Fuzzy2 76.21 (2.3%) 76.31 (2.7%) 0.1% ( -4% - 5%) 0.863 DismaxOrHighMed 160.60 (2.1%) 160.82 (2.2%) 0.1% ( -4% - 4%) 0.838 CountFilteredOrHighHigh 61.33 (2.4%) 61.46 (1.9%) 0.2% ( -3% - 4%) 0.750 OrMany 18.89 (4.2%) 18.95 (4.9%) 0.3% ( -8% - 9%) 0.840 FilteredOrStopWords 42.60 (2.2%) 42.74 (1.8%) 0.3% ( -3% - 4%) 0.609 FilteredAndStopWords 45.95 (2.1%) 46.12 (4.2%) 0.4% ( -5% - 6%) 0.724 CountFilteredOrHighMed 66.66 (1.9%) 66.92 (1.7%) 0.4% ( -3% - 4%) 0.501 CountFilteredOrMany 8.38 (2.4%) 8.41 (2.2%) 0.4% ( -4% - 5%) 0.552 FilteredOrHighHigh 63.37 (2.1%) 63.67 (1.5%) 0.5% ( -3% - 4%) 0.423 FilteredTerm 151.98 (1.9%) 152.93 (2.2%) 0.6% ( -3% - 4%) 0.337 DismaxTerm 563.85 (3.9%) 567.45 (3.3%) 0.6% ( -6% - 8%) 0.579 FilteredAnd2Terms2StopWords 190.41 (1.5%) 191.89 (2.8%) 0.8% ( -3% - 5%) 0.277 AndMedOrHighHigh 58.23 (1.3%) 58.70 (2.5%) 0.8% ( -2% - 4%) 0.195 TermTitleSort 151.17 (2.0%) 152.42 (1.9%) 0.8% ( -2% - 4%) 0.176 FilteredAndHighHigh 59.87 (2.2%) 60.42 (4.1%) 0.9% ( -5% - 7%) 0.380 And2Terms2StopWords 155.29 (2.9%) 156.72 (4.0%) 0.9% ( -5% - 8%) 0.406 PKLookup 269.87 (2.5%) 272.71 (2.4%) 1.1% ( -3% - 6%) 0.177 TermDayOfYearSort 606.93 (2.4%) 613.34 (2.1%) 1.1% ( -3% - 5%) 0.143 Prefix3 137.51 (4.8%) 139.06 (4.4%) 1.1% ( -7% - 10%) 0.439 And3Terms 167.19 (3.0%) 169.08 (4.4%) 1.1% ( -6% - 8%) 0.344 FilteredAnd3Terms 187.71 (1.9%) 190.04 (3.2%) 1.2% ( -3% - 6%) 0.139 AndHighOrMedMed 42.66 (1.0%) 43.20 (1.2%) 1.3% ( 0% - 3%) 0.000 FilteredPrefix3 131.57 (4.5%) 133.28 (4.3%) 1.3% ( -7% - 10%) 0.352 Term 465.25 (4.6%) 471.42 (3.0%) 1.3% ( -5% - 9%) 0.278 DismaxOrHighHigh 112.06 (3.7%) 113.64 (2.7%) 1.4% ( -4% - 8%) 0.167 FilteredOrMany 16.76 (3.4%) 17.01 (3.4%) 1.5% ( -5% - 8%) 0.177 FilteredAndHighMed 123.47 (2.6%) 125.27 (4.5%) 1.5% ( -5% - 8%) 0.206 AndStopWords 29.84 (4.5%) 30.27 (6.7%) 1.5% ( -9% - 13%) 0.413 AndHighMed 120.16 (1.5%) 123.26 (2.4%) 2.6% ( -1% - 6%) 0.000 AndHighHigh 41.70 (1.8%) 42.82 (3.0%) 2.7% ( -2% - 7%) 0.001 CountAndHighMed 149.35 (3.9%) 154.71 (2.5%) 3.6% ( -2% - 10%) 0.000 CountAndHighHigh 51.38 (3.4%) 53.27 (2.3%) 3.7% ( -1% - 9%) 0.000 OrHighRare 259.51 (9.2%) 270.60 (5.8%) 4.3% ( -9% - 21%) 0.078 CountOrHighMed 78.97 (6.6%) 96.90 (10.8%) 22.7% ( 5% - 42%) 0.000 CountOrMany 3.58 (8.1%) 4.58 (13.1%) 27.9% ( 6% - 53%) 0.000 CountOrHighHigh 35.97 (8.4%) 46.31 (14.2%) 28.7% ( 5% - 56%) 0.000 ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org