jpountz commented on PR #14077:
URL: https://github.com/apache/lucene/pull/14077#issuecomment-2549190865
To benchmark this change, I applied a (quick and dirty) patch to luceneutil
to have a mix of 3 `Bits` implementations to represent live docs, using a
`FixedBitSet` on 75% of segments:
```patch
diff --git a/src/main/perf/SearchPerfTest.java
b/src/main/perf/SearchPerfTest.java
index 1819be5..2bbe46b 100755
--- a/src/main/perf/SearchPerfTest.java
+++ b/src/main/perf/SearchPerfTest.java
@@ -25,7 +25,6 @@ package perf;
import java.io.IOException;
import java.io.PrintStream;
-import java.lang.management.ManagementFactory;
import java.lang.management.ThreadInfo;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -42,6 +41,7 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.lucene.analysis.Analyzer;
@@ -60,6 +60,8 @@ import
org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.ExitableDirectoryReader;
+import org.apache.lucene.index.FilterDirectoryReader;
+import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@@ -80,7 +82,9 @@ import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.spell.DirectSpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NRTCachingDirectory;
+import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Constants;
+import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.NamedThreadFactory;
import org.apache.lucene.util.PrintStreamInfoStream;
@@ -470,7 +474,7 @@ public class SearchPerfTest {
} else {
dir = dir0;
writer = null;
- final DirectoryReader _reader;
+ DirectoryReader _reader;
if (commit != null && commit.length() > 0) {
System.out.println("Opening searcher on commit=" + commit);
_reader = DirectoryReader.open(PerfUtils.findCommitPoint(commit,
dir));
@@ -479,7 +483,9 @@ public class SearchPerfTest {
_reader = DirectoryReader.open(dir);
}
// if exitable == true, wrap the directory readery by
ExitableDirectoryReader with (almost) infinite timeout budget.
- final DirectoryReader reader = exitable ?
ExitableDirectoryReader.wrap(_reader, new QueryTimeoutImpl(-1L)) : _reader;
+ _reader = exitable ? ExitableDirectoryReader.wrap(_reader, new
QueryTimeoutImpl(-1L)) : _reader;
+ _reader = new DeletesDirectoryReader(_reader);
+ final DirectoryReader reader = _reader;
IndexSearcher s = createIndexSearcher(reader, executorService);
s.setQueryCache(null); // don't bench the cache
@@ -500,7 +506,7 @@ public class SearchPerfTest {
// TODO: sort by descending segment size -- it makes it easier to
eyeball the segment -> slice mapping. OR, maybe just
// print the slices not the segments?
for (LeafReaderContext leaf : s.getIndexReader().leaves()) {
- System.out.println(" " + ((SegmentReader)
leaf.reader()).getSegmentName() + " has maxDoc=" + leaf.reader().maxDoc());
+ //System.out.println(" " + ((SegmentReader)
leaf.reader()).getSegmentName() + " has maxDoc=" + leaf.reader().maxDoc());
}
} finally {
mgr.release(s);
@@ -774,4 +780,102 @@ public class SearchPerfTest {
private static IndexSearcher createIndexSearcher(IndexReader reader,
ExecutorService executorService) {
return new IndexSearcher(reader, executorService);
}
+
+ private static class MatchAllBits1 implements Bits {
+
+ private final int length;
+
+ MatchAllBits1(int length) {
+ this.length = length;
+ }
+
+ @Override
+ public boolean get(int index) {
+ return true;
+ }
+
+ @Override
+ public int length() {
+ return length;
+ }
+
+ }
+
+ private static class MatchAllBits2 implements Bits {
+
+ private final int length;
+
+ MatchAllBits2(int length) {
+ this.length = length;
+ }
+
+ @Override
+ public boolean get(int index) {
+ return true;
+ }
+
+ @Override
+ public int length() {
+ return length;
+ }
+
+ }
+
+ private static class DeletesDirectoryReader extends FilterDirectoryReader
{
+
+ private static final AtomicInteger COUNTER = new AtomicInteger();
+
+ public DeletesDirectoryReader(DirectoryReader in) throws IOException {
+ super(in, new SubReaderWrapper() {
+
+ @Override
+ public LeafReader wrap(LeafReader reader) {
+ final Bits bits;
+ final int c = COUNTER.getAndIncrement();
+ switch (c & 0x07) {
+ case 6:
+ bits = new MatchAllBits1(reader.maxDoc());
+ break;
+ case 7:
+ bits = new MatchAllBits2(reader.maxDoc());
+ break;
+ default:
+ FixedBitSet bitSet = new FixedBitSet(reader.maxDoc());
+ bitSet.set(0, reader.maxDoc());
+ bits = bitSet;
+ break;
+ }
+ return new FilterLeafReader(reader) {
+
+ @Override
+ public Bits getLiveDocs() {
+ return bits;
+ }
+
+ @Override
+ public CacheHelper getCoreCacheHelper() {
+ return in.getCoreCacheHelper();
+ }
+
+ @Override
+ public CacheHelper getReaderCacheHelper() {
+ return in.getReaderCacheHelper();
+ }
+
+ };
+ }
+ });
+ }
+
+ @Override
+ protected DirectoryReader doWrapDirectoryReader(DirectoryReader in)
throws IOException {
+ return new DeletesDirectoryReader(in);
+ }
+
+ @Override
+ public CacheHelper getReaderCacheHelper() {
+ return in.getReaderCacheHelper();
+ }
+
+ }
}
```
Here are the results on wikibigall:
```
TaskQPS baseline StdDevQPS
my_modified_version StdDev Pct diff p-value
Phrase 14.97 (4.7%) 14.74
(4.4%) -1.5% ( -10% - 7%) 0.286
OrStopWords 31.60 (5.7%) 31.28
(6.5%) -1.0% ( -12% - 11%) 0.598
CombinedTerm 30.82 (2.3%) 30.58
(2.0%) -0.8% ( -4% - 3%) 0.263
FilteredIntNRQ 109.42 (11.9%) 108.61
(10.7%) -0.7% ( -20% - 24%) 0.836
Or2Terms2StopWords 154.39 (3.3%) 153.44
(4.3%) -0.6% ( -7% - 7%) 0.614
Or3Terms 162.40 (3.4%) 161.57
(4.4%) -0.5% ( -8% - 7%) 0.677
OrHighMed 180.29 (3.1%) 179.46
(3.0%) -0.5% ( -6% - 5%) 0.635
TermDTSort 275.19 (6.3%) 273.98
(6.5%) -0.4% ( -12% - 13%) 0.830
CombinedAndHighMed 53.78 (1.3%) 53.55
(2.4%) -0.4% ( -4% - 3%) 0.486
IntNRQ 110.08 (11.1%) 109.72
(10.9%) -0.3% ( -20% - 24%) 0.925
OrHighHigh 48.85 (3.5%) 48.72
(3.5%) -0.3% ( -7% - 6%) 0.808
CombinedAndHighHigh 14.84 (1.5%) 14.81
(2.3%) -0.2% ( -3% - 3%) 0.757
CountTerm 9290.73 (3.1%) 9275.15
(3.8%) -0.2% ( -6% - 6%) 0.878
CountFilteredPhrase 24.94 (2.0%) 24.91
(1.9%) -0.1% ( -3% - 3%) 0.839
Fuzzy1 81.01 (2.6%) 80.98
(3.0%) -0.0% ( -5% - 5%) 0.962
CountPhrase 4.19 (1.3%) 4.19
(1.5%) -0.0% ( -2% - 2%) 0.981
FilteredOr2Terms2StopWords 145.89 (1.3%) 145.88
(1.0%) -0.0% ( -2% - 2%) 0.995
FilteredOr3Terms 162.52 (0.8%) 162.53
(1.0%) 0.0% ( -1% - 1%) 0.979
FilteredOrHighMed 151.45 (1.2%) 151.47
(1.0%) 0.0% ( -2% - 2%) 0.971
FilteredPhrase 29.61 (1.8%) 29.62
(1.6%) 0.0% ( -3% - 3%) 0.976
Wildcard 78.42 (3.9%) 78.45
(4.4%) 0.0% ( -7% - 8%) 0.978
TermMonthSort 3318.21 (2.8%) 3319.46
(2.2%) 0.0% ( -4% - 5%) 0.962
CombinedOrHighHigh 18.19 (1.5%) 18.20
(1.5%) 0.1% ( -2% - 3%) 0.844
CombinedOrHighMed 68.94 (1.2%) 69.02
(1.5%) 0.1% ( -2% - 2%) 0.770
Fuzzy2 76.21 (2.3%) 76.31
(2.7%) 0.1% ( -4% - 5%) 0.863
DismaxOrHighMed 160.60 (2.1%) 160.82
(2.2%) 0.1% ( -4% - 4%) 0.838
CountFilteredOrHighHigh 61.33 (2.4%) 61.46
(1.9%) 0.2% ( -3% - 4%) 0.750
OrMany 18.89 (4.2%) 18.95
(4.9%) 0.3% ( -8% - 9%) 0.840
FilteredOrStopWords 42.60 (2.2%) 42.74
(1.8%) 0.3% ( -3% - 4%) 0.609
FilteredAndStopWords 45.95 (2.1%) 46.12
(4.2%) 0.4% ( -5% - 6%) 0.724
CountFilteredOrHighMed 66.66 (1.9%) 66.92
(1.7%) 0.4% ( -3% - 4%) 0.501
CountFilteredOrMany 8.38 (2.4%) 8.41
(2.2%) 0.4% ( -4% - 5%) 0.552
FilteredOrHighHigh 63.37 (2.1%) 63.67
(1.5%) 0.5% ( -3% - 4%) 0.423
FilteredTerm 151.98 (1.9%) 152.93
(2.2%) 0.6% ( -3% - 4%) 0.337
DismaxTerm 563.85 (3.9%) 567.45
(3.3%) 0.6% ( -6% - 8%) 0.579
FilteredAnd2Terms2StopWords 190.41 (1.5%) 191.89
(2.8%) 0.8% ( -3% - 5%) 0.277
AndMedOrHighHigh 58.23 (1.3%) 58.70
(2.5%) 0.8% ( -2% - 4%) 0.195
TermTitleSort 151.17 (2.0%) 152.42
(1.9%) 0.8% ( -2% - 4%) 0.176
FilteredAndHighHigh 59.87 (2.2%) 60.42
(4.1%) 0.9% ( -5% - 7%) 0.380
And2Terms2StopWords 155.29 (2.9%) 156.72
(4.0%) 0.9% ( -5% - 8%) 0.406
PKLookup 269.87 (2.5%) 272.71
(2.4%) 1.1% ( -3% - 6%) 0.177
TermDayOfYearSort 606.93 (2.4%) 613.34
(2.1%) 1.1% ( -3% - 5%) 0.143
Prefix3 137.51 (4.8%) 139.06
(4.4%) 1.1% ( -7% - 10%) 0.439
And3Terms 167.19 (3.0%) 169.08
(4.4%) 1.1% ( -6% - 8%) 0.344
FilteredAnd3Terms 187.71 (1.9%) 190.04
(3.2%) 1.2% ( -3% - 6%) 0.139
AndHighOrMedMed 42.66 (1.0%) 43.20
(1.2%) 1.3% ( 0% - 3%) 0.000
FilteredPrefix3 131.57 (4.5%) 133.28
(4.3%) 1.3% ( -7% - 10%) 0.352
Term 465.25 (4.6%) 471.42
(3.0%) 1.3% ( -5% - 9%) 0.278
DismaxOrHighHigh 112.06 (3.7%) 113.64
(2.7%) 1.4% ( -4% - 8%) 0.167
FilteredOrMany 16.76 (3.4%) 17.01
(3.4%) 1.5% ( -5% - 8%) 0.177
FilteredAndHighMed 123.47 (2.6%) 125.27
(4.5%) 1.5% ( -5% - 8%) 0.206
AndStopWords 29.84 (4.5%) 30.27
(6.7%) 1.5% ( -9% - 13%) 0.413
AndHighMed 120.16 (1.5%) 123.26
(2.4%) 2.6% ( -1% - 6%) 0.000
AndHighHigh 41.70 (1.8%) 42.82
(3.0%) 2.7% ( -2% - 7%) 0.001
CountAndHighMed 149.35 (3.9%) 154.71
(2.5%) 3.6% ( -2% - 10%) 0.000
CountAndHighHigh 51.38 (3.4%) 53.27
(2.3%) 3.7% ( -1% - 9%) 0.000
OrHighRare 259.51 (9.2%) 270.60
(5.8%) 4.3% ( -9% - 21%) 0.078
CountOrHighMed 78.97 (6.6%) 96.90
(10.8%) 22.7% ( 5% - 42%) 0.000
CountOrMany 3.58 (8.1%) 4.58
(13.1%) 27.9% ( 6% - 53%) 0.000
CountOrHighHigh 35.97 (8.4%) 46.31
(14.2%) 28.7% ( 5% - 56%) 0.000
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]