Re: [PR] Make inlining of FixedBitSet#get more predictable when checking live docs. [lucene]

via GitHub Tue, 17 Dec 2024 09:52:59 -0800


jpountz commented on PR #14077:
URL: https://github.com/apache/lucene/pull/14077#issuecomment-2549190865


   To benchmark this change, I applied a (quick and dirty) patch to luceneutil 
to have a mix of 3 `Bits` implementations to represent live docs, using a 
`FixedBitSet` on 75% of segments:
   
   ```patch
   diff --git a/src/main/perf/SearchPerfTest.java 
b/src/main/perf/SearchPerfTest.java
   index 1819be5..2bbe46b 100755
   --- a/src/main/perf/SearchPerfTest.java
   +++ b/src/main/perf/SearchPerfTest.java
   @@ -25,7 +25,6 @@ package perf;
    
    import java.io.IOException;
    import java.io.PrintStream;
   -import java.lang.management.ManagementFactory;
    import java.lang.management.ThreadInfo;
    import java.nio.file.Path;
    import java.nio.file.Paths;
   @@ -42,6 +41,7 @@ import java.util.concurrent.LinkedBlockingQueue;
    import java.util.concurrent.ThreadPoolExecutor;
    import java.util.concurrent.TimeUnit;
    import java.util.concurrent.atomic.AtomicBoolean;
   +import java.util.concurrent.atomic.AtomicInteger;
    import java.util.concurrent.atomic.AtomicReference;
    
    import org.apache.lucene.analysis.Analyzer;
   @@ -60,6 +60,8 @@ import 
org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
    import org.apache.lucene.index.ConcurrentMergeScheduler;
    import org.apache.lucene.index.DirectoryReader;
    import org.apache.lucene.index.ExitableDirectoryReader;
   +import org.apache.lucene.index.FilterDirectoryReader;
   +import org.apache.lucene.index.FilterLeafReader;
    import org.apache.lucene.index.IndexReader;
    import org.apache.lucene.index.IndexWriter;
    import org.apache.lucene.index.IndexWriterConfig;
   @@ -80,7 +82,9 @@ import org.apache.lucene.search.similarities.Similarity;
    import org.apache.lucene.search.spell.DirectSpellChecker;
    import org.apache.lucene.store.Directory;
    import org.apache.lucene.store.NRTCachingDirectory;
   +import org.apache.lucene.util.Bits;
    import org.apache.lucene.util.Constants;
   +import org.apache.lucene.util.FixedBitSet;
    import org.apache.lucene.util.InfoStream;
    import org.apache.lucene.util.NamedThreadFactory;
    import org.apache.lucene.util.PrintStreamInfoStream;
   @@ -470,7 +474,7 @@ public class SearchPerfTest {
        } else {
          dir = dir0;
          writer = null;
   -      final DirectoryReader _reader;
   +      DirectoryReader _reader;
          if (commit != null && commit.length() > 0) {
            System.out.println("Opening searcher on commit=" + commit);
            _reader = DirectoryReader.open(PerfUtils.findCommitPoint(commit, 
dir));
   @@ -479,7 +483,9 @@ public class SearchPerfTest {
            _reader = DirectoryReader.open(dir);
          }
          // if exitable == true, wrap the directory readery by 
ExitableDirectoryReader with (almost) infinite timeout budget.
   -      final DirectoryReader reader = exitable ? 
ExitableDirectoryReader.wrap(_reader, new QueryTimeoutImpl(-1L)) : _reader;
   +      _reader = exitable ? ExitableDirectoryReader.wrap(_reader, new 
QueryTimeoutImpl(-1L)) : _reader;
   +      _reader = new DeletesDirectoryReader(_reader);
   +      final DirectoryReader reader = _reader;
    
          IndexSearcher s = createIndexSearcher(reader, executorService);
          s.setQueryCache(null); // don't bench the cache
   @@ -500,7 +506,7 @@ public class SearchPerfTest {
            // TODO: sort by descending segment size -- it makes it easier to 
eyeball the segment -> slice mapping.  OR, maybe just
            // print the slices not the segments?
            for (LeafReaderContext leaf : s.getIndexReader().leaves()) {
   -          System.out.println("  " + ((SegmentReader) 
leaf.reader()).getSegmentName() + " has maxDoc=" + leaf.reader().maxDoc());
   +          //System.out.println("  " + ((SegmentReader) 
leaf.reader()).getSegmentName() + " has maxDoc=" + leaf.reader().maxDoc());
            }
          } finally {
            mgr.release(s);
   @@ -774,4 +780,102 @@ public class SearchPerfTest {
      private static IndexSearcher createIndexSearcher(IndexReader reader, 
ExecutorService executorService) {
          return new IndexSearcher(reader, executorService);
      }
   +
   +  private static class MatchAllBits1 implements Bits {
   +
   +    private final int length;
   +
   +    MatchAllBits1(int length) {
   +      this.length = length;
   +    }
   +
   +    @Override
   +    public boolean get(int index) {
   +      return true;
   +    }
   +
   +    @Override
   +    public int length() {
   +      return length;
   +    }
   +
   +  }
   +
   +  private static class MatchAllBits2 implements Bits {
   +
   +    private final int length;
   +
   +    MatchAllBits2(int length) {
   +      this.length = length;
   +    }
   +
   +    @Override
   +    public boolean get(int index) {
   +      return true;
   +    }
   +
   +    @Override
   +    public int length() {
   +      return length;
   +    }
   +
   +  }
   +
   +  private static class DeletesDirectoryReader extends FilterDirectoryReader 
{
   +
   +    private static final AtomicInteger COUNTER = new AtomicInteger();
   +
   +    public DeletesDirectoryReader(DirectoryReader in) throws IOException {
   +      super(in, new SubReaderWrapper() {
   +
   +        @Override
   +        public LeafReader wrap(LeafReader reader) {
   +          final Bits bits;
   +          final int c = COUNTER.getAndIncrement();
   +          switch (c & 0x07) {
   +          case 6:
   +            bits = new MatchAllBits1(reader.maxDoc());
   +            break;
   +          case 7:
   +            bits = new MatchAllBits2(reader.maxDoc());
   +            break;
   +          default:
   +            FixedBitSet bitSet = new FixedBitSet(reader.maxDoc());
   +            bitSet.set(0, reader.maxDoc());
   +            bits = bitSet;
   +            break;
   +          }
   +          return new FilterLeafReader(reader) {
   +
   +            @Override
   +            public Bits getLiveDocs() {
   +              return bits;
   +            }
   +
   +            @Override
   +            public CacheHelper getCoreCacheHelper() {
   +              return in.getCoreCacheHelper();
   +            }
   +
   +            @Override
   +            public CacheHelper getReaderCacheHelper() {
   +              return in.getReaderCacheHelper();
   +            }
   +
   +          };
   +        }
   +      });
   +    }
   +
   +    @Override
   +    protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) 
throws IOException {
   +      return new DeletesDirectoryReader(in);
   +    }
   +
   +    @Override
   +    public CacheHelper getReaderCacheHelper() {
   +      return in.getReaderCacheHelper();
   +    }
   +
   +  }
    }
   ```
   
   Here are the results on wikibigall:
   
   ```
                               TaskQPS baseline      StdDevQPS 
my_modified_version      StdDev                Pct diff p-value
                             Phrase       14.97      (4.7%)       14.74      
(4.4%)   -1.5% ( -10% -    7%) 0.286
                        OrStopWords       31.60      (5.7%)       31.28      
(6.5%)   -1.0% ( -12% -   11%) 0.598
                       CombinedTerm       30.82      (2.3%)       30.58      
(2.0%)   -0.8% (  -4% -    3%) 0.263
                     FilteredIntNRQ      109.42     (11.9%)      108.61     
(10.7%)   -0.7% ( -20% -   24%) 0.836
                 Or2Terms2StopWords      154.39      (3.3%)      153.44      
(4.3%)   -0.6% (  -7% -    7%) 0.614
                           Or3Terms      162.40      (3.4%)      161.57      
(4.4%)   -0.5% (  -8% -    7%) 0.677
                          OrHighMed      180.29      (3.1%)      179.46      
(3.0%)   -0.5% (  -6% -    5%) 0.635
                         TermDTSort      275.19      (6.3%)      273.98      
(6.5%)   -0.4% ( -12% -   13%) 0.830
                 CombinedAndHighMed       53.78      (1.3%)       53.55      
(2.4%)   -0.4% (  -4% -    3%) 0.486
                             IntNRQ      110.08     (11.1%)      109.72     
(10.9%)   -0.3% ( -20% -   24%) 0.925
                         OrHighHigh       48.85      (3.5%)       48.72      
(3.5%)   -0.3% (  -7% -    6%) 0.808
                CombinedAndHighHigh       14.84      (1.5%)       14.81      
(2.3%)   -0.2% (  -3% -    3%) 0.757
                          CountTerm     9290.73      (3.1%)     9275.15      
(3.8%)   -0.2% (  -6% -    6%) 0.878
                CountFilteredPhrase       24.94      (2.0%)       24.91      
(1.9%)   -0.1% (  -3% -    3%) 0.839
                             Fuzzy1       81.01      (2.6%)       80.98      
(3.0%)   -0.0% (  -5% -    5%) 0.962
                        CountPhrase        4.19      (1.3%)        4.19      
(1.5%)   -0.0% (  -2% -    2%) 0.981
         FilteredOr2Terms2StopWords      145.89      (1.3%)      145.88      
(1.0%)   -0.0% (  -2% -    2%) 0.995
                   FilteredOr3Terms      162.52      (0.8%)      162.53      
(1.0%)    0.0% (  -1% -    1%) 0.979
                  FilteredOrHighMed      151.45      (1.2%)      151.47      
(1.0%)    0.0% (  -2% -    2%) 0.971
                     FilteredPhrase       29.61      (1.8%)       29.62      
(1.6%)    0.0% (  -3% -    3%) 0.976
                           Wildcard       78.42      (3.9%)       78.45      
(4.4%)    0.0% (  -7% -    8%) 0.978
                      TermMonthSort     3318.21      (2.8%)     3319.46      
(2.2%)    0.0% (  -4% -    5%) 0.962
                 CombinedOrHighHigh       18.19      (1.5%)       18.20      
(1.5%)    0.1% (  -2% -    3%) 0.844
                  CombinedOrHighMed       68.94      (1.2%)       69.02      
(1.5%)    0.1% (  -2% -    2%) 0.770
                             Fuzzy2       76.21      (2.3%)       76.31      
(2.7%)    0.1% (  -4% -    5%) 0.863
                    DismaxOrHighMed      160.60      (2.1%)      160.82      
(2.2%)    0.1% (  -4% -    4%) 0.838
            CountFilteredOrHighHigh       61.33      (2.4%)       61.46      
(1.9%)    0.2% (  -3% -    4%) 0.750
                             OrMany       18.89      (4.2%)       18.95      
(4.9%)    0.3% (  -8% -    9%) 0.840
                FilteredOrStopWords       42.60      (2.2%)       42.74      
(1.8%)    0.3% (  -3% -    4%) 0.609
               FilteredAndStopWords       45.95      (2.1%)       46.12      
(4.2%)    0.4% (  -5% -    6%) 0.724
             CountFilteredOrHighMed       66.66      (1.9%)       66.92      
(1.7%)    0.4% (  -3% -    4%) 0.501
                CountFilteredOrMany        8.38      (2.4%)        8.41      
(2.2%)    0.4% (  -4% -    5%) 0.552
                 FilteredOrHighHigh       63.37      (2.1%)       63.67      
(1.5%)    0.5% (  -3% -    4%) 0.423
                       FilteredTerm      151.98      (1.9%)      152.93      
(2.2%)    0.6% (  -3% -    4%) 0.337
                         DismaxTerm      563.85      (3.9%)      567.45      
(3.3%)    0.6% (  -6% -    8%) 0.579
        FilteredAnd2Terms2StopWords      190.41      (1.5%)      191.89      
(2.8%)    0.8% (  -3% -    5%) 0.277
                   AndMedOrHighHigh       58.23      (1.3%)       58.70      
(2.5%)    0.8% (  -2% -    4%) 0.195
                      TermTitleSort      151.17      (2.0%)      152.42      
(1.9%)    0.8% (  -2% -    4%) 0.176
                FilteredAndHighHigh       59.87      (2.2%)       60.42      
(4.1%)    0.9% (  -5% -    7%) 0.380
                And2Terms2StopWords      155.29      (2.9%)      156.72      
(4.0%)    0.9% (  -5% -    8%) 0.406
                           PKLookup      269.87      (2.5%)      272.71      
(2.4%)    1.1% (  -3% -    6%) 0.177
                  TermDayOfYearSort      606.93      (2.4%)      613.34      
(2.1%)    1.1% (  -3% -    5%) 0.143
                            Prefix3      137.51      (4.8%)      139.06      
(4.4%)    1.1% (  -7% -   10%) 0.439
                          And3Terms      167.19      (3.0%)      169.08      
(4.4%)    1.1% (  -6% -    8%) 0.344
                  FilteredAnd3Terms      187.71      (1.9%)      190.04      
(3.2%)    1.2% (  -3% -    6%) 0.139
                    AndHighOrMedMed       42.66      (1.0%)       43.20      
(1.2%)    1.3% (   0% -    3%) 0.000
                    FilteredPrefix3      131.57      (4.5%)      133.28      
(4.3%)    1.3% (  -7% -   10%) 0.352
                               Term      465.25      (4.6%)      471.42      
(3.0%)    1.3% (  -5% -    9%) 0.278
                   DismaxOrHighHigh      112.06      (3.7%)      113.64      
(2.7%)    1.4% (  -4% -    8%) 0.167
                     FilteredOrMany       16.76      (3.4%)       17.01      
(3.4%)    1.5% (  -5% -    8%) 0.177
                 FilteredAndHighMed      123.47      (2.6%)      125.27      
(4.5%)    1.5% (  -5% -    8%) 0.206
                       AndStopWords       29.84      (4.5%)       30.27      
(6.7%)    1.5% (  -9% -   13%) 0.413
                         AndHighMed      120.16      (1.5%)      123.26      
(2.4%)    2.6% (  -1% -    6%) 0.000
                        AndHighHigh       41.70      (1.8%)       42.82      
(3.0%)    2.7% (  -2% -    7%) 0.001
                    CountAndHighMed      149.35      (3.9%)      154.71      
(2.5%)    3.6% (  -2% -   10%) 0.000
                   CountAndHighHigh       51.38      (3.4%)       53.27      
(2.3%)    3.7% (  -1% -    9%) 0.000
                         OrHighRare      259.51      (9.2%)      270.60      
(5.8%)    4.3% (  -9% -   21%) 0.078
                     CountOrHighMed       78.97      (6.6%)       96.90     
(10.8%)   22.7% (   5% -   42%) 0.000
                        CountOrMany        3.58      (8.1%)        4.58     
(13.1%)   27.9% (   6% -   53%) 0.000
                    CountOrHighHigh       35.97      (8.4%)       46.31     
(14.2%)   28.7% (   5% -   56%) 0.000
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Make inlining of FixedBitSet#get more predictable when checking live docs. [lucene]

Reply via email to