jpountz commented on PR #13337:
URL: https://github.com/apache/lucene/pull/13337#issuecomment-2090882979

   
   
   <details>
   <summary>I created the following benchmark to simulate lookups in a terms 
dictionary that cannot fit in the page cache.</summary>
   
   ```java
   import java.io.IOException;
   import java.nio.file.Path;
   import java.nio.file.Paths;
   import java.util.ArrayList;
   import java.util.Arrays;
   import java.util.List;
   import java.util.Random;
   import java.util.concurrent.ThreadLocalRandom;
   
   import org.apache.lucene.store.Directory;
   import org.apache.lucene.store.IOContext;
   import org.apache.lucene.store.IndexInput;
   import org.apache.lucene.store.IndexOutput;
   import org.apache.lucene.store.NIOFSDirectory;
   
   public class PrefetchBench {
   
     private static final int NUM_TERMS = 3;
     private static final long FILE_SIZE = 100L * 1024 * 1024 * 1024; // 100GB
     private static final int NUM_BYTES = 16;
     public static int DUMMY;
   
     public static void main(String[] args) throws IOException {
       Path filePath = Paths.get(args[0]);
       Path dirPath = filePath.getParent();
       String fileName = filePath.getFileName().toString();
       Random r = ThreadLocalRandom.current();
   
       try (Directory dir = new NIOFSDirectory(dirPath)) {
         if (Arrays.asList(dir.listAll()).contains(fileName) == false) {
           try (IndexOutput out = dir.createOutput(fileName, 
IOContext.DEFAULT)) {
             byte[] buf = new byte[8196];
             for (long i = 0; i < FILE_SIZE; i += buf.length) {
               r.nextBytes(buf);
               out.writeBytes(buf, buf.length);
             }
           }
         }
   
         for (boolean dataFitsInCache : new boolean[] { false, true}) {
           try (IndexInput i0 = dir.openInput("file", IOContext.DEFAULT)) {
             byte[][] b = new byte[NUM_TERMS][];
             for (int i = 0; i < NUM_TERMS; ++i) {
               b[i] = new byte[NUM_BYTES];
             }
             IndexInput[] inputs = new IndexInput[NUM_TERMS];
             if (dataFitsInCache) {
               // 16MB slice that should easily fit in the page cache
               inputs[0] = i0.slice("slice", 0, 16 * 1024 * 1024);
             } else {
               inputs[0] = i0;
             }
             for (int i = 1; i < NUM_TERMS; ++i) {
               inputs[i] = inputs[0].clone();
             }
             final long length = inputs[0].length();
             List<Long>[] latencies = new List[2];
             latencies[0] = new ArrayList<>();
             latencies[1] = new ArrayList<>();
             for (int iter = 0; iter < 10_000; ++iter) {
               final boolean prefetch = (iter & 1) == 0;
   
               final long start = System.nanoTime();
   
               for (IndexInput ii : inputs) {
                 final long offset = r.nextLong(length - NUM_BYTES);
                 ii.seek(offset);
                 if (prefetch) {
                   ii.prefetch();
                 }
               }
   
               for (int i = 0; i < NUM_TERMS; ++i) {
                 inputs[i].readBytes(b[i], 0, b[i].length);
               }
   
               final long end = System.nanoTime();
   
               // Prevent the JVM from optimizing away the reads
               DUMMY = Arrays.stream(b).mapToInt(Arrays::hashCode).sum();
   
               latencies[iter & 1].add((end - start) / 1024);
             }
   
             latencies[0].sort(null);
             latencies[1].sort(null);
   
             System.out.println("Data " + (dataFitsInCache ? "fits" : "does not 
fit") + " in the page cache");
             long prefetchP50 = latencies[0].get(latencies[0].size() / 2);
             long prefetchP90 = latencies[0].get(latencies[0].size() * 9 / 10);
             long noPrefetchP50 = latencies[1].get(latencies[0].size() / 2);
             long noPrefetchP90 = latencies[1].get(latencies[0].size() * 9 / 
10);
   
             System.out.println("  With prefetching:    P50=" + prefetchP50 + 
"us P90=" + prefetchP90 + "us");
             System.out.println("  Without prefetching: P50=" + noPrefetchP50 + 
"us P90=" + noPrefetchP90 + "us");
           }
         }
       }
     }
   
   }
   ```
   
   </details>
   
   It assumes 3 terms that need to read 16 bytes each from the terms 
dictionary. We compare the time it takes to read these 16 bytes using today's 
sequential approach vs. taking advantage of the new `prefetch()` API, both in 
the case when the terms dictionary fits in the page cache, and when it doesn't. 
The benchmark prints the following on my machine:
   
   ```
   Data does not fit in the page cache
     With prefetching:    P50=103us P90=144us
     Without prefetching: P50=226us P90=247us
   Data fits in the page cache
     With prefetching:    P50=14us P90=83us
     Without prefetching: P50=3us P90=73us
   ```
   
   Using `prefetch()` adds a bit of overhead (10us) to each lookup in the 
best-case scenario when data fits in the page cache, but saves a non-negligible 
amount of time (100us) when data does not fit in the page cache. Note that 
these numbers would need to be multiplied by the number of segments to search. 
Also, this tests with 3 terms while some queries may have much higher numbers 
of terms. I'd need to test what happens with higher query concurrency, but this 
looks promising to me.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to