Re: [PR] Add a Faiss codec for KNN searches [lucene]

via GitHub Thu, 01 May 2025 05:33:05 -0700


mikemccand commented on PR #14178:
URL: https://github.com/apache/lucene/pull/14178#issuecomment-2844751984


   I was able to test this PR, yay!
   
   I first installed `faiss-cpu` via the Anaconda `pytorch` channel into my dev 
box, then applied the patch diff from this PR, then tweaked luceneutil with 
this hackity patch:
   
   ```
   diff --git a/src/main/knn/KnnGraphTester.java 
b/src/main/knn/KnnGraphTester.java
   index 327b826c..441e80bb 100644
   --- a/src/main/knn/KnnGraphTester.java
   +++ b/src/main/knn/KnnGraphTester.java
   @@ -46,6 +46,9 @@ import java.util.concurrent.ForkJoinPool;
    import java.util.concurrent.TimeUnit;
    import java.util.function.BinaryOperator;
    
   +// nocommit
   +import org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat;
   +
    import org.apache.lucene.codecs.Codec;
    import org.apache.lucene.codecs.KnnVectorsFormat;
    import org.apache.lucene.codecs.KnnVectorsReader;
   @@ -531,6 +534,9 @@ public class KnnGraphTester {
        }
      }
    
   +  private void computeIndexStatistics(Path indexPath) throws IOException {
   +  }
   +  
      private void printIndexStatistics(Path indexPath) throws IOException {
        try (Directory dir = FSDirectory.open(indexPath);
             IndexReader reader = DirectoryReader.open(dir)) {
   @@ -711,14 +717,12 @@ public class KnnGraphTester {
            HnswGraph knnValues;
            if (vectorsReader instanceof Lucene99HnswVectorsReader 
hnswVectorsReader) {
              knnValues = hnswVectorsReader.getGraph(KNN_FIELD);
   -        } else {
   -          throw new IllegalStateException("unsupported vectors reader: " + 
vectorsReader);
   -        }
    
   -        log("Leaf %d has %d layers\n", context.ord, knnValues.numLevels());
   -        log("Leaf %d has %d documents\n", context.ord, leafReader.maxDoc());
   -        printGraphFanout(knnValues, leafReader.maxDoc());
   -        printGraphConnectedNess(knnValues);
   +          log("Leaf %d has %d layers\n", context.ord, 
knnValues.numLevels());
   +          log("Leaf %d has %d documents\n", context.ord, 
leafReader.maxDoc());
   +          printGraphFanout(knnValues, leafReader.maxDoc());
   +          printGraphConnectedNess(knnValues);
   +        }
          }
        }
      }
   @@ -1260,6 +1264,8 @@ public class KnnGraphTester {
          return new Lucene103Codec() {
            @Override
            public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
   +          // nocommit
   +          /*
              if (quantize) {
                if (quantizeBits == 1) {
                  return switch (indexType) {
   @@ -1272,12 +1278,16 @@ public class KnnGraphTester {
              } else {
                return new Lucene99HnswVectorsFormat(maxConn, beamWidth, 
numMergeWorker, null);
              }
   +          */
   +          return new FaissKnnVectorsFormat("IDMap,HNSW32", 
"efConstruction=200,efSearch=150");
            }
          };
        } else {
          return new Lucene103Codec() {
            @Override
            public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
   +          // nocommit
   +          /*
              if (quantize) {
                if (quantizeBits == 1) {
                  return switch (indexType) {
   @@ -1290,6 +1300,8 @@ public class KnnGraphTester {
              } else {
                return new Lucene99HnswVectorsFormat(maxConn, beamWidth, 
numMergeWorker, exec);
              }
   +          */
   +          return new FaissKnnVectorsFormat("IDMap,HNSW32", 
"efConstruction=200,efSearch=150");
            }
          };
        }
   diff --git a/src/python/benchUtil.py b/src/python/benchUtil.py
   index 45949cde..e105cbf7 100644
   --- a/src/python/benchUtil.py
   +++ b/src/python/benchUtil.py
   @@ -1699,6 +1699,10 @@ def getAntClassPath(checkout):
        raise RuntimeError("can't find core JAR file in %s" % 
("%s/lucene/build/core" % path))
    
      cp.append(core_jar_file)
   +
   +  # nocommit -- for Faiss Codec wrapper
   +  cp.append('/home/mike/miniforge3/lib'
   +            )
      cp.append("%s/lucene/build/sandbox/classes/java" % path)
      cp.append("%s/lucene/build/misc/classes/java" % path)
      cp.append("%s/lucene/build/facet/classes/java" % path)
   diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py
   index 1e971aee..03d17125 100644
   --- a/src/python/knnPerfTest.py
   +++ b/src/python/knnPerfTest.py
   @@ -82,7 +82,7 @@ PARAMS = {
      "quantizeCompress": (True,),
      # "indexType": ("flat", "hnsw"), # index type, only works with singlt bit
      "queryStartIndex": (0,),  # seek to this start vector before searching, 
to sample different vectors
   -  # "forceMerge": (True, False),
   +  #"forceMerge": (True,),
      #'niter': (10,),
    }
    
   @@ -132,6 +132,10 @@ def run_knn_benchmark(checkout, values):
      jfr_output = f"{constants.LOGS_DIR}/knn-perf-test.jfr"
    
      cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout) + 
(f"{constants.BENCH_BASE_DIR}/build",))
   +
   +  # nocommit -- must use JAR so SPI can find the codec:
   +  cp = 
'/l/faiss/lucene/sandbox/build/libs/lucene-sandbox-11.0.0-SNAPSHOT.jar:' + cp
   +  
      cmd = constants.JAVA_EXE.split(" ") + [
        "-cp",
        cp,
   @@ -143,6 +147,9 @@ def run_knn_benchmark(checkout, values):
        "-XX:+DebugNonSafepoints",
      ]
    
   +  # nocommit
   +  
cmd.append('-Djava.library.path=/home/mike/miniforge3/envs/faiss_lucene_codec/lib:/home/mike/miniforge3/lib/:/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib')
   +
      if DO_PROFILING:
        cmd += 
[f"-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings={constants.BENCH_BASE_DIR}/src/python/profiling.jfc"
 + f",filename={jfr_output}"]
    
   @@ -194,7 +201,7 @@ def run_knn_benchmark(checkout, values):
            str(dim),
            "-docs",
            doc_vectors,
   -        "-reindex",
   +        #"-reindex",
            "-search-and-stats",
            query_vectors,
            "-numIndexThreads",
   ```
   
   And then I was able to do a quick smoke test with Faiss, yay!
   
   With single-segment index (this is `Cohere/wikipedia-22-12-en-embeddings` 
768 dim vectors):
   
   ```
   Results:
   recall  latency(ms)    nDoc  topK  fanout  maxConn  beamWidth  quantized  
index(s)  index_docs/s  force_merge(s)  num_segments  index_size(MB)  
vec_disk(MB)  vec_RAM(MB)  indexType
    0.887        0.849  500000   100      50       64        250         no    
265.79       1881.21          150.02             1         3065.21      
1464.844     1464.844       HNSW
   ```
   
   and multi (11) segment index:
   
   ```
   Results:
   recall  latency(ms)    nDoc  topK  fanout  maxConn  beamWidth  quantized  
index(s)  index_docs/s  num_segments  index_size(MB)  vec_disk(MB)  vec_RAM(MB) 
 indexType
    0.970        6.506  500000   100      50       64        250         no    
231.42       2160.54            11         3065.23      1464.844     1464.844   
    HNSW
   ```
   
   This is with `new FaissKnnVectorsFormat("IDMap,HNSW32", 
"efConstruction=200,efSearch=150");` -- meaning (translating to Lucene HNSW 
world) I think: `maxConn=32`, `beamWidth=200`, `topK+fanout=150`, so, 
`topK=100` and `fanout=50`.
   
   It's also curious how much slower the 11 segment case is (6.5 vs .9 msec) -- 
maybe the search is not concurrent across segments?
   
   Also, it's annoying that recall always gets better with multiple segments 
(same is true of Lucene HNSW I think?) -- this is a leaky abstraction.  Ideally 
change to underlying segment geometry should not have such a big impact on the 
returned hits.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Add a Faiss codec for KNN searches [lucene]

Reply via email to