mikemccand commented on PR #14178: URL: https://github.com/apache/lucene/pull/14178#issuecomment-2844751984
I was able to test this PR, yay! I first installed `faiss-cpu` via the Anaconda `pytorch` channel into my dev box, then applied the patch diff from this PR, then tweaked luceneutil with this hackity patch: ``` diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 327b826c..441e80bb 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -46,6 +46,9 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; import java.util.function.BinaryOperator; +// nocommit +import org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat; + import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; @@ -531,6 +534,9 @@ public class KnnGraphTester { } } + private void computeIndexStatistics(Path indexPath) throws IOException { + } + private void printIndexStatistics(Path indexPath) throws IOException { try (Directory dir = FSDirectory.open(indexPath); IndexReader reader = DirectoryReader.open(dir)) { @@ -711,14 +717,12 @@ public class KnnGraphTester { HnswGraph knnValues; if (vectorsReader instanceof Lucene99HnswVectorsReader hnswVectorsReader) { knnValues = hnswVectorsReader.getGraph(KNN_FIELD); - } else { - throw new IllegalStateException("unsupported vectors reader: " + vectorsReader); - } - log("Leaf %d has %d layers\n", context.ord, knnValues.numLevels()); - log("Leaf %d has %d documents\n", context.ord, leafReader.maxDoc()); - printGraphFanout(knnValues, leafReader.maxDoc()); - printGraphConnectedNess(knnValues); + log("Leaf %d has %d layers\n", context.ord, knnValues.numLevels()); + log("Leaf %d has %d documents\n", context.ord, leafReader.maxDoc()); + printGraphFanout(knnValues, leafReader.maxDoc()); + printGraphConnectedNess(knnValues); + } } } } @@ -1260,6 +1264,8 @@ public class KnnGraphTester { return new Lucene103Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + // nocommit + /* if (quantize) { if (quantizeBits == 1) { return switch (indexType) { @@ -1272,12 +1278,16 @@ public class KnnGraphTester { } else { return new Lucene99HnswVectorsFormat(maxConn, beamWidth, numMergeWorker, null); } + */ + return new FaissKnnVectorsFormat("IDMap,HNSW32", "efConstruction=200,efSearch=150"); } }; } else { return new Lucene103Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + // nocommit + /* if (quantize) { if (quantizeBits == 1) { return switch (indexType) { @@ -1290,6 +1300,8 @@ public class KnnGraphTester { } else { return new Lucene99HnswVectorsFormat(maxConn, beamWidth, numMergeWorker, exec); } + */ + return new FaissKnnVectorsFormat("IDMap,HNSW32", "efConstruction=200,efSearch=150"); } }; } diff --git a/src/python/benchUtil.py b/src/python/benchUtil.py index 45949cde..e105cbf7 100644 --- a/src/python/benchUtil.py +++ b/src/python/benchUtil.py @@ -1699,6 +1699,10 @@ def getAntClassPath(checkout): raise RuntimeError("can't find core JAR file in %s" % ("%s/lucene/build/core" % path)) cp.append(core_jar_file) + + # nocommit -- for Faiss Codec wrapper + cp.append('/home/mike/miniforge3/lib' + ) cp.append("%s/lucene/build/sandbox/classes/java" % path) cp.append("%s/lucene/build/misc/classes/java" % path) cp.append("%s/lucene/build/facet/classes/java" % path) diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py index 1e971aee..03d17125 100644 --- a/src/python/knnPerfTest.py +++ b/src/python/knnPerfTest.py @@ -82,7 +82,7 @@ PARAMS = { "quantizeCompress": (True,), # "indexType": ("flat", "hnsw"), # index type, only works with singlt bit "queryStartIndex": (0,), # seek to this start vector before searching, to sample different vectors - # "forceMerge": (True, False), + #"forceMerge": (True,), #'niter': (10,), } @@ -132,6 +132,10 @@ def run_knn_benchmark(checkout, values): jfr_output = f"{constants.LOGS_DIR}/knn-perf-test.jfr" cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout) + (f"{constants.BENCH_BASE_DIR}/build",)) + + # nocommit -- must use JAR so SPI can find the codec: + cp = '/l/faiss/lucene/sandbox/build/libs/lucene-sandbox-11.0.0-SNAPSHOT.jar:' + cp + cmd = constants.JAVA_EXE.split(" ") + [ "-cp", cp, @@ -143,6 +147,9 @@ def run_knn_benchmark(checkout, values): "-XX:+DebugNonSafepoints", ] + # nocommit + cmd.append('-Djava.library.path=/home/mike/miniforge3/envs/faiss_lucene_codec/lib:/home/mike/miniforge3/lib/:/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib') + if DO_PROFILING: cmd += [f"-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings={constants.BENCH_BASE_DIR}/src/python/profiling.jfc" + f",filename={jfr_output}"] @@ -194,7 +201,7 @@ def run_knn_benchmark(checkout, values): str(dim), "-docs", doc_vectors, - "-reindex", + #"-reindex", "-search-and-stats", query_vectors, "-numIndexThreads", ``` And then I was able to do a quick smoke test with Faiss, yay! With single-segment index (this is `Cohere/wikipedia-22-12-en-embeddings` 768 dim vectors): ``` Results: recall latency(ms) nDoc topK fanout maxConn beamWidth quantized index(s) index_docs/s force_merge(s) num_segments index_size(MB) vec_disk(MB) vec_RAM(MB) indexType 0.887 0.849 500000 100 50 64 250 no 265.79 1881.21 150.02 1 3065.21 1464.844 1464.844 HNSW ``` and multi (11) segment index: ``` Results: recall latency(ms) nDoc topK fanout maxConn beamWidth quantized index(s) index_docs/s num_segments index_size(MB) vec_disk(MB) vec_RAM(MB) indexType 0.970 6.506 500000 100 50 64 250 no 231.42 2160.54 11 3065.23 1464.844 1464.844 HNSW ``` This is with `new FaissKnnVectorsFormat("IDMap,HNSW32", "efConstruction=200,efSearch=150");` -- meaning (translating to Lucene HNSW world) I think: `maxConn=32`, `beamWidth=200`, `topK+fanout=150`, so, `topK=100` and `fanout=50`. It's also curious how much slower the 11 segment case is (6.5 vs .9 msec) -- maybe the search is not concurrent across segments? Also, it's annoying that recall always gets better with multiple segments (same is true of Lucene HNSW I think?) -- this is a leaky abstraction. Ideally change to underlying segment geometry should not have such a big impact on the returned hits. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org