Re: [PR] Add a Faiss codec for KNN searches [lucene]

via GitHub Tue, 27 May 2025 10:28:53 -0700


kaivalnp commented on code in PR #14178:
URL: https://github.com/apache/lucene/pull/14178#discussion_r2109760361



##########
lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/faiss/FaissKnnVectorsWriter.java:
##########
@@ -0,0 +1,240 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.sandbox.codecs.faiss;
+
+import static 
org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.DATA_CODEC_NAME;
+import static 
org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.DATA_EXTENSION;
+import static 
org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_CODEC_NAME;
+import static 
org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.META_EXTENSION;
+import static 
org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat.VERSION_CURRENT;
+import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.createIndex;
+import static org.apache.lucene.sandbox.codecs.faiss.LibFaissC.indexWrite;
+
+import java.io.IOException;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.KnnFieldVectorsWriter;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.codecs.hnsw.FlatFieldVectorsWriter;
+import org.apache.lucene.codecs.hnsw.FlatVectorsWriter;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Sorter;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.hnsw.IntToIntFunction;
+
+/**
+ * Write per-segment Faiss indexes and associated metadata.
+ *
+ * @lucene.experimental
+ */
+final class FaissKnnVectorsWriter extends KnnVectorsWriter {
+  private final String description, indexParams;
+  private final FlatVectorsWriter rawVectorsWriter;
+  private final IndexOutput meta, data;
+  private final Map<FieldInfo, FlatFieldVectorsWriter<?>> rawFields;
+  private boolean closed, finished;
+
+  public FaissKnnVectorsWriter(
+      String description,
+      String indexParams,
+      SegmentWriteState state,
+      FlatVectorsWriter rawVectorsWriter)
+      throws IOException {
+
+    this.description = description;
+    this.indexParams = indexParams;
+    this.rawVectorsWriter = rawVectorsWriter;
+    this.rawFields = new HashMap<>();
+    this.closed = false;
+    this.finished = false;
+
+    boolean failure = true;
+    try {
+      this.meta = openOutput(state, META_EXTENSION, META_CODEC_NAME);
+      this.data = openOutput(state, DATA_EXTENSION, DATA_CODEC_NAME);
+      failure = false;
+    } finally {
+      if (failure) {
+        IOUtils.closeWhileHandlingException(this);
+      }
+    }
+  }
+
+  private IndexOutput openOutput(SegmentWriteState state, String extension, 
String codecName)
+      throws IOException {
+    String fileName =
+        IndexFileNames.segmentFileName(state.segmentInfo.name, 
state.segmentSuffix, extension);
+    IndexOutput output = state.directory.createOutput(fileName, state.context);
+    CodecUtil.writeIndexHeader(
+        output, codecName, VERSION_CURRENT, state.segmentInfo.getId(), 
state.segmentSuffix);
+    return output;
+  }
+
+  @Override
+  public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws 
IOException {
+    rawVectorsWriter.mergeOneField(fieldInfo, mergeState);
+    switch (fieldInfo.getVectorEncoding()) {
+      case BYTE ->
+          // TODO: Support using SQ8 quantization, see:
+          //  - https://github.com/opensearch-project/k-NN/pull/2425
+          throw new UnsupportedOperationException("Byte vectors not 
supported");
+      case FLOAT32 -> {
+        FloatVectorValues merged =
+            
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, 
mergeState);
+        writeFloatField(fieldInfo, merged, doc -> doc);
+      }
+    }
+  }
+
+  @Override
+  public KnnFieldVectorsWriter<?> addField(FieldInfo fieldInfo) throws 
IOException {
+    FlatFieldVectorsWriter<?> rawFieldVectorsWriter = 
rawVectorsWriter.addField(fieldInfo);
+    rawFields.put(fieldInfo, rawFieldVectorsWriter);
+    return rawFieldVectorsWriter;
+  }
+
+  @Override
+  public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException {

Review Comment:
   > Does Faiss also not need the full precision vectors at search time when 
using quantization? Does it have a rerank phase where it uses the original 
vectors? Is that phase optional?
   
   Faiss' quantized indexes do not store full vectors, but the index factory 
allows for a two-phased search with another index used for rerank (where a flat 
index can be specified, to use original vectors) -- so this is all configurable 
by the user
   
   Faiss in itself will store whatever is minimally required by the index, but 
we need to store an additional copy in Lucene (via a raw vector reader / 
writer) for reasons listed above..
   
   > when we finally find a clean way to not replicate full precision vectors 
to searchers
   
   +1 -- it would be very similar to whatever solution is used for Lucene's 
default format (both of them use a raw vector reader / writer internally)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Add a Faiss codec for KNN searches [lucene]

Reply via email to