benwtrent commented on code in PR #12582:
URL: https://github.com/apache/lucene/pull/12582#discussion_r1344513453


##########
lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsWriter.java:
##########
@@ -0,0 +1,1170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene99;
+
+import static 
org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
+import static 
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultQuantile;
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.HnswGraphProvider;
+import org.apache.lucene.codecs.KnnFieldVectorsWriter;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.codecs.lucene90.IndexedDISI;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
+import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.DocsWithFieldSet;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Sorter;
+import org.apache.lucene.index.VectorEncoding;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InfoStream;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.hnsw.HnswGraph;
+import org.apache.lucene.util.hnsw.HnswGraph.NodesIterator;
+import org.apache.lucene.util.hnsw.HnswGraphBuilder;
+import org.apache.lucene.util.hnsw.NeighborArray;
+import org.apache.lucene.util.hnsw.OnHeapHnswGraph;
+import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
+import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
+import org.apache.lucene.util.packed.DirectMonotonicWriter;
+
+/**
+ * Writes vector values and knn graphs to index segments.
+ *
+ * @lucene.experimental
+ */
+public final class Lucene99HnswVectorsWriter extends KnnVectorsWriter {
+
+  private final SegmentWriteState segmentWriteState;
+  private final IndexOutput meta, vectorData, quantizedVectorData, vectorIndex;
+  private final int M;
+  private final int beamWidth;
+  private final Lucene99ScalarQuantizedVectorsWriter quantizedVectorsWriter;
+
+  private final List<FieldWriter<?>> fields = new ArrayList<>();
+  private boolean finished;
+
+  Lucene99HnswVectorsWriter(
+      SegmentWriteState state,
+      int M,
+      int beamWidth,
+      Lucene99ScalarQuantizedVectorsFormat quantizedVectorsFormat)
+      throws IOException {
+    this.M = M;
+    this.beamWidth = beamWidth;
+    segmentWriteState = state;
+    String metaFileName =
+        IndexFileNames.segmentFileName(
+            state.segmentInfo.name, state.segmentSuffix, 
Lucene99HnswVectorsFormat.META_EXTENSION);
+
+    String vectorDataFileName =
+        IndexFileNames.segmentFileName(
+            state.segmentInfo.name,
+            state.segmentSuffix,
+            Lucene99HnswVectorsFormat.VECTOR_DATA_EXTENSION);
+
+    String indexDataFileName =
+        IndexFileNames.segmentFileName(
+            state.segmentInfo.name,
+            state.segmentSuffix,
+            Lucene99HnswVectorsFormat.VECTOR_INDEX_EXTENSION);
+
+    final String quantizedVectorDataFileName =
+        quantizedVectorsFormat != null
+            ? IndexFileNames.segmentFileName(
+                state.segmentInfo.name,
+                state.segmentSuffix,
+                
Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_DATA_EXTENSION)
+            : null;
+    boolean success = false;
+    try {
+      meta = state.directory.createOutput(metaFileName, state.context);
+      vectorData = state.directory.createOutput(vectorDataFileName, 
state.context);
+      vectorIndex = state.directory.createOutput(indexDataFileName, 
state.context);
+
+      CodecUtil.writeIndexHeader(
+          meta,
+          Lucene99HnswVectorsFormat.META_CODEC_NAME,
+          Lucene99HnswVectorsFormat.VERSION_CURRENT,
+          state.segmentInfo.getId(),
+          state.segmentSuffix);
+      CodecUtil.writeIndexHeader(
+          vectorData,
+          Lucene99HnswVectorsFormat.VECTOR_DATA_CODEC_NAME,
+          Lucene99HnswVectorsFormat.VERSION_CURRENT,
+          state.segmentInfo.getId(),
+          state.segmentSuffix);
+      CodecUtil.writeIndexHeader(
+          vectorIndex,
+          Lucene99HnswVectorsFormat.VECTOR_INDEX_CODEC_NAME,
+          Lucene99HnswVectorsFormat.VERSION_CURRENT,
+          state.segmentInfo.getId(),
+          state.segmentSuffix);
+      if (quantizedVectorDataFileName != null) {
+        quantizedVectorData =
+            state.directory.createOutput(quantizedVectorDataFileName, 
state.context);
+        CodecUtil.writeIndexHeader(
+            quantizedVectorData,
+            
Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_DATA_CODEC_NAME,
+            Lucene99ScalarQuantizedVectorsFormat.VERSION_CURRENT,
+            state.segmentInfo.getId(),
+            state.segmentSuffix);
+        quantizedVectorsWriter =
+            new Lucene99ScalarQuantizedVectorsWriter(
+                quantizedVectorData, quantizedVectorsFormat.quantile);
+      } else {
+        quantizedVectorData = null;
+        quantizedVectorsWriter = null;
+      }
+      success = true;
+    } finally {
+      if (success == false) {
+        IOUtils.closeWhileHandlingException(this);
+      }
+    }
+  }
+
+  @Override
+  public KnnFieldVectorsWriter<?> addField(FieldInfo fieldInfo) throws 
IOException {
+    Lucene99ScalarQuantizedVectorsWriter.QuantizationVectorWriter 
quantizedVectorFieldWriter = null;
+    // Quantization only supports FLOAT32 for now
+    if (quantizedVectorsWriter != null
+        && fieldInfo.getVectorEncoding().equals(VectorEncoding.FLOAT32)) {
+      quantizedVectorFieldWriter = quantizedVectorsWriter.addField(fieldInfo);
+    }
+    FieldWriter<?> newField =
+        FieldWriter.create(
+            fieldInfo, M, beamWidth, segmentWriteState.infoStream, 
quantizedVectorFieldWriter);
+    fields.add(newField);
+    return newField;
+  }
+
+  @Override
+  public void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException {
+    for (FieldWriter<?> field : fields) {
+      long[] quantizedVectorOffsetAndLen = null;
+      if (field.quantizedWriter != null) {
+        assert quantizedVectorsWriter != null;
+        quantizedVectorOffsetAndLen =
+            quantizedVectorsWriter.flush(sortMap, field.quantizedWriter, 
field.docsWithField);
+      }
+      if (sortMap == null) {
+        writeField(field, maxDoc, quantizedVectorOffsetAndLen);
+      } else {
+        writeSortingField(field, maxDoc, sortMap, quantizedVectorOffsetAndLen);
+      }
+    }
+  }
+
+  @Override
+  public void finish() throws IOException {
+    if (finished) {
+      throw new IllegalStateException("already finished");
+    }
+    finished = true;
+    if (quantizedVectorsWriter != null) {
+      quantizedVectorsWriter.finish();
+    }
+
+    if (meta != null) {
+      // write end of fields marker
+      meta.writeInt(-1);
+      CodecUtil.writeFooter(meta);
+    }
+    if (vectorData != null) {
+      CodecUtil.writeFooter(vectorData);
+      CodecUtil.writeFooter(vectorIndex);
+    }
+  }
+
+  @Override
+  public long ramBytesUsed() {
+    long total = 0;
+    for (FieldWriter<?> field : fields) {
+      total += field.ramBytesUsed();
+    }
+    return total;
+  }
+
+  private void writeField(FieldWriter<?> fieldData, int maxDoc, long[] 
quantizedVecOffsetAndLen)
+      throws IOException {
+    // write vector values
+    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    switch (fieldData.fieldInfo.getVectorEncoding()) {
+      case BYTE -> writeByteVectors(fieldData);
+      case FLOAT32 -> writeFloat32Vectors(fieldData);
+    }
+    long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+
+    // write graph
+    long vectorIndexOffset = vectorIndex.getFilePointer();
+    OnHeapHnswGraph graph = fieldData.getGraph();
+    int[][] graphLevelNodeOffsets = writeGraph(graph);
+    long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
+
+    writeMeta(
+        fieldData.isQuantized(),
+        fieldData.fieldInfo,
+        maxDoc,
+        fieldData.getConfiguredQuantile(),
+        fieldData.getMinQuantile(),
+        fieldData.getMaxQuantile(),
+        quantizedVecOffsetAndLen,
+        vectorDataOffset,
+        vectorDataLength,
+        vectorIndexOffset,
+        vectorIndexLength,
+        fieldData.docsWithField,
+        graph,
+        graphLevelNodeOffsets);
+  }
+
+  private void writeFloat32Vectors(FieldWriter<?> fieldData) throws 
IOException {
+    final ByteBuffer buffer =
+        ByteBuffer.allocate(fieldData.dim * 
Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
+    for (Object v : fieldData.vectors) {
+      buffer.asFloatBuffer().put((float[]) v);
+      vectorData.writeBytes(buffer.array(), buffer.array().length);
+    }
+  }
+
+  private void writeByteVectors(FieldWriter<?> fieldData) throws IOException {
+    for (Object v : fieldData.vectors) {
+      byte[] vector = (byte[]) v;
+      vectorData.writeBytes(vector, vector.length);
+    }
+  }
+
+  private void writeSortingField(
+      FieldWriter<?> fieldData,
+      int maxDoc,
+      Sorter.DocMap sortMap,
+      long[] quantizedVectorOffsetAndLen)
+      throws IOException {
+    final int[] docIdOffsets = new int[sortMap.size()];
+    int offset = 1; // 0 means no vector for this (field, document)
+    DocIdSetIterator iterator = fieldData.docsWithField.iterator();
+    for (int docID = iterator.nextDoc();
+        docID != DocIdSetIterator.NO_MORE_DOCS;
+        docID = iterator.nextDoc()) {
+      int newDocID = sortMap.oldToNew(docID);
+      docIdOffsets[newDocID] = offset++;
+    }
+    DocsWithFieldSet newDocsWithField = new DocsWithFieldSet();
+    final int[] ordMap = new int[offset - 1]; // new ord to old ord
+    final int[] oldOrdMap = new int[offset - 1]; // old ord to new ord
+    int ord = 0;
+    int doc = 0;
+    for (int docIdOffset : docIdOffsets) {
+      if (docIdOffset != 0) {
+        ordMap[ord] = docIdOffset - 1;
+        oldOrdMap[docIdOffset - 1] = ord;
+        newDocsWithField.add(doc);
+        ord++;
+      }
+      doc++;
+    }
+
+    // write vector values
+    long vectorDataOffset =
+        switch (fieldData.fieldInfo.getVectorEncoding()) {
+          case BYTE -> writeSortedByteVectors(fieldData, ordMap);
+          case FLOAT32 -> writeSortedFloat32Vectors(fieldData, ordMap);
+        };
+    long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+
+    // write graph
+    long vectorIndexOffset = vectorIndex.getFilePointer();
+    OnHeapHnswGraph graph = fieldData.getGraph();
+    int[][] graphLevelNodeOffsets = graph == null ? new int[0][] : new 
int[graph.numLevels()][];
+    HnswGraph mockGraph = reconstructAndWriteGraph(graph, ordMap, oldOrdMap, 
graphLevelNodeOffsets);
+    long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
+
+    writeMeta(
+        fieldData.isQuantized(),
+        fieldData.fieldInfo,
+        maxDoc,
+        fieldData.getConfiguredQuantile(),
+        fieldData.getMinQuantile(),
+        fieldData.getMaxQuantile(),
+        quantizedVectorOffsetAndLen,
+        vectorDataOffset,
+        vectorDataLength,
+        vectorIndexOffset,
+        vectorIndexLength,
+        newDocsWithField,
+        mockGraph,
+        graphLevelNodeOffsets);
+  }
+
+  private long writeSortedFloat32Vectors(FieldWriter<?> fieldData, int[] 
ordMap)
+      throws IOException {
+    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    final ByteBuffer buffer =
+        ByteBuffer.allocate(fieldData.dim * 
Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
+    for (int ordinal : ordMap) {
+      float[] vector = (float[]) fieldData.vectors.get(ordinal);
+      buffer.asFloatBuffer().put(vector);
+      vectorData.writeBytes(buffer.array(), buffer.array().length);
+    }
+    return vectorDataOffset;
+  }
+
+  private long writeSortedByteVectors(FieldWriter<?> fieldData, int[] ordMap) 
throws IOException {
+    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    for (int ordinal : ordMap) {
+      byte[] vector = (byte[]) fieldData.vectors.get(ordinal);
+      vectorData.writeBytes(vector, vector.length);
+    }
+    return vectorDataOffset;
+  }
+
+  /**
+   * Reconstructs the graph given the old and new node ids.
+   *
+   * <p>Additionally, the graph node connections are written to the 
vectorIndex.
+   *
+   * @param graph The current on heap graph
+   * @param newToOldMap the new node ids indexed to the old node ids
+   * @param oldToNewMap the old node ids indexed to the new node ids
+   * @param levelNodeOffsets where to place the new offsets for the nodes in 
the vector index.
+   * @return The graph
+   * @throws IOException if writing to vectorIndex fails
+   */
+  private HnswGraph reconstructAndWriteGraph(
+      OnHeapHnswGraph graph, int[] newToOldMap, int[] oldToNewMap, int[][] 
levelNodeOffsets)
+      throws IOException {
+    if (graph == null) return null;
+
+    List<int[]> nodesByLevel = new ArrayList<>(graph.numLevels());
+    nodesByLevel.add(null);
+
+    int maxOrd = graph.size();
+    NodesIterator nodesOnLevel0 = graph.getNodesOnLevel(0);
+    levelNodeOffsets[0] = new int[nodesOnLevel0.size()];
+    while (nodesOnLevel0.hasNext()) {
+      int node = nodesOnLevel0.nextInt();
+      NeighborArray neighbors = graph.getNeighbors(0, newToOldMap[node]);
+      long offset = vectorIndex.getFilePointer();
+      reconstructAndWriteNeigbours(neighbors, oldToNewMap, maxOrd);
+      levelNodeOffsets[0][node] = Math.toIntExact(vectorIndex.getFilePointer() 
- offset);
+    }
+
+    for (int level = 1; level < graph.numLevels(); level++) {
+      NodesIterator nodesOnLevel = graph.getNodesOnLevel(level);
+      int[] newNodes = new int[nodesOnLevel.size()];
+      for (int n = 0; nodesOnLevel.hasNext(); n++) {
+        newNodes[n] = oldToNewMap[nodesOnLevel.nextInt()];
+      }
+      Arrays.sort(newNodes);
+      nodesByLevel.add(newNodes);
+      levelNodeOffsets[level] = new int[newNodes.length];
+      int nodeOffsetIndex = 0;
+      for (int node : newNodes) {
+        NeighborArray neighbors = graph.getNeighbors(level, newToOldMap[node]);
+        long offset = vectorIndex.getFilePointer();
+        reconstructAndWriteNeigbours(neighbors, oldToNewMap, maxOrd);
+        levelNodeOffsets[level][nodeOffsetIndex++] =
+            Math.toIntExact(vectorIndex.getFilePointer() - offset);
+      }
+    }
+    return new HnswGraph() {
+      @Override
+      public int nextNeighbor() {
+        throw new UnsupportedOperationException("Not supported on a mock 
graph");
+      }
+
+      @Override
+      public void seek(int level, int target) {
+        throw new UnsupportedOperationException("Not supported on a mock 
graph");
+      }
+
+      @Override
+      public int size() {
+        return graph.size();
+      }
+
+      @Override
+      public int numLevels() {
+        return graph.numLevels();
+      }
+
+      @Override
+      public int entryNode() {
+        throw new UnsupportedOperationException("Not supported on a mock 
graph");
+      }
+
+      @Override
+      public NodesIterator getNodesOnLevel(int level) {
+        if (level == 0) {
+          return graph.getNodesOnLevel(0);
+        } else {
+          return new ArrayNodesIterator(nodesByLevel.get(level), 
nodesByLevel.get(level).length);
+        }
+      }
+    };
+  }
+
+  private void reconstructAndWriteNeigbours(NeighborArray neighbors, int[] 
oldToNewMap, int maxOrd)
+      throws IOException {
+    int size = neighbors.size();
+    vectorIndex.writeVInt(size);
+
+    // Destructively modify; it's ok we are discarding it after this
+    int[] nnodes = neighbors.node();
+    for (int i = 0; i < size; i++) {
+      nnodes[i] = oldToNewMap[nnodes[i]];
+    }
+    Arrays.sort(nnodes, 0, size);
+    // Now that we have sorted, do delta encoding to minimize the required 
bits to store the
+    // information
+    for (int i = size - 1; i > 0; --i) {
+      assert nnodes[i] < maxOrd : "node too large: " + nnodes[i] + ">=" + 
maxOrd;
+      nnodes[i] -= nnodes[i - 1];
+    }
+    for (int i = 0; i < size; i++) {
+      vectorIndex.writeVInt(nnodes[i]);
+    }
+  }
+
+  @Override
+  public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws 
IOException {
+    long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
+    IndexOutput tempVectorData =
+        segmentWriteState.directory.createTempOutput(
+            vectorData.getName(), "temp", segmentWriteState.context);
+    IndexInput vectorDataInput = null;
+    IndexInput quantizationDataInput = null;
+    Lucene99ScalarQuantizedVectorsWriter.MergedQuantileState quantizationState 
= null;
+    boolean success = false;
+    try {
+      long[] quantizedVectorDataOffsetAndLength = null;
+      if (quantizedVectorsWriter != null) {
+        quantizationState =
+            quantizedVectorsWriter.mergeOneField(segmentWriteState, fieldInfo, 
mergeState);
+        if (quantizationState != null) {
+          quantizedVectorDataOffsetAndLength = new long[2];
+          quantizedVectorDataOffsetAndLength[0] = 
quantizedVectorData.alignFilePointer(Float.BYTES);
+          quantizationDataInput =
+              segmentWriteState.directory.openInput(
+                  quantizationState.tempVectorFileName, 
segmentWriteState.context);
+          quantizedVectorData.copyBytes(
+              quantizationDataInput, quantizationDataInput.length() - 
CodecUtil.footerLength());
+          quantizedVectorDataOffsetAndLength[1] =
+              quantizedVectorData.getFilePointer() - 
quantizedVectorDataOffsetAndLength[0];
+          CodecUtil.retrieveChecksum(quantizationDataInput);
+        }
+      }
+      // write the vector data to a temporary file

Review Comment:
   updated @jimczi what do you think?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to