Re: [PR] Extract the hnsw graph merging from being part of the vector writer [lucene]

via GitHub Mon, 16 Oct 2023 16:15:16 -0700


zhaih commented on code in PR #12657:
URL: https://github.com/apache/lucene/pull/12657#discussion_r1361341835



##########
lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java:
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.hnsw;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.codecs.HnswGraphProvider;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
+import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CollectionUtil;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * This selects the biggest Hnsw graph from the provided merge state and 
initializes a new
+ * HnswGraphBuilder with that graph as a starting point.
+ *
+ * @lucene.experimental
+ */
+public class IncrementalHnswGraphMerger {
+
+  private KnnVectorsReader initReader;
+  private MergeState.DocMap initDocMap;
+  private int initGraphSize;
+  private final FieldInfo fieldInfo;
+  private final RandomVectorScorerSupplier scorerSupplier;
+  private final int M;
+  private final int beamWidth;
+
+  /**
+   * @param fieldInfo FieldInfo for the field being merged
+   */
+  public IncrementalHnswGraphMerger(
+      FieldInfo fieldInfo, RandomVectorScorerSupplier scorerSupplier, int M, 
int beamWidth) {
+    this.fieldInfo = fieldInfo;
+    this.scorerSupplier = scorerSupplier;
+    this.M = M;
+    this.beamWidth = beamWidth;
+  }
+
+  /**
+   * Adds a reader to the graph merger if it meets the following criteria: 1. 
Does not contain any
+   * deleted docs 2. Is a HnswGraphProvider/PerFieldKnnVectorReader 3. Has the 
most docs of any
+   * previous reader that met the above criteria
+   *
+   * @param reader KnnVectorsReader to add to the merger
+   * @param docMap MergeState.DocMap for the reader
+   * @param liveDocs Bits representing live docs, can be null
+   * @return this
+   * @throws IOException If an error occurs while reading from the merge state
+   */
+  public IncrementalHnswGraphMerger addReader(
+      KnnVectorsReader reader, MergeState.DocMap docMap, Bits liveDocs) throws 
IOException {
+    KnnVectorsReader currKnnVectorsReader = reader;
+    if (reader instanceof PerFieldKnnVectorsFormat.FieldsReader 
candidateReader) {
+      currKnnVectorsReader = candidateReader.getFieldReader(fieldInfo.name);
+    }
+
+    if (!(currKnnVectorsReader instanceof HnswGraphProvider) || 
!allMatch(liveDocs)) {
+      return this;
+    }
+
+    int candidateVectorCount = 0;
+    switch (fieldInfo.getVectorEncoding()) {
+      case BYTE -> {
+        ByteVectorValues byteVectorValues =
+            currKnnVectorsReader.getByteVectorValues(fieldInfo.name);
+        if (byteVectorValues == null) {
+          return this;
+        }
+        candidateVectorCount = byteVectorValues.size();
+      }
+      case FLOAT32 -> {
+        FloatVectorValues vectorValues = 
currKnnVectorsReader.getFloatVectorValues(fieldInfo.name);
+        if (vectorValues == null) {
+          return this;
+        }
+        candidateVectorCount = vectorValues.size();
+      }
+    }
+    if (candidateVectorCount > initGraphSize) {
+      initReader = currKnnVectorsReader;
+      initDocMap = docMap;
+      initGraphSize = candidateVectorCount;
+    }
+    return this;
+  }
+
+  /**
+   * Builds a new HnswGraphBuilder using the biggest graph from the merge 
state as a starting point.
+   * If no valid readers were added to the merge state, a new graph is created.
+   *
+   * @param mergeState MergeState for the merge
+   * @return HnswGraphBuilder
+   * @throws IOException If an error occurs while reading from the merge state
+   */
+  public HnswGraphBuilder createBuilder(MergeState mergeState) throws 
IOException {
+    if (initReader == null) {
+      return HnswGraphBuilder.create(scorerSupplier, M, beamWidth, 
HnswGraphBuilder.randSeed);
+    }
+
+    HnswGraph initializerGraph = ((HnswGraphProvider) 
initReader).getGraph(fieldInfo.name);
+    BitSet initializedNodes = new FixedBitSet(mergeState.segmentInfo.maxDoc() 
+ 1);
+    int[] ordBaseline = getNewOrdOffset(mergeState, initializedNodes);
+    return InitializedHnswGraphBuilder.fromGraph(
+        scorerSupplier,
+        M,
+        beamWidth,
+        HnswGraphBuilder.randSeed,
+        initializerGraph,
+        ordBaseline,
+        initializedNodes);
+  }
+
+  private int[] getNewOrdOffset(MergeState mergeState, BitSet 
initializedNodes) throws IOException {
+    DocIdSetIterator initializerIterator = null;
+
+    switch (fieldInfo.getVectorEncoding()) {
+      case BYTE -> initializerIterator = 
initReader.getByteVectorValues(fieldInfo.name);
+      case FLOAT32 -> initializerIterator = 
initReader.getFloatVectorValues(fieldInfo.name);
+    }
+
+    Map<Integer, Integer> newIdToOldOrdinal = 
CollectionUtil.newHashMap(initGraphSize);
+    int oldOrd = 0;
+    int maxNewDocID = -1;
+    for (int oldId = initializerIterator.nextDoc();
+        oldId != NO_MORE_DOCS;
+        oldId = initializerIterator.nextDoc()) {
+      int newId = initDocMap.get(oldId);
+      maxNewDocID = Math.max(newId, maxNewDocID);
+      newIdToOldOrdinal.put(newId, oldOrd);
+      oldOrd++;
+    }
+
+    if (maxNewDocID == -1) {
+      return new int[0];
+    }
+
+    int[] oldToNewOrdinalMap = new int[initGraphSize];
+
+    DocIdSetIterator vectorIterator = null;
+    switch (fieldInfo.getVectorEncoding()) {
+      case BYTE -> vectorIterator =
+          KnnVectorsWriter.MergedVectorValues.mergeByteVectorValues(fieldInfo, 
mergeState);
+      case FLOAT32 -> vectorIterator =
+          
KnnVectorsWriter.MergedVectorValues.mergeFloatVectorValues(fieldInfo, 
mergeState);
+    }
+
+    int newOrd = 0;
+    for (int newDocId = vectorIterator.nextDoc();
+        newDocId <= maxNewDocID;
+        newDocId = vectorIterator.nextDoc()) {
+      if (newIdToOldOrdinal.containsKey(newDocId)) {
+        initializedNodes.set(newOrd);
+        oldToNewOrdinalMap[newIdToOldOrdinal.get(newDocId)] = newOrd;
+      }
+      newOrd++;
+    }
+    return oldToNewOrdinalMap;
+  }
+
+  private static boolean allMatch(Bits bits) {

Review Comment:
   Let's just name it `noDeletion` and the parameter `livedocs`?



##########
lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java:
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.hnsw;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.codecs.HnswGraphProvider;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
+import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CollectionUtil;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * This selects the biggest Hnsw graph from the provided merge state and 
initializes a new
+ * HnswGraphBuilder with that graph as a starting point.
+ *
+ * @lucene.experimental
+ */
+public class IncrementalHnswGraphMerger {
+
+  private KnnVectorsReader initReader;
+  private MergeState.DocMap initDocMap;
+  private int initGraphSize;
+  private final FieldInfo fieldInfo;
+  private final RandomVectorScorerSupplier scorerSupplier;
+  private final int M;
+  private final int beamWidth;
+
+  /**
+   * @param fieldInfo FieldInfo for the field being merged
+   */
+  public IncrementalHnswGraphMerger(
+      FieldInfo fieldInfo, RandomVectorScorerSupplier scorerSupplier, int M, 
int beamWidth) {
+    this.fieldInfo = fieldInfo;
+    this.scorerSupplier = scorerSupplier;
+    this.M = M;
+    this.beamWidth = beamWidth;
+  }
+
+  /**
+   * Adds a reader to the graph merger if it meets the following criteria: 1. 
Does not contain any
+   * deleted docs 2. Is a HnswGraphProvider/PerFieldKnnVectorReader 3. Has the 
most docs of any
+   * previous reader that met the above criteria
+   *
+   * @param reader KnnVectorsReader to add to the merger
+   * @param docMap MergeState.DocMap for the reader
+   * @param liveDocs Bits representing live docs, can be null
+   * @return this
+   * @throws IOException If an error occurs while reading from the merge state
+   */
+  public IncrementalHnswGraphMerger addReader(
+      KnnVectorsReader reader, MergeState.DocMap docMap, Bits liveDocs) throws 
IOException {
+    KnnVectorsReader currKnnVectorsReader = reader;
+    if (reader instanceof PerFieldKnnVectorsFormat.FieldsReader 
candidateReader) {
+      currKnnVectorsReader = candidateReader.getFieldReader(fieldInfo.name);
+    }
+
+    if (!(currKnnVectorsReader instanceof HnswGraphProvider) || 
!allMatch(liveDocs)) {
+      return this;
+    }
+
+    int candidateVectorCount = 0;
+    switch (fieldInfo.getVectorEncoding()) {
+      case BYTE -> {
+        ByteVectorValues byteVectorValues =
+            currKnnVectorsReader.getByteVectorValues(fieldInfo.name);
+        if (byteVectorValues == null) {
+          return this;
+        }
+        candidateVectorCount = byteVectorValues.size();
+      }
+      case FLOAT32 -> {
+        FloatVectorValues vectorValues = 
currKnnVectorsReader.getFloatVectorValues(fieldInfo.name);
+        if (vectorValues == null) {
+          return this;
+        }
+        candidateVectorCount = vectorValues.size();
+      }
+    }
+    if (candidateVectorCount > initGraphSize) {
+      initReader = currKnnVectorsReader;
+      initDocMap = docMap;
+      initGraphSize = candidateVectorCount;
+    }
+    return this;
+  }
+
+  /**
+   * Builds a new HnswGraphBuilder using the biggest graph from the merge 
state as a starting point.
+   * If no valid readers were added to the merge state, a new graph is created.
+   *
+   * @param mergeState MergeState for the merge
+   * @return HnswGraphBuilder
+   * @throws IOException If an error occurs while reading from the merge state
+   */
+  public HnswGraphBuilder createBuilder(MergeState mergeState) throws 
IOException {
+    if (initReader == null) {
+      return HnswGraphBuilder.create(scorerSupplier, M, beamWidth, 
HnswGraphBuilder.randSeed);
+    }
+
+    HnswGraph initializerGraph = ((HnswGraphProvider) 
initReader).getGraph(fieldInfo.name);
+    BitSet initializedNodes = new FixedBitSet(mergeState.segmentInfo.maxDoc() 
+ 1);
+    int[] ordBaseline = getNewOrdOffset(mergeState, initializedNodes);
+    return InitializedHnswGraphBuilder.fromGraph(
+        scorerSupplier,
+        M,
+        beamWidth,
+        HnswGraphBuilder.randSeed,
+        initializerGraph,
+        ordBaseline,
+        initializedNodes);
+  }
+
+  private int[] getNewOrdOffset(MergeState mergeState, BitSet 
initializedNodes) throws IOException {

Review Comment:
   rename to `getNewOrdMap` and add some explanation about the returned int 
array?



##########
lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java:
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.hnsw;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.codecs.HnswGraphProvider;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
+import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CollectionUtil;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * This selects the biggest Hnsw graph from the provided merge state and 
initializes a new
+ * HnswGraphBuilder with that graph as a starting point.
+ *
+ * @lucene.experimental
+ */
+public class IncrementalHnswGraphMerger {
+
+  private KnnVectorsReader initReader;
+  private MergeState.DocMap initDocMap;
+  private int initGraphSize;
+  private final FieldInfo fieldInfo;
+  private final RandomVectorScorerSupplier scorerSupplier;
+  private final int M;
+  private final int beamWidth;
+
+  /**
+   * @param fieldInfo FieldInfo for the field being merged
+   */
+  public IncrementalHnswGraphMerger(
+      FieldInfo fieldInfo, RandomVectorScorerSupplier scorerSupplier, int M, 
int beamWidth) {
+    this.fieldInfo = fieldInfo;
+    this.scorerSupplier = scorerSupplier;
+    this.M = M;
+    this.beamWidth = beamWidth;
+  }
+
+  /**
+   * Adds a reader to the graph merger if it meets the following criteria: 1. 
Does not contain any
+   * deleted docs 2. Is a HnswGraphProvider/PerFieldKnnVectorReader 3. Has the 
most docs of any
+   * previous reader that met the above criteria
+   *
+   * @param reader KnnVectorsReader to add to the merger
+   * @param docMap MergeState.DocMap for the reader
+   * @param liveDocs Bits representing live docs, can be null
+   * @return this
+   * @throws IOException If an error occurs while reading from the merge state
+   */
+  public IncrementalHnswGraphMerger addReader(
+      KnnVectorsReader reader, MergeState.DocMap docMap, Bits liveDocs) throws 
IOException {
+    KnnVectorsReader currKnnVectorsReader = reader;
+    if (reader instanceof PerFieldKnnVectorsFormat.FieldsReader 
candidateReader) {
+      currKnnVectorsReader = candidateReader.getFieldReader(fieldInfo.name);
+    }
+
+    if (!(currKnnVectorsReader instanceof HnswGraphProvider) || 
!allMatch(liveDocs)) {
+      return this;
+    }
+
+    int candidateVectorCount = 0;
+    switch (fieldInfo.getVectorEncoding()) {
+      case BYTE -> {
+        ByteVectorValues byteVectorValues =
+            currKnnVectorsReader.getByteVectorValues(fieldInfo.name);
+        if (byteVectorValues == null) {
+          return this;
+        }
+        candidateVectorCount = byteVectorValues.size();
+      }
+      case FLOAT32 -> {
+        FloatVectorValues vectorValues = 
currKnnVectorsReader.getFloatVectorValues(fieldInfo.name);
+        if (vectorValues == null) {
+          return this;
+        }
+        candidateVectorCount = vectorValues.size();
+      }
+    }
+    if (candidateVectorCount > initGraphSize) {
+      initReader = currKnnVectorsReader;
+      initDocMap = docMap;
+      initGraphSize = candidateVectorCount;
+    }
+    return this;
+  }
+
+  /**
+   * Builds a new HnswGraphBuilder using the biggest graph from the merge 
state as a starting point.
+   * If no valid readers were added to the merge state, a new graph is created.
+   *
+   * @param mergeState MergeState for the merge
+   * @return HnswGraphBuilder
+   * @throws IOException If an error occurs while reading from the merge state
+   */
+  public HnswGraphBuilder createBuilder(MergeState mergeState) throws 
IOException {
+    if (initReader == null) {
+      return HnswGraphBuilder.create(scorerSupplier, M, beamWidth, 
HnswGraphBuilder.randSeed);
+    }
+
+    HnswGraph initializerGraph = ((HnswGraphProvider) 
initReader).getGraph(fieldInfo.name);
+    BitSet initializedNodes = new FixedBitSet(mergeState.segmentInfo.maxDoc() 
+ 1);

Review Comment:
   We probably don't need this much number of bits, but we can postpone it to 
another PR I think? Could you create an issue for this after the PR is merged? 
(And maybe attach "Good First Issue" label?)



##########
lucene/core/src/java/org/apache/lucene/util/hnsw/InitializedHnswGraphBuilder.java:
##########
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.util.hnsw;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+import java.io.IOException;
+import org.apache.lucene.util.BitSet;
+
+/**
+ * This creates a graph builder that is initialized with the provided 
HnswGraph. This is useful for
+ * merging HnswGraphs from multiple segments.
+ *
+ * @lucene.experimental
+ */
+public final class InitializedHnswGraphBuilder extends HnswGraphBuilder {
+
+  /**
+   * Create a new HnswGraphBuilder that is initialized with the provided 
HnswGraph.
+   *
+   * @param scorerSupplier the scorer to use for vectors
+   * @param M the number of connections to keep per node
+   * @param beamWidth the number of nodes to explore in the search
+   * @param seed the seed for the random number generator
+   * @param initializerGraph the graph to initialize the new graph builder
+   * @param newOrdMap a mapping from the old node ordinal to the new node 
ordinal
+   * @param initializedNodes a bitset of nodes that are already initialized in 
the initializerGraph
+   * @return a new HnswGraphBuilder that is initialized with the provided 
HnswGraph
+   * @throws IOException when reading the graph fails
+   */
+  public static InitializedHnswGraphBuilder fromGraph(
+      RandomVectorScorerSupplier scorerSupplier,
+      int M,
+      int beamWidth,
+      long seed,
+      HnswGraph initializerGraph,
+      int[] newOrdMap,
+      BitSet initializedNodes)
+      throws IOException {
+    OnHeapHnswGraph hnsw = new OnHeapHnswGraph(M);
+    for (int level = 0; level < initializerGraph.numLevels(); level++) {

Review Comment:
   FYI I have just merged #12651 so insertion sequence here might need to be 
changed



##########
lucene/core/src/java/org/apache/lucene/util/hnsw/IncrementalHnswGraphMerger.java:
##########
@@ -0,0 +1,197 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.util.hnsw;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+import java.io.IOException;
+import java.util.Map;
+import org.apache.lucene.codecs.HnswGraphProvider;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
+import org.apache.lucene.index.ByteVectorValues;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.BitSet;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.CollectionUtil;
+import org.apache.lucene.util.FixedBitSet;
+
+/**
+ * This selects the biggest Hnsw graph from the provided merge state and 
initializes a new
+ * HnswGraphBuilder with that graph as a starting point.
+ *
+ * @lucene.experimental
+ */
+public class IncrementalHnswGraphMerger {
+
+  private KnnVectorsReader initReader;
+  private MergeState.DocMap initDocMap;
+  private int initGraphSize;
+  private final FieldInfo fieldInfo;
+  private final RandomVectorScorerSupplier scorerSupplier;
+  private final int M;
+  private final int beamWidth;
+
+  /**
+   * @param fieldInfo FieldInfo for the field being merged
+   */
+  public IncrementalHnswGraphMerger(
+      FieldInfo fieldInfo, RandomVectorScorerSupplier scorerSupplier, int M, 
int beamWidth) {
+    this.fieldInfo = fieldInfo;
+    this.scorerSupplier = scorerSupplier;
+    this.M = M;
+    this.beamWidth = beamWidth;
+  }
+
+  /**
+   * Adds a reader to the graph merger if it meets the following criteria: 1. 
Does not contain any
+   * deleted docs 2. Is a HnswGraphProvider/PerFieldKnnVectorReader 3. Has the 
most docs of any
+   * previous reader that met the above criteria
+   *
+   * @param reader KnnVectorsReader to add to the merger
+   * @param docMap MergeState.DocMap for the reader
+   * @param liveDocs Bits representing live docs, can be null
+   * @return this
+   * @throws IOException If an error occurs while reading from the merge state
+   */
+  public IncrementalHnswGraphMerger addReader(
+      KnnVectorsReader reader, MergeState.DocMap docMap, Bits liveDocs) throws 
IOException {
+    KnnVectorsReader currKnnVectorsReader = reader;
+    if (reader instanceof PerFieldKnnVectorsFormat.FieldsReader 
candidateReader) {
+      currKnnVectorsReader = candidateReader.getFieldReader(fieldInfo.name);
+    }
+
+    if (!(currKnnVectorsReader instanceof HnswGraphProvider) || 
!allMatch(liveDocs)) {
+      return this;
+    }
+
+    int candidateVectorCount = 0;
+    switch (fieldInfo.getVectorEncoding()) {
+      case BYTE -> {
+        ByteVectorValues byteVectorValues =
+            currKnnVectorsReader.getByteVectorValues(fieldInfo.name);
+        if (byteVectorValues == null) {
+          return this;
+        }
+        candidateVectorCount = byteVectorValues.size();
+      }
+      case FLOAT32 -> {
+        FloatVectorValues vectorValues = 
currKnnVectorsReader.getFloatVectorValues(fieldInfo.name);
+        if (vectorValues == null) {
+          return this;
+        }
+        candidateVectorCount = vectorValues.size();
+      }
+    }
+    if (candidateVectorCount > initGraphSize) {
+      initReader = currKnnVectorsReader;
+      initDocMap = docMap;
+      initGraphSize = candidateVectorCount;
+    }
+    return this;
+  }
+
+  /**
+   * Builds a new HnswGraphBuilder using the biggest graph from the merge 
state as a starting point.
+   * If no valid readers were added to the merge state, a new graph is created.
+   *
+   * @param mergeState MergeState for the merge
+   * @return HnswGraphBuilder
+   * @throws IOException If an error occurs while reading from the merge state
+   */
+  public HnswGraphBuilder createBuilder(MergeState mergeState) throws 
IOException {
+    if (initReader == null) {
+      return HnswGraphBuilder.create(scorerSupplier, M, beamWidth, 
HnswGraphBuilder.randSeed);
+    }
+
+    HnswGraph initializerGraph = ((HnswGraphProvider) 
initReader).getGraph(fieldInfo.name);
+    BitSet initializedNodes = new FixedBitSet(mergeState.segmentInfo.maxDoc() 
+ 1);
+    int[] ordBaseline = getNewOrdOffset(mergeState, initializedNodes);

Review Comment:
   `ordBaseline` -> `newOrdMap`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Extract the hnsw graph merging from being part of the vector writer [lucene]

Reply via email to