Pulkitg64 commented on code in PR #15003:
URL: https://github.com/apache/lucene/pull/15003#discussion_r2538726262


##########
lucene/core/src/java/org/apache/lucene/util/hnsw/InitializedHnswGraphBuilder.java:
##########
@@ -53,49 +110,338 @@ public static InitializedHnswGraphBuilder fromGraph(
       BitSet initializedNodes,
       int totalNumberOfVectors)
       throws IOException {
-    return new InitializedHnswGraphBuilder(
-        scorerSupplier,
-        beamWidth,
-        seed,
-        initGraph(initializerGraph, newOrdMap, totalNumberOfVectors),
-        initializedNodes);
+
+    InitializedHnswGraphBuilder builder =
+        new InitializedHnswGraphBuilder(
+            scorerSupplier,
+            beamWidth,
+            seed,
+            new OnHeapHnswGraph(initializerGraph.maxConn(), 
totalNumberOfVectors),
+            initializedNodes);
+
+    builder.initializeFromGraph(initializerGraph, newOrdMap);
+    return builder;
   }
 
+  /**
+   * Convenience method to create a fully initialized on-heap HNSW graph 
without tracking
+   * initialized nodes. This is useful when you just need the resulting graph 
structure without
+   * planning to add additional nodes incrementally.
+   *
+   * @param initializerGraph the source graph to copy structure from
+   * @param newOrdMap maps old ordinals to new ordinals; -1 indicates deleted 
documents
+   * @param totalNumberOfVectors the total number of vectors in the merged 
graph
+   * @param beamWidth the search beam width for graph construction
+   * @param scorerSupplier provides vector similarity scoring
+   * @return a fully initialized on-heap HNSW graph
+   * @throws IOException if an I/O error occurs during graph initialization
+   */
   public static OnHeapHnswGraph initGraph(
-      HnswGraph initializerGraph, int[] newOrdMap, int totalNumberOfVectors) 
throws IOException {
-    OnHeapHnswGraph hnsw = new OnHeapHnswGraph(initializerGraph.maxConn(), 
totalNumberOfVectors);
-    for (int level = initializerGraph.numLevels() - 1; level >= 0; level--) {
+      HnswGraph initializerGraph,
+      int[] newOrdMap,
+      int totalNumberOfVectors,
+      int beamWidth,
+      RandomVectorScorerSupplier scorerSupplier)
+      throws IOException {
+
+    InitializedHnswGraphBuilder builder =
+        fromGraph(
+            scorerSupplier,
+            beamWidth,
+            randSeed,
+            initializerGraph,
+            newOrdMap,
+            null,
+            totalNumberOfVectors);
+    return builder.getGraph();
+  }
+
+  private InitializedHnswGraphBuilder(
+      RandomVectorScorerSupplier scorerSupplier,
+      int beamWidth,
+      long seed,
+      OnHeapHnswGraph initializedGraph,
+      BitSet initializedNodes)
+      throws IOException {
+    super(scorerSupplier, beamWidth, seed, initializedGraph);
+    this.initializedNodes = initializedNodes;
+  }
+
+  /**
+   * Initializes the graph from the provided initializer graph through a 
three-phase process:
+   *
+   * <ol>
+   *   <li>Copy the graph structure with ordinal remapping, identifying 
disconnected nodes
+   *   <li>If deletions occurred, repair disconnected nodes by finding 
additional neighbors
+   *   <li>If deletions occurred, rebalance the graph hierarchy to maintain 
proper level
+   *       distribution
+   * </ol>
+   *
+   * @param initializerGraph the source graph to copy from
+   * @param newOrdMap ordinal mapping from old to new ordinals
+   * @throws IOException if an I/O error occurs during initialization
+   */
+  private void initializeFromGraph(HnswGraph initializerGraph, int[] 
newOrdMap) throws IOException {
+    hasDeletes = false;
+    // Phase 1: Copy structure and identify nodes that lost too many neighbors
+    Map<Integer, List<Integer>> disconnectedNodesByLevel =
+        copyGraphStructure(initializerGraph, newOrdMap);
+
+    // Repair graph if it has deletes
+    if (hasDeletes) {
+      // Phase 2: Repair nodes with insufficient connections
+      repairDisconnectedNodes(disconnectedNodesByLevel, 
initializerGraph.numLevels());
+
+      // Phase 3: Rebalance graph to maintain proper level distribution
+      rebalanceGraph();
+    }
+  }
+
+  /**
+   * Copies the graph structure from the initializer graph, applying ordinal 
remapping and
+   * identifying nodes that have lost neighbors.
+   *
+   * <p>A node is considered disconnected if it retains less than {@link 
#DISCONNECTED_NODE_FACTOR}
+   * of its original neighbors. This happens when many neighbors were deleted 
documents that
+   * couldn't be remapped (indicated by -1 in newOrdMap).
+   *
+   * <p><b>Example:</b> With DISCONNECTED_NODE_FACTOR = 0.9, if a node had 20 
neighbors in the
+   * source graph but only 17 remain after remapping (17/20 = 0.85 < 0.9), 
it's marked as
+   * disconnected and will be repaired.
+   *
+   * @param initializerGraph the source graph to copy from
+   * @param newOrdMap maps old ordinals to new ordinals; -1 indicates deleted 
documents
+   * @return map of level to list of disconnected node ordinals at that level
+   * @throws IOException if an I/O error occurs during graph traversal
+   */
+  private Map<Integer, List<Integer>> copyGraphStructure(
+      HnswGraph initializerGraph, int[] newOrdMap) throws IOException {
+    int numLevels = initializerGraph.numLevels();
+    levelToNodes = new IntArrayList[numLevels];
+    Map<Integer, List<Integer>> disconnectedNodesByLevel = new 
HashMap<>(numLevels);
+
+    for (int level = numLevels - 1; level >= 0; level--) {
+      levelToNodes[level] = new IntArrayList();
+      List<Integer> disconnectedNodes = new ArrayList<>();
       HnswGraph.NodesIterator it = initializerGraph.getNodesOnLevel(level);
+
       while (it.hasNext()) {
         int oldOrd = it.nextInt();
         int newOrd = newOrdMap[oldOrd];
+
+        // Skip deleted documents (mapped to -1)
+        if (newOrd == -1) {
+          hasDeletes = true;
+          continue;
+        }
+
         hnsw.addNode(level, newOrd);
+        levelToNodes[level].add(newOrd);
         hnsw.trySetNewEntryNode(newOrd, level);
+        scorer.setScoringOrdinal(newOrd);
+
+        // Copy neighbors
         NeighborArray newNeighbors = hnsw.getNeighbors(level, newOrd);
         initializerGraph.seek(level, oldOrd);
+        int oldNeighbourCount = 0;
         for (int oldNeighbor = initializerGraph.nextNeighbor();
             oldNeighbor != NO_MORE_DOCS;
             oldNeighbor = initializerGraph.nextNeighbor()) {
+          oldNeighbourCount++;
           int newNeighbor = newOrdMap[oldNeighbor];
-          // we will compute these scores later when we need to pop out the 
non-diverse nodes
-          newNeighbors.addOutOfOrder(newNeighbor, Float.NaN);
+
+          // Only add neighbors that weren't deleted
+          if (newNeighbor != -1) {
+            newNeighbors.addOutOfOrder(newNeighbor, Float.NaN);
+          }
+        }
+
+        // Mark as disconnected if node lost more than the acceptable 
threshold of neighbors
+        if (newNeighbors.size() < oldNeighbourCount * 
DISCONNECTED_NODE_FACTOR) {
+          disconnectedNodes.add(newOrd);
         }
       }
+      disconnectedNodesByLevel.put(level, disconnectedNodes);
     }
-    return hnsw;
+    return disconnectedNodesByLevel;
   }
 
-  private final BitSet initializedNodes;
+  /**
+   * Repairs disconnected nodes at all levels by finding additional neighbors 
to restore
+   * connectivity.
+   *
+   * @param disconnectedNodesByLevel map of level to disconnected nodes at 
that level
+   * @param numLevels total number of levels in the graph hierarchy
+   * @throws IOException if an I/O error occurs during repair operations
+   */
+  private void repairDisconnectedNodes(
+      Map<Integer, List<Integer>> disconnectedNodesByLevel, int numLevels) 
throws IOException {
+    for (int level = numLevels - 1; level >= 0; level--) {
+      fixDisconnectedNodes(disconnectedNodesByLevel.get(level), level, scorer);
+    }
+  }
 
-  public InitializedHnswGraphBuilder(
-      RandomVectorScorerSupplier scorerSupplier,
-      int beamWidth,
-      long seed,
-      OnHeapHnswGraph initializedGraph,
-      BitSet initializedNodes)
+  /**
+   * Fixes disconnected nodes at a specific level by performing graph searches 
from their existing

Review Comment:
   Thanks @mccullocht for idea and sharing this paper. The idea of using the 
neighbors and neighbor of neighbors of the deleted nodes for reconnection seems 
like a good idea (definitely faster) and I think worth pursuing. 
   But I wonder if we should do it as part of this PR or different one. I can 
work on this idea immediately as part of different issue if that makes sense to 
you.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to