zhaih commented on code in PR #12660:
URL: https://github.com/apache/lucene/pull/12660#discussion_r1367956726


##########
lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java:
##########
@@ -151,61 +159,124 @@ public OnHeapHnswGraph build(int maxOrd) throws 
IOException {
     return hnsw;
   }
 
-  /** Set info-stream to output debugging information * */
+  @Override
   public void setInfoStream(InfoStream infoStream) {
     this.infoStream = infoStream;
   }
 
+  @Override
   public OnHeapHnswGraph getGraph() {
     return hnsw;
   }
 
-  private void addVectors(int maxOrd) throws IOException {
+  protected void addVectors(int minOrd, int maxOrd) throws IOException {
     long start = System.nanoTime(), t = start;
-    for (int node = 0; node < maxOrd; node++) {
+    if (infoStream.isEnabled(HNSW_COMPONENT)) {
+      infoStream.message(HNSW_COMPONENT, "addVectors [" + minOrd + " " + 
maxOrd + ")");
+    }
+    //    System.out.println("addVectors [" + minOrd + " " + maxOrd + ") 
initialized.size=" +
+    // initializedNodes.size());
+    for (int node = minOrd; node < maxOrd; node++) {
+      // System.out.println("add node " + node + " t=" + 
Thread.currentThread().getName());
       addGraphNode(node);
+      // System.out.println("entry node " + hnsw.entryNode());
+      // System.out.println("node " + node + " nbrs.size()=" + 
hnsw.getNeighbors(0, node).size());
       if ((node % 10000 == 0) && infoStream.isEnabled(HNSW_COMPONENT)) {
         t = printGraphBuildStatus(node, start, t);
       }
     }
+    //    System.out.println("addVectors [" + minOrd + " " + maxOrd + ") done 
+ graph.size=" +
+    // hnsw.size());
+  }
+
+  @SuppressWarnings({"rawtypes", "unchecked"})
+  private void addVectors(int maxOrd) throws IOException {
+    addVectors(0, maxOrd);
   }
 
-  /** Inserts a doc with vector value to the graph */
+  @Override
   public void addGraphNode(int node) throws IOException {
+    /*
+    Note: this implementation is thread safe when graph size is fixed (e.g. 
when merging)
+    The process of adding a node is roughly:
+    1. Add the node to all level from top to the bottom, but do not connect it 
to any other node,
+       nor try to promote itself to an entry node before the connection is done
+    2. Do the search from top to bottom, remember all the possible neighbours 
on each level the node
+       is on.
+    3. Add the neighbor to the node from bottom to top level, when adding the 
neighbour,
+       we always add all the outgoing links first before adding incoming link 
such that
+       when a search visiting this node, it can always find a way out
+    4. If the node has level that is less or equal to graph level, then we're 
done here.
+       If the node has level larger than graph level, then we need to promote 
the node
+       as the entry node. If, while we add the node to the graph, the entry 
node has changed
+       (which means the graph level has changed as well), we need to reinsert 
the node
+       to the newly introduced levels (repeating step 2,3 for new levels) and 
again try to
+       promote the node to entry node.
+    */
     RandomVectorScorer scorer = scorerSupplier.scorer(node);
     final int nodeLevel = getRandomGraphLevel(ml, random);
-    int curMaxLevel = hnsw.numLevels() - 1;
-
-    // If entrynode is -1, then this should finish without adding neighbors
-    if (hnsw.entryNode() == -1) {
-      for (int level = nodeLevel; level >= 0; level--) {
-        hnsw.addNode(level, node);
-      }
+    // first add nodes to all levels
+    for (int level = nodeLevel; level >= 0; level--) {
+      hnsw.addNode(level, node);
+    }
+    // then promote itself as entry node if entry node is not set
+    if (hnsw.trySetNewEntryNode(node, nodeLevel)) {
       return;
     }
-    int[] eps = new int[] {hnsw.entryNode()};
+    // if the entry node is already set, then we have to do all connections 
first before we can
+    // promote ourselves as entry node
+    // do connections from bottom up
+    int lowestUnsetLevel = 0;
+    int curMaxLevel;
+    do {
+      curMaxLevel = hnsw.numLevels() - 1;
+      // NOTE: the entry node and max level may not be paired, but because we 
get the level first
+      // we ensure that the entry node we get later will always exist on the 
curMaxLevel
+      int[] eps = new int[] {hnsw.entryNode()};
+      // for levels > nodeLevel search with topk = 1
+      GraphBuilderKnnCollector candidates = entryCandidates;
+      for (int level = curMaxLevel; level > nodeLevel; level--) {
+        candidates.clear();
+        graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null);
+        eps = new int[] {candidates.popNode()};
+      }
 
-    // if a node introduces new levels to the graph, add this new node on new 
levels
-    for (int level = nodeLevel; level > curMaxLevel; level--) {
-      hnsw.addNode(level, node);
-    }
+      // for levels <= nodeLevel search with topk = beamWidth, and add 
connections
+      candidates = beamCandidates;
+      NeighborArray[] scratchPerLevel =
+          new NeighborArray[Math.min(nodeLevel, curMaxLevel) - 
lowestUnsetLevel + 1];
+      for (int i = scratchPerLevel.length - 1; i >= 0; i--) {
+        int level = i + lowestUnsetLevel;
+        candidates.clear();
+        graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null);
+        eps = candidates.popUntilNearestKNodes();
+        scratchPerLevel[i] = new NeighborArray(Math.max(beamCandidates.k(), M 
+ 1), false);
+        popToScratch(candidates, scratchPerLevel[i]);
+      }
 
-    // for levels > nodeLevel search with topk = 1
-    GraphBuilderKnnCollector candidates = entryCandidates;
-    for (int level = curMaxLevel; level > nodeLevel; level--) {
-      candidates.clear();
-      graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null);
-      eps = new int[] {candidates.popNode()};
-    }
-    // for levels <= nodeLevel search with topk = beamWidth, and add 
connections
-    candidates = beamCandidates;
-    for (int level = Math.min(nodeLevel, curMaxLevel); level >= 0; level--) {
-      candidates.clear();
-      graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null);
-      eps = candidates.popUntilNearestKNodes();
-      hnsw.addNode(level, node);
-      addDiverseNeighbors(level, node, candidates);
-    }
+      for (int i = 0; i < scratchPerLevel.length; i++) {
+        addDiverseNeighbors(i + lowestUnsetLevel, node, scratchPerLevel[i]);
+      }
+      lowestUnsetLevel = scratchPerLevel.length + lowestUnsetLevel;
+      assert lowestUnsetLevel == Math.min(nodeLevel, curMaxLevel) + 1;
+      if (lowestUnsetLevel > nodeLevel) {
+        return;
+      }
+      assert lowestUnsetLevel == curMaxLevel + 1 && nodeLevel > curMaxLevel;
+      if (hnsw.tryPromoteNewEntryNode(node, nodeLevel, curMaxLevel)) {
+        return;
+      }
+      if (hnsw.numLevels() == curMaxLevel + 1) {
+        throw new IllegalStateException(

Review Comment:
   Yes, if all the logic and calculation is right then this should never happen.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to