zhaih commented on code in PR #12660: URL: https://github.com/apache/lucene/pull/12660#discussion_r1367956372
########## lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java: ########## @@ -151,61 +159,124 @@ public OnHeapHnswGraph build(int maxOrd) throws IOException { return hnsw; } - /** Set info-stream to output debugging information * */ + @Override public void setInfoStream(InfoStream infoStream) { this.infoStream = infoStream; } + @Override public OnHeapHnswGraph getGraph() { return hnsw; } - private void addVectors(int maxOrd) throws IOException { + protected void addVectors(int minOrd, int maxOrd) throws IOException { long start = System.nanoTime(), t = start; - for (int node = 0; node < maxOrd; node++) { + if (infoStream.isEnabled(HNSW_COMPONENT)) { + infoStream.message(HNSW_COMPONENT, "addVectors [" + minOrd + " " + maxOrd + ")"); + } + // System.out.println("addVectors [" + minOrd + " " + maxOrd + ") initialized.size=" + + // initializedNodes.size()); + for (int node = minOrd; node < maxOrd; node++) { + // System.out.println("add node " + node + " t=" + Thread.currentThread().getName()); addGraphNode(node); + // System.out.println("entry node " + hnsw.entryNode()); + // System.out.println("node " + node + " nbrs.size()=" + hnsw.getNeighbors(0, node).size()); if ((node % 10000 == 0) && infoStream.isEnabled(HNSW_COMPONENT)) { t = printGraphBuildStatus(node, start, t); } } + // System.out.println("addVectors [" + minOrd + " " + maxOrd + ") done + graph.size=" + + // hnsw.size()); + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private void addVectors(int maxOrd) throws IOException { + addVectors(0, maxOrd); } - /** Inserts a doc with vector value to the graph */ + @Override public void addGraphNode(int node) throws IOException { + /* + Note: this implementation is thread safe when graph size is fixed (e.g. when merging) + The process of adding a node is roughly: + 1. Add the node to all level from top to the bottom, but do not connect it to any other node, + nor try to promote itself to an entry node before the connection is done + 2. Do the search from top to bottom, remember all the possible neighbours on each level the node + is on. + 3. Add the neighbor to the node from bottom to top level, when adding the neighbour, + we always add all the outgoing links first before adding incoming link such that + when a search visiting this node, it can always find a way out + 4. If the node has level that is less or equal to graph level, then we're done here. + If the node has level larger than graph level, then we need to promote the node + as the entry node. If, while we add the node to the graph, the entry node has changed + (which means the graph level has changed as well), we need to reinsert the node + to the newly introduced levels (repeating step 2,3 for new levels) and again try to + promote the node to entry node. + */ RandomVectorScorer scorer = scorerSupplier.scorer(node); final int nodeLevel = getRandomGraphLevel(ml, random); - int curMaxLevel = hnsw.numLevels() - 1; - - // If entrynode is -1, then this should finish without adding neighbors - if (hnsw.entryNode() == -1) { - for (int level = nodeLevel; level >= 0; level--) { - hnsw.addNode(level, node); - } + // first add nodes to all levels + for (int level = nodeLevel; level >= 0; level--) { + hnsw.addNode(level, node); + } + // then promote itself as entry node if entry node is not set + if (hnsw.trySetNewEntryNode(node, nodeLevel)) { return; } - int[] eps = new int[] {hnsw.entryNode()}; + // if the entry node is already set, then we have to do all connections first before we can + // promote ourselves as entry node + // do connections from bottom up + int lowestUnsetLevel = 0; + int curMaxLevel; + do { + curMaxLevel = hnsw.numLevels() - 1; + // NOTE: the entry node and max level may not be paired, but because we get the level first + // we ensure that the entry node we get later will always exist on the curMaxLevel + int[] eps = new int[] {hnsw.entryNode()}; + // for levels > nodeLevel search with topk = 1 + GraphBuilderKnnCollector candidates = entryCandidates; + for (int level = curMaxLevel; level > nodeLevel; level--) { + candidates.clear(); + graphSearcher.searchLevel(candidates, scorer, level, eps, hnsw, null); + eps = new int[] {candidates.popNode()}; Review Comment: good catch, let me change it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org