zhaih commented on code in PR #12660: URL: https://github.com/apache/lucene/pull/12660#discussion_r1369644221
########## lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java: ########## @@ -221,34 +296,50 @@ private long printGraphBuildStatus(int node, long start, long t) { return now; } - private void addDiverseNeighbors(int level, int node, GraphBuilderKnnCollector candidates) + private void addDiverseNeighbors(int level, int node, NeighborArray candidates) throws IOException { /* For each of the beamWidth nearest candidates (going from best to worst), select it only if it * is closer to target than it is to any of the already-selected neighbors (ie selected in this method, * since the node is new and has no prior neighbors). */ NeighborArray neighbors = hnsw.getNeighbors(level, node); assert neighbors.size() == 0; // new node - popToScratch(candidates); int maxConnOnLevel = level == 0 ? M * 2 : M; - selectAndLinkDiverse(neighbors, scratch, maxConnOnLevel); + boolean[] mask = selectAndLinkDiverse(neighbors, candidates, maxConnOnLevel); // Link the selected nodes to the new node, and the new node to the selected nodes (again // applying diversity heuristic) - int size = neighbors.size(); - for (int i = 0; i < size; i++) { - int nbr = neighbors.node[i]; + // NOTE: here we're using candidates and mask but not the neighbour array because once we have + // added incoming link there will be possibilities of this node being discovered and neighbour + // array being modified. So using local candidates and mask is a safer option. + for (int i = 0; i < candidates.size(); i++) { + if (mask[i] == false) { + continue; + } + int nbr = candidates.node[i]; NeighborArray nbrsOfNbr = hnsw.getNeighbors(level, nbr); - nbrsOfNbr.addOutOfOrder(node, neighbors.score[i]); - if (nbrsOfNbr.size() > maxConnOnLevel) { - int indexToRemove = findWorstNonDiverse(nbrsOfNbr, nbr); - nbrsOfNbr.removeIndex(indexToRemove); + long start = System.nanoTime(); + nbrsOfNbr.rwlock.writeLock().lock(); Review Comment: So when I'm testing it's like 20ms of overhead on grabing this lock for building roughly 20k docs with 384 dim vectors. So I think it's not bad given the HNSW build itself is much slower? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org