mccullocht commented on code in PR #15003:
URL: https://github.com/apache/lucene/pull/15003#discussion_r2538761550
##########
lucene/core/src/java/org/apache/lucene/util/hnsw/InitializedHnswGraphBuilder.java:
##########
@@ -53,49 +110,338 @@ public static InitializedHnswGraphBuilder fromGraph(
BitSet initializedNodes,
int totalNumberOfVectors)
throws IOException {
- return new InitializedHnswGraphBuilder(
- scorerSupplier,
- beamWidth,
- seed,
- initGraph(initializerGraph, newOrdMap, totalNumberOfVectors),
- initializedNodes);
+
+ InitializedHnswGraphBuilder builder =
+ new InitializedHnswGraphBuilder(
+ scorerSupplier,
+ beamWidth,
+ seed,
+ new OnHeapHnswGraph(initializerGraph.maxConn(),
totalNumberOfVectors),
+ initializedNodes);
+
+ builder.initializeFromGraph(initializerGraph, newOrdMap);
+ return builder;
}
+ /**
+ * Convenience method to create a fully initialized on-heap HNSW graph
without tracking
+ * initialized nodes. This is useful when you just need the resulting graph
structure without
+ * planning to add additional nodes incrementally.
+ *
+ * @param initializerGraph the source graph to copy structure from
+ * @param newOrdMap maps old ordinals to new ordinals; -1 indicates deleted
documents
+ * @param totalNumberOfVectors the total number of vectors in the merged
graph
+ * @param beamWidth the search beam width for graph construction
+ * @param scorerSupplier provides vector similarity scoring
+ * @return a fully initialized on-heap HNSW graph
+ * @throws IOException if an I/O error occurs during graph initialization
+ */
public static OnHeapHnswGraph initGraph(
- HnswGraph initializerGraph, int[] newOrdMap, int totalNumberOfVectors)
throws IOException {
- OnHeapHnswGraph hnsw = new OnHeapHnswGraph(initializerGraph.maxConn(),
totalNumberOfVectors);
- for (int level = initializerGraph.numLevels() - 1; level >= 0; level--) {
+ HnswGraph initializerGraph,
+ int[] newOrdMap,
+ int totalNumberOfVectors,
+ int beamWidth,
+ RandomVectorScorerSupplier scorerSupplier)
+ throws IOException {
+
+ InitializedHnswGraphBuilder builder =
+ fromGraph(
+ scorerSupplier,
+ beamWidth,
+ randSeed,
+ initializerGraph,
+ newOrdMap,
+ null,
+ totalNumberOfVectors);
+ return builder.getGraph();
+ }
+
+ private InitializedHnswGraphBuilder(
+ RandomVectorScorerSupplier scorerSupplier,
+ int beamWidth,
+ long seed,
+ OnHeapHnswGraph initializedGraph,
+ BitSet initializedNodes)
+ throws IOException {
+ super(scorerSupplier, beamWidth, seed, initializedGraph);
+ this.initializedNodes = initializedNodes;
+ }
+
+ /**
+ * Initializes the graph from the provided initializer graph through a
three-phase process:
+ *
+ * <ol>
+ * <li>Copy the graph structure with ordinal remapping, identifying
disconnected nodes
+ * <li>If deletions occurred, repair disconnected nodes by finding
additional neighbors
+ * <li>If deletions occurred, rebalance the graph hierarchy to maintain
proper level
+ * distribution
+ * </ol>
+ *
+ * @param initializerGraph the source graph to copy from
+ * @param newOrdMap ordinal mapping from old to new ordinals
+ * @throws IOException if an I/O error occurs during initialization
+ */
+ private void initializeFromGraph(HnswGraph initializerGraph, int[]
newOrdMap) throws IOException {
+ hasDeletes = false;
+ // Phase 1: Copy structure and identify nodes that lost too many neighbors
+ Map<Integer, List<Integer>> disconnectedNodesByLevel =
+ copyGraphStructure(initializerGraph, newOrdMap);
+
+ // Repair graph if it has deletes
+ if (hasDeletes) {
+ // Phase 2: Repair nodes with insufficient connections
+ repairDisconnectedNodes(disconnectedNodesByLevel,
initializerGraph.numLevels());
+
+ // Phase 3: Rebalance graph to maintain proper level distribution
+ rebalanceGraph();
+ }
+ }
+
+ /**
+ * Copies the graph structure from the initializer graph, applying ordinal
remapping and
+ * identifying nodes that have lost neighbors.
+ *
+ * <p>A node is considered disconnected if it retains less than {@link
#DISCONNECTED_NODE_FACTOR}
+ * of its original neighbors. This happens when many neighbors were deleted
documents that
+ * couldn't be remapped (indicated by -1 in newOrdMap).
+ *
+ * <p><b>Example:</b> With DISCONNECTED_NODE_FACTOR = 0.9, if a node had 20
neighbors in the
+ * source graph but only 17 remain after remapping (17/20 = 0.85 < 0.9),
it's marked as
+ * disconnected and will be repaired.
+ *
+ * @param initializerGraph the source graph to copy from
+ * @param newOrdMap maps old ordinals to new ordinals; -1 indicates deleted
documents
+ * @return map of level to list of disconnected node ordinals at that level
+ * @throws IOException if an I/O error occurs during graph traversal
+ */
+ private Map<Integer, List<Integer>> copyGraphStructure(
+ HnswGraph initializerGraph, int[] newOrdMap) throws IOException {
+ int numLevels = initializerGraph.numLevels();
+ levelToNodes = new IntArrayList[numLevels];
+ Map<Integer, List<Integer>> disconnectedNodesByLevel = new
HashMap<>(numLevels);
+
+ for (int level = numLevels - 1; level >= 0; level--) {
+ levelToNodes[level] = new IntArrayList();
+ List<Integer> disconnectedNodes = new ArrayList<>();
HnswGraph.NodesIterator it = initializerGraph.getNodesOnLevel(level);
+
while (it.hasNext()) {
int oldOrd = it.nextInt();
int newOrd = newOrdMap[oldOrd];
+
+ // Skip deleted documents (mapped to -1)
+ if (newOrd == -1) {
+ hasDeletes = true;
+ continue;
+ }
+
hnsw.addNode(level, newOrd);
+ levelToNodes[level].add(newOrd);
hnsw.trySetNewEntryNode(newOrd, level);
+ scorer.setScoringOrdinal(newOrd);
+
+ // Copy neighbors
NeighborArray newNeighbors = hnsw.getNeighbors(level, newOrd);
initializerGraph.seek(level, oldOrd);
+ int oldNeighbourCount = 0;
for (int oldNeighbor = initializerGraph.nextNeighbor();
oldNeighbor != NO_MORE_DOCS;
oldNeighbor = initializerGraph.nextNeighbor()) {
+ oldNeighbourCount++;
int newNeighbor = newOrdMap[oldNeighbor];
- // we will compute these scores later when we need to pop out the
non-diverse nodes
- newNeighbors.addOutOfOrder(newNeighbor, Float.NaN);
+
+ // Only add neighbors that weren't deleted
+ if (newNeighbor != -1) {
+ newNeighbors.addOutOfOrder(newNeighbor, Float.NaN);
+ }
+ }
+
+ // Mark as disconnected if node lost more than the acceptable
threshold of neighbors
+ if (newNeighbors.size() < oldNeighbourCount *
DISCONNECTED_NODE_FACTOR) {
+ disconnectedNodes.add(newOrd);
}
}
+ disconnectedNodesByLevel.put(level, disconnectedNodes);
}
- return hnsw;
+ return disconnectedNodesByLevel;
}
- private final BitSet initializedNodes;
+ /**
+ * Repairs disconnected nodes at all levels by finding additional neighbors
to restore
+ * connectivity.
+ *
+ * @param disconnectedNodesByLevel map of level to disconnected nodes at
that level
+ * @param numLevels total number of levels in the graph hierarchy
+ * @throws IOException if an I/O error occurs during repair operations
+ */
+ private void repairDisconnectedNodes(
+ Map<Integer, List<Integer>> disconnectedNodesByLevel, int numLevels)
throws IOException {
+ for (int level = numLevels - 1; level >= 0; level--) {
+ fixDisconnectedNodes(disconnectedNodesByLevel.get(level), level, scorer);
+ }
+ }
- public InitializedHnswGraphBuilder(
- RandomVectorScorerSupplier scorerSupplier,
- int beamWidth,
- long seed,
- OnHeapHnswGraph initializedGraph,
- BitSet initializedNodes)
+ /**
+ * Fixes disconnected nodes at a specific level by performing graph searches
from their existing
Review Comment:
It looks like this PR has been thoroughly tested and I didn't want to block
it but trying this algorithm would be a good follow up.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]