msokolov commented on code in PR #11781: URL: https://github.com/apache/lucene/pull/11781#discussion_r974411586
########## lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java: ########## @@ -316,49 +316,49 @@ private boolean isDiverse(BytesRef candidate, NeighborArray neighbors, float sco */ private int findWorstNonDiverse(NeighborArray neighbors) throws IOException { for (int i = neighbors.size() - 1; i > 0; i--) { - if (isWorstNonDiverse(i, neighbors, neighbors.score[i])) { + if (isWorstNonDiverse(i, neighbors)) { return i; } } return neighbors.size() - 1; } - private boolean isWorstNonDiverse( - int candidate, NeighborArray neighbors, float minAcceptedSimilarity) throws IOException { + private boolean isWorstNonDiverse(int candidateIndex, NeighborArray neighbors) + throws IOException { + int candidateNode = neighbors.node[candidateIndex]; return switch (vectorEncoding) { - case BYTE -> isWorstNonDiverse( - candidate, vectors.binaryValue(candidate), neighbors, minAcceptedSimilarity); + case BYTE -> isWorstNonDiverse(candidateIndex, vectors.binaryValue(candidateNode), neighbors); case FLOAT32 -> isWorstNonDiverse( - candidate, vectors.vectorValue(candidate), neighbors, minAcceptedSimilarity); + candidateIndex, vectors.vectorValue(candidateNode), neighbors); }; } private boolean isWorstNonDiverse( - int candidateIndex, float[] candidate, NeighborArray neighbors, float minAcceptedSimilarity) - throws IOException { - for (int i = candidateIndex - 1; i > -0; i--) { + int candidateIndex, float[] candidateVector, NeighborArray neighbors) throws IOException { + float minAcceptedSimilarity = neighbors.score[candidateIndex]; + for (int i = candidateIndex - 1; i >= 0; i--) { float neighborSimilarity = - similarityFunction.compare(candidate, vectorsCopy.vectorValue(neighbors.node[i])); - // node i is too similar to node j given its score relative to the base node + similarityFunction.compare(candidateVector, vectorsCopy.vectorValue(neighbors.node[i])); + // candidate node is too similar to node i given its score relative to the base node if (neighborSimilarity >= minAcceptedSimilarity) { - return false; + return true; } } - return true; + return false; } private boolean isWorstNonDiverse( - int candidateIndex, BytesRef candidate, NeighborArray neighbors, float minAcceptedSimilarity) - throws IOException { - for (int i = candidateIndex - 1; i > -0; i--) { + int candidateIndex, BytesRef candidateVector, NeighborArray neighbors) throws IOException { Review Comment: I know - how did this garbage even work at all! :frowning_face: It's kind of astonishing how insensitive this whole process is to the diversity checking. Initially we didn't have it at all though (just always pick the closest neighbors), and things still kind of work. Then I had the wonky implementation that did not sort the neighbors while indexing, but did some best effort kind of thing, and still it mostly worked. So we need good tests here to ensure we are doing the right thing! Because bugs here can lead to small degradation. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org