mayya-sharipova commented on code in PR #11781: URL: https://github.com/apache/lucene/pull/11781#discussion_r974364476
########## lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java: ########## @@ -316,49 +316,49 @@ private boolean isDiverse(BytesRef candidate, NeighborArray neighbors, float sco */ private int findWorstNonDiverse(NeighborArray neighbors) throws IOException { for (int i = neighbors.size() - 1; i > 0; i--) { - if (isWorstNonDiverse(i, neighbors, neighbors.score[i])) { + if (isWorstNonDiverse(i, neighbors)) { return i; } } return neighbors.size() - 1; } - private boolean isWorstNonDiverse( - int candidate, NeighborArray neighbors, float minAcceptedSimilarity) throws IOException { + private boolean isWorstNonDiverse(int candidateIndex, NeighborArray neighbors) + throws IOException { + int candidateNode = neighbors.node[candidateIndex]; return switch (vectorEncoding) { - case BYTE -> isWorstNonDiverse( - candidate, vectors.binaryValue(candidate), neighbors, minAcceptedSimilarity); + case BYTE -> isWorstNonDiverse(candidateIndex, vectors.binaryValue(candidateNode), neighbors); case FLOAT32 -> isWorstNonDiverse( - candidate, vectors.vectorValue(candidate), neighbors, minAcceptedSimilarity); + candidateIndex, vectors.vectorValue(candidateNode), neighbors); }; } private boolean isWorstNonDiverse( - int candidateIndex, float[] candidate, NeighborArray neighbors, float minAcceptedSimilarity) - throws IOException { - for (int i = candidateIndex - 1; i > -0; i--) { + int candidateIndex, float[] candidateVector, NeighborArray neighbors) throws IOException { + float minAcceptedSimilarity = neighbors.score[candidateIndex]; + for (int i = candidateIndex - 1; i >= 0; i--) { float neighborSimilarity = - similarityFunction.compare(candidate, vectorsCopy.vectorValue(neighbors.node[i])); - // node i is too similar to node j given its score relative to the base node + similarityFunction.compare(candidateVector, vectorsCopy.vectorValue(neighbors.node[i])); + // candidate node is too similar to node i given its score relative to the base node if (neighborSimilarity >= minAcceptedSimilarity) { - return false; + return true; } } - return true; + return false; } private boolean isWorstNonDiverse( - int candidateIndex, BytesRef candidate, NeighborArray neighbors, float minAcceptedSimilarity) - throws IOException { - for (int i = candidateIndex - 1; i > -0; i--) { + int candidateIndex, BytesRef candidateVector, NeighborArray neighbors) throws IOException { Review Comment: I am surprised that with this big change, we had only a small reduction in recall. I guess the reason could be that in our tests diversity check was really relevant only for small number of nodes; in majority of cases the algorithm just eliminated the most distant node. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org