muse-dev[bot] commented on a change in pull request #2022: URL: https://github.com/apache/lucene-solr/pull/2022#discussion_r511202225
########## File path: lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorReader.java ########## @@ -165,42 +191,88 @@ public VectorValues getVectorValues(String field) throws IOException { return new OffHeapVectorValues(fieldEntry, bytesSlice); } + // exposed for testing + public KnnGraphValues getGraphValues(String field) throws IOException { + FieldInfo info = fieldInfos.fieldInfo(field); + if (info == null) { + throw new IllegalArgumentException("No such field '" + field + "'"); + } + FieldEntry entry = fields.get(field); + if (entry != null && entry.indexDataLength > 0) { + return getGraphValues(entry); + } else { + return KnnGraphValues.EMPTY; + } + } + + private KnnGraphValues getGraphValues(FieldEntry entry) throws IOException { + if (isHnswStrategy(entry.searchStrategy)) { + HnswGraphFieldEntry graphEntry = (HnswGraphFieldEntry) entry; + IndexInput bytesSlice = vectorIndex.slice("graph-data", entry.indexDataOffset, entry.indexDataLength); + return new IndexedKnnGraphReader(graphEntry, bytesSlice); + } else { + return KnnGraphValues.EMPTY; + } + } + @Override public void close() throws IOException { - vectorData.close(); + IOUtils.close(vectorData, vectorIndex); } private static class FieldEntry { final int dimension; final VectorValues.SearchStrategy searchStrategy; - final int maxDoc; final long vectorDataOffset; final long vectorDataLength; + final long indexDataOffset; + final long indexDataLength; final int[] ordToDoc; - FieldEntry(int dimension, VectorValues.SearchStrategy searchStrategy, int maxDoc, - long vectorDataOffset, long vectorDataLength, int[] ordToDoc) { - this.dimension = dimension; + FieldEntry(DataInput input, VectorValues.SearchStrategy searchStrategy) throws IOException { this.searchStrategy = searchStrategy; - this.maxDoc = maxDoc; - this.vectorDataOffset = vectorDataOffset; - this.vectorDataLength = vectorDataLength; - this.ordToDoc = ordToDoc; + vectorDataOffset = input.readVLong(); + vectorDataLength = input.readVLong(); + indexDataOffset = input.readVLong(); + indexDataLength = input.readVLong(); + dimension = input.readInt(); + int size = input.readInt(); + ordToDoc = new int[size]; + for (int i = 0; i < size; i++) { + int doc = input.readVInt(); + ordToDoc[i] = doc; + } } int size() { return ordToDoc.length; } } + private static class HnswGraphFieldEntry extends FieldEntry { + + final long[] ordOffsets; + + HnswGraphFieldEntry(DataInput input, VectorValues.SearchStrategy searchStrategy) throws IOException { + super(input, searchStrategy); + ordOffsets = new long[size()]; + long offset = 0; + for (int i = 0; i < ordOffsets.length; i++) { + offset += input.readVLong(); + ordOffsets[i] = offset; + } + } + } + /** Read the vector values from the index input. This supports both iterated and random access. */ - private final static class OffHeapVectorValues extends VectorValues { + private final class OffHeapVectorValues extends VectorValues { final FieldEntry fieldEntry; final IndexInput dataIn; + final Random random = new Random(); Review comment: *PREDICTABLE_RANDOM:* This random generator (java.util.Random) is predictable [(details)](https://find-sec-bugs.github.io/bugs.htm#PREDICTABLE_RANDOM) ########## File path: lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphBuilder.java ########## @@ -0,0 +1,186 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hnsw; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Random; + +import org.apache.lucene.index.KnnGraphValues; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.util.BytesRef; + +/** + * Builder for HNSW graph. See {@link HnswGraph} for a gloss on the algorithm and the meaning of the hyperparameters. + */ +public final class HnswGraphBuilder { + + // default random seed for level generation + private static final long DEFAULT_RAND_SEED = System.currentTimeMillis(); + + // expose for testing. + public static long randSeed = DEFAULT_RAND_SEED; + + // These "default" hyperparameter settings are exposed (and nonfinal) to enable performance testing + // since the indexing API doesn't provide any control over them. + + // default max connections per node + static int DEFAULT_MAX_CONN = 16; + + // default candidate list size + static int DEFAULT_BEAM_WIDTH = 16; + + private final int maxConn; + private final int beamWidth; + private final BoundedVectorValues boundedVectors; + private final VectorValues.SearchStrategy searchStrategy; + private final HnswGraph hnsw; + private final Random random; + + /** + * Reads all the vectors from a VectorValues, builds a graph connecting them by their dense ordinals, using default + * hyperparameter settings, and returns the resulting graph. + * @param vectorValues the vectors whose relations are represented by the graph + */ + public static HnswGraph build(VectorValues vectorValues) throws IOException { + HnswGraphBuilder builder = new HnswGraphBuilder(vectorValues.randomAccess()); + return builder.build(vectorValues.randomAccess()); + } + + /** + * Reads all the vectors from a VectorValues, builds a graph connecting them by their dense ordinals, using the given + * hyperparameter settings, and returns the resulting graph. + * @param vectorValues the vectors whose relations are represented by the graph + * @param maxConn the number of connections to make when adding a new graph node; roughly speaking the graph fanout. + * @param beamWidth the size of the beam search to use when finding nearest neighbors. + * @param seed the seed for a random number generator used during graph construction. Provide this to ensure repeatable construction. + */ + public static HnswGraph build(VectorValues vectorValues, int maxConn, int beamWidth, long seed) throws IOException { + HnswGraphBuilder builder = new HnswGraphBuilder(vectorValues.randomAccess(), maxConn, beamWidth, seed); + return builder.build(vectorValues.randomAccess()); + } + + /** + * Reads all the vectors from two copies of a random access VectorValues. Providing two copies enables efficient retrieval + * without extra data copying, while avoiding collision of the returned values. + * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet accessor for the vectors + */ + private HnswGraph build(VectorValues.RandomAccess vectors) throws IOException { + for (int node = 1; node < vectors.size(); node++) { + insert(vectors.vectorValue(node)); + } + return hnsw; + } + + /** Construct the builder with default configurations */ + private HnswGraphBuilder(VectorValues.RandomAccess vectors) { + this(vectors, DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, randSeed); + } + + /** Full constructor */ + private HnswGraphBuilder(VectorValues.RandomAccess vectors, int maxConn, int beamWidth, long seed) { + searchStrategy = vectors.searchStrategy(); + if (searchStrategy == VectorValues.SearchStrategy.NONE) { + throw new IllegalStateException("No distance function"); + } + this.maxConn = maxConn; + this.beamWidth = beamWidth; + boundedVectors = new BoundedVectorValues(vectors); + this.hnsw = new HnswGraph(); + random = new Random(seed); Review comment: *PREDICTABLE_RANDOM:* This random generator (java.util.Random) is predictable [(details)](https://find-sec-bugs.github.io/bugs.htm#PREDICTABLE_RANDOM) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org