jpountz commented on code in PR #12529: URL: https://github.com/apache/lucene/pull/12529#discussion_r1319777844
########## lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java: ########## @@ -423,8 +422,12 @@ public RandomAccessVectorValues<float[]> copy() { @Override public float[] vectorValue(int targetOrd) throws IOException { + if (lastOrd == targetOrd) { + return value; + } Review Comment: I'm curious, is it common to read the same ord multiple times in a row? ########## lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecSynonymProvider.java: ########## @@ -23,14 +23,11 @@ import java.io.IOException; import java.util.LinkedList; import java.util.List; -import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.TopDocs; import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.hnsw.HnswGraphBuilder; -import org.apache.lucene.util.hnsw.HnswGraphSearcher; -import org.apache.lucene.util.hnsw.OnHeapHnswGraph; +import org.apache.lucene.util.hnsw.*; Review Comment: Nit: We prefer to avoid star imports. ########## lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorerProvider.java: ########## @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hnsw; + +import java.io.IOException; +import org.apache.lucene.index.VectorSimilarityFunction; + +/** A provider that creates {@link RandomVectorScorer} from an ordinal. */ +public interface RandomVectorScorerProvider { Review Comment: Naming nit: the naming convention seems to be to call things "Supplier"s rather than "Provider"s. ########## lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java: ########## @@ -60,13 +61,17 @@ public int size() { @Override public byte[] vectorValue(int targetOrd) throws IOException { - readValue(targetOrd); + if (lastOrd != targetOrd) { + readValue(targetOrd); + lastOrd = targetOrd; + } return binaryValue; } private void readValue(int targetOrd) throws IOException { slice.seek((long) targetOrd * byteSize); slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + lastOrd = targetOrd; Review Comment: we already set `lastOrd` in `vectorValue()`, so it's not needed here? ########## lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene94/Lucene94HnswVectorsWriter.java: ########## @@ -47,12 +47,8 @@ import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.InfoStream; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.hnsw.HnswGraph; +import org.apache.lucene.util.hnsw.*; Review Comment: nit: star import ########## lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene92/Lucene92HnswVectorsWriter.java: ########## @@ -34,16 +34,12 @@ import org.apache.lucene.index.FloatVectorValues; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; -import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.hnsw.HnswGraphBuilder; -import org.apache.lucene.util.hnsw.NeighborArray; -import org.apache.lucene.util.hnsw.OnHeapHnswGraph; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.util.hnsw.*; Review Comment: nit: star import ########## lucene/core/src/java/org/apache/lucene/util/hnsw/RandomVectorScorerProvider.java: ########## @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.util.hnsw; + +import java.io.IOException; +import org.apache.lucene.index.VectorSimilarityFunction; + +/** A provider that creates {@link RandomVectorScorer} from an ordinal. */ +public interface RandomVectorScorerProvider { + /** + * This creates a {@link RandomVectorScorer} for scoring random nodes in batches against the given + * ordinal. + * + * @param ord the ordinal of the node to compare + * @return a new {@link RandomVectorScorer} + */ + RandomVectorScorer scorer(int ord) throws IOException; + + /** + * Creates a {@link RandomVectorScorerProvider} to compare float vectors. + * + * <p>WARNING: The {@link RandomAccessVectorValues} given can contain stateful buffers. Avoid + * using it after calling this function. If you plan to use it again outside the returned {@link + * RandomVectorScorer}, think about passing a copied version ({@link + * RandomAccessVectorValues#copy}). + * + * @param vectors the underlying storage for vectors + * @param similarityFunction the similarity function to score vectors + */ + static RandomVectorScorerProvider createFloats( + final RandomAccessVectorValues<float[]> vectors, + final VectorSimilarityFunction similarityFunction) + throws IOException { + final RandomAccessVectorValues<float[]> vectorsCopy = vectors.copy(); + return queryOrd -> + (RandomVectorScorer) + cand -> + similarityFunction.compare( + vectors.vectorValue(queryOrd), vectorsCopy.vectorValue(cand)); Review Comment: Should we retrieve the query vector only once, rather than on every call? Then we could skip making a copy of the `RandomAccessVectorValues` as well? Or would it break something? ########## lucene/core/src/java/org/apache/lucene/codecs/lucene95/Lucene95HnswVectorsWriter.java: ########## @@ -41,12 +41,8 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.*; -import org.apache.lucene.util.hnsw.HnswGraph; +import org.apache.lucene.util.hnsw.*; Review Comment: nit: star import ########## lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsReader.java: ########## @@ -42,9 +41,7 @@ import org.apache.lucene.util.Bits; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.hnsw.HnswGraph; -import org.apache.lucene.util.hnsw.HnswGraphSearcher; -import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.util.hnsw.*; Review Comment: nit: star import -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org