benwtrent commented on code in PR #13339: URL: https://github.com/apache/lucene/pull/13339#discussion_r1594222886
########## lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99HnswVectorsFormat.java: ########## @@ -139,7 +139,7 @@ public final class Lucene99HnswVectorsFormat extends KnnVectorsFormat { /** The format for storing, reading, merging vectors on disk */ private static final FlatVectorsFormat flatVectorsFormat = - new Lucene99FlatVectorsFormat(new DefaultFlatVectorScorer()); + new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.newFlatVectorScorer()); Review Comment: I would say we also update the `Lucene99ScalarQuantizedVectorsFormat`, we call ``` this.flatVectorScorer = new Lucene99ScalarQuantizedVectorScorer(new DefaultFlatVectorScorer()); ``` This way folks that send `byte[]` values to that scorer can still get the nice benefits here. ########## lucene/core/src/java/org/apache/lucene/store/FilterIndexInput.java: ########## @@ -40,6 +48,19 @@ public static IndexInput unwrap(IndexInput in) { return in; } + /** + * Unwraps all test FilterIndexInputs until the first non-test FilterIndexInput IndexInput + * instance and returns it + */ + public static IndexInput unwrapOnlyTest(IndexInput in) { + while (in instanceof FilterIndexInput && TEST_FILTER_INPUTS.contains(in.getClass())) { + in = ((FilterIndexInput) in).in; + } + return in; + } + + // org.apache.lucene.tests.store.MockIndexInputWrapper" + Review Comment: ```suggestion ``` Unnecessary? ########## lucene/core/src/test/org/apache/lucene/search/TestKnnByteVectorQueryMMap.java: ########## @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; +import java.io.UncheckedIOException; +import org.apache.lucene.store.MMapDirectory; +import org.apache.lucene.tests.store.BaseDirectoryWrapper; +import org.apache.lucene.tests.store.MockDirectoryWrapper; + +public class TestKnnByteVectorQueryMMap extends TestKnnByteVectorQuery { Review Comment: This is good instead of just having a sysprop or randomness. ########## lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java: ########## @@ -307,39 +310,44 @@ private float squareDistanceBody(float[] a, float[] b, int limit) { @Override public int dotProduct(byte[] a, byte[] b) { + return dotProduct(MemorySegment.ofArray(a), MemorySegment.ofArray(b)); + } + + public static int dotProduct(MemorySegment a, MemorySegment b) { + assert a.byteSize() == b.byteSize(); Review Comment: 😍 ########## lucene/core/src/java21/org/apache/lucene/internal/vectorization/MemorySegmentByteVectorScorerSupplier.java: ########## @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.internal.vectorization; + +import java.io.IOException; +import java.lang.foreign.MemorySegment; +import java.util.Optional; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.store.FilterIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.hnsw.RandomAccessVectorValues; +import org.apache.lucene.util.hnsw.RandomVectorScorer; +import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier; + +/** A scorer of vectors whose element size is byte. */ +public abstract sealed class MemorySegmentByteVectorScorerSupplier + implements RandomVectorScorerSupplier, RandomVectorScorer permits DotProduct, Euclidean { + final int vectorByteSize; + final int dims; + final int maxOrd; + final IndexInput input; + final MemorySegmentAccess memorySegmentAccess; + + final RandomAccessVectorValues values; // to support ordToDoc/getAcceptOrds + final byte[] scratch1, scratch2; + + MemorySegment first; + + /** + * Return an optional whose value, if present, is the scorer. Otherwise, an empty optional is + * returned. + */ + public static Optional<MemorySegmentByteVectorScorerSupplier> create( + int dims, + int maxOrd, + int vectorByteSize, + VectorSimilarityFunction type, + IndexInput input, + RandomAccessVectorValues values) { + input = FilterIndexInput.unwrap(input); + if (!(input instanceof MemorySegmentAccess)) { + return Optional.empty(); + } + checkInvariants(maxOrd, vectorByteSize, input); + return switch (type) { + case DOT_PRODUCT -> Optional.of(new DotProduct(dims, maxOrd, vectorByteSize, input, values)); + case EUCLIDEAN -> Optional.of(new Euclidean(dims, maxOrd, vectorByteSize, input, values)); + case MAXIMUM_INNER_PRODUCT -> Optional.empty(); // TODO: implement MAXIMUM_INNER_PRODUCT + case COSINE -> Optional.empty(); // TODO: implement Cosine + }; + } + + MemorySegmentByteVectorScorerSupplier( + int dims, int maxOrd, int vectorByteSize, IndexInput input, RandomAccessVectorValues values) { + this.vectorByteSize = vectorByteSize; + this.dims = dims; + this.maxOrd = maxOrd; + this.input = input; + this.memorySegmentAccess = (MemorySegmentAccess) input; + this.values = values; + scratch1 = new byte[vectorByteSize]; + scratch2 = new byte[vectorByteSize]; + } + + static void checkInvariants(int maxOrd, int vectorByteLength, IndexInput input) { + if (input.length() < (long) vectorByteLength * maxOrd) { + throw new IllegalArgumentException("input length not equal to expected vector data"); + } + } + + final void checkOrdinal(int ord, int maxOrd) { + if (ord < 0 || ord >= maxOrd) { + throw new IllegalArgumentException("illegal ordinal: " + ord); + } + } + + protected final MemorySegment getSegment(int ord, byte[] scratch) throws IOException { + checkOrdinal(ord, maxOrd); + int byteOffset = ord * vectorByteSize; // TODO: random + meta size Review Comment: While I don't foresee us doing this, but it is conceivable that the flat vector storage will change. I think we have adequate test coverage to catch such a weird behavior, but maybe we need to change the name or add a comment reflecting that it relies on how Lucene95 stored the flat vectors? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org