uschindler commented on code in PR #13401: URL: https://github.com/apache/lucene/pull/13401#discussion_r1611226882
########## lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java: ########## @@ -103,16 +104,27 @@ * <li>VectorSimilarityFunction: a byte containing distance function used for similarity * calculation. * <ul> - * <li>0: EUCLIDEAN distance. ({@link VectorSimilarityFunction#EUCLIDEAN}) - * <li>1: DOT_PRODUCT similarity. ({@link VectorSimilarityFunction#DOT_PRODUCT}) - * <li>2: COSINE similarity. ({@link VectorSimilarityFunction#COSINE}) + * <li>0: EUCLIDEAN distance. ({@link + * org.apache.lucene.index.EuclideanVectorSimilarityFunction}) + * <li>1: DOT_PRODUCT similarity. ({@link + * org.apache.lucene.index.DotProductVectorSimilarityFunction}) + * <li>2: COSINE similarity. ({@link + * org.apache.lucene.index.CosineVectorSimilarityFunction}) * </ul> * </ul> * * @lucene.experimental */ public final class Lucene90FieldInfosFormat extends FieldInfosFormat { + private static final Map<Integer, String> SIMILARITY_FUNCTIONS_MAP = new HashMap<>(); Review Comment: Use Java 9+ `Map.of()` here ########## lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseFieldInfoFormatTestCase.java: ########## @@ -328,6 +332,17 @@ private int getVectorsMaxDimensions(String fieldName) { return Codec.getDefault().knnVectorsFormat().getMaxDimensions(fieldName); } + private VectorSimilarityFunction randomSimilarity() { Review Comment: this should be rewritten to not use ServiceLoader directly and instead use the NamedSPILoader in the provider. If it is needed for tests, you can add a function to retrieve all registered similarities in VectorSimilarityFunction class. We have something similar for analyzers. ########## lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java: ########## @@ -257,11 +269,12 @@ private static DocValuesType getDocValuesType(IndexInput input, byte b) throws I } } - private static VectorSimilarityFunction getDistFunc(IndexInput input, byte b) throws IOException { - if (b < 0 || b >= VectorSimilarityFunction.values().length) { - throw new CorruptIndexException("invalid distance function: " + b, input); + /** Returns VectorSimilarityFunction from index input and ordinal value */ + public static VectorSimilarityFunction getDistFunc(IndexInput input, byte b) throws IOException { + if ((int) b < 0 || (int) b >= SIMILARITY_FUNCTIONS_MAP.size()) { Review Comment: this check is not correct if we have a sparse set of IDs. It is better to just use `SIMILARITY_FUNCTIONS_MAP.contains(Integer.valueOf(b))` ########## lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java: ########## @@ -16,104 +16,73 @@ */ package org.apache.lucene.index; -import static org.apache.lucene.util.VectorUtil.cosine; -import static org.apache.lucene.util.VectorUtil.dotProduct; -import static org.apache.lucene.util.VectorUtil.dotProductScore; -import static org.apache.lucene.util.VectorUtil.scaleMaxInnerProductScore; -import static org.apache.lucene.util.VectorUtil.squareDistance; +import org.apache.lucene.util.NamedSPILoader; /** * Vector similarity function; used in search to return top K most similar vectors to a target - * vector. This is a label describing the method used during indexing and searching of the vectors - * in order to determine the nearest neighbors. + * vector. */ -public enum VectorSimilarityFunction { +public abstract class VectorSimilarityFunction implements NamedSPILoader.NamedSPI { - /** Euclidean distance */ - EUCLIDEAN { - @Override - public float compare(float[] v1, float[] v2) { - return 1 / (1 + squareDistance(v1, v2)); - } - - @Override - public float compare(byte[] v1, byte[] v2) { - return 1 / (1f + squareDistance(v1, v2)); - } - }, + private static class Holder { + private static final NamedSPILoader<VectorSimilarityFunction> LOADER = + new NamedSPILoader<>(VectorSimilarityFunction.class); - /** - * Dot product. NOTE: this similarity is intended as an optimized way to perform cosine - * similarity. In order to use it, all vectors must be normalized, including both document and - * query vectors. Using dot product with vectors that are not normalized can result in errors or - * poor search results. Floating point vectors must be normalized to be of unit length, while byte - * vectors should simply all have the same norm. - */ - DOT_PRODUCT { - @Override - public float compare(float[] v1, float[] v2) { - return Math.max((1 + dotProduct(v1, v2)) / 2, 0); + static NamedSPILoader<VectorSimilarityFunction> getLoader() { + if (LOADER == null) { + throw new IllegalStateException(); Review Comment: add useful message here like in the other SPI holder classes (Postingsformats, codec, docvalues) ########## lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90FieldInfosFormat.java: ########## @@ -257,11 +269,12 @@ private static DocValuesType getDocValuesType(IndexInput input, byte b) throws I } } - private static VectorSimilarityFunction getDistFunc(IndexInput input, byte b) throws IOException { - if (b < 0 || b >= VectorSimilarityFunction.values().length) { - throw new CorruptIndexException("invalid distance function: " + b, input); + /** Returns VectorSimilarityFunction from index input and ordinal value */ + public static VectorSimilarityFunction getDistFunc(IndexInput input, byte b) throws IOException { + if ((int) b < 0 || (int) b >= SIMILARITY_FUNCTIONS_MAP.size()) { + throw new CorruptIndexException("invalid distance function: " + (int) b, input); } - return VectorSimilarityFunction.values()[b]; + return VectorSimilarityFunction.forName(SIMILARITY_FUNCTIONS_MAP.get((int) b)); Review Comment: use 'Integer.valueOf(b)' so we have type safety, as we should not accidentally use wrong type when looking up in map. ########## lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsReader.java: ########## @@ -19,18 +19,35 @@ import java.io.Closeable; import java.io.IOException; +import java.util.HashMap; +import java.util.Map; import org.apache.lucene.index.ByteVectorValues; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.KnnCollector; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; +import org.apache.lucene.store.DataInput; import org.apache.lucene.util.Bits; /** Reads vectors from an index. */ public abstract class KnnVectorsReader implements Closeable { + /** + * SIMILAIRTY_FUNCTION_MAP containing hardcoded mapping for ordinal to vectorSimilarityFunction + * name + */ + public static final Map<Integer, String> SIMILARITY_FUNCTIONS_MAP = new HashMap<>(); Review Comment: same here, use `Map.of()` ########## lucene/luke/src/java/org/apache/lucene/luke/app/desktop/components/DocumentsPanelProvider.java: ########## @@ -1235,17 +1235,17 @@ private static String flags(org.apache.lucene.luke.models.documents.DocumentFiel sb.append("K"); sb.append(String.format(Locale.ENGLISH, "%04d", f.getVectorDimension())); sb.append("/"); - switch (f.getVectorSimilarity()) { - case COSINE: + switch (f.getVectorSimilarity().getName()) { Review Comment: i would change this to just return the name and remove switch completely. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org