msokolov commented on a change in pull request #1930: URL: https://github.com/apache/lucene-solr/pull/1930#discussion_r501828351
########## File path: lucene/core/src/java/org/apache/lucene/index/VectorValues.java ########## @@ -0,0 +1,264 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.BytesRef; + +/** + * Access to per-document vector value. + */ +public abstract class VectorValues extends DocIdSetIterator { + + /** The maximum length of a vector */ + public static int MAX_DIMENSIONS = 1024; + + /** Sole constructor */ + protected VectorValues() {} + + /** + * Return the dimension of the vectors + */ + public abstract int dimension(); + + /** + * TODO: should we use cost() for this? We rely on its always being exactly the number + * of documents having a value for this field, which is not guaranteed by the cost() contract, + * but in all the implementations so far they are the same. + * @return the number of vectors returned by this iterator + */ + public abstract int size(); + + /** + * Return the score function used to compare these vectors + */ + public abstract ScoreFunction scoreFunction(); + + /** + * Return the vector value for the current document ID. + * It is illegal to call this method after the iterator failed to advance. + * @return the vector value + */ + public abstract float[] vectorValue() throws IOException; + + /** + * Return the binary encoded vector value for the current document ID. + * It is illegal to call this method after the iterator failed to advance. + * @return the binary value + */ + public BytesRef binaryValue() throws IOException { + throw new UnsupportedOperationException(); + } + + /** + * Return a random access interface over this iterator's vectors. + */ + public abstract RandomAccess randomAccess(); + + /** + * Provides random access to vectors by dense ordinal + */ + public interface RandomAccess { + + /** + * Return the vector value as a floating point array. + * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}. + */ + float[] vectorValue(int targetOrd) throws IOException; + + /** + * Return the vector value as a byte array; these are the bytes corresponding to the float array + * encoded using little-endian byte order. + * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}. + */ + BytesRef binaryValue(int targetOrd) throws IOException; + + /** + * Return the k nearest neighbor documents as determined by comparison of their vector values + * for this field, to the given vector, by the field's score function. If the score function is + * reversed, lower values indicate nearer vectors, otherwise higher scores indicate nearer + * vectors. Unlike relevance scores, vector scores may be negative. + * @param target the vector-valued query + * @param k the number of docs to return + * @param fanout control the accuracy/speed tradeoff - larger values give better recall at higher cost + * @return the k nearest neighbor documents, along with their (scoreFunction-specific) scores. + */ + TopDocs search(float[] target, int k, int fanout) throws IOException; + } + + /** + * Score function. This is used during indexing and searching of the vectors to determine the nearest neighbors. + * Score values may be negative. By default high scores indicate nearer documents, unless the function is reversed. + */ + public enum ScoreFunction { + /** No distance function is used. Note: {@link VectorValues.RandomAccess#search(float[], int, int)} Review comment: OK I opened LUCENE-9573. I have to admit I don't fully understand the timing constraints/dependencies here. Maybe you could comment on that issue? Re: the ids I opted to move to using the enum ordinal as you suggested later. I can't see how that restricts us in any meaningful way. Perhaps we add a back-compat test to verify that the enum ordinals don't change ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org