benwtrent commented on code in PR #13779: URL: https://github.com/apache/lucene/pull/13779#discussion_r1765048870
########## lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java: ########## @@ -303,29 +314,45 @@ private MergedFloat32VectorValues(List<FloatVectorValuesSub> subs, MergeState me } @Override - public int docID() { - return docId; - } + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int index = -1; - @Override - public int nextDoc() throws IOException { - current = docIdMerger.next(); - if (current == null) { - docId = NO_MORE_DOCS; - } else { - docId = current.mappedDocID; - } - return docId; - } + @Override + public int docID() { + return docId; + } - @Override - public float[] vectorValue() throws IOException { - return current.values.vectorValue(); + @Override + public int index() { + return index; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + index = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ++index; + } + return docId; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + }; } @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + public float[] vectorValue(int ord) throws IOException { + // FIXME what can we assert here? + // assert ord == iterator.index(); + return current.values.vectorValue(current.index()); Review Comment: This is the biggest "gotcha" in this whole thing I think. Does `DocIdMerger` allow random access at all? It seems like `vectorValue(ord)` should also be able to jump between `current` sub iterators and move backwards and forwards. But, I don't think `DocIdMerger` allows backwards movement at all. ########## lucene/core/src/java/org/apache/lucene/util/quantization/ScalarQuantizer.java: ########## @@ -269,11 +270,12 @@ static ScalarQuantizer fromVectors( if (totalVectorCount == 0) { return new ScalarQuantizer(0f, 0f, bits); } + KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator(); Review Comment: I think this is ok for now, but this quantization code can be made much simpler if indeed we can randomly access even across various merged doc sub iterators. ########## lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java: ########## @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene90.IndexedDISI; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; + +/** + * This class abstracts addressing of document vector values indexed as {@link KnnFloatVectorField} + * or {@link KnnByteVectorField}. + * + * @lucene.experimental + */ +public abstract class KnnVectorValues { + + /** The iterator associated with these values. */ + protected DocIndexIterator iterator; + + /** Return the dimension of the vectors */ + public abstract int dimension(); + + /** + * Return the number of vectors for this field. + * + * @return the number of vectors returned by this iterator + */ + public abstract int size(); + + /** + * Return the docid of the document indexed with the given vector ordinal. This default + * implementation returns the argument and is appropriate for dense values implementations where + * every doc has a single value. + */ + public int ordToDoc(int ord) { + return ord; + } + + /** + * Creates a new copy of this {@link KnnVectorValues}. This is helpful when you need to access + * different values at once, to avoid overwriting the underlying vector returned. + */ + public abstract KnnVectorValues copy() throws IOException; + + /** Returns the vector byte length, defaults to dimension multiplied by float byte size */ + public int getVectorByteLength() { + return dimension() * getEncoding().byteSize; + } + + /** The vector encoding of these values. */ + public abstract VectorEncoding getEncoding(); + + /** Returns a Bits accepting docs accepted by the argument and having a vector value */ + public Bits getAcceptOrds(Bits acceptDocs) { + // FIXME: change default to return acceptDocs and provide this impl + // somewhere more specialized (in every non-dense impl). + if (acceptDocs == null) { + return null; + } + return new Bits() { + @Override + public boolean get(int index) { + return acceptDocs.get(ordToDoc(index)); + } + + @Override + public int length() { + return size(); + } + }; + } + + /** + * Return the iterator for this instance. If you need multiple iterators, call <code> + * this.copy().iterator()</code>. + */ + public DocIndexIterator iterator() { + if (iterator == null) { + iterator = createIterator(); + } + return iterator; + } + + /** + * Create an iterator for this instance; typically called once by <code>iterator()</code>. Wrapper + * value classes delegate to their inner instance's iterator and shouldn't implement this. + */ + protected DocIndexIterator createIterator() { + throw new UnsupportedOperationException(); + } + + /** + * A DocIdSetIterator that also provides an index() method tracking a distinct ordinal for a + * vector associated with each doc. + */ + public abstract static class DocIndexIterator extends DocIdSetIterator { + + /** return the value index (aka "ordinal" or "ord") corresponding to the current doc */ + public abstract int index(); + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + + @Override + public long cost() { + throw new UnsupportedOperationException(); Review Comment: I agree here. Either it should default to `size()` via some provided dependency or it shouldn't implement at all and force sub-classes. ########## lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java: ########## @@ -303,29 +314,45 @@ private MergedFloat32VectorValues(List<FloatVectorValuesSub> subs, MergeState me } @Override - public int docID() { - return docId; - } + public DocIndexIterator iterator() { + return new DocIndexIterator() { + private int index = -1; - @Override - public int nextDoc() throws IOException { - current = docIdMerger.next(); - if (current == null) { - docId = NO_MORE_DOCS; - } else { - docId = current.mappedDocID; - } - return docId; - } + @Override + public int docID() { + return docId; + } - @Override - public float[] vectorValue() throws IOException { - return current.values.vectorValue(); + @Override + public int index() { + return index; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + index = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ++index; + } + return docId; + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + }; } @Override - public int advance(int target) { - throw new UnsupportedOperationException(); + public float[] vectorValue(int ord) throws IOException { + // FIXME what can we assert here? + // assert ord == iterator.index(); + return current.values.vectorValue(current.index()); Review Comment: I think unless we can fix `DocIdMerger`, we should throw an error here indicating that only forward iteration is allowed. ########## lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java: ########## @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene90.IndexedDISI; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; + +/** + * This class abstracts addressing of document vector values indexed as {@link KnnFloatVectorField} + * or {@link KnnByteVectorField}. + * + * @lucene.experimental + */ +public abstract class KnnVectorValues { + + /** The iterator associated with these values. */ + protected DocIndexIterator iterator; + + /** Return the dimension of the vectors */ + public abstract int dimension(); + + /** + * Return the number of vectors for this field. + * + * @return the number of vectors returned by this iterator + */ + public abstract int size(); + + /** + * Return the docid of the document indexed with the given vector ordinal. This default + * implementation returns the argument and is appropriate for dense values implementations where + * every doc has a single value. + */ + public int ordToDoc(int ord) { + return ord; + } + + /** + * Creates a new copy of this {@link KnnVectorValues}. This is helpful when you need to access + * different values at once, to avoid overwriting the underlying vector returned. + */ + public abstract KnnVectorValues copy() throws IOException; + + /** Returns the vector byte length, defaults to dimension multiplied by float byte size */ + public int getVectorByteLength() { Review Comment: This cannot be final. Its dependent on parameters like `compressed`. ########## lucene/core/src/java/org/apache/lucene/util/quantization/QuantizedByteVectorValues.java: ########## @@ -17,41 +17,42 @@ package org.apache.lucene.util.quantization; import java.io.IOException; +import org.apache.lucene.codecs.lucene95.HasIndexSlice; import org.apache.lucene.index.ByteVectorValues; -import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.VectorScorer; +import org.apache.lucene.store.IndexInput; /** * A version of {@link ByteVectorValues}, but additionally retrieving score correction offset for * Scalar quantization scores. * * @lucene.experimental */ -public abstract class QuantizedByteVectorValues extends DocIdSetIterator { - public abstract float getScoreCorrectionConstant() throws IOException; +public abstract class QuantizedByteVectorValues extends ByteVectorValues implements HasIndexSlice { - public abstract byte[] vectorValue() throws IOException; + public ScalarQuantizer getScalarQuantizer() { + throw new UnsupportedOperationException(); + } - /** Return the dimension of the vectors */ - public abstract int dimension(); + public abstract float getScoreCorrectionConstant(int ord) throws IOException; /** - * Return the number of vectors for this field. + * Return a {@link VectorScorer} for the given query vector. * - * @return the number of vectors returned by this iterator + * @param query the query vector + * @return a {@link VectorScorer} instance or null */ - public abstract int size(); + public VectorScorer scorer(float[] query) throws IOException { + throw new UnsupportedOperationException(); + } @Override - public final long cost() { - return size(); + public QuantizedByteVectorValues copy() throws IOException { + return this; } Review Comment: defaulting copy to `this` is dangerous. I would recommend against it. ########## lucene/core/src/java/org/apache/lucene/index/KnnVectorValues.java: ########## @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import org.apache.lucene.codecs.lucene90.IndexedDISI; +import org.apache.lucene.document.KnnByteVectorField; +import org.apache.lucene.document.KnnFloatVectorField; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.Bits; + +/** + * This class abstracts addressing of document vector values indexed as {@link KnnFloatVectorField} + * or {@link KnnByteVectorField}. + * + * @lucene.experimental + */ +public abstract class KnnVectorValues { + + /** The iterator associated with these values. */ + protected DocIndexIterator iterator; + + /** Return the dimension of the vectors */ + public abstract int dimension(); + + /** + * Return the number of vectors for this field. + * + * @return the number of vectors returned by this iterator + */ + public abstract int size(); + + /** + * Return the docid of the document indexed with the given vector ordinal. This default + * implementation returns the argument and is appropriate for dense values implementations where + * every doc has a single value. + */ + public int ordToDoc(int ord) { + return ord; + } + + /** + * Creates a new copy of this {@link KnnVectorValues}. This is helpful when you need to access + * different values at once, to avoid overwriting the underlying vector returned. + */ + public abstract KnnVectorValues copy() throws IOException; + + /** Returns the vector byte length, defaults to dimension multiplied by float byte size */ + public int getVectorByteLength() { + return dimension() * getEncoding().byteSize; + } + + /** The vector encoding of these values. */ + public abstract VectorEncoding getEncoding(); + + /** Returns a Bits accepting docs accepted by the argument and having a vector value */ + public Bits getAcceptOrds(Bits acceptDocs) { + // FIXME: change default to return acceptDocs and provide this impl + // somewhere more specialized (in every non-dense impl). + if (acceptDocs == null) { + return null; + } + return new Bits() { + @Override + public boolean get(int index) { + return acceptDocs.get(ordToDoc(index)); + } + + @Override + public int length() { + return size(); + } + }; + } + + /** + * Return the iterator for this instance. If you need multiple iterators, call <code> + * this.copy().iterator()</code>. + */ + public DocIndexIterator iterator() { + if (iterator == null) { + iterator = createIterator(); + } + return iterator; + } + + /** + * Create an iterator for this instance; typically called once by <code>iterator()</code>. Wrapper + * value classes delegate to their inner instance's iterator and shouldn't implement this. + */ + protected DocIndexIterator createIterator() { Review Comment: This should be an abstract method and sub-classes purposefully indicate that this is unsupported. ########## lucene/core/src/java/org/apache/lucene/index/FloatVectorValues.java: ########## @@ -27,34 +27,23 @@ * * @lucene.experimental */ -public abstract class FloatVectorValues extends DocIdSetIterator { +public abstract class FloatVectorValues extends KnnVectorValues { /** Sole constructor */ protected FloatVectorValues() {} - /** Return the dimension of the vectors */ - public abstract int dimension(); - - /** - * Return the number of vectors for this field. - * - * @return the number of vectors returned by this iterator - */ - public abstract int size(); - @Override - public final long cost() { - return size(); + public FloatVectorValues copy() throws IOException { + return this; Review Comment: we shouldn't default to `this` that is dangerous. If an off-heap thing that assumes caching and doesn't allow multi-threaded access overrides but forgets to override, we are in a bad place. ########## lucene/core/src/java/org/apache/lucene/index/ByteVectorValues.java: ########## @@ -27,34 +27,23 @@ * * @lucene.experimental */ -public abstract class ByteVectorValues extends DocIdSetIterator { +public abstract class ByteVectorValues extends KnnVectorValues { /** Sole constructor */ protected ByteVectorValues() {} - /** Return the dimension of the vectors */ - public abstract int dimension(); - - /** - * Return the number of vectors for this field. - * - * @return the number of vectors returned by this iterator - */ - public abstract int size(); - @Override - public final long cost() { - return size(); + public ByteVectorValues copy() throws IOException { + return this; Review Comment: again, this default is dangerous to me. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org