jtibshirani commented on code in PR #792: URL: https://github.com/apache/lucene/pull/792#discussion_r865291279
########## lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java: ########## @@ -388,115 +400,239 @@ private static class FieldEntry { int size() { return size; } - - int ordToDoc(int ord) { - return ordToDocOperator.applyAsInt(ord); - } } - /** Read the vector values from the index input. This supports both iterated and random access. */ - static class OffHeapVectorValues extends VectorValues - implements RandomAccessVectorValues, RandomAccessVectorValuesProducer { - - private final int dimension; - private final int size; - private final int[] ordToDoc; - private final IntUnaryOperator ordToDocOperator; - private final IndexInput dataIn; - private final BytesRef binaryValue; - private final ByteBuffer byteBuffer; - private final int byteSize; - private final float[] value; + static class DenseOffHeapVectorValues extends OffHeapVectorValues { private int ord = -1; private int doc = -1; - OffHeapVectorValues(int dimension, int size, int[] ordToDoc, IndexInput dataIn) { - this.dimension = dimension; - this.size = size; - this.ordToDoc = ordToDoc; - ordToDocOperator = ordToDoc == null ? IntUnaryOperator.identity() : (ord) -> ordToDoc[ord]; - this.dataIn = dataIn; - byteSize = Float.BYTES * dimension; - byteBuffer = ByteBuffer.allocate(byteSize); - value = new float[dimension]; - binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + public DenseOffHeapVectorValues(int dimension, int size, IndexInput slice) { + super(dimension, size, slice); } @Override - public int dimension() { - return dimension; + public float[] vectorValue() throws IOException { + slice.seek((long) ord * byteSize); + slice.readFloats(value, 0, value.length); + return value; } @Override - public int size() { - return size; + public BytesRef binaryValue() throws IOException { + slice.seek((long) ord * byteSize); + slice.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize, false); + return binaryValue; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() throws IOException { + return advance(doc + 1); + } + + @Override + public int advance(int target) throws IOException { + assert docID() < target; + ord = target; + if (target >= size) { + return doc = NO_MORE_DOCS; + } + return doc = target; + } + + @Override + public RandomAccessVectorValues randomAccess() throws IOException { + return new DenseOffHeapVectorValues(dimension, size, slice.clone()); + } + + @Override + public int ordToDoc(int ord) { + return ord; + } + } + + static class SparseOffHeapVectorValues extends OffHeapVectorValues { Review Comment: It'd be good to make all these subclasses private. ########## lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java: ########## @@ -388,115 +400,239 @@ private static class FieldEntry { int size() { return size; } - - int ordToDoc(int ord) { - return ordToDocOperator.applyAsInt(ord); - } } - /** Read the vector values from the index input. This supports both iterated and random access. */ - static class OffHeapVectorValues extends VectorValues - implements RandomAccessVectorValues, RandomAccessVectorValuesProducer { - - private final int dimension; - private final int size; - private final int[] ordToDoc; - private final IntUnaryOperator ordToDocOperator; - private final IndexInput dataIn; - private final BytesRef binaryValue; - private final ByteBuffer byteBuffer; - private final int byteSize; - private final float[] value; + static class DenseOffHeapVectorValues extends OffHeapVectorValues { private int ord = -1; private int doc = -1; - OffHeapVectorValues(int dimension, int size, int[] ordToDoc, IndexInput dataIn) { - this.dimension = dimension; - this.size = size; - this.ordToDoc = ordToDoc; - ordToDocOperator = ordToDoc == null ? IntUnaryOperator.identity() : (ord) -> ordToDoc[ord]; - this.dataIn = dataIn; - byteSize = Float.BYTES * dimension; - byteBuffer = ByteBuffer.allocate(byteSize); - value = new float[dimension]; - binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + public DenseOffHeapVectorValues(int dimension, int size, IndexInput slice) { Review Comment: I think we can remove the `ord` variable now, since in the dense case `doc` and `ord` are always the same. ########## lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java: ########## @@ -258,14 +257,20 @@ public TopDocs search(String field, float[] target, int k, Bits acceptDocs, int } private OffHeapVectorValues getOffHeapVectorValues(FieldEntry fieldEntry) throws IOException { + if (fieldEntry.docsWithFieldOffset == -2) { Review Comment: Some more suggestions to make the code cleaner: * We could move this method to the `OffHeapVectorValues` class as a static constructor. It would be like `static OffHeapVectorValues load(FieldEntry fieldEntry, IndexInput vectorData) { ... }`. * We can move `OffHeapVectorValues` to its own class now that it has grown. It would still be package-private. * We could move the `getAcceptOrds` method to `OffHeapVectorValues`. This lets us remove the check `if (vectorValues instanceOf DenseOffHeapVectorValues) { .. }`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org