jtibshirani commented on a change in pull request #649:
URL: https://github.com/apache/lucene/pull/649#discussion_r806160008



##########
File path: 
lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsWriter.java
##########
@@ -206,14 +203,22 @@ private void writeMeta(
     meta.writeVLong(vectorIndexOffset);
     meta.writeVLong(vectorIndexLength);
     meta.writeInt(field.getVectorDimension());
-    meta.writeInt(docIds.length);
-    for (int docId : docIds) {
-      // TODO: delta-encode, or write as bitset
-      meta.writeVInt(docId);
+
+    // write docIDs
+    int count = docsWithField.cardinality();
+    meta.writeInt(count);
+    if (count == maxDoc) {
+      meta.writeByte((byte) -1);
+      ; // dense marker, each document has a vector value

Review comment:
       I think you accidentally added a newline before the comment.

##########
File path: 
lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java
##########
@@ -1018,4 +1020,57 @@ public void testAdvance() throws Exception {
       }
     }
   }
+
+  public void testVectorValuesReportCorrectDocs() throws Exception {
+    final int numDocs = atLeast(1000);
+    final int dim = random().nextInt(20) + 1;
+    final VectorSimilarityFunction similarityFunction =
+        VectorSimilarityFunction.values()[
+            random().nextInt(VectorSimilarityFunction.values().length)];
+
+    float fieldValuesCheckSum = 0f;
+    int fieldDocCount = 0;
+    long fieldSumDocIDs = 0;
+
+    try (Directory dir = newDirectory();
+        RandomIndexWriter w = new RandomIndexWriter(random(), dir, 
newIndexWriterConfig())) {
+      for (int i = 0; i < numDocs; i++) {
+        Document doc = new Document();
+        int docID = random().nextInt(numDocs);
+        doc.add(new StoredField("id", docID));
+        if (random().nextInt(4) == 3) {
+          float[] vector = randomVector(dim);
+          doc.add(new KnnVectorField("knn_vector", vector, 
similarityFunction));
+          fieldValuesCheckSum += vector[0];
+          fieldDocCount++;
+          fieldSumDocIDs += docID;
+        }
+        w.addDocument(doc);
+      }
+
+      if (random().nextBoolean()) {
+        w.forceMerge(1);
+      }
+
+      try (IndexReader r = w.getReader()) {
+        float checksum = 0;

Review comment:
       Would the checksum and doc IDs still match if the vectors were out of 
order? Maybe worth strengthening this check. 

##########
File path: 
lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java
##########
@@ -424,38 +448,45 @@ public int docID() {
 
     @Override
     public int nextDoc() {
-      if (++ord >= size()) {
+      if (++ord >= size) {
         doc = NO_MORE_DOCS;
       } else {
-        doc = ordToDoc[ord];
+        doc = ordToDocOperator.applyAsInt(ord);
       }
       return doc;
     }
 
     @Override
     public int advance(int target) {
       assert docID() < target;
-      ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target);
-      if (ord < 0) {
-        ord = -(ord + 1);
+
+      if (ordToDoc == null) {
+        ord = target;
+      } else {
+        ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target);
+        if (ord < 0) {
+          ord = -(ord + 1);
+        }
       }
-      assert ord <= ordToDoc.length;
-      if (ord == ordToDoc.length) {
+
+      assert ord <= size;
+      if (ord == size) {
         doc = NO_MORE_DOCS;
       } else {
-        doc = ordToDoc[ord];
+        doc = ordToDocOperator.applyAsInt(ord);
+        ;

Review comment:
       stray semicolon

##########
File path: 
lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java
##########
@@ -266,12 +268,12 @@ private Bits getAcceptOrds(Bits acceptDocs, FieldEntry 
fieldEntry) {
     return new Bits() {
       @Override
       public boolean get(int index) {
-        return acceptDocs.get(fieldEntry.ordToDoc[index]);
+        return acceptDocs.get(fieldEntry.ordToDoc(index));

Review comment:
       We could add a small optimization here where we check 
`fieldEntry.ordToDocs == null`, and if so just return `acceptDocs`?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to