jtibshirani commented on a change in pull request #649: URL: https://github.com/apache/lucene/pull/649#discussion_r806160008
########## File path: lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsWriter.java ########## @@ -206,14 +203,22 @@ private void writeMeta( meta.writeVLong(vectorIndexOffset); meta.writeVLong(vectorIndexLength); meta.writeInt(field.getVectorDimension()); - meta.writeInt(docIds.length); - for (int docId : docIds) { - // TODO: delta-encode, or write as bitset - meta.writeVInt(docId); + + // write docIDs + int count = docsWithField.cardinality(); + meta.writeInt(count); + if (count == maxDoc) { + meta.writeByte((byte) -1); + ; // dense marker, each document has a vector value Review comment: I think you accidentally added a newline before the comment. ########## File path: lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java ########## @@ -1018,4 +1020,57 @@ public void testAdvance() throws Exception { } } } + + public void testVectorValuesReportCorrectDocs() throws Exception { + final int numDocs = atLeast(1000); + final int dim = random().nextInt(20) + 1; + final VectorSimilarityFunction similarityFunction = + VectorSimilarityFunction.values()[ + random().nextInt(VectorSimilarityFunction.values().length)]; + + float fieldValuesCheckSum = 0f; + int fieldDocCount = 0; + long fieldSumDocIDs = 0; + + try (Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig())) { + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + int docID = random().nextInt(numDocs); + doc.add(new StoredField("id", docID)); + if (random().nextInt(4) == 3) { + float[] vector = randomVector(dim); + doc.add(new KnnVectorField("knn_vector", vector, similarityFunction)); + fieldValuesCheckSum += vector[0]; + fieldDocCount++; + fieldSumDocIDs += docID; + } + w.addDocument(doc); + } + + if (random().nextBoolean()) { + w.forceMerge(1); + } + + try (IndexReader r = w.getReader()) { + float checksum = 0; Review comment: Would the checksum and doc IDs still match if the vectors were out of order? Maybe worth strengthening this check. ########## File path: lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java ########## @@ -424,38 +448,45 @@ public int docID() { @Override public int nextDoc() { - if (++ord >= size()) { + if (++ord >= size) { doc = NO_MORE_DOCS; } else { - doc = ordToDoc[ord]; + doc = ordToDocOperator.applyAsInt(ord); } return doc; } @Override public int advance(int target) { assert docID() < target; - ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target); - if (ord < 0) { - ord = -(ord + 1); + + if (ordToDoc == null) { + ord = target; + } else { + ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target); + if (ord < 0) { + ord = -(ord + 1); + } } - assert ord <= ordToDoc.length; - if (ord == ordToDoc.length) { + + assert ord <= size; + if (ord == size) { doc = NO_MORE_DOCS; } else { - doc = ordToDoc[ord]; + doc = ordToDocOperator.applyAsInt(ord); + ; Review comment: stray semicolon ########## File path: lucene/core/src/java/org/apache/lucene/codecs/lucene91/Lucene91HnswVectorsReader.java ########## @@ -266,12 +268,12 @@ private Bits getAcceptOrds(Bits acceptDocs, FieldEntry fieldEntry) { return new Bits() { @Override public boolean get(int index) { - return acceptDocs.get(fieldEntry.ordToDoc[index]); + return acceptDocs.get(fieldEntry.ordToDoc(index)); Review comment: We could add a small optimization here where we check `fieldEntry.ordToDocs == null`, and if so just return `acceptDocs`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org