Pulkitg64 commented on code in PR #15476:
URL: https://github.com/apache/lucene/pull/15476#discussion_r2605998706
##########
lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java:
##########
@@ -1910,6 +1929,163 @@ public void testVectorValuesReportCorrectDocs() throws
Exception {
}
}
+ private List<float[]> getRandomFloatVector(int numVectors, int dim, boolean
normalize) {
+ List<float[]> vectors = new ArrayList<>(numVectors);
+ for (int i = 0; i < numVectors; i++) {
+ float[] vec = randomVector(dim);
+ if (normalize) {
+ float[] copy = new float[vec.length];
+ System.arraycopy(vec, 0, copy, 0, copy.length);
+ VectorUtil.l2normalize(copy);
+ vec = copy;
+ }
+ vectors.add(vec);
+ }
+ return vectors;
+ }
+
+ /**
+ * Tests reading quantized vectors when raw vector data is empty. Verifies
that scalar quantized
+ * formats can properly dequantize vectors and maintain accuracy within
expected error bounds even
+ * when the original raw vector file is empty or corrupted.
+ */
+ public void testReadQuantizedVectorWithEmptyRawVectors() throws Exception {
+ assumeTrue("Test only applies to scalar quantized formats",
supportsFloatVectorFallback());
+
+ String vectorFieldName = "vec1";
+ int numVectors = 1 + random().nextInt(50);
+ int dim = random().nextInt(64) + 1;
+ if (dim % 2 == 1) {
+ dim++;
+ }
+ float eps = (1f / (float) (1 << getQuantizationBits()));
+ VectorSimilarityFunction similarityFunction = randomSimilarity();
+ List<float[]> vectors =
+ getRandomFloatVector(
+ numVectors, dim, similarityFunction ==
VectorSimilarityFunction.COSINE);
+
+ try (BaseDirectoryWrapper dir = newDirectory();
+ IndexWriter w =
+ new IndexWriter(
+ dir,
+ new IndexWriterConfig()
+ .setMaxBufferedDocs(numVectors + 1)
+ .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+ .setMergePolicy(NoMergePolicy.INSTANCE)
+ .setUseCompoundFile(false)
+ .setCodec(getCodecForQuantizedTest()))) {
+ dir.setCheckIndexOnClose(false);
+
+ for (int i = 0; i < numVectors; i++) {
+ Document doc = new Document();
+ doc.add(new KnnFloatVectorField(vectorFieldName, vectors.get(i),
similarityFunction));
+ w.addDocument(doc);
+ }
+ w.commit();
+
+ simulateEmptyRawVectors(dir);
+
+ try (IndexReader reader = DirectoryReader.open(w)) {
+ LeafReader r = getOnlyLeafReader(reader);
+ if (r instanceof CodecReader codecReader) {
+ KnnVectorsReader knnVectorsReader = codecReader.getVectorReader();
+ knnVectorsReader =
knnVectorsReader.unwrapReaderForField(vectorFieldName);
+ FloatVectorValues floatVectorValues =
+ knnVectorsReader.getFloatVectorValues(vectorFieldName);
+ if (floatVectorValues.size() > 0) {
+ KnnVectorValues.DocIndexIterator iter =
floatVectorValues.iterator();
+ for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId =
iter.nextDoc()) {
+ float[] dequantizedVector =
floatVectorValues.vectorValue(iter.index());
+ float mae = 0;
+ for (int i = 0; i < dim; i++) {
+ mae += Math.abs(dequantizedVector[i] - vectors.get(docId)[i]);
+ }
+ mae /= dim;
+ assertTrue(
+ "bits: " + getQuantizationBits() + " mae: " + mae + " > eps:
" + eps, mae <= eps);
+ }
+ } else {
+ fail("floatVectorValues size should be non zero");
+ }
+ } else {
+ fail("reader is not CodecReader");
+ }
+ }
+ }
+ }
+
+ /** Simulates empty raw vectors by modifying index files. */
+ protected void simulateEmptyRawVectors(Directory dir) throws Exception {
+ final String[] indexFiles = dir.listAll();
+ final String RAW_VECTOR_EXTENSION = "vec";
+ final String VECTOR_META_EXTENSION = "vemf";
+
+ for (String file : indexFiles) {
+ if (file.endsWith("." + RAW_VECTOR_EXTENSION)) {
+ replaceWithEmptyVectorFile(dir, file);
+ } else if (file.endsWith("." + VECTOR_META_EXTENSION)) {
+ updateVectorMetadataFile(dir, file);
+ }
+ }
+ }
+
+ /** Replaces a raw vector file with an empty one that has valid
header/footer. */
+ protected void replaceWithEmptyVectorFile(Directory dir, String fileName)
throws Exception {
+ byte[] indexHeader;
+ try (IndexInput in = dir.openInput(fileName, IOContext.DEFAULT)) {
+ indexHeader = CodecUtil.readIndexHeader(in);
+ }
+ dir.deleteFile(fileName);
+ try (IndexOutput out = dir.createOutput(fileName, IOContext.DEFAULT)) {
+ // Write header
+ out.writeBytes(indexHeader, 0, indexHeader.length);
+ // Write footer (no content in between)
+ CodecUtil.writeFooter(out);
+ }
+ }
+
+ /** Updates vector metadata file to indicate zero vector length. */
+ protected void updateVectorMetadataFile(Directory dir, String fileName)
throws Exception {
Review Comment:
Thank you for the suggestion Mike! I want to make sure I understand the
approach correctly before proceeding. If I understand correctly, the idea is to
move this function to the FlatVectorWriter class. I see a few potential
challenges with this approach:
1. We'd need to determine how to invoke this function from tests, which may
require exposing additional APIs.
2. Alternatively, we could make this part of the codec itself to create
empty vector files, but this functionality isn't currently supported and might
be too invasive a change for this particular fix.
For given time, would it make sense to instead focus on addressing #13158
(creating a read-only index)? (I think I have some idea which we can pursue to
solve the issue)
Please let me know if you meant something else in your comment.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]