benwtrent commented on code in PR #15476:
URL: https://github.com/apache/lucene/pull/15476#discussion_r2602351829


##########
lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java:
##########
@@ -1910,6 +1929,163 @@ public void testVectorValuesReportCorrectDocs() throws 
Exception {
     }
   }
 
+  private List<float[]> getRandomFloatVector(int numVectors, int dim, boolean 
normalize) {
+    List<float[]> vectors = new ArrayList<>(numVectors);
+    for (int i = 0; i < numVectors; i++) {
+      float[] vec = randomVector(dim);
+      if (normalize) {
+        float[] copy = new float[vec.length];
+        System.arraycopy(vec, 0, copy, 0, copy.length);
+        VectorUtil.l2normalize(copy);
+        vec = copy;
+      }
+      vectors.add(vec);
+    }
+    return vectors;
+  }
+
+  /**
+   * Tests reading quantized vectors when raw vector data is empty. Verifies 
that scalar quantized
+   * formats can properly dequantize vectors and maintain accuracy within 
expected error bounds even
+   * when the original raw vector file is empty or corrupted.
+   */
+  public void testReadQuantizedVectorWithEmptyRawVectors() throws Exception {
+    assumeTrue("Test only applies to scalar quantized formats", 
supportsFloatVectorFallback());
+
+    String vectorFieldName = "vec1";
+    int numVectors = 1 + random().nextInt(50);
+    int dim = random().nextInt(64) + 1;
+    if (dim % 2 == 1) {
+      dim++;
+    }
+    float eps = (1f / (float) (1 << getQuantizationBits()));
+    VectorSimilarityFunction similarityFunction = randomSimilarity();
+    List<float[]> vectors =
+        getRandomFloatVector(
+            numVectors, dim, similarityFunction == 
VectorSimilarityFunction.COSINE);
+
+    try (BaseDirectoryWrapper dir = newDirectory();
+        IndexWriter w =
+            new IndexWriter(
+                dir,
+                new IndexWriterConfig()
+                    .setMaxBufferedDocs(numVectors + 1)
+                    .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
+                    .setMergePolicy(NoMergePolicy.INSTANCE)
+                    .setUseCompoundFile(false)
+                    .setCodec(getCodecForQuantizedTest()))) {
+      dir.setCheckIndexOnClose(false);
+
+      for (int i = 0; i < numVectors; i++) {
+        Document doc = new Document();
+        doc.add(new KnnFloatVectorField(vectorFieldName, vectors.get(i), 
similarityFunction));
+        w.addDocument(doc);
+      }
+      w.commit();
+
+      simulateEmptyRawVectors(dir);
+
+      try (IndexReader reader = DirectoryReader.open(w)) {
+        LeafReader r = getOnlyLeafReader(reader);
+        if (r instanceof CodecReader codecReader) {
+          KnnVectorsReader knnVectorsReader = codecReader.getVectorReader();
+          knnVectorsReader = 
knnVectorsReader.unwrapReaderForField(vectorFieldName);
+          FloatVectorValues floatVectorValues =
+              knnVectorsReader.getFloatVectorValues(vectorFieldName);
+          if (floatVectorValues.size() > 0) {
+            KnnVectorValues.DocIndexIterator iter = 
floatVectorValues.iterator();
+            for (int docId = iter.nextDoc(); docId != NO_MORE_DOCS; docId = 
iter.nextDoc()) {
+              float[] dequantizedVector = 
floatVectorValues.vectorValue(iter.index());
+              float mae = 0;
+              for (int i = 0; i < dim; i++) {
+                mae += Math.abs(dequantizedVector[i] - vectors.get(docId)[i]);
+              }
+              mae /= dim;
+              assertTrue(
+                  "bits: " + getQuantizationBits() + " mae: " + mae + " > eps: 
" + eps, mae <= eps);
+            }
+          } else {
+            fail("floatVectorValues size should be non zero");
+          }
+        } else {
+          fail("reader is not CodecReader");
+        }
+      }
+    }
+  }
+
+  /** Simulates empty raw vectors by modifying index files. */
+  private void simulateEmptyRawVectors(Directory dir) throws Exception {
+    final String[] indexFiles = dir.listAll();
+    final String RAW_VECTOR_EXTENSION = "vec";
+    final String VECTOR_META_EXTENSION = "vemf";
+
+    for (String file : indexFiles) {
+      if (file.endsWith("." + RAW_VECTOR_EXTENSION)) {
+        replaceWithEmptyVectorFile(dir, file);
+      } else if (file.endsWith("." + VECTOR_META_EXTENSION)) {
+        updateVectorMetadataFile(dir, file);
+      }
+    }
+  }
+
+  /** Replaces a raw vector file with an empty one that has valid 
header/footer. */
+  private void replaceWithEmptyVectorFile(Directory dir, String fileName) 
throws Exception {

Review Comment:
   Could you make these helper methods `protected` ? They make certain 
assumptions about the layout of the format (e.g. file extensions, etc.) that 
might not actually be true.
   
   What if we change the flat float format in the future, and we change the 
file names, or we write additional info to the metadata? Then the bwc tests and 
the mainline tests would get out of sync and it would get frustrating to 
correct. 
   
   I am fine with the "default" being the same as the current flat float 
format, but lets make these helper methods overridable 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to