benwtrent commented on code in PR #15271:
URL: https://github.com/apache/lucene/pull/15271#discussion_r2395369282
##########
lucene/core/src/java/org/apache/lucene/codecs/lucene104/Lucene104ScalarQuantizedVectorsWriter.java:
##########
@@ -385,6 +389,63 @@ static DocsWithFieldSet writeVectorData(
return docsWithField;
}
+ static DocsWithFieldSet writeBinarizedVectorAndQueryData(
+ IndexOutput binarizedVectorData,
+ ScalarEncoding encoding,
+ IndexOutput binarizedQueryData,
+ ScalarEncoding queryEncoding,
+ FloatVectorValues floatVectorValues,
+ float[] centroid,
+ OptimizedScalarQuantizer binaryQuantizer)
+ throws IOException {
+ if (encoding == queryEncoding) {
+ throw new IllegalArgumentException("encoding and queryEncoding must be
different");
+ }
+ if (encoding != ScalarEncoding.SINGLE_BIT || queryEncoding !=
ScalarEncoding.PACKED_NIBBLE) {
+ throw new IllegalArgumentException(
+ "encoding must be SINGLE_BIT and queryEncoding must be
PACKED_NIBBLE");
+ }
+ DocsWithFieldSet docsWithField = new DocsWithFieldSet();
+ int discretizedDims =
+ Math.max(
+ encoding.getDiscreteDimensions(floatVectorValues.dimension()),
+
queryEncoding.getDiscreteDimensions(floatVectorValues.dimension()));
+ assert discretizedDims % encoding.getBits() == 0;
+ assert discretizedDims % queryEncoding.getBits() == 0;
+ byte[][] quantizationScratch = new byte[2][];
+ quantizationScratch[0] = new byte[discretizedDims];
+ quantizationScratch[1] = new byte[discretizedDims];
+ byte[] toIndex = new byte[encoding.getPackedLength(discretizedDims)];
+ byte[] toQuery = new byte[queryEncoding.getPackedLength(discretizedDims)];
+ KnnVectorValues.DocIndexIterator iterator = floatVectorValues.iterator();
+ for (int docV = iterator.nextDoc(); docV != NO_MORE_DOCS; docV =
iterator.nextDoc()) {
+ // write index vector
+ OptimizedScalarQuantizer.QuantizationResult[] r =
+ binaryQuantizer.multiScalarQuantize(
+ floatVectorValues.vectorValue(iterator.index()),
+ quantizationScratch,
+ new byte[] {INDEX_BITS, QUERY_BITS},
+ centroid);
+ // pack and store document bit vector
+ packAsBinary(quantizationScratch[0], toIndex);
+ binarizedVectorData.writeBytes(toIndex, toIndex.length);
+ binarizedVectorData.writeInt(Float.floatToIntBits(r[0].lowerInterval()));
+ binarizedVectorData.writeInt(Float.floatToIntBits(r[0].upperInterval()));
+
binarizedVectorData.writeInt(Float.floatToIntBits(r[0].additionalCorrection()));
+ binarizedVectorData.writeInt(r[0].quantizedComponentSum());
Review Comment:
I think this is OK. two bytes is two bytes, for sure, but I would like to
err on the side of simplicity and speed for now.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]