mayya-sharipova commented on code in PR #12582: URL: https://github.com/apache/lucene/pull/12582#discussion_r1365917081
########## lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java: ########## @@ -0,0 +1,824 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene99; + +import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT; +import static org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultQuantile; +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance; + +import java.io.Closeable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.ArrayList; +import java.util.List; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.KnnVectorsReader; +import org.apache.lucene.codecs.KnnVectorsWriter; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.DocsWithFieldSet; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FloatVectorValues; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Sorter; +import org.apache.lucene.index.VectorEncoding; +import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Accountable; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.InfoStream; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.ScalarQuantizer; +import org.apache.lucene.util.VectorUtil; +import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier; +import org.apache.lucene.util.hnsw.RandomVectorScorer; + +/** + * Writes quantized vector values and metadata to index segments. + * + * @lucene.experimental + */ +public final class Lucene99ScalarQuantizedVectorsWriter implements Accountable { + + private static final long BASE_RAM_BYTES_USED = + shallowSizeOfInstance(Lucene99ScalarQuantizedVectorsWriter.class); + + // Used for determining when merged quantiles shifted too far from individual segment quantiles. + // When merging quantiles from various segments, we need to ensure that the new quantiles + // are not exceptionally different from an individual segments quantiles. + // This would imply that the quantization buckets would shift too much + // for floating point values and justify recalculating the quantiles. This helps preserve + // accuracy of the calculated quantiles, even in adversarial cases such as vector clustering. + // This number was determined via empirical testing + private static final float QUANTILE_RECOMPUTE_LIMIT = 32; + // Used for determining if a new quantization state requires a re-quantization + // for a given segment. + // This ensures that in expectation 4/5 of the vector would be unchanged by requantization. + // Furthermore, only those values where the value is within 1/5 of the centre of a quantization + // bin will be changed. In these cases the error introduced by snapping one way or another + // is small compared to the error introduced by quantization in the first place. Furthermore, + // empirical testing showed that the relative error by not requantizing is small (compared to + // the quantization error) and the condition is sensitive enough to detect all adversarial cases, + // such as merging clustered data. + private static final float REQUANTIZATION_LIMIT = 0.2f; + private final IndexOutput quantizedVectorData; + private final Float quantile; + private boolean finished; + + Lucene99ScalarQuantizedVectorsWriter(IndexOutput quantizedVectorData, Float quantile) { + this.quantile = quantile; + this.quantizedVectorData = quantizedVectorData; + } + + QuantizationFieldVectorWriter addField(FieldInfo fieldInfo, InfoStream infoStream) { + if (fieldInfo.getVectorEncoding() != VectorEncoding.FLOAT32) { + throw new IllegalArgumentException( + "Only float32 vector fields are supported for quantization"); + } + float quantile = + this.quantile == null + ? calculateDefaultQuantile(fieldInfo.getVectorDimension()) + : this.quantile; + if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) { + infoStream.message( + QUANTIZED_VECTOR_COMPONENT, + "quantizing field=" + + fieldInfo.name + + " dimension=" + + fieldInfo.getVectorDimension() + + " quantile=" + + quantile); + } + return QuantizationFieldVectorWriter.create(fieldInfo, quantile, infoStream); + } + + long[] flush( + Sorter.DocMap sortMap, QuantizationFieldVectorWriter field, DocsWithFieldSet docsWithField) + throws IOException { + field.finish(); + return sortMap == null ? writeField(field) : writeSortingField(field, sortMap, docsWithField); + } + + void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + if (quantizedVectorData != null) { + CodecUtil.writeFooter(quantizedVectorData); + } + } + + private long[] writeField(QuantizationFieldVectorWriter fieldData) throws IOException { + long quantizedVectorDataOffset = quantizedVectorData.alignFilePointer(Float.BYTES); + writeQuantizedVectors(fieldData); + long quantizedVectorDataLength = + quantizedVectorData.getFilePointer() - quantizedVectorDataOffset; + return new long[] {quantizedVectorDataOffset, quantizedVectorDataLength}; + } + + private void writeQuantizedVectors(QuantizationFieldVectorWriter fieldData) throws IOException { + ScalarQuantizer scalarQuantizer = fieldData.createQuantizer(); + byte[] vector = new byte[fieldData.dim]; + final ByteBuffer offsetBuffer = ByteBuffer.allocate(Float.BYTES).order(ByteOrder.LITTLE_ENDIAN); + for (float[] v : fieldData.floatVectors) { + float offsetCorrection = + scalarQuantizer.quantize(v, vector, fieldData.vectorSimilarityFunction); + quantizedVectorData.writeBytes(vector, vector.length); + offsetBuffer.putFloat(offsetCorrection); + quantizedVectorData.writeBytes(offsetBuffer.array(), offsetBuffer.array().length); Review Comment: nit: We can just use `quantizedVectorData.writeInt(Float.floatToIntBits(offsetCorrection))` without using `offsetBuffer` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org