Re: [PR] Add new int8 scalar quantization to HNSW codec [lucene]

via GitHub Thu, 19 Oct 2023 10:53:54 -0700


mayya-sharipova commented on code in PR #12582:
URL: https://github.com/apache/lucene/pull/12582#discussion_r1365917081



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99ScalarQuantizedVectorsWriter.java:
##########
@@ -0,0 +1,824 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene99;
+
+import static 
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.QUANTIZED_VECTOR_COMPONENT;
+import static 
org.apache.lucene.codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat.calculateDefaultQuantile;
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+import static org.apache.lucene.util.RamUsageEstimator.shallowSizeOfInstance;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.KnnVectorsReader;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat;
+import org.apache.lucene.index.DocIDMerger;
+import org.apache.lucene.index.DocsWithFieldSet;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FloatVectorValues;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Sorter;
+import org.apache.lucene.index.VectorEncoding;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InfoStream;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.ScalarQuantizer;
+import org.apache.lucene.util.VectorUtil;
+import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier;
+import org.apache.lucene.util.hnsw.RandomVectorScorer;
+
+/**
+ * Writes quantized vector values and metadata to index segments.
+ *
+ * @lucene.experimental
+ */
+public final class Lucene99ScalarQuantizedVectorsWriter implements Accountable 
{
+
+  private static final long BASE_RAM_BYTES_USED =
+      shallowSizeOfInstance(Lucene99ScalarQuantizedVectorsWriter.class);
+
+  // Used for determining when merged quantiles shifted too far from 
individual segment quantiles.
+  // When merging quantiles from various segments, we need to ensure that the 
new quantiles
+  // are not exceptionally different from an individual segments quantiles.
+  // This would imply that the quantization buckets would shift too much
+  // for floating point values and justify recalculating the quantiles. This 
helps preserve
+  // accuracy of the calculated quantiles, even in adversarial cases such as 
vector clustering.
+  // This number was determined via empirical testing
+  private static final float QUANTILE_RECOMPUTE_LIMIT = 32;
+  // Used for determining if a new quantization state requires a 
re-quantization
+  // for a given segment.
+  // This ensures that in expectation 4/5 of the vector would be unchanged by 
requantization.
+  // Furthermore, only those values where the value is within 1/5 of the 
centre of a quantization
+  // bin will be changed. In these cases the error introduced by snapping one 
way or another
+  // is small compared to the error introduced by quantization in the first 
place. Furthermore,
+  // empirical testing showed that the relative error by not requantizing is 
small (compared to
+  // the quantization error) and the condition is sensitive enough to detect 
all adversarial cases,
+  // such as merging clustered data.
+  private static final float REQUANTIZATION_LIMIT = 0.2f;
+  private final IndexOutput quantizedVectorData;
+  private final Float quantile;
+  private boolean finished;
+
+  Lucene99ScalarQuantizedVectorsWriter(IndexOutput quantizedVectorData, Float 
quantile) {
+    this.quantile = quantile;
+    this.quantizedVectorData = quantizedVectorData;
+  }
+
+  QuantizationFieldVectorWriter addField(FieldInfo fieldInfo, InfoStream 
infoStream) {
+    if (fieldInfo.getVectorEncoding() != VectorEncoding.FLOAT32) {
+      throw new IllegalArgumentException(
+          "Only float32 vector fields are supported for quantization");
+    }
+    float quantile =
+        this.quantile == null
+            ? calculateDefaultQuantile(fieldInfo.getVectorDimension())
+            : this.quantile;
+    if (infoStream.isEnabled(QUANTIZED_VECTOR_COMPONENT)) {
+      infoStream.message(
+          QUANTIZED_VECTOR_COMPONENT,
+          "quantizing field="
+              + fieldInfo.name
+              + " dimension="
+              + fieldInfo.getVectorDimension()
+              + " quantile="
+              + quantile);
+    }
+    return QuantizationFieldVectorWriter.create(fieldInfo, quantile, 
infoStream);
+  }
+
+  long[] flush(
+      Sorter.DocMap sortMap, QuantizationFieldVectorWriter field, 
DocsWithFieldSet docsWithField)
+      throws IOException {
+    field.finish();
+    return sortMap == null ? writeField(field) : writeSortingField(field, 
sortMap, docsWithField);
+  }
+
+  void finish() throws IOException {
+    if (finished) {
+      throw new IllegalStateException("already finished");
+    }
+    finished = true;
+    if (quantizedVectorData != null) {
+      CodecUtil.writeFooter(quantizedVectorData);
+    }
+  }
+
+  private long[] writeField(QuantizationFieldVectorWriter fieldData) throws 
IOException {
+    long quantizedVectorDataOffset = 
quantizedVectorData.alignFilePointer(Float.BYTES);
+    writeQuantizedVectors(fieldData);
+    long quantizedVectorDataLength =
+        quantizedVectorData.getFilePointer() - quantizedVectorDataOffset;
+    return new long[] {quantizedVectorDataOffset, quantizedVectorDataLength};
+  }
+
+  private void writeQuantizedVectors(QuantizationFieldVectorWriter fieldData) 
throws IOException {
+    ScalarQuantizer scalarQuantizer = fieldData.createQuantizer();
+    byte[] vector = new byte[fieldData.dim];
+    final ByteBuffer offsetBuffer = 
ByteBuffer.allocate(Float.BYTES).order(ByteOrder.LITTLE_ENDIAN);
+    for (float[] v : fieldData.floatVectors) {
+      float offsetCorrection =
+          scalarQuantizer.quantize(v, vector, 
fieldData.vectorSimilarityFunction);
+      quantizedVectorData.writeBytes(vector, vector.length);
+      offsetBuffer.putFloat(offsetCorrection);
+      quantizedVectorData.writeBytes(offsetBuffer.array(), 
offsetBuffer.array().length);

Review Comment:
   nit: We can just use 
`quantizedVectorData.writeInt(Float.floatToIntBits(offsetCorrection))` without 
using `offsetBuffer`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Add new int8 scalar quantization to HNSW codec [lucene]

Reply via email to