[GitHub] [lucene] jtibshirani commented on a diff in pull request #992: LUCENE-10592 Build HNSW Graph on indexing

GitBox Thu, 30 Jun 2022 04:00:51 -0700


jtibshirani commented on code in PR #992:
URL: https://github.com/apache/lucene/pull/992#discussion_r910836731



##########
lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java:
##########
@@ -26,233 +26,153 @@
 import org.apache.lucene.codecs.KnnVectorsWriter;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.RamUsageEstimator;
 
 /**
- * Buffers up pending vector value(s) per doc, then flushes when segment 
flushes.
+ * Buffers up pending vector value(s) per doc, then flushes when segment 
flushes. Used for {@code
+ * SimpleTextKnnVectorsWriter} and for vectors writers before v 9.3 .
  *
  * @lucene.experimental
  */
-class VectorValuesWriter {
-
-  private final FieldInfo fieldInfo;
-  private final Counter iwBytesUsed;
-  private final List<float[]> vectors = new ArrayList<>();
-  private final DocsWithFieldSet docsWithField;
-
-  private int lastDocID = -1;
-
-  private long bytesUsed;
-
-  VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
-    this.fieldInfo = fieldInfo;
-    this.iwBytesUsed = iwBytesUsed;
-    this.docsWithField = new DocsWithFieldSet();
-    this.bytesUsed = docsWithField.ramBytesUsed();
-    if (iwBytesUsed != null) {
-      iwBytesUsed.addAndGet(bytesUsed);
+public abstract class VectorValuesWriter extends KnnVectorsWriter {

Review Comment:
   Would renaming this to `BufferingKnnVectorsWriter` be clearer? I assumed it 
did something different because of the very general name `VectorValuesWriter`.
   
   I also wonder if we could update `SimpleTextKnnVectorsWriter` to use the new 
writer interface. Then we could move this class to the backwards-codecs 
package, because it would only be used in the old codec tests.



##########
lucene/core/src/java/org/apache/lucene/index/VectorValuesConsumer.java:
##########
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.KnnVectorsFormat;
+import org.apache.lucene.codecs.KnnVectorsWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.util.Accountable;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InfoStream;
+
+/**
+ * Streams vector values for indexing to the given codec's vectors writer. The 
codec's vectors
+ * writer is responsible for buffering and processing vectors.
+ */
+class VectorValuesConsumer {
+  private final Codec codec;
+  private final Directory directory;
+  private final SegmentInfo segmentInfo;
+  private final InfoStream infoStream;
+
+  private Accountable accountable = Accountable.NULL_ACCOUNTABLE;
+  private KnnVectorsWriter writer;
+
+  VectorValuesConsumer(
+      Codec codec, Directory directory, SegmentInfo segmentInfo, InfoStream 
infoStream) {
+    this.codec = codec;
+    this.directory = directory;
+    this.segmentInfo = segmentInfo;
+    this.infoStream = infoStream;
+  }
+
+  private void initKnnVectorsWriter(String fieldName) throws IOException {
+    if (writer == null) {
+      KnnVectorsFormat fmt = codec.knnVectorsFormat();
+      if (fmt == null) {
+        throw new IllegalStateException(
+            "field=\""
+                + fieldName
+                + "\" was indexed as vectors but codec does not support 
vectors");
+      }
+      SegmentWriteState initialWriteState =
+          new SegmentWriteState(infoStream, directory, segmentInfo, null, 
null, IOContext.DEFAULT);
+      writer = fmt.fieldsWriter(initialWriteState);
+      accountable = writer;
+    }
+  }
+
+  public void addField(FieldInfo fieldInfo) throws IOException {
+    initKnnVectorsWriter(fieldInfo.name);
+    writer.addField(fieldInfo);
+  }
+
+  public void addValue(FieldInfo fieldInfo, int docID, float[] vectorValue) 
throws IOException {
+    writer.addValue(fieldInfo, docID, vectorValue);
+  }
+
+  void flush(SegmentWriteState state, Sorter.DocMap sortMap) throws 
IOException {

Review Comment:
   Do we also need something like `StoredFieldsConsumer#finish` in case more 
docs are added after the last doc that contains vectors?



##########
lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java:
##########
@@ -24,28 +24,40 @@
 import org.apache.lucene.index.DocIDMerger;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.Sorter;
 import org.apache.lucene.index.VectorValues;
 import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 
 /** Writes vectors to an index. */
-public abstract class KnnVectorsWriter implements Closeable {
+public abstract class KnnVectorsWriter implements Accountable, Closeable {
 
   /** Sole constructor */
   protected KnnVectorsWriter() {}
 
-  /** Write all values contained in the provided reader */
-  public abstract void writeField(FieldInfo fieldInfo, KnnVectorsReader 
knnVectorsReader)
+  /** Add new field for indexing */
+  public abstract void addField(FieldInfo fieldInfo) throws IOException;
+
+  /** Add new docID with its vector value to the given field for indexing */
+  public abstract void addValue(FieldInfo fieldInfo, int docID, float[] 
vectorValue)
+      throws IOException;
+
+  /** Flush all buffered data on disk * */
+  public abstract void flush(int maxDoc, Sorter.DocMap sortMap) throws 
IOException;

Review Comment:
   Do we need separate methods `flush` and `finish`? Maybe we should just have 
`flush`, which also performs the steps to finish the codec (I think this is 
what `StoredFieldsConsumer` does for example).



##########
lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java:
##########
@@ -203,8 +204,11 @@ private NeighborQueue searchLevel(
     return results;
   }
 
-  private void clearScratchState() {
+  private void clearScratchState(int capacity) {

Review Comment:
   Maybe we can rename this to `prepareScratchState` since it resizes the bit 
set too.



##########
lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene91/Lucene91HnswVectorsWriter.java:
##########
@@ -58,7 +57,6 @@ public final class Lucene91HnswVectorsWriter extends 
KnnVectorsWriter {
     this.maxConn = maxConn;
     this.beamWidth = beamWidth;
 
-    assert state.fieldInfos.hasVectorValues();

Review Comment:
   Wondering why we can remove this assertion?



##########
lucene/core/src/java/org/apache/lucene/util/hnsw/HnswGraphSearcher.java:
##########
@@ -203,8 +204,11 @@ private NeighborQueue searchLevel(
     return results;
   }
 
-  private void clearScratchState() {
+  private void clearScratchState(int capacity) {
     candidates.clear();
+    if (visited.length() < capacity) {
+      visited = FixedBitSet.ensureCapacity((FixedBitSet) visited, capacity);

Review Comment:
   This seems like a good strategy to me, but it will be interesting to see how 
this affects GC during indexing.



##########
lucene/core/src/java/org/apache/lucene/codecs/lucene93/Lucene93HnswVectorsWriter.java:
##########
@@ -116,7 +119,193 @@ public final class Lucene93HnswVectorsWriter extends 
KnnVectorsWriter {
   }
 
   @Override
-  public void writeField(FieldInfo fieldInfo, KnnVectorsReader 
knnVectorsReader)
+  public void addField(FieldInfo fieldInfo) throws IOException {
+    if (fields == null) {
+      fields = new FieldData[1];
+    } else {
+      FieldData[] newFields = new FieldData[fields.length + 1];
+      System.arraycopy(fields, 0, newFields, 0, fields.length);
+      fields = newFields;
+    }
+    fields[fields.length - 1] =
+        new FieldData(fieldInfo, M, beamWidth, segmentWriteState.infoStream);
+  }
+
+  @Override
+  public void addValue(FieldInfo fieldInfo, int docID, float[] vectorValue) 
throws IOException {
+    for (FieldData field : fields) {

Review Comment:
   Maybe we should maintain a map from field names to `FieldData` to ensure 
this is efficient.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] jtibshirani commented on a diff in pull request #992: LUCENE-10592 Build HNSW Graph on indexing

Reply via email to