[GitHub] [lucene] msokolov commented on a change in pull request #728: LUCENE-10194 Buffer KNN vectors on disk

GitBox Thu, 03 Mar 2022 13:01:42 -0800


msokolov commented on a change in pull request #728:
URL: https://github.com/apache/lucene/pull/728#discussion_r819056510




##########
File path: lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java
##########
@@ -20,39 +20,53 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
-import java.util.ArrayList;
-import java.util.List;
+import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.KnnVectorsReader;
 import org.apache.lucene.codecs.KnnVectorsWriter;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.IOUtils;
 
 /**
- * Buffers up pending vector value(s) per doc, then flushes when segment 
flushes.
+ * Buffers up pending vector value per doc on disk until segment flushes.
  *
  * @lucene.experimental
  */
 class VectorValuesWriter {
 
   private final FieldInfo fieldInfo;
   private final Counter iwBytesUsed;
-  private final List<float[]> vectors = new ArrayList<>();
   private final DocsWithFieldSet docsWithField;
+  private final int dim;
+  private final int byteSize;
+  private final ByteBuffer buffer;
+  private final Directory directory;
+  private final IndexOutput dataOut;
 
   private int lastDocID = -1;
 
   private long bytesUsed;
 
-  VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+  VectorValuesWriter(
+      FieldInfo fieldInfo, Counter iwBytesUsed, Directory directory, String 
segmentName)
+      throws IOException {
     this.fieldInfo = fieldInfo;
     this.iwBytesUsed = iwBytesUsed;
-    this.docsWithField = new DocsWithFieldSet();
-    this.bytesUsed = docsWithField.ramBytesUsed();
+    docsWithField = new DocsWithFieldSet();
+    this.directory = directory;
+    String fileName = segmentName + "_" + fieldInfo.getName() + 
"_buffered_vectors";

Review comment:
       I think fields can have pretty much any character in their name. Perhaps 
instead of using the field name, we should use its number in the filename?

##########
File path: lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
##########
@@ -522,6 +526,18 @@ void abort() throws IOException {
     // finalizer will e.g. close any open files in the term vectors writer:

Review comment:
       maybe this comment should be updated?

##########
File path: lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java
##########
@@ -20,39 +20,53 @@
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
-import java.util.ArrayList;
-import java.util.List;
+import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.codecs.KnnVectorsReader;
 import org.apache.lucene.codecs.KnnVectorsWriter;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.IOUtils;
 
 /**
- * Buffers up pending vector value(s) per doc, then flushes when segment 
flushes.
+ * Buffers up pending vector value per doc on disk until segment flushes.
  *
  * @lucene.experimental
  */
 class VectorValuesWriter {
 
   private final FieldInfo fieldInfo;
   private final Counter iwBytesUsed;
-  private final List<float[]> vectors = new ArrayList<>();
   private final DocsWithFieldSet docsWithField;
+  private final int dim;
+  private final int byteSize;
+  private final ByteBuffer buffer;
+  private final Directory directory;
+  private final IndexOutput dataOut;
 
   private int lastDocID = -1;
 
   private long bytesUsed;
 
-  VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+  VectorValuesWriter(
+      FieldInfo fieldInfo, Counter iwBytesUsed, Directory directory, String 
segmentName)
+      throws IOException {
     this.fieldInfo = fieldInfo;
     this.iwBytesUsed = iwBytesUsed;
-    this.docsWithField = new DocsWithFieldSet();
-    this.bytesUsed = docsWithField.ramBytesUsed();
+    docsWithField = new DocsWithFieldSet();
+    this.directory = directory;
+    String fileName = segmentName + "_" + fieldInfo.getName() + 
"_buffered_vectors";
+    dataOut = directory.createTempOutput(fileName, "temp", IOContext.DEFAULT);

Review comment:
       I'm curious what does `createTempOutput` do? Does it mean if we crash 
these files would get cleaned up on re-opening the index?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

[GitHub] [lucene] msokolov commented on a change in pull request #728: LUCENE-10194 Buffer KNN vectors on disk

Reply via email to