msokolov commented on a change in pull request #728: URL: https://github.com/apache/lucene/pull/728#discussion_r819056510
########## File path: lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java ########## @@ -20,39 +20,53 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.List; +import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.IOUtils; /** - * Buffers up pending vector value(s) per doc, then flushes when segment flushes. + * Buffers up pending vector value per doc on disk until segment flushes. * * @lucene.experimental */ class VectorValuesWriter { private final FieldInfo fieldInfo; private final Counter iwBytesUsed; - private final List<float[]> vectors = new ArrayList<>(); private final DocsWithFieldSet docsWithField; + private final int dim; + private final int byteSize; + private final ByteBuffer buffer; + private final Directory directory; + private final IndexOutput dataOut; private int lastDocID = -1; private long bytesUsed; - VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { + VectorValuesWriter( + FieldInfo fieldInfo, Counter iwBytesUsed, Directory directory, String segmentName) + throws IOException { this.fieldInfo = fieldInfo; this.iwBytesUsed = iwBytesUsed; - this.docsWithField = new DocsWithFieldSet(); - this.bytesUsed = docsWithField.ramBytesUsed(); + docsWithField = new DocsWithFieldSet(); + this.directory = directory; + String fileName = segmentName + "_" + fieldInfo.getName() + "_buffered_vectors"; Review comment: I think fields can have pretty much any character in their name. Perhaps instead of using the field name, we should use its number in the filename? ########## File path: lucene/core/src/java/org/apache/lucene/index/IndexingChain.java ########## @@ -522,6 +526,18 @@ void abort() throws IOException { // finalizer will e.g. close any open files in the term vectors writer: Review comment: maybe this comment should be updated? ########## File path: lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java ########## @@ -20,39 +20,53 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.ByteOrder; -import java.util.ArrayList; -import java.util.List; +import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.KnnVectorsWriter; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TopDocs; -import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; -import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.IOUtils; /** - * Buffers up pending vector value(s) per doc, then flushes when segment flushes. + * Buffers up pending vector value per doc on disk until segment flushes. * * @lucene.experimental */ class VectorValuesWriter { private final FieldInfo fieldInfo; private final Counter iwBytesUsed; - private final List<float[]> vectors = new ArrayList<>(); private final DocsWithFieldSet docsWithField; + private final int dim; + private final int byteSize; + private final ByteBuffer buffer; + private final Directory directory; + private final IndexOutput dataOut; private int lastDocID = -1; private long bytesUsed; - VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { + VectorValuesWriter( + FieldInfo fieldInfo, Counter iwBytesUsed, Directory directory, String segmentName) + throws IOException { this.fieldInfo = fieldInfo; this.iwBytesUsed = iwBytesUsed; - this.docsWithField = new DocsWithFieldSet(); - this.bytesUsed = docsWithField.ramBytesUsed(); + docsWithField = new DocsWithFieldSet(); + this.directory = directory; + String fileName = segmentName + "_" + fieldInfo.getName() + "_buffered_vectors"; + dataOut = directory.createTempOutput(fileName, "temp", IOContext.DEFAULT); Review comment: I'm curious what does `createTempOutput` do? Does it mean if we crash these files would get cleaned up on re-opening the index? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org