LantaoJin commented on code in PR #16214:
URL: https://github.com/apache/lucene/pull/16214#discussion_r3377635868


##########
lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java:
##########
@@ -460,6 +510,187 @@ public long cost() {
     }
   }
 
+  /**
+   * Writes a new generation of KNN vector files for each field that has 
pending vector updates. For
+   * each such field we bump a new {@code vectorGen}, write {@code 
.vec/.vemf/.vex/.vem} files at
+   * that gen suffix where the vectors are the existing field vectors overlaid 
with the updates, and
+   * the HNSW graph is rebuilt eagerly (via the standard {@link 
KnnVectorsWriter} flush path). The
+   * base reader (gen == -1) plus other untouched fields are left in place. 
Vector analogue of
+   * {@link #handleDVUpdates}.
+   */
+  @SuppressWarnings("unchecked")
+  private synchronized void handleVectorUpdates(
+      FieldInfos infos,
+      Directory dir,
+      KnnVectorsFormat vectorsFormat,
+      final SegmentReader reader,
+      Map<Integer, Set<String>> fieldFiles,
+      long maxDelGen,
+      InfoStream infoStream,
+      boolean deferVectorGraphRebuild)
+      throws IOException {
+    for (Entry<String, List<KnnVectorFieldUpdates>> ent : 
pendingVectorUpdates.entrySet()) {
+      final String field = ent.getKey();
+      final List<KnnVectorFieldUpdates> updates = ent.getValue();
+      final List<KnnVectorFieldUpdates> updatesToApply = new ArrayList<>();
+      for (KnnVectorFieldUpdates update : updates) {
+        if (update.delGen <= maxDelGen) {
+          updatesToApply.add(update);
+        }
+      }
+      if (updatesToApply.isEmpty()) {
+        continue;
+      }
+
+      // Reject quantized formats: only the unquantized Lucene99 HNSW format 
is supported for
+      // in-place vector updates.
+      KnnVectorsFormat perFieldFormat = vectorsFormat;
+      if (vectorsFormat instanceof PerFieldKnnVectorsFormat perField) {
+        perFieldFormat = perField.getKnnVectorsFormatForField(field);
+      }
+      if (perFieldFormat instanceof Lucene99HnswVectorsFormat == false) {
+        throw new UnsupportedOperationException(
+            "in-place vector update is only supported for "
+                + Lucene99HnswVectorsFormat.class.getSimpleName()
+                + " but field ["
+                + field
+                + "] uses "
+                + perFieldFormat.getClass().getSimpleName());
+      }
+
+      final long nextVectorGen = info.getNextVectorGen();
+      final String segmentSuffix = Long.toString(nextVectorGen, 
Character.MAX_RADIX);
+      final IOContext updatesContext = IOContext.flush(new 
FlushInfo(info.info.maxDoc(), 0));
+
+      // Stamp the new vectorGen onto the (shared, cloned) FieldInfo so it is 
persisted by
+      // writeFieldInfosGen and picked up at read time. A single-field 
FieldInfos is handed to the
+      // vectors writer for this gen.
+      final FieldInfo fieldInfo = infos.fieldInfo(field);
+      assert fieldInfo != null && fieldInfo.hasVectorValues();
+      fieldInfo.setVectorGen(nextVectorGen);
+      final FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] 
{fieldInfo});
+
+      // Choose the writer format for this generation. When deferring the 
graph rebuild we write
+      // only
+      // the flat vectors and skip building the HNSW graph: the gen's 
".vex/.vem" carry an empty
+      // graph
+      // (vectorIndexLength == 0), so the reader falls back to an exact scan 
on this segment until
+      // the
+      // next merge rebuilds the graph using the codec's normal format. The 
graph build is
+      // suppressed
+      // by raising the "tiny segment" threshold above the segment size (so 
shouldCreateGraph() is
+      // always false for this write). We wrap it in a 
PerFieldKnnVectorsFormat so file naming and
+      // the
+      // per-field suffix attributes match what the (always per-field) reader 
reconstructs.
+      final KnnVectorsFormat writeFormat;
+      if (deferVectorGraphRebuild) {
+        final KnnVectorsFormat noGraphFormat =
+            new Lucene99HnswVectorsFormat(
+                Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN,
+                Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH,
+                info.info.maxDoc() + 1);
+        writeFormat =
+            new PerFieldKnnVectorsFormat() {
+              @Override
+              public KnnVectorsFormat getKnnVectorsFormatForField(String f) {
+                return noGraphFormat;
+              }
+            };
+      } else {
+        writeFormat = vectorsFormat;
+      }
+
+      final TrackingDirectoryWrapper trackingDir = new 
TrackingDirectoryWrapper(dir);
+      final SegmentWriteState state =
+          new SegmentWriteState(
+              null, trackingDir, info.info, fieldInfos, null, updatesContext, 
segmentSuffix);
+
+      try (KnnVectorsWriter writer = writeFormat.fieldsWriter(state)) {
+        final KnnVectorFieldUpdates.Iterator mergedIterator =
+            
KnnVectorFieldUpdates.mergedIterator(toIteratorArray(updatesToApply));
+        if (fieldInfo.getVectorEncoding() == VectorEncoding.FLOAT32) {
+          KnnFieldVectorsWriter<float[]> fieldWriter =
+              (KnnFieldVectorsWriter<float[]>) writer.addField(fieldInfo);
+          writeOverlayFloatVectors(reader, field, mergedIterator, fieldWriter);
+        } else {
+          KnnFieldVectorsWriter<byte[]> fieldWriter =
+              (KnnFieldVectorsWriter<byte[]>) writer.addField(fieldInfo);
+          writeOverlayByteVectors(reader, field, mergedIterator, fieldWriter);
+        }
+        writer.flush(info.info.maxDoc(), null);
+        writer.finish();
+      }
+
+      info.advanceVectorGen();
+      assert !fieldFiles.containsKey(fieldInfo.number);
+      fieldFiles.put(fieldInfo.number, trackingDir.getCreatedFiles());
+    }
+  }
+
+  private static KnnVectorFieldUpdates.Iterator[] toIteratorArray(
+      List<KnnVectorFieldUpdates> updatesToApply) {
+    KnnVectorFieldUpdates.Iterator[] subs =
+        new KnnVectorFieldUpdates.Iterator[updatesToApply.size()];
+    for (int i = 0; i < subs.length; i++) {
+      subs[i] = updatesToApply.get(i).iterator();
+    }
+    return subs;
+  }
+
+  /**
+   * Feeds the field writer, in docID order, every doc that currently has a 
float vector for {@code
+   * field}, substituting the updated vector when the merged update iterator 
has one for that doc.
+   */
+  private static void writeOverlayFloatVectors(

Review Comment:
   Good catch. The overlay now throws an `IllegalArgumentException` instead of 
skipping an unmatched update.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to