aruggero commented on code in PR #16034:
URL: https://github.com/apache/lucene/pull/16034#discussion_r3202372236
##########
lucene/join/src/java/org/apache/lucene/search/join/DiversifyingNearestChildrenKnnCollectorManager.java:
##########
@@ -74,12 +89,103 @@ public KnnCollector newOptimisticCollector(
if (parentBitSet == null) {
return null;
}
+ int[] docToOrd = getCachedDocToOrd(context);
return new DiversifyingNearestChildrenKnnCollector(
- k, visitedLimit, searchStrategy, parentBitSet);
+ k, visitedLimit, searchStrategy, parentBitSet, docToOrd);
}
@Override
public boolean isOptimistic() {
return true;
}
+
+ /**
+ * Returns the docId-to-ordinal array for the given leaf, building and
caching it on first access.
+ * The cached array is evicted automatically when the segment closes.
+ */
+ private int[] getCachedDocToOrd(LeafReaderContext context) throws
IOException {
+ IndexReader.CacheHelper cacheHelper =
context.reader().getCoreCacheHelper();
+ if (cacheHelper == null) {
+ return buildDocToOrd(context);
+ }
+ IndexReader.CacheKey cacheKey = cacheHelper.getKey();
+ ConcurrentHashMap<String, int[]> fieldMap = new ConcurrentHashMap<>();
+ ConcurrentHashMap<String, int[]> existing =
DOC_TO_ORD_CACHE.putIfAbsent(cacheKey, fieldMap);
+ if (existing == null) {
+ // We inserted the new entry — register cleanup when the segment closes
+ cacheHelper.addClosedListener(DOC_TO_ORD_CACHE::remove);
+ } else {
+ fieldMap = existing;
+ }
+ int[] cached = fieldMap.get(field);
+ if (cached != null) {
+ return cached;
+ }
+ int[] built = buildDocToOrd(context);
+ int[] race = fieldMap.putIfAbsent(field, built);
+ return race != null ? race : built;
+ }
+
+ /**
+ * Builds a docId-to-ordinal array for the given leaf, mapping each docId to
its vector ordinal.
+ *
+ * <p>Returns an empty array if the field has no vector values in this
segment at all — sibling
+ * expansion will be disabled for this leaf.
+ *
+ * <p>Otherwise returns an array of size {@code maxDoc} where each entry is
the vector ordinal for
+ * that docId, or {@code -1} if that specific document has no vector (sparse
indexing).
+ */
+ // Step 1 — Index time: ordinals are assigned by insertion order
+ // In Lucene99FlatVectorsWriter.addValue(), each vector is appended to an
ArrayList (vectors.add(copy)) and its docId
+ // is recorded in docsWithField. Documents are always added in ascending
docId order (enforced by assert docID >
+ // lastDocID). So ordinal 0 = first doc with a vector, ordinal 1 = second,
etc.
+ //
+ // Step 2 — Index time: ordToDoc mapping is written in the same order
+ // In OrdToDocDISIReaderConfiguration.writeStoredMeta(),
docsWithField.iterator() is iterated in ascending docId
+ // order, and each docId is written to DirectMonotonicWriter sequentially.
The i-th value written becomes ordinal
+ // i — so the ordToDoc array stored on disk is exactly: ordToDoc[0] = first
docId, ordToDoc[1] = second docId, ...
+ //
+ // Step 3 — Query time: buildDocToOrd inverts the same ordering
+ // getFloatVectorValues(field).iterator() also yields docIds in ascending
order (same set, same order as
+ // docsWithField at index time). The loop:
+ // while (iter.nextDoc() != NO_MORE_DOCS) {
+ // docToOrd[iter.docID()] = ord++;
+ // }
+ // assigns ord = 0 to the first docId, ord = 1 to the second — exactly
inverting the ordToDoc array written at step 2.
+ //
+ // Step 4 — The HNSW graph uses these same ordinals as node IDs
+ // HNSW nodes are identified by their ordinal (the position in the flat
vector store). So when the searcher returns
+ // ordinal k as a graph node, docToOrd[docId] = k being correct means
docIdToOrdinal will find the right HNSW node
+ // for any sibling docId.
+ private int[] buildDocToOrd(LeafReaderContext context) throws IOException {
Review Comment:
Sorry for the big comment above @benwtrent
Just to remember why some choices were made :)
Could you check if this method is correctly implemented?
We should be inside a single segment at this point, right?
And both ordinals and docids are segment-related (not globally unique),
right?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]