Re: [PR] Optimize slice calculation in IndexSearcher a little [lucene]

via GitHub Sat, 08 Mar 2025 13:29:45 -0800


original-brownbear commented on code in PR #13860:
URL: https://github.com/apache/lucene/pull/13860#discussion_r1986157505



##########
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##########
@@ -360,102 +350,95 @@ public static LeafSlice[] slices(
       boolean allowSegmentPartitions) {
 
     // Make a copy so we can sort:
-    List<LeafReaderContext> sortedLeaves = new ArrayList<>(leaves);
+    LeafReaderContext[] sortedLeaves = leaves.toArray(new 
LeafReaderContext[0]);
 
     // Sort by maxDoc, descending:
-    sortedLeaves.sort(Collections.reverseOrder(Comparator.comparingInt(l -> 
l.reader().maxDoc())));
+    Arrays.sort(
+        sortedLeaves, (l1, l2) -> Integer.compare(l2.reader().maxDoc(), 
l1.reader().maxDoc()));
 
     if (allowSegmentPartitions) {
-      final List<List<LeafReaderContextPartition>> groupedLeafPartitions = new 
ArrayList<>();
-      int currentSliceNumDocs = 0;
-      List<LeafReaderContextPartition> group = null;
-      for (LeafReaderContext ctx : sortedLeaves) {
-        if (ctx.reader().maxDoc() > maxDocsPerSlice) {
-          assert group == null;
-          // if the segment does not fit in a single slice, we split it into 
maximum 5 partitions of
-          // equal size
-          int numSlices = Math.min(5, Math.ceilDiv(ctx.reader().maxDoc(), 
maxDocsPerSlice));
-          int numDocs = ctx.reader().maxDoc() / numSlices;
-          int maxDocId = numDocs;
-          int minDocId = 0;
-          for (int i = 0; i < numSlices - 1; i++) {
-            groupedLeafPartitions.add(
-                Collections.singletonList(
-                    LeafReaderContextPartition.createFromAndTo(ctx, minDocId, 
maxDocId)));
-            minDocId = maxDocId;
-            maxDocId += numDocs;
-          }
-          // the last slice gets all the remaining docs
-          groupedLeafPartitions.add(
-              Collections.singletonList(
-                  LeafReaderContextPartition.createFromAndTo(
-                      ctx, minDocId, ctx.reader().maxDoc())));
-        } else {
-          if (group == null) {
-            group = new ArrayList<>();
-            groupedLeafPartitions.add(group);
-          }
-          group.add(LeafReaderContextPartition.createForEntireSegment(ctx));
-
-          currentSliceNumDocs += ctx.reader().maxDoc();
-          // We only split a segment when it does not fit entirely in a slice. 
We don't partition
-          // the
-          // segment that makes the current slice (which holds multiple 
segments) go over
-          // maxDocsPerSlice. This means that a slice either contains multiple 
entire segments, or a
-          // single partition of a segment.
-          if (group.size() >= maxSegmentsPerSlice || currentSliceNumDocs > 
maxDocsPerSlice) {
-            group = null;
-            currentSliceNumDocs = 0;
-          }
-        }
-      }
+      return slicesWithPartitionedSegments(maxDocsPerSlice, 
maxSegmentsPerSlice, sortedLeaves);
+    }
 
-      LeafSlice[] slices = new LeafSlice[groupedLeafPartitions.size()];
-      int upto = 0;
-      for (List<LeafReaderContextPartition> currentGroup : 
groupedLeafPartitions) {
-        slices[upto] = new LeafSlice(currentGroup);
-        ++upto;
+    final List<LeafSlice> groupedLeaves = new ArrayList<>();
+    long docSum = 0;
+    List<LeafReaderContextPartition> group = null;
+    for (LeafReaderContext ctx : sortedLeaves) {
+      final int maxDoc = ctx.reader().maxDoc();
+      var partition = LeafReaderContextPartition.createForEntireSegment(ctx);
+      if (maxDoc > maxDocsPerSlice) {
+        assert group == null;
+        groupedLeaves.add(new LeafSlice(Collections.singletonList(partition)));
+      } else {
+        if (group == null) {
+          group = new ArrayList<>();
+        }
+        group.add(partition);
+        docSum += maxDoc;
+        if (docSum > maxDocsPerSlice || group.size() >= maxSegmentsPerSlice) {
+          groupedLeaves.add(new LeafSlice(group));
+          group = null;
+          docSum = 0;
+        }
       }
-      return slices;
     }
+    if (group != null) {
+      groupedLeaves.add(new LeafSlice(group));
+    }
+    return groupedLeaves.toArray(LeafSlice.EMPTY_ARRAY);
+  }
 
-    final List<List<LeafReaderContext>> groupedLeaves = new ArrayList<>();
-    long docSum = 0;
-    List<LeafReaderContext> group = null;
+  private static LeafSlice[] slicesWithPartitionedSegments(
+      int maxDocsPerSlice, int maxSegmentsPerSlice, LeafReaderContext[] 
sortedLeaves) {
+    final List<List<LeafReaderContextPartition>> groupedLeafPartitions = new 
ArrayList<>();
+    int currentSliceNumDocs = 0;
+    List<LeafReaderContextPartition> group = null;
     for (LeafReaderContext ctx : sortedLeaves) {
       if (ctx.reader().maxDoc() > maxDocsPerSlice) {
         assert group == null;
-        groupedLeaves.add(Collections.singletonList(ctx));
+        // if the segment does not fit in a single slice, we split it into 
maximum 5 partitions of
+        // equal size
+        int numSlices = Math.min(5, Math.ceilDiv(ctx.reader().maxDoc(), 
maxDocsPerSlice));
+        int numDocs = ctx.reader().maxDoc() / numSlices;
+        int maxDocId = numDocs;
+        int minDocId = 0;
+        for (int i = 0; i < numSlices - 1; i++) {
+          groupedLeafPartitions.add(
+              Collections.singletonList(
+                  LeafReaderContextPartition.createFromAndTo(ctx, minDocId, 
maxDocId)));
+          minDocId = maxDocId;
+          maxDocId += numDocs;
+        }
+        // the last slice gets all the remaining docs
+        groupedLeafPartitions.add(
+            Collections.singletonList(
+                LeafReaderContextPartition.createFromAndTo(ctx, minDocId, 
ctx.reader().maxDoc())));
       } else {
         if (group == null) {
           group = new ArrayList<>();
-          group.add(ctx);
-
-          groupedLeaves.add(group);
-        } else {
-          group.add(ctx);
+          groupedLeafPartitions.add(group);
         }
-
-        docSum += ctx.reader().maxDoc();
-        if (group.size() >= maxSegmentsPerSlice || docSum > maxDocsPerSlice) {
+        group.add(LeafReaderContextPartition.createForEntireSegment(ctx));
+
+        currentSliceNumDocs += ctx.reader().maxDoc();
+        // We only split a segment when it does not fit entirely in a slice. 
We don't partition
+        // the
+        // segment that makes the current slice (which holds multiple 
segments) go over
+        // maxDocsPerSlice. This means that a slice either contains multiple 
entire segments, or a
+        // single partition of a segment.
+        if (group.size() >= maxSegmentsPerSlice || currentSliceNumDocs > 
maxDocsPerSlice) {
           group = null;
-          docSum = 0;
+          currentSliceNumDocs = 0;
         }
       }
     }
 
-    LeafSlice[] slices = new LeafSlice[groupedLeaves.size()];
+    LeafSlice[] slices = new LeafSlice[groupedLeafPartitions.size()];
     int upto = 0;
-    for (List<LeafReaderContext> currentLeaf : groupedLeaves) {
-      slices[upto] =
-          new LeafSlice(
-              new ArrayList<>(
-                  currentLeaf.stream()
-                      .map(LeafReaderContextPartition::createForEntireSegment)
-                      .toList()));
+    for (List<LeafReaderContextPartition> currentGroup : 
groupedLeafPartitions) {
+      slices[upto] = new LeafSlice(currentGroup);

Review Comment:
   Done in https://github.com/apache/lucene/pull/14336 sorry for delay on this 
one :) But after benchmarking a little more as of late, this is more valuable 
than ever for us. Especially for large segment count use-cases this logic tends 
to make up a non-trivial part of the total execution time of the query phase 
(as in up to O(10%)) without this fix (close to an order of magnitude cheaper 
with all the fixes in here in ES use cases seemingly ... probably due to 
instruction cache/code-size issues as it turned out now :)).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Optimize slice calculation in IndexSearcher a little [lucene]

Reply via email to