Re: [PR] Add support for intra-segment search concurrency [lucene]

via GitHub Fri, 02 Aug 2024 17:38:17 -0700


zhaih commented on code in PR #13542:
URL: https://github.com/apache/lucene/pull/13542#discussion_r1702166806



##########
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##########
@@ -890,11 +945,70 @@ public static class LeafSlice {
      *
      * @lucene.experimental
      */
-    public final LeafReaderContext[] leaves;
+    public final LeafReaderContextPartition[] leaves;
 
-    public LeafSlice(List<LeafReaderContext> leavesList) {
-      Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase));
-      this.leaves = leavesList.toArray(new LeafReaderContext[0]);
+    public LeafSlice(List<LeafReaderContextPartition> 
leafReaderContextPartitions) {
+      leafReaderContextPartitions.sort(Comparator.comparingInt(l -> 
l.ctx.docBase));
+      // TODO should we sort by minDocId too?
+      this.leaves = leafReaderContextPartitions.toArray(new 
LeafReaderContextPartition[0]);
+    }
+
+    /**
+     * Returns the total number of docs that a slice targets, by summing the 
number of docs that
+     * each of its leaf context partitions targets.
+     */
+    public int getNumDocs() {
+      return Arrays.stream(leaves)

Review Comment:
   We don't need to loop through it everytime, just remember the sum at 
construction?



##########
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##########
@@ -890,11 +945,70 @@ public static class LeafSlice {
      *
      * @lucene.experimental
      */
-    public final LeafReaderContext[] leaves;
+    public final LeafReaderContextPartition[] leaves;
 
-    public LeafSlice(List<LeafReaderContext> leavesList) {
-      Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase));
-      this.leaves = leavesList.toArray(new LeafReaderContext[0]);
+    public LeafSlice(List<LeafReaderContextPartition> 
leafReaderContextPartitions) {
+      leafReaderContextPartitions.sort(Comparator.comparingInt(l -> 
l.ctx.docBase));
+      // TODO should we sort by minDocId too?
+      this.leaves = leafReaderContextPartitions.toArray(new 
LeafReaderContextPartition[0]);
+    }
+
+    /**
+     * Returns the total number of docs that a slice targets, by summing the 
number of docs that
+     * each of its leaf context partitions targets.
+     */
+    public int getNumDocs() {
+      return Arrays.stream(leaves)
+          .map(LeafReaderContextPartition::getNumDocs)
+          .reduce(Integer::sum)
+          .get();
+    }
+  }
+
+  /**
+   * Holds information about a specific leaf context and the corresponding 
range of doc ids to
+   * search within.
+   *
+   * @lucene.experimental
+   */
+  public static final class LeafReaderContextPartition {

Review Comment:
   Can we move maybe `LeafSlice` and this class into a standalone file? The 
current `IndexSearcher` is already too long...



##########
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##########
@@ -1024,10 +1138,50 @@ public LeafSlice[] get() {
             leafSlices =
                 Objects.requireNonNull(
                     sliceProvider.apply(leaves), "slices computed by the 
provider is null");
+            checkSlices(leafSlices);
           }
         }
       }
       return leafSlices;
     }
+
+    /**
+     * Enforce that there aren't multiple slices pointing to the same physical 
segment. It is a
+     * requirement that {@link Collector#getLeafCollector(LeafReaderContext)} 
gets called once per
+     * leaf context. Also, it does not make sense to partition a segment to 
then search those
+     * partitions as part of the same slice, because the goal of partitioning 
is parallel searching
+     * which happens at the slices level.
+     */
+    private static void checkSlices(LeafSlice[] leafSlices) {
+      for (LeafSlice leafSlice : leafSlices) {
+        Set<LeafReaderContext> distinctLeaves = new HashSet<>();
+        for (LeafReaderContextPartition leafPartition : leafSlice.leaves) {
+          distinctLeaves.add(leafPartition.ctx);
+        }
+        if (leafSlice.leaves.length != distinctLeaves.size()) {
+          throw new IllegalStateException(
+              "The same slice targets multiple partitions of the same leaf 
reader. "
+                  + "A segment should rather get partitioned to be searched 
concurrently from as many slices as the "
+                  + "number of partitions it is split into.");
+        }
+      }
+    }
+  }
+
+  /**
+   * Whether the provided leaf slices include leaf partitions.
+   *
+   * @param leafSlices the leaf slices
+   * @return true if the provided leaf slices include leaf partition, false if 
they all target
+   *     entire segments.
+   */
+  public static boolean hasLeafPartitions(IndexSearcher.LeafSlice[] 
leafSlices) {

Review Comment:
   If we allow passing in the total leaf number from indexReader, we only need 
to sum up to see whether `numPartitions == numLeaves` instead of using a 
HashMap?



##########
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##########
@@ -890,11 +945,70 @@ public static class LeafSlice {
      *
      * @lucene.experimental
      */
-    public final LeafReaderContext[] leaves;
+    public final LeafReaderContextPartition[] leaves;
 
-    public LeafSlice(List<LeafReaderContext> leavesList) {
-      Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase));
-      this.leaves = leavesList.toArray(new LeafReaderContext[0]);
+    public LeafSlice(List<LeafReaderContextPartition> 
leafReaderContextPartitions) {
+      leafReaderContextPartitions.sort(Comparator.comparingInt(l -> 
l.ctx.docBase));
+      // TODO should we sort by minDocId too?
+      this.leaves = leafReaderContextPartitions.toArray(new 
LeafReaderContextPartition[0]);
+    }
+
+    /**
+     * Returns the total number of docs that a slice targets, by summing the 
number of docs that
+     * each of its leaf context partitions targets.
+     */
+    public int getNumDocs() {

Review Comment:
   Also maybe use `maxDocs` instead? This number is not counting any deletions 
right?



##########
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##########
@@ -890,11 +945,70 @@ public static class LeafSlice {
      *

Review Comment:
   Can we add javadoc to this class explaining the relationship between 
`LeafSlice`, `LeafReaderContextPartition` and `LeafReader`?



##########
lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java:
##########
@@ -890,11 +945,70 @@ public static class LeafSlice {
      *
      * @lucene.experimental
      */
-    public final LeafReaderContext[] leaves;
+    public final LeafReaderContextPartition[] leaves;
 
-    public LeafSlice(List<LeafReaderContext> leavesList) {
-      Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase));
-      this.leaves = leavesList.toArray(new LeafReaderContext[0]);
+    public LeafSlice(List<LeafReaderContextPartition> 
leafReaderContextPartitions) {
+      leafReaderContextPartitions.sort(Comparator.comparingInt(l -> 
l.ctx.docBase));
+      // TODO should we sort by minDocId too?
+      this.leaves = leafReaderContextPartitions.toArray(new 
LeafReaderContextPartition[0]);
+    }
+
+    /**
+     * Returns the total number of docs that a slice targets, by summing the 
number of docs that
+     * each of its leaf context partitions targets.
+     */
+    public int getNumDocs() {
+      return Arrays.stream(leaves)
+          .map(LeafReaderContextPartition::getNumDocs)
+          .reduce(Integer::sum)
+          .get();
+    }
+  }
+
+  /**
+   * Holds information about a specific leaf context and the corresponding 
range of doc ids to
+   * search within.
+   *
+   * @lucene.experimental
+   */
+  public static final class LeafReaderContextPartition {
+    private final int minDocId;
+    private final int maxDocId;
+    private final int numDocs;
+    public final LeafReaderContext ctx;
+
+    private LeafReaderContextPartition(
+        LeafReaderContext leafReaderContext, int minDocId, int maxDocId, int 
numDocs) {
+      this.ctx = leafReaderContext;
+      this.minDocId = minDocId;
+      this.maxDocId = maxDocId;
+      this.numDocs = numDocs;

Review Comment:
   +1 I think we only need either `maxDocId` or `numDocs` not both?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Re: [PR] Add support for intra-segment search concurrency [lucene]

Reply via email to