zhaih commented on code in PR #13542: URL: https://github.com/apache/lucene/pull/13542#discussion_r1702166806
########## lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java: ########## @@ -890,11 +945,70 @@ public static class LeafSlice { * * @lucene.experimental */ - public final LeafReaderContext[] leaves; + public final LeafReaderContextPartition[] leaves; - public LeafSlice(List<LeafReaderContext> leavesList) { - Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase)); - this.leaves = leavesList.toArray(new LeafReaderContext[0]); + public LeafSlice(List<LeafReaderContextPartition> leafReaderContextPartitions) { + leafReaderContextPartitions.sort(Comparator.comparingInt(l -> l.ctx.docBase)); + // TODO should we sort by minDocId too? + this.leaves = leafReaderContextPartitions.toArray(new LeafReaderContextPartition[0]); + } + + /** + * Returns the total number of docs that a slice targets, by summing the number of docs that + * each of its leaf context partitions targets. + */ + public int getNumDocs() { + return Arrays.stream(leaves) Review Comment: We don't need to loop through it everytime, just remember the sum at construction? ########## lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java: ########## @@ -890,11 +945,70 @@ public static class LeafSlice { * * @lucene.experimental */ - public final LeafReaderContext[] leaves; + public final LeafReaderContextPartition[] leaves; - public LeafSlice(List<LeafReaderContext> leavesList) { - Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase)); - this.leaves = leavesList.toArray(new LeafReaderContext[0]); + public LeafSlice(List<LeafReaderContextPartition> leafReaderContextPartitions) { + leafReaderContextPartitions.sort(Comparator.comparingInt(l -> l.ctx.docBase)); + // TODO should we sort by minDocId too? + this.leaves = leafReaderContextPartitions.toArray(new LeafReaderContextPartition[0]); + } + + /** + * Returns the total number of docs that a slice targets, by summing the number of docs that + * each of its leaf context partitions targets. + */ + public int getNumDocs() { + return Arrays.stream(leaves) + .map(LeafReaderContextPartition::getNumDocs) + .reduce(Integer::sum) + .get(); + } + } + + /** + * Holds information about a specific leaf context and the corresponding range of doc ids to + * search within. + * + * @lucene.experimental + */ + public static final class LeafReaderContextPartition { Review Comment: Can we move maybe `LeafSlice` and this class into a standalone file? The current `IndexSearcher` is already too long... ########## lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java: ########## @@ -1024,10 +1138,50 @@ public LeafSlice[] get() { leafSlices = Objects.requireNonNull( sliceProvider.apply(leaves), "slices computed by the provider is null"); + checkSlices(leafSlices); } } } return leafSlices; } + + /** + * Enforce that there aren't multiple slices pointing to the same physical segment. It is a + * requirement that {@link Collector#getLeafCollector(LeafReaderContext)} gets called once per + * leaf context. Also, it does not make sense to partition a segment to then search those + * partitions as part of the same slice, because the goal of partitioning is parallel searching + * which happens at the slices level. + */ + private static void checkSlices(LeafSlice[] leafSlices) { + for (LeafSlice leafSlice : leafSlices) { + Set<LeafReaderContext> distinctLeaves = new HashSet<>(); + for (LeafReaderContextPartition leafPartition : leafSlice.leaves) { + distinctLeaves.add(leafPartition.ctx); + } + if (leafSlice.leaves.length != distinctLeaves.size()) { + throw new IllegalStateException( + "The same slice targets multiple partitions of the same leaf reader. " + + "A segment should rather get partitioned to be searched concurrently from as many slices as the " + + "number of partitions it is split into."); + } + } + } + } + + /** + * Whether the provided leaf slices include leaf partitions. + * + * @param leafSlices the leaf slices + * @return true if the provided leaf slices include leaf partition, false if they all target + * entire segments. + */ + public static boolean hasLeafPartitions(IndexSearcher.LeafSlice[] leafSlices) { Review Comment: If we allow passing in the total leaf number from indexReader, we only need to sum up to see whether `numPartitions == numLeaves` instead of using a HashMap? ########## lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java: ########## @@ -890,11 +945,70 @@ public static class LeafSlice { * * @lucene.experimental */ - public final LeafReaderContext[] leaves; + public final LeafReaderContextPartition[] leaves; - public LeafSlice(List<LeafReaderContext> leavesList) { - Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase)); - this.leaves = leavesList.toArray(new LeafReaderContext[0]); + public LeafSlice(List<LeafReaderContextPartition> leafReaderContextPartitions) { + leafReaderContextPartitions.sort(Comparator.comparingInt(l -> l.ctx.docBase)); + // TODO should we sort by minDocId too? + this.leaves = leafReaderContextPartitions.toArray(new LeafReaderContextPartition[0]); + } + + /** + * Returns the total number of docs that a slice targets, by summing the number of docs that + * each of its leaf context partitions targets. + */ + public int getNumDocs() { Review Comment: Also maybe use `maxDocs` instead? This number is not counting any deletions right? ########## lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java: ########## @@ -890,11 +945,70 @@ public static class LeafSlice { * Review Comment: Can we add javadoc to this class explaining the relationship between `LeafSlice`, `LeafReaderContextPartition` and `LeafReader`? ########## lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java: ########## @@ -890,11 +945,70 @@ public static class LeafSlice { * * @lucene.experimental */ - public final LeafReaderContext[] leaves; + public final LeafReaderContextPartition[] leaves; - public LeafSlice(List<LeafReaderContext> leavesList) { - Collections.sort(leavesList, Comparator.comparingInt(l -> l.docBase)); - this.leaves = leavesList.toArray(new LeafReaderContext[0]); + public LeafSlice(List<LeafReaderContextPartition> leafReaderContextPartitions) { + leafReaderContextPartitions.sort(Comparator.comparingInt(l -> l.ctx.docBase)); + // TODO should we sort by minDocId too? + this.leaves = leafReaderContextPartitions.toArray(new LeafReaderContextPartition[0]); + } + + /** + * Returns the total number of docs that a slice targets, by summing the number of docs that + * each of its leaf context partitions targets. + */ + public int getNumDocs() { + return Arrays.stream(leaves) + .map(LeafReaderContextPartition::getNumDocs) + .reduce(Integer::sum) + .get(); + } + } + + /** + * Holds information about a specific leaf context and the corresponding range of doc ids to + * search within. + * + * @lucene.experimental + */ + public static final class LeafReaderContextPartition { + private final int minDocId; + private final int maxDocId; + private final int numDocs; + public final LeafReaderContext ctx; + + private LeafReaderContextPartition( + LeafReaderContext leafReaderContext, int minDocId, int maxDocId, int numDocs) { + this.ctx = leafReaderContext; + this.minDocId = minDocId; + this.maxDocId = maxDocId; + this.numDocs = numDocs; Review Comment: +1 I think we only need either `maxDocId` or `numDocs` not both? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org