carlosdelest commented on code in PR #13430: URL: https://github.com/apache/lucene/pull/13430#discussion_r1626064947
########## lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java: ########## @@ -522,21 +550,28 @@ private MergeSpecification doFindMerges( final List<SegmentCommitInfo> candidate = new ArrayList<>(); boolean hitTooLarge = false; long bytesThisMerge = 0; + long docCountThisMerge = 0; for (int idx = startIdx; idx < sortedEligible.size() && candidate.size() < mergeFactor - && bytesThisMerge < maxMergedSegmentBytes; + && bytesThisMerge < maxMergedSegmentBytes + && docCountThisMerge < allowedDocCount; idx++) { final SegmentSizeAndDocs segSizeDocs = sortedEligible.get(idx); final long segBytes = segSizeDocs.sizeInBytes; - + int segDocCount = segSizeDocs.maxDoc - segSizeDocs.delCount; + if (docCountThisMerge + segDocCount > allowedDocCount) { + // We don't want to merge segments that will produce more documents than allowedDocCount + continue; Review Comment: Makes sense, thanks! ########## lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java: ########## @@ -77,20 +77,23 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws long levelSizeBytes = Math.max(minSegmentBytes, (long) (tmp.getFloorSegmentMB() * 1024 * 1024)); long bytesLeft = totalBytes; double allowedSegCount = 0; + final int maxNumSegmentsOnHighestTier = + (int) Math.max(tmp.getSegmentsPerTier(), tmp.getTargetSearchConcurrency()); // below we make the assumption that segments that reached the max segment // size divided by 2 don't need merging anymore int mergeFactor = (int) Math.min(tmp.getSegmentsPerTier(), tmp.getMaxMergeAtOnce()); while (true) { final double segCountLevel = bytesLeft / (double) levelSizeBytes; - if (segCountLevel < tmp.getSegmentsPerTier() || levelSizeBytes >= maxMergedSegmentBytes / 2) { + if (segCountLevel < maxNumSegmentsOnHighestTier Review Comment: Good catch - updating ########## lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java: ########## @@ -522,21 +550,28 @@ private MergeSpecification doFindMerges( final List<SegmentCommitInfo> candidate = new ArrayList<>(); Review Comment: > Merging is only necessary if there are more segments than necessary or more deletes than necessary. The problem is that sortedEligible has filtered out the segments that cannot be merged (like those that are bigger than max segment size), so this condition will skip trying to merge even if we have more segments than allowed. So for example we may have 17 segments total, from which we can just have 12 sortedEligibles, and have 15 allowed segment count. We would never try to merge the eligible segments as they're less than the number of allowed segment counts. Should we take into account the total number of segments vs the total eligibles for this condition? ########## lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java: ########## @@ -111,11 +114,29 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws } } + // There can be more segments if we can't merge docs - they are balanced between segments + int maxDocsPerSegment = tmp.getMaxAllowedDocs(infos.totalMaxDoc()); Review Comment: Yes, we should. Thanks! ########## lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java: ########## @@ -111,11 +114,29 @@ protected void assertSegmentInfos(MergePolicy policy, SegmentInfos infos) throws } } + // There can be more segments if we can't merge docs - they are balanced between segments + int maxDocsPerSegment = tmp.getMaxAllowedDocs(infos.totalMaxDoc()); + List<Integer> segmentDocs = + infos.asList().stream() + .map(info -> info.info.maxDoc() - info.getDelCount()) + .sorted() + .toList(); + int numEligible = 0; + int currentSum = 0; + for (int i = 0; i < segmentDocs.size(); i++) { + currentSum += segmentDocs.get(i); + if (currentSum > maxDocsPerSegment) { + break; + } + numEligible++; + } + boolean eligibleDocsMerge = numEligible > 1; Review Comment: Correct, applied the change. ########## lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java: ########## @@ -916,14 +951,13 @@ public MergeSpecification findForcedDeletesMerges(SegmentInfos infos, MergeConte final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments(); boolean haveWork = false; + int totalDelCount = 0; for (SegmentCommitInfo info : infos) { int delCount = mergeContext.numDeletesToMerge(info); assert assertDelCount(delCount, info); + totalDelCount += delCount; double pctDeletes = 100. * ((double) delCount) / info.info.maxDoc(); - if (pctDeletes > forceMergeDeletesPctAllowed && !merging.contains(info)) { - haveWork = true; - break; - } + haveWork = haveWork || (pctDeletes > forceMergeDeletesPctAllowed && !merging.contains(info)); Review Comment: The idea is to use the loop to count all delete counts, and use it for calculating the max number of allowed docs down below. Is there any other way of retrieving the number of deleted docs without looping over all `SegmentCommitInfo`s? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org