gortiz commented on code in PR #8979: URL: https://github.com/apache/pinot/pull/8979#discussion_r980025857
########## pinot-core/src/main/java/org/apache/pinot/core/plan/SelectionPlanNode.java: ########## @@ -50,67 +55,119 @@ public Operator<SelectionResultsBlock> run() { List<ExpressionContext> expressions = SelectionOperatorUtils.extractExpressions(_queryContext, _indexSegment); int limit = _queryContext.getLimit(); - if (limit > 0) { - List<OrderByExpressionContext> orderByExpressions = _queryContext.getOrderByExpressions(); - if (orderByExpressions == null) { - // Selection only - TransformOperator transformOperator = new TransformPlanNode(_indexSegment, _queryContext, expressions, - Math.min(limit, DocIdSetPlanNode.MAX_DOC_PER_CALL)).run(); - return new SelectionOnlyOperator(_indexSegment, _queryContext, expressions, transformOperator); - } else { - // Selection order-by - if (isAllOrderByColumnsSorted(orderByExpressions)) { - // All order-by columns are sorted, no need to sort the records - TransformOperator transformOperator = new TransformPlanNode(_indexSegment, _queryContext, expressions, - Math.min(limit + _queryContext.getOffset(), DocIdSetPlanNode.MAX_DOC_PER_CALL)).run(); - return new SelectionOrderByOperator(_indexSegment, _queryContext, expressions, transformOperator, true); - } else if (orderByExpressions.size() == expressions.size()) { - // All output expressions are ordered - TransformOperator transformOperator = - new TransformPlanNode(_indexSegment, _queryContext, expressions, DocIdSetPlanNode.MAX_DOC_PER_CALL).run(); - return new SelectionOrderByOperator(_indexSegment, _queryContext, expressions, transformOperator, false); - } else { - // Not all output expressions are ordered, only fetch the order-by expressions and docId to avoid the - // unnecessary data fetch - List<ExpressionContext> expressionsToTransform = new ArrayList<>(orderByExpressions.size()); - for (OrderByExpressionContext orderByExpression : orderByExpressions) { - expressionsToTransform.add(orderByExpression.getExpression()); - } - TransformOperator transformOperator = - new TransformPlanNode(_indexSegment, _queryContext, expressionsToTransform, - DocIdSetPlanNode.MAX_DOC_PER_CALL).run(); - return new SelectionOrderByOperator(_indexSegment, _queryContext, expressions, transformOperator, false); - } - } - } else { + if (limit == 0) { // Empty selection (LIMIT 0) TransformOperator transformOperator = new TransformPlanNode(_indexSegment, _queryContext, expressions, 0).run(); return new EmptySelectionOperator(_indexSegment, expressions, transformOperator); } + List<OrderByExpressionContext> orderByExpressions = _queryContext.getOrderByExpressions(); + if (orderByExpressions == null) { + // Selection only + // ie: SELECT ... FROM Table WHERE ... LIMIT 10 + int maxDocsPerCall = Math.min(limit, DocIdSetPlanNode.MAX_DOC_PER_CALL); + TransformPlanNode planNode = new TransformPlanNode(_indexSegment, _queryContext, expressions, maxDocsPerCall); + TransformOperator transformOperator = planNode.run(); + + return new SelectionOnlyOperator(_indexSegment, _queryContext, expressions, transformOperator); + } + int numOrderByExpressions = orderByExpressions.size(); + // Although it is a break of abstraction, some code, specially merging, assumes that if there is an order by + // expression the operator will return a block whose selection result is a priority queue. + int sortedColumnsPrefixSize = getSortedColumnsPrefix(orderByExpressions, _queryContext.isNullHandlingEnabled()); + if (sortedColumnsPrefixSize > 0 && !isSkipOrderByOptimization()) { + // The first order by expressions are sorted (either asc or desc). + // ie: SELECT ... FROM Table WHERE predicates ORDER BY sorted_column DESC LIMIT 10 OFFSET 5 + // or: SELECT ... FROM Table WHERE predicates ORDER BY sorted_column, not_sorted LIMIT 10 OFFSET 5 + // but not SELECT ... FROM Table WHERE predicates ORDER BY not_sorted, sorted_column LIMIT 10 OFFSET 5 + if (orderByExpressions.get(0).isAsc()) { + int maxDocsPerCall = Math.min(limit + _queryContext.getOffset(), DocIdSetPlanNode.MAX_DOC_PER_CALL); + TransformPlanNode planNode = new TransformPlanNode(_indexSegment, _queryContext, expressions, maxDocsPerCall); + TransformOperator transformOperator = planNode.run(); + return new SelectionPartiallyOrderedByAscOperator(_indexSegment, _queryContext, expressions, + transformOperator, sortedColumnsPrefixSize); + } else { + int maxDocsPerCall = DocIdSetPlanNode.MAX_DOC_PER_CALL; Review Comment: I'm not sure about that. The cost of this optimization should almost always be better than the previous one. Trying to bring light in the topic, I've created a new benchmark to test that, these are the results (_orderByAlgorithm=naive implies to use the brute force algorithm, null means try to use all optimizations): ``` Benchmark (_limit) (_numRows) (_orderByAlgorithm) (_primaryRepetitions) Mode Cnt Score Error Units BenchmarkOrderByDescQueries.sortedDesc 101 1500000 naive 1 avgt 5 291.745 ± 32.896 ms/op BenchmarkOrderByDescQueries.sortedDesc 101 1500000 naive 1000 avgt 5 45.143 ± 3.794 ms/op BenchmarkOrderByDescQueries.sortedDesc 101 1500000 null 1 avgt 5 11.280 ± 0.688 ms/op BenchmarkOrderByDescQueries.sortedDesc 101 1500000 null 1000 avgt 5 7.919 ± 1.100 ms/op BenchmarkOrderByDescQueries.sortedDesc 10100 1500000 naive 1 avgt 5 486.737 ± 24.255 ms/op BenchmarkOrderByDescQueries.sortedDesc 10100 1500000 naive 1000 avgt 5 472.893 ± 18.295 ms/op BenchmarkOrderByDescQueries.sortedDesc 10100 1500000 null 1 avgt 5 46.034 ± 9.756 ms/op BenchmarkOrderByDescQueries.sortedDesc 10100 1500000 null 1000 avgt 5 42.245 ± 8.914 ms/op BenchmarkOrderByDescQueries.sortedDesc 101000 1500000 naive 1 avgt 5 667.891 ± 75.390 ms/op BenchmarkOrderByDescQueries.sortedDesc 101000 1500000 naive 1000 avgt 5 632.281 ± 68.473 ms/op BenchmarkOrderByDescQueries.sortedDesc 101000 1500000 null 1 avgt 5 254.195 ± 28.543 ms/op BenchmarkOrderByDescQueries.sortedDesc 101000 1500000 null 1000 avgt 5 213.058 ± 6.653 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 101 1500000 naive 1 avgt 5 280.261 ± 28.662 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 101 1500000 naive 1000 avgt 5 181.734 ± 5.478 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 101 1500000 null 1 avgt 5 53.713 ± 2.282 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 101 1500000 null 1000 avgt 5 101.987 ± 12.397 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 10100 1500000 naive 1 avgt 5 495.354 ± 29.019 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 10100 1500000 naive 1000 avgt 5 809.116 ± 56.675 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 10100 1500000 null 1 avgt 5 144.217 ± 6.340 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 10100 1500000 null 1000 avgt 5 307.823 ± 40.541 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 101000 1500000 naive 1 avgt 5 622.101 ± 70.437 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 101000 1500000 naive 1000 avgt 5 1042.126 ± 125.183 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 101000 1500000 null 1 avgt 5 151.553 ± 24.081 ms/op BenchmarkOrderByDescQueries.sortedDescPartially 101000 1500000 null 1000 avgt 5 325.945 ± 14.207 ms/op ``` I need to study the results a little bit more because some numbers don't make sense (for example, with limit 10100, `BenchmarkOrderByDescQueries.sortedDesc ` seems to be slower than `BenchmarkOrderByDescQueries.sortedDescPartially`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org