gortiz commented on code in PR #8979:
URL: https://github.com/apache/pinot/pull/8979#discussion_r980025857


##########
pinot-core/src/main/java/org/apache/pinot/core/plan/SelectionPlanNode.java:
##########
@@ -50,67 +55,119 @@ public Operator<SelectionResultsBlock> run() {
     List<ExpressionContext> expressions = 
SelectionOperatorUtils.extractExpressions(_queryContext, _indexSegment);
     int limit = _queryContext.getLimit();
 
-    if (limit > 0) {
-      List<OrderByExpressionContext> orderByExpressions = 
_queryContext.getOrderByExpressions();
-      if (orderByExpressions == null) {
-        // Selection only
-        TransformOperator transformOperator = new 
TransformPlanNode(_indexSegment, _queryContext, expressions,
-            Math.min(limit, DocIdSetPlanNode.MAX_DOC_PER_CALL)).run();
-        return new SelectionOnlyOperator(_indexSegment, _queryContext, 
expressions, transformOperator);
-      } else {
-        // Selection order-by
-        if (isAllOrderByColumnsSorted(orderByExpressions)) {
-          // All order-by columns are sorted, no need to sort the records
-          TransformOperator transformOperator = new 
TransformPlanNode(_indexSegment, _queryContext, expressions,
-              Math.min(limit + _queryContext.getOffset(), 
DocIdSetPlanNode.MAX_DOC_PER_CALL)).run();
-          return new SelectionOrderByOperator(_indexSegment, _queryContext, 
expressions, transformOperator, true);
-        } else if (orderByExpressions.size() == expressions.size()) {
-          // All output expressions are ordered
-          TransformOperator transformOperator =
-              new TransformPlanNode(_indexSegment, _queryContext, expressions, 
DocIdSetPlanNode.MAX_DOC_PER_CALL).run();
-          return new SelectionOrderByOperator(_indexSegment, _queryContext, 
expressions, transformOperator, false);
-        } else {
-          // Not all output expressions are ordered, only fetch the order-by 
expressions and docId to avoid the
-          // unnecessary data fetch
-          List<ExpressionContext> expressionsToTransform = new 
ArrayList<>(orderByExpressions.size());
-          for (OrderByExpressionContext orderByExpression : 
orderByExpressions) {
-            expressionsToTransform.add(orderByExpression.getExpression());
-          }
-          TransformOperator transformOperator =
-              new TransformPlanNode(_indexSegment, _queryContext, 
expressionsToTransform,
-                  DocIdSetPlanNode.MAX_DOC_PER_CALL).run();
-          return new SelectionOrderByOperator(_indexSegment, _queryContext, 
expressions, transformOperator, false);
-        }
-      }
-    } else {
+    if (limit == 0) {
       // Empty selection (LIMIT 0)
       TransformOperator transformOperator = new 
TransformPlanNode(_indexSegment, _queryContext, expressions, 0).run();
       return new EmptySelectionOperator(_indexSegment, expressions, 
transformOperator);
     }
+    List<OrderByExpressionContext> orderByExpressions = 
_queryContext.getOrderByExpressions();
+    if (orderByExpressions == null) {
+      // Selection only
+      // ie: SELECT ... FROM Table WHERE ... LIMIT 10
+      int maxDocsPerCall = Math.min(limit, DocIdSetPlanNode.MAX_DOC_PER_CALL);
+      TransformPlanNode planNode = new TransformPlanNode(_indexSegment, 
_queryContext, expressions, maxDocsPerCall);
+      TransformOperator transformOperator = planNode.run();
+
+      return new SelectionOnlyOperator(_indexSegment, _queryContext, 
expressions, transformOperator);
+    }
+    int numOrderByExpressions = orderByExpressions.size();
+    // Although it is a break of abstraction, some code, specially merging, 
assumes that if there is an order by
+    // expression the operator will return a block whose selection result is a 
priority queue.
+    int sortedColumnsPrefixSize = getSortedColumnsPrefix(orderByExpressions, 
_queryContext.isNullHandlingEnabled());
+    if (sortedColumnsPrefixSize > 0 && !isSkipOrderByOptimization()) {
+      // The first order by expressions are sorted (either asc or desc).
+      // ie: SELECT ... FROM Table WHERE predicates ORDER BY sorted_column 
DESC LIMIT 10 OFFSET 5
+      // or: SELECT ... FROM Table WHERE predicates ORDER BY sorted_column, 
not_sorted LIMIT 10 OFFSET 5
+      // but not SELECT ... FROM Table WHERE predicates ORDER BY not_sorted, 
sorted_column LIMIT 10 OFFSET 5
+      if (orderByExpressions.get(0).isAsc()) {
+        int maxDocsPerCall = Math.min(limit + _queryContext.getOffset(), 
DocIdSetPlanNode.MAX_DOC_PER_CALL);
+        TransformPlanNode planNode = new TransformPlanNode(_indexSegment, 
_queryContext, expressions, maxDocsPerCall);
+        TransformOperator transformOperator = planNode.run();
+        return new SelectionPartiallyOrderedByAscOperator(_indexSegment, 
_queryContext, expressions,
+            transformOperator, sortedColumnsPrefixSize);
+      } else {
+        int maxDocsPerCall = DocIdSetPlanNode.MAX_DOC_PER_CALL;

Review Comment:
   I'm not sure about that. The cost of this optimization should almost always 
be better than the previous one. Trying to bring light in the topic, I've 
created a new benchmark to test that, these are the results 
(_orderByAlgorithm=naive implies to use the brute force algorithm, null means 
try to use all optimizations):
   
   ```
   
   Benchmark                                        (_limit)  (_numRows)  
(_orderByAlgorithm)  (_primaryRepetitions)  Mode  Cnt     Score     Error  Units
   BenchmarkOrderByDescQueries.sortedDesc                101     1500000        
        naive                      1  avgt    5   291.745 ±  32.896  ms/op
   BenchmarkOrderByDescQueries.sortedDesc                101     1500000        
        naive                   1000  avgt    5    45.143 ±   3.794  ms/op
   BenchmarkOrderByDescQueries.sortedDesc                101     1500000        
         null                      1  avgt    5    11.280 ±   0.688  ms/op
   BenchmarkOrderByDescQueries.sortedDesc                101     1500000        
         null                   1000  avgt    5     7.919 ±   1.100  ms/op
   BenchmarkOrderByDescQueries.sortedDesc              10100     1500000        
        naive                      1  avgt    5   486.737 ±  24.255  ms/op
   BenchmarkOrderByDescQueries.sortedDesc              10100     1500000        
        naive                   1000  avgt    5   472.893 ±  18.295  ms/op
   BenchmarkOrderByDescQueries.sortedDesc              10100     1500000        
         null                      1  avgt    5    46.034 ±   9.756  ms/op
   BenchmarkOrderByDescQueries.sortedDesc              10100     1500000        
         null                   1000  avgt    5    42.245 ±   8.914  ms/op
   BenchmarkOrderByDescQueries.sortedDesc             101000     1500000        
        naive                      1  avgt    5   667.891 ±  75.390  ms/op
   BenchmarkOrderByDescQueries.sortedDesc             101000     1500000        
        naive                   1000  avgt    5   632.281 ±  68.473  ms/op
   BenchmarkOrderByDescQueries.sortedDesc             101000     1500000        
         null                      1  avgt    5   254.195 ±  28.543  ms/op
   BenchmarkOrderByDescQueries.sortedDesc             101000     1500000        
         null                   1000  avgt    5   213.058 ±   6.653  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially       101     1500000        
        naive                      1  avgt    5   280.261 ±  28.662  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially       101     1500000        
        naive                   1000  avgt    5   181.734 ±   5.478  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially       101     1500000        
         null                      1  avgt    5    53.713 ±   2.282  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially       101     1500000        
         null                   1000  avgt    5   101.987 ±  12.397  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially     10100     1500000        
        naive                      1  avgt    5   495.354 ±  29.019  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially     10100     1500000        
        naive                   1000  avgt    5   809.116 ±  56.675  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially     10100     1500000        
         null                      1  avgt    5   144.217 ±   6.340  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially     10100     1500000        
         null                   1000  avgt    5   307.823 ±  40.541  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially    101000     1500000        
        naive                      1  avgt    5   622.101 ±  70.437  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially    101000     1500000        
        naive                   1000  avgt    5  1042.126 ± 125.183  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially    101000     1500000        
         null                      1  avgt    5   151.553 ±  24.081  ms/op
   BenchmarkOrderByDescQueries.sortedDescPartially    101000     1500000        
         null                   1000  avgt    5   325.945 ±  14.207  ms/op
   ```
   
   I need to study the results a little bit more because some numbers don't 
make sense (for example, with limit 10100, 
`BenchmarkOrderByDescQueries.sortedDesc ` seems to be slower than 
`BenchmarkOrderByDescQueries.sortedDescPartially`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to