gortiz commented on code in PR #8979: URL: https://github.com/apache/pinot/pull/8979#discussion_r960764456
########## pinot-core/src/main/java/org/apache/pinot/core/operator/query/SelectionPartiallyOrderedByDescOperation.java: ########## @@ -0,0 +1,132 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.operator.query; + +import com.google.common.base.Preconditions; +import com.google.common.base.Supplier; +import java.util.ArrayList; +import java.util.List; +import org.apache.pinot.common.request.context.ExpressionContext; +import org.apache.pinot.core.common.BlockValSet; +import org.apache.pinot.core.common.RowBasedBlockValueFetcher; +import org.apache.pinot.core.operator.blocks.TransformBlock; +import org.apache.pinot.core.operator.transform.TransformOperator; +import org.apache.pinot.core.query.request.context.QueryContext; +import org.apache.pinot.segment.spi.IndexSegment; + + +/** + * An operator for order-by queries DESC that are partially sorted over the sorting keys. + * + * @see LinearSelectionOrderByOperator + */ +public class SelectionPartiallyOrderedByDescOperation extends LinearSelectionOrderByOperator { + + private static final String EXPLAIN_NAME = "SELECT_PARTIAL_ORDER_BY_DESC"; + + private int _numDocsScanned = 0; + private long _numEntriesScannedPostFilter = 0; + + public SelectionPartiallyOrderedByDescOperation(IndexSegment indexSegment, QueryContext queryContext, + List<ExpressionContext> expressions, TransformOperator transformOperator, int sortedExpr) { + super(indexSegment, queryContext, expressions, transformOperator, sortedExpr); + assert queryContext.getOrderByExpressions() != null; + Preconditions.checkArgument(queryContext.getOrderByExpressions().stream() + .filter(expr -> expr.getExpression().getType() == ExpressionContext.Type.IDENTIFIER) + .findFirst() + .orElseThrow(() -> new IllegalArgumentException("The query is not order by identifiers")) + .isDesc(), + "%s can only be used when the first column in order by is DESC", EXPLAIN_NAME); + } + + @Override + protected List<Object[]> fetch(Supplier<ListBuilder> listBuilderSupplier) { + + // Ideally we would use a descending cursor, but we don't actually have them + // Alternatively, we could store all blocks in a list and iterate them in reverse order, but ProjectionBlocks share + // the same DataBlockCache, so they may be ephemeral and being overridden by the next block. + // The only alternative we have right now is to retrieve the last LIMIT elements from each block + + TransformBlock block; + int numColumnsProjected = _transformOperator.getNumColumnsProjected(); + int numExpressions = _expressions.size(); + BlockValSet[] blockValSets = new BlockValSet[numExpressions]; + + List<List<Object[]>> rowsPerBlock = new ArrayList<>(); Review Comment: This should be fixed in the current and future versions. Now what we do is to keep at most 2 `ListBuilder`s whose size is limited by the number of rows the segment must conserve. One of this structures starts empty and contains the best values calculated so far. On the loop that iterates over the blocks in docId order, a new ListBuilder is created and populated with the best rows in the block. After that, still in the loop, the `localBest` elements are added to the new one and becomes the new best values we have. Not all elements in `localBest` have to be added. We can stop when a new partition is discovered and we reached the desired number of rows. Then this newly created ListBuilder substitutes the older `localBest`. This algorithm is easier to understand and it has a cost in memory close to `2 * max(L, min(B, P))` where B is the number of rows in the block, L is `_numRowsToKeep` and P is how many rows are in the larger partition. In the worse case scenario, where all elements are in the same partition and the limit is very large, the consumed memory is close to double the one used by the algorithm we were using before, but it is quite less CPU intensive (and produce less garbage). Please @Jackie-Jiang take a look in case I miss something else. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org