[GitHub] [pinot] walterddr commented on a diff in pull request #10286: [multistage] Initial (phase 1) Query runtime for window functions - empty OVER() and OVER(PARTITION BY)

via GitHub Fri, 24 Feb 2023 10:31:51 -0800


walterddr commented on code in PR #10286:
URL: https://github.com/apache/pinot/pull/10286#discussion_r1117427355



##########
pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/operator/AggregateOperator.java:
##########
@@ -308,6 +303,20 @@ public Object merge(Object agg, Object value) {
     }
   }
 
+  private static class MergeCounts implements Merger {
+
+    @Override
+    public Object initialize(Object other, DataSchema.ColumnDataType dataType) 
{
+      return other == null ? 0 : 1;
+    }
+
+    @Override
+    public Object merge(Object left, Object ignored) {
+      // TODO: COUNT(*) doesn't need to parse right object until we support 
NULL
+      return ((Number) left).doubleValue() + 1;

Review Comment:
   this should be supported right?
   ```suggestion
         return ((Number) left).doubleValue() + (ignored == null ? 0 : 1);
   ```
   and change `ignored` to `right`



##########
pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/operator/WindowAggregateOperator.java:
##########
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.operator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.pinot.common.datablock.DataBlock;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.data.table.Key;
+import org.apache.pinot.query.planner.logical.RexExpression;
+import org.apache.pinot.query.routing.VirtualServerAddress;
+import org.apache.pinot.query.runtime.blocks.TransferableBlock;
+import org.apache.pinot.query.runtime.blocks.TransferableBlockUtils;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * The WindowAggregateOperator is used to compute window function aggregations 
over a set of optional
+ * PARTITION BY keys, ORDER BY keys and a FRAME clause. The output data will 
include the projected
+ * columns and in addition will add the aggregation columns to the output data.
+ * [input columns, aggregate result1, ... aggregate resultN]
+ *
+ * The window functions supported today are SUM/COUNT/MIN/MAX aggregations. 
Window functions also include
+ * other types of functions such as rank and value functions.
+ *
+ * Unlike the AggregateOperator which will output one row per group, the 
WindowAggregateOperator
+ * will output as many rows as input rows.
+ *
+ * TODO:
+ *     1. Add support for OVER() clause with ORDER BY only or PARTITION BY 
ORDER BY
+ *     2. Add support for rank window functions
+ *     3. Add support for value window functions
+ *     4. Add support for custom frames
+ */
+public class WindowAggregateOperator extends MultiStageOperator {
+  private static final String EXPLAIN_NAME = "WINDOW";
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(WindowAggregateOperator.class);
+
+  private final MultiStageOperator _inputOperator;
+  private final List<RexExpression> _groupSet;
+  private final OrderSetInfo _orderSetInfo;
+  private final WindowFrame _windowFrame;
+  private final List<RexExpression.FunctionCall> _aggCalls;
+  private final List<RexExpression> _constants;
+  private final DataSchema _resultSchema;
+  private final WindowAccumulator[] _windowAccumulators;
+  private final Map<Key, List<Object[]>> _partitionRows;
+
+  private TransferableBlock _upstreamErrorBlock;
+
+  private int _numRows;
+  private boolean _readyToConstruct;
+  private boolean _hasReturnedWindowAggregateBlock;
+
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      long requestId, int stageId, VirtualServerAddress virtualServerAddress) {
+    this(inputOperator, groupSet, orderSet, orderSetDirection, 
orderSetNullDirection, aggCalls, lowerBound,
+        upperBound, isRows, constants, resultSchema, inputSchema, 
WindowAccumulator.WINDOW_MERGERS,
+        requestId, stageId, virtualServerAddress);
+  }
+
+  @VisibleForTesting
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      Map<String, Function<DataSchema.ColumnDataType, WindowMerger>> mergers, 
long requestId, int stageId,
+      VirtualServerAddress virtualServerAddress) {
+    super(requestId, stageId, virtualServerAddress);
+
+    boolean isPartitionByOnly = isPartitionByOnlyQuery(groupSet, orderSet, 
orderSetDirection, orderSetNullDirection);
+    // TODO: add support for ORDER BY in the OVER() clause
+    Preconditions.checkState(orderSet == null || orderSet.isEmpty() || 
isPartitionByOnly,
+        "Order by is not yet supported in window functions");
+
+    _inputOperator = inputOperator;
+    _groupSet = groupSet;
+    _orderSetInfo = new OrderSetInfo(orderSet, orderSetDirection, 
orderSetNullDirection);
+    _windowFrame = new WindowFrame(lowerBound, upperBound, isRows);
+
+    // TODO: add support for custom frames, and for ORDER BY default frame 
(upperBound => currentRow)
+    Preconditions.checkState(!_windowFrame.isRows(), "Only RANGE type frames 
are supported at present");
+    Preconditions.checkState(_windowFrame.isUnboundedPreceding(),
+        "Only default frame is supported, lowerBound must be UNBOUNDED 
PRECEDING");
+    Preconditions.checkState(_windowFrame.isUnboundedFollowing()
+            || (_windowFrame.isUpperBoundCurrentRow() && isPartitionByOnly),
+        "Only default frame is supported, upperBound must be UNBOUNDED 
FOLLOWING or CURRENT ROW");
+
+    // we expect all agg calls to be aggregate function calls
+    _aggCalls = 
aggCalls.stream().map(RexExpression.FunctionCall.class::cast).collect(Collectors.toList());
+    _constants = constants;
+    _resultSchema = resultSchema;
+
+    // TODO: Not all window functions (e.g. ROW_NUMBER, LAG, etc) need 
aggregations. Such functions should be handled
+    //       differently.
+    _windowAccumulators = new WindowAccumulator[_aggCalls.size()];
+    for (int i = 0; i < _aggCalls.size(); i++) {
+      RexExpression.FunctionCall agg = _aggCalls.get(i);
+      String functionName = agg.getFunctionName();
+      if (!mergers.containsKey(functionName)) {
+        throw new IllegalStateException("Unexpected value: " + functionName);
+      }
+      _windowAccumulators[i] = new WindowAccumulator(agg, mergers, 
functionName, inputSchema);
+    }
+
+    _partitionRows = new HashMap<>();
+
+    _numRows = 0;
+    _readyToConstruct = false;
+    _hasReturnedWindowAggregateBlock = false;
+  }
+
+  @Override
+  public List<MultiStageOperator> getChildOperators() {
+    return ImmutableList.of(_inputOperator);
+  }
+
+  @Nullable
+  @Override
+  public String toExplainString() {
+    return EXPLAIN_NAME;
+  }
+
+  @Override
+  protected TransferableBlock getNextBlock() {
+    try {
+      if (!_readyToConstruct && !consumeInputBlocks()) {
+        return TransferableBlockUtils.getNoOpTransferableBlock();
+      }
+
+      if (_upstreamErrorBlock != null) {
+        return _upstreamErrorBlock;
+      }
+
+      if (!_hasReturnedWindowAggregateBlock) {
+        return produceWindowAggregateBlock();
+      } else {
+        // TODO: Move to close call.
+        return TransferableBlockUtils.getEndOfStreamTransferableBlock();
+      }
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while executing 
WindowAggregationOperator, returning an error block", e);
+      return TransferableBlockUtils.getErrorTransferableBlock(e);
+    }
+  }
+
+  private boolean isPartitionByOnlyQuery(List<RexExpression> groupSet, 
List<RexExpression> orderSet,
+      List<RelFieldCollation.Direction> orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection) {
+    if (CollectionUtils.isEmpty(orderSet)) {
+      return true;
+    }
+
+    if (CollectionUtils.isEmpty(groupSet) || (groupSet.size() != 
orderSet.size())) {
+      return false;
+    }
+
+    Set<Integer> partitionByInputRefIndexes = new HashSet<>();
+    Set<Integer> orderByInputRefIndexes = new HashSet<>();
+    for (int i = 0; i < groupSet.size(); i++) {
+      partitionByInputRefIndexes.add(((RexExpression.InputRef) 
groupSet.get(i)).getIndex());
+      orderByInputRefIndexes.add(((RexExpression.InputRef) 
orderSet.get(i)).getIndex());
+    }
+
+    boolean isPartitionByOnly = 
partitionByInputRefIndexes.equals(orderByInputRefIndexes);
+    if (isPartitionByOnly) {
+      // Check the direction and null direction to ensure default ordering on 
the order by keys, which are:
+      // Direction: ASC
+      // Null Direction: LAST
+      for (int i = 0; i < orderSet.size(); i++) {
+        if (orderSetDirection.get(i) == RelFieldCollation.Direction.DESCENDING
+            || orderSetNullDirection.get(i) == 
RelFieldCollation.NullDirection.FIRST) {
+          isPartitionByOnly = false;
+          break;
+        }
+      }
+    }
+    return isPartitionByOnly;
+  }
+
+  private TransferableBlock produceWindowAggregateBlock() {
+    List<Object[]> rows = new ArrayList<>(_numRows);
+    for (Map.Entry<Key, List<Object[]>> e : _partitionRows.entrySet()) {
+      Key partitionKey = e.getKey();
+      List<Object[]> rowList = e.getValue();
+      for (Object[] existingRow : rowList) {
+        Object[] row = new Object[existingRow.length + _aggCalls.size()];
+        System.arraycopy(existingRow, 0, row, 0, existingRow.length);
+        for (int i = 0; i < _windowAccumulators.length; i++) {
+          row[i + existingRow.length] = 
_windowAccumulators[i]._results.get(partitionKey);
+        }
+        rows.add(row);
+      }
+    }
+    _hasReturnedWindowAggregateBlock = true;
+    if (rows.size() == 0) {
+      return new TransferableBlock(Collections.emptyList(), _resultSchema, 
DataBlock.Type.ROW);
+    } else {
+      return new TransferableBlock(rows, _resultSchema, DataBlock.Type.ROW);
+    }
+  }
+
+  /**
+   * @return whether or not the operator is ready to move on (EOS or ERROR)
+   */
+  private boolean consumeInputBlocks() {
+    TransferableBlock block = _inputOperator.nextBlock();
+    while (!block.isNoOpBlock()) {
+      // setting upstream error block
+      if (block.isErrorBlock()) {
+        _upstreamErrorBlock = block;
+        return true;
+      } else if (block.isEndOfStreamBlock()) {
+        _readyToConstruct = true;
+        return true;
+      }
+
+      List<Object[]> container = block.getContainer();
+      for (Object[] row : container) {
+        _numRows++;
+        // TODO: Revisit the aggregation logic once ORDER BY inside OVER() 
support is added
+        Key key = extractRowKey(row, _groupSet);
+        _partitionRows.putIfAbsent(key, new ArrayList<>());
+        _partitionRows.get(key).add(row);
+        for (int i = 0; i < _aggCalls.size(); i++) {
+          _windowAccumulators[i].accumulate(key, row);
+        }
+      }
+      block = _inputOperator.nextBlock();
+    }
+    return false;
+  }
+
+  private static Key extractRowKey(Object[] row, List<RexExpression> groupSet) 
{
+    Object[] keyElements = new Object[groupSet.size()];
+    for (int i = 0; i < groupSet.size(); i++) {
+      keyElements[i] = row[((RexExpression.InputRef) 
groupSet.get(i)).getIndex()];
+    }
+    return new Key(keyElements);
+  }
+
+  private static class OrderSetInfo {
+    final List<RexExpression> _orderSet;
+    final List<RelFieldCollation.Direction> _orderSetDirection;
+    final List<RelFieldCollation.NullDirection> _orderSetNullDirection;
+
+    OrderSetInfo(List<RexExpression> orderSet, 
List<RelFieldCollation.Direction> orderSetDirection,
+        List<RelFieldCollation.NullDirection> orderSetNullDirection) {
+      _orderSet = orderSet;
+      _orderSetDirection = orderSetDirection;
+      _orderSetNullDirection = orderSetNullDirection;
+    }
+
+    List<RexExpression> getOrderSet() {
+      return _orderSet;
+    }
+
+    List<RelFieldCollation.Direction> getOrderSetDirection() {
+      return _orderSetDirection;
+    }
+
+    List<RelFieldCollation.NullDirection> getOrderSetNullDirection() {
+      return _orderSetNullDirection;
+    }
+  }
+
+  private static class WindowFrame {
+    final int _lowerBound;
+    final int _upperBound;
+    final boolean _isRows;
+
+    WindowFrame(int lowerBound, int upperBound, boolean isRows) {
+      _lowerBound = lowerBound;
+      _upperBound = upperBound;
+      _isRows = isRows;
+    }
+
+    boolean isUnboundedPreceding() {
+      return _lowerBound == Integer.MIN_VALUE;
+    }
+
+    boolean isUnboundedFollowing() {
+      return _upperBound == Integer.MAX_VALUE;
+    }
+
+    boolean isUpperBoundCurrentRow() {
+      return _upperBound == 0;
+    }
+
+    boolean isRows() {
+      return _isRows;
+    }
+
+    int getLowerBound() {
+      return _lowerBound;
+    }
+
+    int getUpperBound() {
+      return _upperBound;
+    }
+  }
+
+  private static Object mergeSum(Object left, Object right) {
+    return ((Number) left).doubleValue() + ((Number) right).doubleValue();
+  }
+
+  private static Object mergeMin(Object left, Object right) {
+    return Math.min(((Number) left).doubleValue(), ((Number) 
right).doubleValue());
+  }
+
+  private static Object mergeMax(Object left, Object right) {
+    return Math.max(((Number) left).doubleValue(), ((Number) 
right).doubleValue());
+  }
+
+  private static class MergeCount implements WindowMerger {
+
+    @Override
+    public Object initialize(Object other, DataSchema.ColumnDataType dataType) 
{
+      return other == null ? 0 : 1d;
+    }
+
+    @Override
+    public Object merge(Object left, Object ignored) {
+      // TODO: COUNT(*) doesn't need to parse right object until we support 
NULL
+      return ((Number) left).doubleValue() + 1;
+    }
+  }
+
+  interface WindowMerger {
+    /**
+     * Initializes the merger based on the first input
+     */
+    default Object initialize(Object other, DataSchema.ColumnDataType 
dataType) {
+      return other == null ? ((Number) 
dataType.getNullPlaceholder()).doubleValue() : ((Number) other).doubleValue();
+    }
+
+    /**
+     * Merges the existing aggregate (the result of {@link #initialize(Object, 
DataSchema.ColumnDataType)}) with
+     * the new value coming in (which may be an aggregate in and of itself).
+     */
+    Object merge(Object agg, Object value);
+  }
+
+  private static class WindowAccumulator {
+    private static final Map<String, Function<DataSchema.ColumnDataType, 
WindowMerger>> WINDOW_MERGERS =
+        ImmutableMap.<String, Function<DataSchema.ColumnDataType, 
WindowMerger>>builder()
+            .put("SUM", cdt -> WindowAggregateOperator::mergeSum)
+            .put("$SUM", cdt -> WindowAggregateOperator::mergeSum)
+            .put("$SUM0", cdt -> WindowAggregateOperator::mergeSum)
+            .put("MIN", cdt -> WindowAggregateOperator::mergeMin)
+            .put("$MIN", cdt -> WindowAggregateOperator::mergeMin)
+            .put("$MIN0", cdt -> WindowAggregateOperator::mergeMin)
+            .put("MAX", cdt -> WindowAggregateOperator::mergeMax)
+            .put("$MAX", cdt -> WindowAggregateOperator::mergeMax)
+            .put("$MAX0", cdt -> WindowAggregateOperator::mergeMax)
+            .put("COUNT", cdt -> new MergeCount())
+            .build();

Review Comment:
   let's extract out these to a common `Accumulator` interface and reused in 
both agg operator and window operator



##########
pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/operator/WindowAggregateOperator.java:
##########
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.operator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.pinot.common.datablock.DataBlock;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.data.table.Key;
+import org.apache.pinot.query.planner.logical.RexExpression;
+import org.apache.pinot.query.routing.VirtualServerAddress;
+import org.apache.pinot.query.runtime.blocks.TransferableBlock;
+import org.apache.pinot.query.runtime.blocks.TransferableBlockUtils;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * The WindowAggregateOperator is used to compute window function aggregations 
over a set of optional
+ * PARTITION BY keys, ORDER BY keys and a FRAME clause. The output data will 
include the projected
+ * columns and in addition will add the aggregation columns to the output data.
+ * [input columns, aggregate result1, ... aggregate resultN]
+ *
+ * The window functions supported today are SUM/COUNT/MIN/MAX aggregations. 
Window functions also include
+ * other types of functions such as rank and value functions.
+ *
+ * Unlike the AggregateOperator which will output one row per group, the 
WindowAggregateOperator
+ * will output as many rows as input rows.
+ *
+ * TODO:
+ *     1. Add support for OVER() clause with ORDER BY only or PARTITION BY 
ORDER BY
+ *     2. Add support for rank window functions
+ *     3. Add support for value window functions
+ *     4. Add support for custom frames
+ */
+public class WindowAggregateOperator extends MultiStageOperator {
+  private static final String EXPLAIN_NAME = "WINDOW";
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(WindowAggregateOperator.class);
+
+  private final MultiStageOperator _inputOperator;
+  private final List<RexExpression> _groupSet;
+  private final OrderSetInfo _orderSetInfo;
+  private final WindowFrame _windowFrame;
+  private final List<RexExpression.FunctionCall> _aggCalls;
+  private final List<RexExpression> _constants;
+  private final DataSchema _resultSchema;
+  private final WindowAccumulator[] _windowAccumulators;
+  private final Map<Key, List<Object[]>> _partitionRows;
+
+  private TransferableBlock _upstreamErrorBlock;
+
+  private int _numRows;
+  private boolean _readyToConstruct;
+  private boolean _hasReturnedWindowAggregateBlock;
+
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      long requestId, int stageId, VirtualServerAddress virtualServerAddress) {
+    this(inputOperator, groupSet, orderSet, orderSetDirection, 
orderSetNullDirection, aggCalls, lowerBound,
+        upperBound, isRows, constants, resultSchema, inputSchema, 
WindowAccumulator.WINDOW_MERGERS,
+        requestId, stageId, virtualServerAddress);
+  }
+
+  @VisibleForTesting
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      Map<String, Function<DataSchema.ColumnDataType, WindowMerger>> mergers, 
long requestId, int stageId,
+      VirtualServerAddress virtualServerAddress) {
+    super(requestId, stageId, virtualServerAddress);
+
+    boolean isPartitionByOnly = isPartitionByOnlyQuery(groupSet, orderSet, 
orderSetDirection, orderSetNullDirection);
+    // TODO: add support for ORDER BY in the OVER() clause
+    Preconditions.checkState(orderSet == null || orderSet.isEmpty() || 
isPartitionByOnly,
+        "Order by is not yet supported in window functions");
+
+    _inputOperator = inputOperator;
+    _groupSet = groupSet;
+    _orderSetInfo = new OrderSetInfo(orderSet, orderSetDirection, 
orderSetNullDirection);
+    _windowFrame = new WindowFrame(lowerBound, upperBound, isRows);
+
+    // TODO: add support for custom frames, and for ORDER BY default frame 
(upperBound => currentRow)
+    Preconditions.checkState(!_windowFrame.isRows(), "Only RANGE type frames 
are supported at present");
+    Preconditions.checkState(_windowFrame.isUnboundedPreceding(),
+        "Only default frame is supported, lowerBound must be UNBOUNDED 
PRECEDING");
+    Preconditions.checkState(_windowFrame.isUnboundedFollowing()
+            || (_windowFrame.isUpperBoundCurrentRow() && isPartitionByOnly),
+        "Only default frame is supported, upperBound must be UNBOUNDED 
FOLLOWING or CURRENT ROW");
+
+    // we expect all agg calls to be aggregate function calls
+    _aggCalls = 
aggCalls.stream().map(RexExpression.FunctionCall.class::cast).collect(Collectors.toList());
+    _constants = constants;
+    _resultSchema = resultSchema;
+
+    // TODO: Not all window functions (e.g. ROW_NUMBER, LAG, etc) need 
aggregations. Such functions should be handled
+    //       differently.
+    _windowAccumulators = new WindowAccumulator[_aggCalls.size()];
+    for (int i = 0; i < _aggCalls.size(); i++) {
+      RexExpression.FunctionCall agg = _aggCalls.get(i);
+      String functionName = agg.getFunctionName();
+      if (!mergers.containsKey(functionName)) {
+        throw new IllegalStateException("Unexpected value: " + functionName);
+      }
+      _windowAccumulators[i] = new WindowAccumulator(agg, mergers, 
functionName, inputSchema);
+    }
+
+    _partitionRows = new HashMap<>();
+
+    _numRows = 0;
+    _readyToConstruct = false;
+    _hasReturnedWindowAggregateBlock = false;
+  }
+
+  @Override
+  public List<MultiStageOperator> getChildOperators() {
+    return ImmutableList.of(_inputOperator);
+  }
+
+  @Nullable
+  @Override
+  public String toExplainString() {
+    return EXPLAIN_NAME;
+  }
+
+  @Override
+  protected TransferableBlock getNextBlock() {
+    try {
+      if (!_readyToConstruct && !consumeInputBlocks()) {
+        return TransferableBlockUtils.getNoOpTransferableBlock();
+      }
+
+      if (_upstreamErrorBlock != null) {
+        return _upstreamErrorBlock;
+      }
+
+      if (!_hasReturnedWindowAggregateBlock) {
+        return produceWindowAggregateBlock();
+      } else {
+        // TODO: Move to close call.
+        return TransferableBlockUtils.getEndOfStreamTransferableBlock();
+      }
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while executing 
WindowAggregationOperator, returning an error block", e);
+      return TransferableBlockUtils.getErrorTransferableBlock(e);
+    }
+  }
+
+  private boolean isPartitionByOnlyQuery(List<RexExpression> groupSet, 
List<RexExpression> orderSet,
+      List<RelFieldCollation.Direction> orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection) {
+    if (CollectionUtils.isEmpty(orderSet)) {
+      return true;
+    }
+
+    if (CollectionUtils.isEmpty(groupSet) || (groupSet.size() != 
orderSet.size())) {
+      return false;
+    }
+
+    Set<Integer> partitionByInputRefIndexes = new HashSet<>();
+    Set<Integer> orderByInputRefIndexes = new HashSet<>();
+    for (int i = 0; i < groupSet.size(); i++) {
+      partitionByInputRefIndexes.add(((RexExpression.InputRef) 
groupSet.get(i)).getIndex());
+      orderByInputRefIndexes.add(((RexExpression.InputRef) 
orderSet.get(i)).getIndex());
+    }
+
+    boolean isPartitionByOnly = 
partitionByInputRefIndexes.equals(orderByInputRefIndexes);
+    if (isPartitionByOnly) {
+      // Check the direction and null direction to ensure default ordering on 
the order by keys, which are:
+      // Direction: ASC
+      // Null Direction: LAST
+      for (int i = 0; i < orderSet.size(); i++) {
+        if (orderSetDirection.get(i) == RelFieldCollation.Direction.DESCENDING
+            || orderSetNullDirection.get(i) == 
RelFieldCollation.NullDirection.FIRST) {
+          isPartitionByOnly = false;
+          break;
+        }
+      }
+    }
+    return isPartitionByOnly;
+  }
+
+  private TransferableBlock produceWindowAggregateBlock() {
+    List<Object[]> rows = new ArrayList<>(_numRows);
+    for (Map.Entry<Key, List<Object[]>> e : _partitionRows.entrySet()) {
+      Key partitionKey = e.getKey();
+      List<Object[]> rowList = e.getValue();
+      for (Object[] existingRow : rowList) {
+        Object[] row = new Object[existingRow.length + _aggCalls.size()];
+        System.arraycopy(existingRow, 0, row, 0, existingRow.length);
+        for (int i = 0; i < _windowAccumulators.length; i++) {
+          row[i + existingRow.length] = 
_windowAccumulators[i]._results.get(partitionKey);
+        }
+        rows.add(row);
+      }
+    }
+    _hasReturnedWindowAggregateBlock = true;
+    if (rows.size() == 0) {
+      return new TransferableBlock(Collections.emptyList(), _resultSchema, 
DataBlock.Type.ROW);
+    } else {
+      return new TransferableBlock(rows, _resultSchema, DataBlock.Type.ROW);
+    }
+  }
+
+  /**
+   * @return whether or not the operator is ready to move on (EOS or ERROR)
+   */
+  private boolean consumeInputBlocks() {
+    TransferableBlock block = _inputOperator.nextBlock();
+    while (!block.isNoOpBlock()) {
+      // setting upstream error block
+      if (block.isErrorBlock()) {
+        _upstreamErrorBlock = block;
+        return true;
+      } else if (block.isEndOfStreamBlock()) {
+        _readyToConstruct = true;
+        return true;
+      }
+
+      List<Object[]> container = block.getContainer();
+      for (Object[] row : container) {
+        _numRows++;
+        // TODO: Revisit the aggregation logic once ORDER BY inside OVER() 
support is added
+        Key key = extractRowKey(row, _groupSet);
+        _partitionRows.putIfAbsent(key, new ArrayList<>());
+        _partitionRows.get(key).add(row);
+        for (int i = 0; i < _aggCalls.size(); i++) {
+          _windowAccumulators[i].accumulate(key, row);
+        }
+      }
+      block = _inputOperator.nextBlock();
+    }
+    return false;
+  }
+
+  private static Key extractRowKey(Object[] row, List<RexExpression> groupSet) 
{

Review Comment:
   same here please make a comment Agg util for keeping these utils 
   we can even have a separate PR to just refactor these out



##########
pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/operator/WindowAggregateOperator.java:
##########
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.operator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.pinot.common.datablock.DataBlock;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.data.table.Key;
+import org.apache.pinot.query.planner.logical.RexExpression;
+import org.apache.pinot.query.routing.VirtualServerAddress;
+import org.apache.pinot.query.runtime.blocks.TransferableBlock;
+import org.apache.pinot.query.runtime.blocks.TransferableBlockUtils;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * The WindowAggregateOperator is used to compute window function aggregations 
over a set of optional
+ * PARTITION BY keys, ORDER BY keys and a FRAME clause. The output data will 
include the projected
+ * columns and in addition will add the aggregation columns to the output data.
+ * [input columns, aggregate result1, ... aggregate resultN]
+ *
+ * The window functions supported today are SUM/COUNT/MIN/MAX aggregations. 
Window functions also include
+ * other types of functions such as rank and value functions.
+ *
+ * Unlike the AggregateOperator which will output one row per group, the 
WindowAggregateOperator
+ * will output as many rows as input rows.
+ *
+ * TODO:
+ *     1. Add support for OVER() clause with ORDER BY only or PARTITION BY 
ORDER BY
+ *     2. Add support for rank window functions
+ *     3. Add support for value window functions
+ *     4. Add support for custom frames
+ */
+public class WindowAggregateOperator extends MultiStageOperator {
+  private static final String EXPLAIN_NAME = "WINDOW";
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(WindowAggregateOperator.class);
+
+  private final MultiStageOperator _inputOperator;
+  private final List<RexExpression> _groupSet;
+  private final OrderSetInfo _orderSetInfo;
+  private final WindowFrame _windowFrame;
+  private final List<RexExpression.FunctionCall> _aggCalls;
+  private final List<RexExpression> _constants;
+  private final DataSchema _resultSchema;
+  private final WindowAccumulator[] _windowAccumulators;
+  private final Map<Key, List<Object[]>> _partitionRows;
+
+  private TransferableBlock _upstreamErrorBlock;
+
+  private int _numRows;
+  private boolean _readyToConstruct;
+  private boolean _hasReturnedWindowAggregateBlock;
+
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      long requestId, int stageId, VirtualServerAddress virtualServerAddress) {
+    this(inputOperator, groupSet, orderSet, orderSetDirection, 
orderSetNullDirection, aggCalls, lowerBound,
+        upperBound, isRows, constants, resultSchema, inputSchema, 
WindowAccumulator.WINDOW_MERGERS,
+        requestId, stageId, virtualServerAddress);
+  }
+
+  @VisibleForTesting
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      Map<String, Function<DataSchema.ColumnDataType, WindowMerger>> mergers, 
long requestId, int stageId,
+      VirtualServerAddress virtualServerAddress) {
+    super(requestId, stageId, virtualServerAddress);
+
+    boolean isPartitionByOnly = isPartitionByOnlyQuery(groupSet, orderSet, 
orderSetDirection, orderSetNullDirection);
+    // TODO: add support for ORDER BY in the OVER() clause
+    Preconditions.checkState(orderSet == null || orderSet.isEmpty() || 
isPartitionByOnly,
+        "Order by is not yet supported in window functions");
+
+    _inputOperator = inputOperator;
+    _groupSet = groupSet;
+    _orderSetInfo = new OrderSetInfo(orderSet, orderSetDirection, 
orderSetNullDirection);
+    _windowFrame = new WindowFrame(lowerBound, upperBound, isRows);
+
+    // TODO: add support for custom frames, and for ORDER BY default frame 
(upperBound => currentRow)
+    Preconditions.checkState(!_windowFrame.isRows(), "Only RANGE type frames 
are supported at present");
+    Preconditions.checkState(_windowFrame.isUnboundedPreceding(),
+        "Only default frame is supported, lowerBound must be UNBOUNDED 
PRECEDING");
+    Preconditions.checkState(_windowFrame.isUnboundedFollowing()
+            || (_windowFrame.isUpperBoundCurrentRow() && isPartitionByOnly),
+        "Only default frame is supported, upperBound must be UNBOUNDED 
FOLLOWING or CURRENT ROW");
+
+    // we expect all agg calls to be aggregate function calls
+    _aggCalls = 
aggCalls.stream().map(RexExpression.FunctionCall.class::cast).collect(Collectors.toList());
+    _constants = constants;
+    _resultSchema = resultSchema;
+
+    // TODO: Not all window functions (e.g. ROW_NUMBER, LAG, etc) need 
aggregations. Such functions should be handled
+    //       differently.
+    _windowAccumulators = new WindowAccumulator[_aggCalls.size()];
+    for (int i = 0; i < _aggCalls.size(); i++) {
+      RexExpression.FunctionCall agg = _aggCalls.get(i);
+      String functionName = agg.getFunctionName();
+      if (!mergers.containsKey(functionName)) {
+        throw new IllegalStateException("Unexpected value: " + functionName);
+      }
+      _windowAccumulators[i] = new WindowAccumulator(agg, mergers, 
functionName, inputSchema);
+    }
+
+    _partitionRows = new HashMap<>();
+
+    _numRows = 0;
+    _readyToConstruct = false;
+    _hasReturnedWindowAggregateBlock = false;
+  }
+
+  @Override
+  public List<MultiStageOperator> getChildOperators() {
+    return ImmutableList.of(_inputOperator);
+  }
+
+  @Nullable
+  @Override
+  public String toExplainString() {
+    return EXPLAIN_NAME;
+  }
+
+  @Override
+  protected TransferableBlock getNextBlock() {
+    try {
+      if (!_readyToConstruct && !consumeInputBlocks()) {

Review Comment:
   let's keep it this way. 
   the `readyToConstruct` flag is only here to ensure correctness. for example 
if the data is already sorted and we are doing partition by order by, then we 
can actually produce rows as we go. 
   
   let's revisit this later



##########
pinot-query-runtime/src/test/java/org/apache/pinot/query/runtime/QueryRunnerTestBase.java:
##########
@@ -364,6 +371,8 @@ public static class Query {
       public List<List<Object>> _outputs = null;
       @JsonProperty("expectedException")
       public String _expectedException;
+      @JsonProperty("keepOutputRowOrder")
+      public boolean _keepOutputRowOrder;

Review Comment:
   good catch. would you be able to also change the OrderBy.json and add the 
keepOutputRowOrder flag?



##########
pinot-query-planner/src/main/java/org/apache/calcite/rel/rules/PinotWindowExchangeNodeInsertRule.java:
##########
@@ -160,6 +162,19 @@ private boolean isPartitionByOnlyQuery(Window.Group 
windowGroup) {
       Set<Integer> partitionByKeyList = new 
HashSet<>(windowGroup.keys.toList());
       Set<Integer> orderByKeyList = new 
HashSet<>(windowGroup.orderKeys.getKeys());
       isPartitionByOnly = partitionByKeyList.equals(orderByKeyList);
+      if (isPartitionByOnly) {

Review Comment:
   all 3 of the example above produces the exact same query results right? 
   



##########
pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/operator/WindowAggregateOperator.java:
##########
@@ -0,0 +1,431 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.operator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.pinot.common.datablock.DataBlock;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.data.table.Key;
+import org.apache.pinot.query.planner.logical.RexExpression;
+import org.apache.pinot.query.runtime.blocks.TransferableBlock;
+import org.apache.pinot.query.runtime.blocks.TransferableBlockUtils;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * The WindowAggregateOperator is used to compute window function aggregations 
over a set of optional
+ * PARTITION BY keys, ORDER BY keys and a FRAME clause. The output data will 
include the projected
+ * columns and in addition will add the aggregation columns to the output data.
+ * [input columns, aggregate result1, ... aggregate resultN]
+ *
+ * The window functions supported today are SUM/COUNT/MIN/MAX aggregations. 
Window functions also include
+ * other types of functions such as rank and value functions.
+ *
+ * Unlike the AggregateOperator which will output one row per group, the 
WindowAggregateOperator
+ * will output as many rows as input rows.
+ *
+ * TODO:
+ *     1. Add support for OVER() clause with ORDER BY only or PARTITION BY 
ORDER BY
+ *     2. Add support for rank window functions
+ *     3. Add support for value window functions
+ *     4. Add support for custom frames
+ */
+public class WindowAggregateOperator extends MultiStageOperator {
+  private static final String EXPLAIN_NAME = "WINDOW";
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(WindowAggregateOperator.class);
+
+  private final MultiStageOperator _inputOperator;
+  private final List<RexExpression> _groupSet;
+  private final OrderSetInfo _orderSetInfo;
+  private final WindowFrame _windowFrame;
+  private final List<RexExpression.FunctionCall> _aggCalls;
+  private final List<RexExpression> _constants;
+  private final DataSchema _resultSchema;
+  private final WindowAccumulator[] _windowAccumulators;
+  private final Map<Key, List<Object[]>> _partitionRows;
+
+  private TransferableBlock _upstreamErrorBlock;
+
+  private int _numRows;
+  private boolean _readyToConstruct;
+  private boolean _hasReturnedWindowAggregateBlock;
+
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      long requestId, int stageId) {
+    this(inputOperator, groupSet, orderSet, orderSetDirection, 
orderSetNullDirection, aggCalls, lowerBound,
+        upperBound, isRows, constants, resultSchema, inputSchema, 
WindowAccumulator.WINDOW_MERGERS,
+        requestId, stageId);
+  }
+
+  @VisibleForTesting
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      Map<String, Function<DataSchema.ColumnDataType, WindowMerger>> mergers, 
long requestId, int stageId) {
+    super(requestId, stageId);
+
+    boolean isPartitionByOnly = isPartitionByOnlyQuery(groupSet, orderSet, 
orderSetDirection, orderSetNullDirection);
+    // TODO: add support for ORDER BY in the OVER() clause
+    Preconditions.checkState(orderSet == null || orderSet.isEmpty() || 
isPartitionByOnly,
+        "Order by is not yet supported in window functions");
+
+    _inputOperator = inputOperator;
+    _groupSet = groupSet;
+    _orderSetInfo = new OrderSetInfo(orderSet, orderSetDirection, 
orderSetNullDirection);
+    _windowFrame = new WindowFrame(lowerBound, upperBound, isRows);
+
+    // TODO: add support for custom frames, and for ORDER BY default frame 
(upperBound => currentRow)
+    Preconditions.checkState(!_windowFrame.isRows(), "Only RANGE type frames 
are supported at present");
+    Preconditions.checkState(_windowFrame.isUnboundedPreceding(),
+        "Only default frame is supported, lowerBound must be UNBOUNDED 
PRECEDING");
+    Preconditions.checkState(_windowFrame.isUnboundedFollowing()
+            || (_windowFrame.isUpperBoundCurrentRow() && isPartitionByOnly),
+        "Only default frame is supported, upperBound must be UNBOUNDED 
FOLLOWING or CURRENT ROW");
+
+    // we expect all agg calls to be aggregate function calls
+    _aggCalls = 
aggCalls.stream().map(RexExpression.FunctionCall.class::cast).collect(Collectors.toList());
+    _constants = constants;
+    _resultSchema = resultSchema;
+
+    // TODO: Not all window functions (e.g. ROW_NUMBER, LAG, etc) need 
aggregations. Such functions should be handled
+    //       differently.
+    _windowAccumulators = new WindowAccumulator[_aggCalls.size()];
+    for (int i = 0; i < _aggCalls.size(); i++) {
+      RexExpression.FunctionCall agg = _aggCalls.get(i);
+      String functionName = agg.getFunctionName();
+      if (!mergers.containsKey(functionName)) {
+        throw new IllegalStateException("Unexpected value: " + functionName);
+      }
+      _windowAccumulators[i] = new WindowAccumulator(agg, mergers, 
functionName, inputSchema);
+    }
+
+    _partitionRows = new HashMap<>();
+
+    _numRows = 0;
+    _readyToConstruct = false;
+    _hasReturnedWindowAggregateBlock = false;
+  }
+
+  @Override
+  public List<MultiStageOperator> getChildOperators() {
+    return ImmutableList.of(_inputOperator);
+  }
+
+  @Nullable
+  @Override
+  public String toExplainString() {
+    return EXPLAIN_NAME;
+  }
+
+  @Override
+  protected TransferableBlock getNextBlock() {
+    try {
+      if (!_readyToConstruct && !consumeInputBlocks()) {
+        return TransferableBlockUtils.getNoOpTransferableBlock();
+      }
+
+      if (_upstreamErrorBlock != null) {
+        return _upstreamErrorBlock;
+      }
+
+      if (!_hasReturnedWindowAggregateBlock) {
+        return produceWindowAggregateBlock();
+      } else {
+        // TODO: Move to close call.
+        return TransferableBlockUtils.getEndOfStreamTransferableBlock();
+      }
+    } catch (Exception e) {
+      return TransferableBlockUtils.getErrorTransferableBlock(e);
+    }
+  }
+
+  private boolean isPartitionByOnlyQuery(List<RexExpression> groupSet, 
List<RexExpression> orderSet,
+      List<RelFieldCollation.Direction> orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection) {
+    if (orderSet == null || orderSet.isEmpty()) {
+      return true;
+    }
+
+    if (groupSet == null || groupSet.isEmpty() || (groupSet.size() != 
orderSet.size())) {
+      return false;
+    }
+
+    Set<Integer> partitionByInputRefIndexes = new HashSet<>();
+    Set<Integer> orderByInputRefIndexes = new HashSet<>();
+    for (int i = 0; i < groupSet.size(); i++) {
+      partitionByInputRefIndexes.add(((RexExpression.InputRef) 
groupSet.get(i)).getIndex());
+      orderByInputRefIndexes.add(((RexExpression.InputRef) 
orderSet.get(i)).getIndex());
+    }
+
+    boolean isPartitionByOnly = 
partitionByInputRefIndexes.equals(orderByInputRefIndexes);
+    if (isPartitionByOnly) {
+      // Check the direction and null direction to ensure default ordering on 
the order by keys, which are:
+      // Direction: ASC
+      // Null Direction: LAST
+      for (int i = 0; i < orderSet.size(); i++) {
+        if (orderSetDirection.get(i) == RelFieldCollation.Direction.DESCENDING
+            || orderSetNullDirection.get(i) == 
RelFieldCollation.NullDirection.FIRST) {
+          isPartitionByOnly = false;
+          break;
+        }
+      }
+    }
+    return isPartitionByOnly;
+  }
+
+  private TransferableBlock produceWindowAggregateBlock() {
+    List<Object[]> rows = new ArrayList<>(_numRows);
+    for (Map.Entry<Key, List<Object[]>> e : _partitionRows.entrySet()) {
+      Key partitionKey = e.getKey();
+      List<Object[]> rowList = e.getValue();
+      for (Object[] existingRow : rowList) {
+        Object[] row = new Object[existingRow.length + _aggCalls.size()];
+        System.arraycopy(existingRow, 0, row, 0, existingRow.length);
+        for (int i = 0; i < _windowAccumulators.length; i++) {
+          row[i + existingRow.length] = 
_windowAccumulators[i]._results.get(partitionKey);
+        }
+        rows.add(row);
+      }
+    }
+    _hasReturnedWindowAggregateBlock = true;
+    if (rows.size() == 0) {

Review Comment:
   i am not sure i follow the discussion here. what exact query are we trying 
to mitigate?
   using postgres, none of these query returns a single row
   ```
   SELECT id, SUM(val) OVER() FROM test where 0 = 1
   SELECT id, SUM(val) OVER(PARTITION BY id) FROM test where 0 = 1
   SELECT id, SUM(val) OVER(PARTITION BY id ORDER BY val) FROM test where 0 =1
   SELECT id, SUM(val) OVER(PARTITION BY id ORDER BY id) FROM test where 0 = 1
   ```



##########
pinot-query-runtime/src/main/java/org/apache/pinot/query/runtime/operator/WindowAggregateOperator.java:
##########
@@ -0,0 +1,436 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.query.runtime.operator;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Function;
+import java.util.stream.Collectors;
+import javax.annotation.Nullable;
+import org.apache.calcite.rel.RelFieldCollation;
+import org.apache.commons.collections.CollectionUtils;
+import org.apache.pinot.common.datablock.DataBlock;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.core.data.table.Key;
+import org.apache.pinot.query.planner.logical.RexExpression;
+import org.apache.pinot.query.routing.VirtualServerAddress;
+import org.apache.pinot.query.runtime.blocks.TransferableBlock;
+import org.apache.pinot.query.runtime.blocks.TransferableBlockUtils;
+import org.apache.pinot.spi.data.FieldSpec;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+/**
+ * The WindowAggregateOperator is used to compute window function aggregations 
over a set of optional
+ * PARTITION BY keys, ORDER BY keys and a FRAME clause. The output data will 
include the projected
+ * columns and in addition will add the aggregation columns to the output data.
+ * [input columns, aggregate result1, ... aggregate resultN]
+ *
+ * The window functions supported today are SUM/COUNT/MIN/MAX aggregations. 
Window functions also include
+ * other types of functions such as rank and value functions.
+ *
+ * Unlike the AggregateOperator which will output one row per group, the 
WindowAggregateOperator
+ * will output as many rows as input rows.
+ *
+ * TODO:
+ *     1. Add support for OVER() clause with ORDER BY only or PARTITION BY 
ORDER BY
+ *     2. Add support for rank window functions
+ *     3. Add support for value window functions
+ *     4. Add support for custom frames
+ */
+public class WindowAggregateOperator extends MultiStageOperator {
+  private static final String EXPLAIN_NAME = "WINDOW";
+  private static final Logger LOGGER = 
LoggerFactory.getLogger(WindowAggregateOperator.class);
+
+  private final MultiStageOperator _inputOperator;
+  private final List<RexExpression> _groupSet;
+  private final OrderSetInfo _orderSetInfo;
+  private final WindowFrame _windowFrame;
+  private final List<RexExpression.FunctionCall> _aggCalls;
+  private final List<RexExpression> _constants;
+  private final DataSchema _resultSchema;
+  private final WindowAccumulator[] _windowAccumulators;
+  private final Map<Key, List<Object[]>> _partitionRows;
+
+  private TransferableBlock _upstreamErrorBlock;
+
+  private int _numRows;
+  private boolean _readyToConstruct;
+  private boolean _hasReturnedWindowAggregateBlock;
+
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      long requestId, int stageId, VirtualServerAddress virtualServerAddress) {
+    this(inputOperator, groupSet, orderSet, orderSetDirection, 
orderSetNullDirection, aggCalls, lowerBound,
+        upperBound, isRows, constants, resultSchema, inputSchema, 
WindowAccumulator.WINDOW_MERGERS,
+        requestId, stageId, virtualServerAddress);
+  }
+
+  @VisibleForTesting
+  public WindowAggregateOperator(MultiStageOperator inputOperator, 
List<RexExpression> groupSet,
+      List<RexExpression> orderSet, List<RelFieldCollation.Direction> 
orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection, 
List<RexExpression> aggCalls, int lowerBound,
+      int upperBound, boolean isRows, List<RexExpression> constants, 
DataSchema resultSchema, DataSchema inputSchema,
+      Map<String, Function<DataSchema.ColumnDataType, WindowMerger>> mergers, 
long requestId, int stageId,
+      VirtualServerAddress virtualServerAddress) {
+    super(requestId, stageId, virtualServerAddress);
+
+    boolean isPartitionByOnly = isPartitionByOnlyQuery(groupSet, orderSet, 
orderSetDirection, orderSetNullDirection);
+    // TODO: add support for ORDER BY in the OVER() clause
+    Preconditions.checkState(orderSet == null || orderSet.isEmpty() || 
isPartitionByOnly,
+        "Order by is not yet supported in window functions");
+
+    _inputOperator = inputOperator;
+    _groupSet = groupSet;
+    _orderSetInfo = new OrderSetInfo(orderSet, orderSetDirection, 
orderSetNullDirection);
+    _windowFrame = new WindowFrame(lowerBound, upperBound, isRows);
+
+    // TODO: add support for custom frames, and for ORDER BY default frame 
(upperBound => currentRow)
+    Preconditions.checkState(!_windowFrame.isRows(), "Only RANGE type frames 
are supported at present");
+    Preconditions.checkState(_windowFrame.isUnboundedPreceding(),
+        "Only default frame is supported, lowerBound must be UNBOUNDED 
PRECEDING");
+    Preconditions.checkState(_windowFrame.isUnboundedFollowing()
+            || (_windowFrame.isUpperBoundCurrentRow() && isPartitionByOnly),
+        "Only default frame is supported, upperBound must be UNBOUNDED 
FOLLOWING or CURRENT ROW");
+
+    // we expect all agg calls to be aggregate function calls
+    _aggCalls = 
aggCalls.stream().map(RexExpression.FunctionCall.class::cast).collect(Collectors.toList());
+    _constants = constants;
+    _resultSchema = resultSchema;
+
+    // TODO: Not all window functions (e.g. ROW_NUMBER, LAG, etc) need 
aggregations. Such functions should be handled
+    //       differently.
+    _windowAccumulators = new WindowAccumulator[_aggCalls.size()];
+    for (int i = 0; i < _aggCalls.size(); i++) {
+      RexExpression.FunctionCall agg = _aggCalls.get(i);
+      String functionName = agg.getFunctionName();
+      if (!mergers.containsKey(functionName)) {
+        throw new IllegalStateException("Unexpected value: " + functionName);
+      }
+      _windowAccumulators[i] = new WindowAccumulator(agg, mergers, 
functionName, inputSchema);
+    }
+
+    _partitionRows = new HashMap<>();
+
+    _numRows = 0;
+    _readyToConstruct = false;
+    _hasReturnedWindowAggregateBlock = false;
+  }
+
+  @Override
+  public List<MultiStageOperator> getChildOperators() {
+    return ImmutableList.of(_inputOperator);
+  }
+
+  @Nullable
+  @Override
+  public String toExplainString() {
+    return EXPLAIN_NAME;
+  }
+
+  @Override
+  protected TransferableBlock getNextBlock() {
+    try {
+      if (!_readyToConstruct && !consumeInputBlocks()) {
+        return TransferableBlockUtils.getNoOpTransferableBlock();
+      }
+
+      if (_upstreamErrorBlock != null) {
+        return _upstreamErrorBlock;
+      }
+
+      if (!_hasReturnedWindowAggregateBlock) {
+        return produceWindowAggregateBlock();
+      } else {
+        // TODO: Move to close call.
+        return TransferableBlockUtils.getEndOfStreamTransferableBlock();
+      }
+    } catch (Exception e) {
+      LOGGER.error("Caught exception while executing 
WindowAggregationOperator, returning an error block", e);
+      return TransferableBlockUtils.getErrorTransferableBlock(e);
+    }
+  }
+
+  private boolean isPartitionByOnlyQuery(List<RexExpression> groupSet, 
List<RexExpression> orderSet,
+      List<RelFieldCollation.Direction> orderSetDirection,
+      List<RelFieldCollation.NullDirection> orderSetNullDirection) {
+    if (CollectionUtils.isEmpty(orderSet)) {
+      return true;
+    }
+
+    if (CollectionUtils.isEmpty(groupSet) || (groupSet.size() != 
orderSet.size())) {
+      return false;
+    }
+
+    Set<Integer> partitionByInputRefIndexes = new HashSet<>();
+    Set<Integer> orderByInputRefIndexes = new HashSet<>();
+    for (int i = 0; i < groupSet.size(); i++) {
+      partitionByInputRefIndexes.add(((RexExpression.InputRef) 
groupSet.get(i)).getIndex());
+      orderByInputRefIndexes.add(((RexExpression.InputRef) 
orderSet.get(i)).getIndex());
+    }
+
+    boolean isPartitionByOnly = 
partitionByInputRefIndexes.equals(orderByInputRefIndexes);
+    if (isPartitionByOnly) {
+      // Check the direction and null direction to ensure default ordering on 
the order by keys, which are:
+      // Direction: ASC
+      // Null Direction: LAST
+      for (int i = 0; i < orderSet.size(); i++) {
+        if (orderSetDirection.get(i) == RelFieldCollation.Direction.DESCENDING
+            || orderSetNullDirection.get(i) == 
RelFieldCollation.NullDirection.FIRST) {
+          isPartitionByOnly = false;
+          break;
+        }
+      }
+    }
+    return isPartitionByOnly;
+  }
+
+  private TransferableBlock produceWindowAggregateBlock() {
+    List<Object[]> rows = new ArrayList<>(_numRows);
+    for (Map.Entry<Key, List<Object[]>> e : _partitionRows.entrySet()) {
+      Key partitionKey = e.getKey();
+      List<Object[]> rowList = e.getValue();
+      for (Object[] existingRow : rowList) {
+        Object[] row = new Object[existingRow.length + _aggCalls.size()];
+        System.arraycopy(existingRow, 0, row, 0, existingRow.length);
+        for (int i = 0; i < _windowAccumulators.length; i++) {
+          row[i + existingRow.length] = 
_windowAccumulators[i]._results.get(partitionKey);
+        }
+        rows.add(row);
+      }
+    }
+    _hasReturnedWindowAggregateBlock = true;
+    if (rows.size() == 0) {
+      return new TransferableBlock(Collections.emptyList(), _resultSchema, 
DataBlock.Type.ROW);
+    } else {
+      return new TransferableBlock(rows, _resultSchema, DataBlock.Type.ROW);
+    }
+  }
+
+  /**
+   * @return whether or not the operator is ready to move on (EOS or ERROR)
+   */
+  private boolean consumeInputBlocks() {
+    TransferableBlock block = _inputOperator.nextBlock();
+    while (!block.isNoOpBlock()) {
+      // setting upstream error block
+      if (block.isErrorBlock()) {
+        _upstreamErrorBlock = block;
+        return true;
+      } else if (block.isEndOfStreamBlock()) {
+        _readyToConstruct = true;
+        return true;
+      }
+
+      List<Object[]> container = block.getContainer();
+      for (Object[] row : container) {
+        _numRows++;
+        // TODO: Revisit the aggregation logic once ORDER BY inside OVER() 
support is added
+        Key key = extractRowKey(row, _groupSet);
+        _partitionRows.putIfAbsent(key, new ArrayList<>());
+        _partitionRows.get(key).add(row);
+        for (int i = 0; i < _aggCalls.size(); i++) {
+          _windowAccumulators[i].accumulate(key, row);
+        }
+      }
+      block = _inputOperator.nextBlock();
+    }
+    return false;
+  }
+
+  private static Key extractRowKey(Object[] row, List<RexExpression> groupSet) 
{
+    Object[] keyElements = new Object[groupSet.size()];
+    for (int i = 0; i < groupSet.size(); i++) {
+      keyElements[i] = row[((RexExpression.InputRef) 
groupSet.get(i)).getIndex()];
+    }
+    return new Key(keyElements);
+  }
+
+  private static class OrderSetInfo {
+    final List<RexExpression> _orderSet;
+    final List<RelFieldCollation.Direction> _orderSetDirection;
+    final List<RelFieldCollation.NullDirection> _orderSetNullDirection;
+
+    OrderSetInfo(List<RexExpression> orderSet, 
List<RelFieldCollation.Direction> orderSetDirection,
+        List<RelFieldCollation.NullDirection> orderSetNullDirection) {
+      _orderSet = orderSet;
+      _orderSetDirection = orderSetDirection;
+      _orderSetNullDirection = orderSetNullDirection;
+    }
+
+    List<RexExpression> getOrderSet() {
+      return _orderSet;
+    }
+
+    List<RelFieldCollation.Direction> getOrderSetDirection() {
+      return _orderSetDirection;
+    }
+
+    List<RelFieldCollation.NullDirection> getOrderSetNullDirection() {
+      return _orderSetNullDirection;
+    }
+  }
+
+  private static class WindowFrame {

Review Comment:
   javadoc please, i am not sure what isRows mean --> are we using this to 
determine this RANGE-based or ROW-based?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

[GitHub] [pinot] walterddr commented on a diff in pull request #10286: [multistage] Initial (phase 1) Query runtime for window functions - empty OVER() and OVER(PARTITION BY)

Reply via email to