[GitHub] [pinot] weixiangsun commented on a change in pull request #8029: Add Time-Series Gapfilling functionality.

GitBox Mon, 21 Mar 2022 13:42:45 -0700


weixiangsun commented on a change in pull request #8029:
URL: https://github.com/apache/pinot/pull/8029#discussion_r831525452




##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/query/reduce/GapfillProcessor.java
##########
@@ -0,0 +1,477 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.query.reduce;
+
+import com.google.common.base.Preconditions;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import org.apache.pinot.common.request.context.ExpressionContext;
+import org.apache.pinot.common.request.context.FunctionContext;
+import org.apache.pinot.common.response.broker.BrokerResponseNative;
+import org.apache.pinot.common.response.broker.ResultTable;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.common.utils.DataSchema.ColumnDataType;
+import org.apache.pinot.core.common.BlockValSet;
+import org.apache.pinot.core.data.table.Key;
+import org.apache.pinot.core.query.aggregation.function.AggregationFunction;
+import 
org.apache.pinot.core.query.aggregation.function.AggregationFunctionFactory;
+import 
org.apache.pinot.core.query.aggregation.function.CountAggregationFunction;
+import org.apache.pinot.core.query.aggregation.groupby.GroupByResultHolder;
+import org.apache.pinot.core.query.request.context.QueryContext;
+import org.apache.pinot.core.util.GapfillUtils;
+import org.apache.pinot.spi.data.DateTimeFormatSpec;
+import org.apache.pinot.spi.data.DateTimeGranularitySpec;
+
+
+/**
+ * Helper class to reduce and set gap fill results into the 
BrokerResponseNative
+ */
+@SuppressWarnings({"rawtypes", "unchecked"})
+public class GapfillProcessor {
+  private final QueryContext _queryContext;
+
+  private final int _limitForAggregatedResult;
+  private final DateTimeGranularitySpec _gapfillDateTimeGranularity;
+  private final DateTimeGranularitySpec _postGapfillDateTimeGranularity;
+  private final DateTimeFormatSpec _dateTimeFormatter;
+  private final long _startMs;
+  private final long _endMs;
+  private final long _gapfillTimeBucketSize;
+  private final long _postGapfillTimeBucketSize;
+  private final int _numOfTimeBuckets;
+  private final List<Integer> _groupByKeyIndexes;
+  private final Set<Key> _groupByKeys;
+  private final Map<Key, Object[]> _previousByGroupKey;
+  private final Map<String, ExpressionContext> _fillExpressions;
+  private final List<ExpressionContext> _timeSeries;
+  private final GapfillUtils.GapfillType _gapfillType;
+  private int _limitForGapfilledResult;
+  private boolean[] _isGroupBySelections;
+  private final int _timeBucketColumnIndex;
+  private int[] _sourceColumnIndexForResultSchema = null;
+  private final int _aggregationSize;
+
+  GapfillProcessor(QueryContext queryContext, GapfillUtils.GapfillType 
gapfillType) {
+    _queryContext = queryContext;
+    _gapfillType = gapfillType;
+    _limitForAggregatedResult = queryContext.getLimit();
+    if (_gapfillType == GapfillUtils.GapfillType.AGGREGATE_GAP_FILL
+        || _gapfillType == GapfillUtils.GapfillType.GAP_FILL) {
+      _limitForGapfilledResult = queryContext.getLimit();
+    } else {
+      _limitForGapfilledResult = queryContext.getSubquery().getLimit();
+    }
+
+    ExpressionContext gapFillSelection = 
GapfillUtils.getGapfillExpressionContext(queryContext, _gapfillType);
+    _timeBucketColumnIndex = 
GapfillUtils.findTimeBucketColumnIndex(queryContext, _gapfillType);
+
+    List<ExpressionContext> args = 
gapFillSelection.getFunction().getArguments();
+
+    _dateTimeFormatter = new DateTimeFormatSpec(args.get(1).getLiteral());
+    _gapfillDateTimeGranularity = new 
DateTimeGranularitySpec(args.get(4).getLiteral());
+    if (args.get(5).getLiteral() == null) {
+      _postGapfillDateTimeGranularity = _gapfillDateTimeGranularity;
+    } else {
+      _postGapfillDateTimeGranularity = new 
DateTimeGranularitySpec(args.get(5).getLiteral());
+    }
+    String start = args.get(2).getLiteral();
+    _startMs = truncate(_dateTimeFormatter.fromFormatToMillis(start));
+    String end = args.get(3).getLiteral();
+    _endMs = truncate(_dateTimeFormatter.fromFormatToMillis(end));
+    _gapfillTimeBucketSize = _gapfillDateTimeGranularity.granularityToMillis();
+    _postGapfillTimeBucketSize = 
_postGapfillDateTimeGranularity.granularityToMillis();
+    _numOfTimeBuckets = (int) ((_endMs - _startMs) / _gapfillTimeBucketSize);
+
+    _fillExpressions = GapfillUtils.getFillExpressions(gapFillSelection);
+
+    _aggregationSize = (int) (_postGapfillTimeBucketSize / 
_gapfillTimeBucketSize);
+
+    _previousByGroupKey = new HashMap<>();
+    _groupByKeyIndexes = new ArrayList<>();
+    _groupByKeys = new HashSet<>();
+
+    ExpressionContext timeseriesOn = 
GapfillUtils.getTimeSeriesOnExpressionContext(gapFillSelection);
+    _timeSeries = timeseriesOn.getFunction().getArguments();
+  }
+
+  private int findGapfillBucketIndex(long time) {
+    return (int) ((time - _startMs) / _gapfillTimeBucketSize);
+  }
+
+  private void replaceColumnNameWithAlias(DataSchema dataSchema) {
+    QueryContext queryContext;
+    if (_gapfillType == GapfillUtils.GapfillType.AGGREGATE_GAP_FILL_AGGREGATE) 
{
+      queryContext = _queryContext.getSubquery().getSubquery();
+    } else if (_gapfillType == GapfillUtils.GapfillType.GAP_FILL) {
+      queryContext = _queryContext;
+    } else {
+      queryContext = _queryContext.getSubquery();
+    }
+    List<String> aliasList = queryContext.getAliasList();
+    Map<String, String> columnNameToAliasMap = new HashMap<>();
+    for (int i = 0; i < aliasList.size(); i++) {
+      if (aliasList.get(i) != null) {
+        ExpressionContext selection = 
queryContext.getSelectExpressions().get(i);
+        if (GapfillUtils.isGapfill(selection)) {
+          selection = selection.getFunction().getArguments().get(0);
+        }
+        columnNameToAliasMap.put(selection.toString(), aliasList.get(i));
+      }
+    }
+    for (int i = 0; i < dataSchema.getColumnNames().length; i++) {
+      if (columnNameToAliasMap.containsKey(dataSchema.getColumnNames()[i])) {
+        dataSchema.getColumnNames()[i] = 
columnNameToAliasMap.get(dataSchema.getColumnNames()[i]);
+      }
+    }
+  }
+
+  /**
+   * Here are three things that happen
+   * 1. Sort the result sets from all pinot servers based on timestamp
+   * 2. Gapfill the data for missing entities per time bucket
+   * 3. Aggregate the dataset per time bucket.
+   */
+  public void process(BrokerResponseNative brokerResponseNative) {
+    DataSchema dataSchema = 
brokerResponseNative.getResultTable().getDataSchema();
+    DataSchema resultTableSchema = getResultTableDataSchema(dataSchema);
+    if (brokerResponseNative.getResultTable().getRows().isEmpty()) {
+      brokerResponseNative.setResultTable(new ResultTable(resultTableSchema, 
Collections.emptyList()));
+      return;
+    }
+
+    String[] columns = dataSchema.getColumnNames();
+
+    Map<String, Integer> indexes = new HashMap<>();
+    for (int i = 0; i < columns.length; i++) {
+      indexes.put(columns[i], i);
+    }
+
+    _isGroupBySelections = new boolean[dataSchema.getColumnDataTypes().length];
+
+    // The first one argument of timeSeries is time column. The left ones are 
defining entity.
+    for (ExpressionContext entityColum : _timeSeries) {
+      int index = indexes.get(entityColum.getIdentifier());
+      _isGroupBySelections[index] = true;
+    }
+
+    for (int i = 0; i < _isGroupBySelections.length; i++) {
+      if (_isGroupBySelections[i]) {
+        _groupByKeyIndexes.add(i);
+      }
+    }
+
+    List<Object[]>[] timeBucketedRawRows = 
putRawRowsIntoTimeBucket(brokerResponseNative.getResultTable().getRows());
+
+    replaceColumnNameWithAlias(dataSchema);
+
+    if (_queryContext.getAggregationFunctions() == null) {
+
+      Map<String, Integer> sourceColumnsIndexes = new HashMap<>();
+      for (int i = 0; i < dataSchema.getColumnNames().length; i++) {
+        sourceColumnsIndexes.put(dataSchema.getColumnName(i), i);
+      }
+      _sourceColumnIndexForResultSchema = new 
int[resultTableSchema.getColumnNames().length];
+      for (int i = 0; i < _sourceColumnIndexForResultSchema.length; i++) {
+        _sourceColumnIndexForResultSchema[i] = 
sourceColumnsIndexes.get(resultTableSchema.getColumnName(i));
+      }
+    }
+
+    List<Object[]> resultRows = gapFillAndAggregate(timeBucketedRawRows, 
resultTableSchema, dataSchema);
+    brokerResponseNative.setResultTable(new ResultTable(resultTableSchema, 
resultRows));
+  }
+
+  /**
+   * Constructs the DataSchema for the ResultTable.
+   */
+  private DataSchema getResultTableDataSchema(DataSchema dataSchema) {
+    if (_gapfillType == GapfillUtils.GapfillType.GAP_FILL) {
+      return dataSchema;
+    }
+
+    int numOfColumns = _queryContext.getSelectExpressions().size();
+    String[] columnNames = new String[numOfColumns];
+    ColumnDataType[] columnDataTypes = new ColumnDataType[numOfColumns];
+    for (int i = 0; i < numOfColumns; i++) {
+      ExpressionContext expressionContext = 
_queryContext.getSelectExpressions().get(i);
+      if (GapfillUtils.isGapfill(expressionContext)) {
+        expressionContext = 
expressionContext.getFunction().getArguments().get(0);
+      }
+      if (expressionContext.getType() != ExpressionContext.Type.FUNCTION) {
+        columnNames[i] = expressionContext.getIdentifier();
+        columnDataTypes[i] = ColumnDataType.STRING;
+      } else {
+        FunctionContext functionContext = expressionContext.getFunction();
+        AggregationFunction aggregationFunction =
+            AggregationFunctionFactory.getAggregationFunction(functionContext, 
_queryContext);
+        columnDataTypes[i] = aggregationFunction.getFinalResultColumnType();
+        columnNames[i] = functionContext.toString();
+      }
+    }
+    return new DataSchema(columnNames, columnDataTypes);
+  }
+
+  private Key constructGroupKeys(Object[] row) {
+    Object[] groupKeys = new Object[_groupByKeyIndexes.size()];
+    for (int i = 0; i < _groupByKeyIndexes.size(); i++) {
+      groupKeys[i] = row[_groupByKeyIndexes.get(i)];
+    }
+    return new Key(groupKeys);
+  }
+
+  private long truncate(long epoch) {
+    int sz = _gapfillDateTimeGranularity.getSize();
+    return epoch / sz * sz;
+  }
+
+  private List<Object[]> gapFillAndAggregate(List<Object[]>[] 
timeBucketedRawRows,
+      DataSchema dataSchemaForAggregatedResult, DataSchema dataSchema) {
+    List<Object[]> result = new ArrayList<>();
+
+    GapfillFilterHandler postGapfillFilterHandler = null;
+    if (_queryContext.getSubquery() != null && _queryContext.getFilter() != 
null) {
+      postGapfillFilterHandler = new 
GapfillFilterHandler(_queryContext.getFilter(), dataSchema);
+    }
+    GapfillFilterHandler postAggregateHavingFilterHandler = null;
+    if (_queryContext.getHavingFilter() != null) {
+      postAggregateHavingFilterHandler =
+          new GapfillFilterHandler(_queryContext.getHavingFilter(), 
dataSchemaForAggregatedResult);
+    }
+    long start = _startMs;
+    ColumnDataType[] resultColumnDataTypes = dataSchema.getColumnDataTypes();
+    List<Object[]> bucketedResult = new ArrayList<>();
+    for (long time = _startMs; time < _endMs; time += _gapfillTimeBucketSize) {
+      int index = findGapfillBucketIndex(time);
+      gapfill(time, bucketedResult, timeBucketedRawRows[index], dataSchema, 
postGapfillFilterHandler);
+      if (_queryContext.getAggregationFunctions() == null) {

Review comment:
       When _queryContext.getAggregationFunctions() == null, the bucketed 
result will be added to final result. But when _aggregationSize == 1, it need 
go through the aggregation.
   
   So I cannot take this suggestion.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

[GitHub] [pinot] weixiangsun commented on a change in pull request #8029: Add Time-Series Gapfilling functionality.

Reply via email to