Jackie-Jiang commented on a change in pull request #5786: URL: https://github.com/apache/incubator-pinot/pull/5786#discussion_r464645873
########## File path: pinot-core/src/main/java/org/apache/pinot/core/query/aggregation/function/PartitionedDistinctCountAggregationFunction.java ########## @@ -0,0 +1,425 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.core.query.aggregation.function; + +import it.unimi.dsi.fastutil.doubles.DoubleOpenHashSet; +import it.unimi.dsi.fastutil.floats.FloatOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; +import java.util.Collection; +import java.util.Map; +import javax.annotation.Nullable; +import org.apache.pinot.common.function.AggregationFunctionType; +import org.apache.pinot.common.utils.DataSchema.ColumnDataType; +import org.apache.pinot.core.common.BlockValSet; +import org.apache.pinot.core.query.aggregation.AggregationResultHolder; +import org.apache.pinot.core.query.aggregation.ObjectAggregationResultHolder; +import org.apache.pinot.core.query.aggregation.groupby.GroupByResultHolder; +import org.apache.pinot.core.query.aggregation.groupby.ObjectGroupByResultHolder; +import org.apache.pinot.core.query.request.context.ExpressionContext; +import org.apache.pinot.spi.data.FieldSpec.DataType; +import org.apache.pinot.spi.utils.ByteArray; +import org.roaringbitmap.RoaringBitmap; + + +/** + * The {@code PartitionedDistinctCountAggregationFunction} calculates the number of distinct values for a given + * single-value expression. + * <p>IMPORTANT: This function relies on the expression values being partitioned for each segment, where there is no + * common values within different segments. + * <p>This function calculates the exact number of distinct values within the segment, then simply sums up the results + * from different segments to get the final result. + */ +public class PartitionedDistinctCountAggregationFunction extends BaseSingleInputAggregationFunction<Long, Long> { + + public PartitionedDistinctCountAggregationFunction(ExpressionContext expression) { + super(expression); + } + + @Override + public AggregationFunctionType getType() { + return AggregationFunctionType.PARTITIONEDDISTINCTCOUNT; + } + + @Override + public void accept(AggregationFunctionVisitorBase visitor) { + visitor.visit(this); + } + + @Override + public AggregationResultHolder createAggregationResultHolder() { + return new ObjectAggregationResultHolder(); + } + + @Override + public GroupByResultHolder createGroupByResultHolder(int initialCapacity, int maxCapacity) { + return new ObjectGroupByResultHolder(initialCapacity, maxCapacity); + } + + @Override + public void aggregate(int length, AggregationResultHolder aggregationResultHolder, + Map<ExpressionContext, BlockValSet> blockValSetMap) { + BlockValSet blockValSet = blockValSetMap.get(_expression); + + // For dictionary-encoded expression, store dictionary ids into a RoaringBitmap + if (blockValSet.getDictionary() != null) { + int[] dictIds = blockValSet.getDictionaryIdsSV(); + RoaringBitmap bitmap = aggregationResultHolder.getResult(); + if (bitmap == null) { + bitmap = new RoaringBitmap(); + aggregationResultHolder.setValue(bitmap); + } + bitmap.addN(dictIds, 0, length); + return; + } + + // For non-dictionary-encoded expression, store INT values into a RoaringBitmap, other types into an OpenHashSet + DataType valueType = blockValSet.getValueType(); + switch (valueType) { + case INT: + int[] intValues = blockValSet.getIntValuesSV(); + RoaringBitmap bitmap = aggregationResultHolder.getResult(); + if (bitmap == null) { + bitmap = new RoaringBitmap(); + aggregationResultHolder.setValue(bitmap); + } + bitmap.addN(intValues, 0, length); + break; + case LONG: + long[] longValues = blockValSet.getLongValuesSV(); + LongOpenHashSet longSet = aggregationResultHolder.getResult(); Review comment: There is a long version roaring bitmap `Roaring64NavigableMap` which is still in early stage within the current version of RoaringBitmap `0.8.0` we imported. We can test its performance when we upgrade the RoaringBitmap library for the latest version and change it later as it does not involve any backward-incompatible changes. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org