This is an automated email from the ASF dual-hosted git repository. englefly pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 4174d5a707 [opt](nereids) optimze aggregation estimation #18607 4174d5a707 is described below commit 4174d5a7077ca540ba0a70d7504d5f973819e2cf Author: minghong <engle...@gmail.com> AuthorDate: Fri Apr 14 16:13:25 2023 +0800 [opt](nereids) optimze aggregation estimation #18607 `select count(*) from T group by A, B` suppose `ndv(A) > ndv(B)` the estimated row count of aggregate is between ndv(A) and ndv(A) * ndv(B) in previous version, we choose upper bound, that is ndv(A) * ndv(B). The drawback of this choice is the estimated row is often bigger that row count of T. In this version, we choose the lower bound. --- .../rules/implementation/AggregateStrategies.java | 25 -------------- .../doris/nereids/stats/StatsCalculator.java | 7 ++-- .../data/nereids_tpchPlanShape_p0/shape/q10.out | 33 +++++++++---------- .../data/nereids_tpchPlanShape_p0/shape/q13.out | 17 +++++----- .../data/nereids_tpchPlanShape_p0/shape/q18.out | 38 ++++++++++------------ 5 files changed, 44 insertions(+), 76 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java index aa39ad3467..be914f938e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/AggregateStrategies.java @@ -63,7 +63,6 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalStorageLayerAggrega import org.apache.doris.nereids.util.ExpressionUtils; import org.apache.doris.nereids.util.TypeCoercionUtils; import org.apache.doris.qe.ConnectContext; -import org.apache.doris.statistics.Statistics; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; @@ -303,20 +302,6 @@ public class AggregateStrategies implements ImplementationRuleFactory { } } - private boolean aggregateOnUniqueColumn( - LogicalAggregate<? extends Plan> logicalAgg) { - if (logicalAgg.child() instanceof GroupPlan) { - Statistics childStats = ((GroupPlan) logicalAgg.child()).getGroup().getStatistics(); - if (childStats != null) { - return logicalAgg.getGroupByExpressions().stream().anyMatch( - expression -> - childStats.almostUniqueExpression(expression) - ); - } - } - return false; - } - /** * sql: select count(*) from tbl group by id * @@ -345,11 +330,6 @@ public class AggregateStrategies implements ImplementationRuleFactory { */ private List<PhysicalHashAggregate<Plan>> onePhaseAggregateWithoutDistinct( LogicalAggregate<? extends Plan> logicalAgg, ConnectContext connectContext) { - if (!logicalAgg.getGroupByExpressions().isEmpty() - && !aggregateOnUniqueColumn(logicalAgg)) { - // twoPhaseAggregate beats onePhaseAggregate - return null; - } RequireProperties requireGather = RequireProperties.of(PhysicalProperties.GATHER); AggregateParam inputToResultParam = AggregateParam.localResult(); List<NamedExpression> newOutput = ExpressionUtils.rewriteDownShortCircuit( @@ -776,11 +756,6 @@ public class AggregateStrategies implements ImplementationRuleFactory { */ private List<PhysicalHashAggregate<? extends Plan>> twoPhaseAggregateWithDistinct( LogicalAggregate<? extends Plan> logicalAgg, ConnectContext connectContext) { - if (!logicalAgg.getGroupByExpressions().isEmpty() - && !aggregateOnUniqueColumn(logicalAgg)) { - // threePhaseAggregate beats twoPhaseAggregate - return null; - } Set<AggregateFunction> aggregateFunctions = logicalAgg.getAggregateFunctions(); Set<Expression> distinctArguments = aggregateFunctions.stream() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 5be38b5dde..4c53ff0c1d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -479,11 +479,8 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> { //all column stats are unknown, use default ratio resultSetCount = inputRowCount * DEFAULT_AGGREGATE_RATIO; } else { - resultSetCount = groupByKeyStats.stream() - .map(s -> s.ndv) - .reduce(1.0, (a, b) -> a * b); - //agg output tuples should be less than input tuples - resultSetCount = Math.min(resultSetCount, inputRowCount); + resultSetCount = groupByKeyStats.stream().map(s -> s.ndv) + .max(Double::compare).get(); } } } diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out index 5cdb232001..b70a354e27 100644 --- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out +++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q10.out @@ -4,24 +4,23 @@ PhysicalTopN --PhysicalDistribute ----PhysicalTopN ------PhysicalProject ---------hashAgg[GLOBAL] +--------hashAgg[LOCAL] ----------PhysicalDistribute -------------hashAgg[LOCAL] ---------------PhysicalProject -----------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey) -------------------PhysicalProject ---------------------filter((lineitem.l_returnflag = 'R')) -----------------------PhysicalOlapScan[lineitem] -------------------PhysicalDistribute ---------------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey) -----------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey) -------------------------PhysicalProject ---------------------------PhysicalOlapScan[customer] -------------------------PhysicalDistribute ---------------------------PhysicalProject -----------------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01)) -------------------------------PhysicalOlapScan[orders] +------------PhysicalProject +--------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey) +----------------PhysicalProject +------------------filter((lineitem.l_returnflag = 'R')) +--------------------PhysicalOlapScan[lineitem] +----------------PhysicalDistribute +------------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey) +--------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey) +----------------------PhysicalProject +------------------------PhysicalOlapScan[customer] ----------------------PhysicalDistribute ------------------------PhysicalProject ---------------------------PhysicalOlapScan[nation] +--------------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01)) +----------------------------PhysicalOlapScan[orders] +--------------------PhysicalDistribute +----------------------PhysicalProject +------------------------PhysicalOlapScan[nation] diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out index 6df8a92b08..d196ba817e 100644 --- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out +++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q13.out @@ -7,14 +7,13 @@ PhysicalQuickSort --------PhysicalDistribute ----------hashAgg[LOCAL] ------------PhysicalProject ---------------hashAgg[GLOBAL] -----------------hashAgg[LOCAL] -------------------PhysicalProject ---------------------hashJoin[RIGHT_OUTER_JOIN](customer.c_custkey = orders.o_custkey) -----------------------PhysicalDistribute -------------------------PhysicalProject ---------------------------filter(( not (o_comment like '%special%requests%'))) -----------------------------PhysicalOlapScan[orders] +--------------hashAgg[LOCAL] +----------------PhysicalProject +------------------hashJoin[RIGHT_OUTER_JOIN](customer.c_custkey = orders.o_custkey) +--------------------PhysicalDistribute ----------------------PhysicalProject -------------------------PhysicalOlapScan[customer] +------------------------filter(( not (o_comment like '%special%requests%'))) +--------------------------PhysicalOlapScan[orders] +--------------------PhysicalProject +----------------------PhysicalOlapScan[customer] diff --git a/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out b/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out index 299b4bb581..3e05c3a7c0 100644 --- a/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out +++ b/regression-test/data/nereids_tpchPlanShape_p0/shape/q18.out @@ -3,25 +3,23 @@ PhysicalTopN --PhysicalDistribute ----PhysicalTopN -------hashAgg[GLOBAL] ---------hashAgg[LOCAL] -----------PhysicalProject -------------hashJoin[INNER_JOIN](orders.o_orderkey = lineitem.l_orderkey) +------hashAgg[LOCAL] +--------PhysicalProject +----------hashJoin[INNER_JOIN](orders.o_orderkey = lineitem.l_orderkey) +------------PhysicalProject +--------------PhysicalOlapScan[lineitem] +------------PhysicalDistribute --------------PhysicalProject -----------------PhysicalOlapScan[lineitem] ---------------PhysicalDistribute -----------------PhysicalProject -------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey) ---------------------PhysicalProject -----------------------PhysicalOlapScan[customer] ---------------------PhysicalDistribute -----------------------hashJoin[LEFT_SEMI_JOIN](orders.o_orderkey = lineitem.l_orderkey) -------------------------PhysicalProject ---------------------------PhysicalOlapScan[orders] -------------------------PhysicalProject ---------------------------filter((sum(l_quantity) > 300.000000000)) -----------------------------hashAgg[GLOBAL] -------------------------------hashAgg[LOCAL] ---------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[lineitem] +----------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey) +------------------PhysicalProject +--------------------PhysicalOlapScan[customer] +------------------PhysicalDistribute +--------------------hashJoin[LEFT_SEMI_JOIN](orders.o_orderkey = lineitem.l_orderkey) +----------------------PhysicalProject +------------------------PhysicalOlapScan[orders] +----------------------PhysicalProject +------------------------filter((sum(l_quantity) > 300.000000000)) +--------------------------hashAgg[LOCAL] +----------------------------PhysicalProject +------------------------------PhysicalOlapScan[lineitem] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org