This is an automated email from the ASF dual-hosted git repository. morrysnow pushed a commit to branch branch-3.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push: new a7548006ef1 branch-3.1: [opt](nereids) add cost penalty for pattern Aggregation-nestedLoopJoin #51127 (#53891) a7548006ef1 is described below commit a7548006ef1bac6850ca3ce0bc2e9e445a0c2890 Author: minghong <zhoumingh...@selectdb.com> AuthorDate: Mon Jul 28 17:14:02 2025 +0800 branch-3.1: [opt](nereids) add cost penalty for pattern Aggregation-nestedLoopJoin #51127 (#53891) pick #51127 --- .../org/apache/doris/nereids/cost/CostModelV1.java | 17 ++++- regression-test/data/shape_check/others/nlj.out | Bin 0 -> 481 bytes .../suites/shape_check/others/nlj.groovy | 74 +++++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java index 4b876015522..db965485357 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java @@ -493,9 +493,24 @@ class CostModelV1 extends PlanVisitor<Cost, PlanContext> { Preconditions.checkState(context.arity() == 2); Statistics leftStatistics = context.getChildStatistics(0); Statistics rightStatistics = context.getChildStatistics(1); + /* + * nljPenalty: + * The row count estimation for nested loop join (NLJ) results often has significant errors. + * When the estimated row count is higher than the actual value, the cost benefits of subsequent + * operators (e.g., aggregation) may be overestimated. This can lead the optimizer to choose a + * plan where a small table joins a large table, severely impacting the overall SQL execution efficiency. + * + * For example, if the subsequent operator is an aggregation (AGG) and the GROUP BY key aligns with + * the distribution key of the small table, the optimizer might prioritize avoiding shuffling the NLJ + * results by choosing to join the small table to the large table, even if this is suboptimal. + */ + double nljPenalty = 1.0; + if (leftStatistics.getRowCount() < 10 * rightStatistics.getRowCount()) { + nljPenalty = Math.min(leftStatistics.getRowCount(), rightStatistics.getRowCount()); + } return CostV1.of(context.getSessionVariable(), leftStatistics.getRowCount() * rightStatistics.getRowCount(), - rightStatistics.getRowCount(), + rightStatistics.getRowCount() * nljPenalty, 0); } diff --git a/regression-test/data/shape_check/others/nlj.out b/regression-test/data/shape_check/others/nlj.out new file mode 100644 index 00000000000..7a4caa03911 Binary files /dev/null and b/regression-test/data/shape_check/others/nlj.out differ diff --git a/regression-test/suites/shape_check/others/nlj.groovy b/regression-test/suites/shape_check/others/nlj.groovy new file mode 100644 index 00000000000..63920380a73 --- /dev/null +++ b/regression-test/suites/shape_check/others/nlj.groovy @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +suite("nlj") { + sql 'set enable_parallel_result_sink=false;' + sql """ + drop table if exists a; + create table a ( + `id` int, + a_date dateV2, + x int, + y int + ) + DISTRIBUTED BY HASH(id) BUCKETS 8 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + alter table a modify column a_date set stats ('row_count'='1428418116', 'ndv'='498', 'min_value'='2023-12-31', 'max_value'='2025-05-18', 'avg_size'='4'); + + + drop table if exists b; + create table b ( + `id` int, + b_date dateV2, + ) + DISTRIBUTED BY HASH(b_date) BUCKETS 8 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + + + alter table b modify column b_date set stats ('row_count'='505', 'ndv'='505', 'min_value'='2025-05-14', 'max_value'='2025-05-15', 'avg_size'='4'); + + drop table if exists c; + create table c ( + `id` int, + c_date dateV2, + ) + DISTRIBUTED BY HASH(id) BUCKETS 8 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + + alter table c modify column c_date set stats ('row_count'='505', 'ndv'='505', 'min_value'='2025-05-14', 'max_value'='2025-05-15', 'avg_size'='4'); + + set disable_nereids_rules='PRUNE_EMPTY_PARTITION'; + set runtime_filter_mode=off; + """ + + // expected join order : a b + // order b-a is good for aggregation, but usually this order is extremely bad for join. + qt_shape """ + explain shape plan + select b_date + from a join b on a_date>b_date + group by b_date, a_date, a.x + """ +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org