This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new a7548006ef1 branch-3.1: [opt](nereids) add cost penalty for pattern 
Aggregation-nestedLoopJoin #51127 (#53891)
a7548006ef1 is described below

commit a7548006ef1bac6850ca3ce0bc2e9e445a0c2890
Author: minghong <zhoumingh...@selectdb.com>
AuthorDate: Mon Jul 28 17:14:02 2025 +0800

    branch-3.1: [opt](nereids) add cost penalty for pattern 
Aggregation-nestedLoopJoin #51127 (#53891)
    
    pick #51127
---
 .../org/apache/doris/nereids/cost/CostModelV1.java |  17 ++++-
 regression-test/data/shape_check/others/nlj.out    | Bin 0 -> 481 bytes
 .../suites/shape_check/others/nlj.groovy           |  74 +++++++++++++++++++++
 3 files changed, 90 insertions(+), 1 deletion(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java
index 4b876015522..db965485357 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java
@@ -493,9 +493,24 @@ class CostModelV1 extends PlanVisitor<Cost, PlanContext> {
         Preconditions.checkState(context.arity() == 2);
         Statistics leftStatistics = context.getChildStatistics(0);
         Statistics rightStatistics = context.getChildStatistics(1);
+        /*
+         * nljPenalty:
+         * The row count estimation for nested loop join (NLJ) results often 
has significant errors.
+         * When the estimated row count is higher than the actual value, the 
cost benefits of subsequent
+         * operators (e.g., aggregation) may be overestimated. This can lead 
the optimizer to choose a
+         * plan where a small table joins a large table, severely impacting 
the overall SQL execution efficiency.
+         *
+         * For example, if the subsequent operator is an aggregation (AGG) and 
the GROUP BY key aligns with
+         * the distribution key of the small table, the optimizer might 
prioritize avoiding shuffling the NLJ
+         * results by choosing to join the small table to the large table, 
even if this is suboptimal.
+         */
+        double nljPenalty = 1.0;
+        if (leftStatistics.getRowCount() < 10 * rightStatistics.getRowCount()) 
{
+            nljPenalty = Math.min(leftStatistics.getRowCount(), 
rightStatistics.getRowCount());
+        }
         return CostV1.of(context.getSessionVariable(),
                 leftStatistics.getRowCount() * rightStatistics.getRowCount(),
-                rightStatistics.getRowCount(),
+                rightStatistics.getRowCount() * nljPenalty,
                 0);
     }
 
diff --git a/regression-test/data/shape_check/others/nlj.out 
b/regression-test/data/shape_check/others/nlj.out
new file mode 100644
index 00000000000..7a4caa03911
Binary files /dev/null and b/regression-test/data/shape_check/others/nlj.out 
differ
diff --git a/regression-test/suites/shape_check/others/nlj.groovy 
b/regression-test/suites/shape_check/others/nlj.groovy
new file mode 100644
index 00000000000..63920380a73
--- /dev/null
+++ b/regression-test/suites/shape_check/others/nlj.groovy
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+suite("nlj") {
+    sql 'set enable_parallel_result_sink=false;'
+    sql """
+    drop table if exists a;
+    create table a (
+        `id` int, 
+        a_date dateV2,
+        x int,
+        y int
+    )
+    DISTRIBUTED BY HASH(id) BUCKETS 8
+    PROPERTIES (
+    "replication_allocation" = "tag.location.default: 1"
+    );
+    alter table a modify column a_date set stats ('row_count'='1428418116', 
'ndv'='498', 'min_value'='2023-12-31', 'max_value'='2025-05-18', 
'avg_size'='4');
+
+
+    drop table if exists b;
+    create table b (
+        `id` int, 
+        b_date dateV2,
+    )
+    DISTRIBUTED BY HASH(b_date) BUCKETS 8
+    PROPERTIES (
+    "replication_allocation" = "tag.location.default: 1"
+    );
+
+
+    alter table b modify column b_date set stats ('row_count'='505', 
'ndv'='505', 'min_value'='2025-05-14', 'max_value'='2025-05-15', 
'avg_size'='4');
+
+    drop table if exists c;
+    create table c (
+        `id` int, 
+        c_date dateV2,
+    )
+    DISTRIBUTED BY HASH(id) BUCKETS 8
+    PROPERTIES (
+    "replication_allocation" = "tag.location.default: 1"
+    );
+
+    alter table c modify column c_date set stats ('row_count'='505', 
'ndv'='505', 'min_value'='2025-05-14', 'max_value'='2025-05-15', 
'avg_size'='4');
+    
+    set disable_nereids_rules='PRUNE_EMPTY_PARTITION';
+    set runtime_filter_mode=off;
+    """
+
+    // expected join order : a b
+    // order b-a is good for aggregation, but usually this order is extremely 
bad for join.
+    qt_shape """
+    explain shape plan
+    select b_date
+    from a join b on a_date>b_date
+    group by b_date, a_date, a.x
+    """
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to