This is an automated email from the ASF dual-hosted git repository.

englefly pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e9fad4fe9 [stats](nereids) improve Anti join stats estimation #22444
0e9fad4fe9 is described below

commit 0e9fad4fe9e95e07b2337e8e5baa24528756e3a9
Author: minghong <engle...@gmail.com>
AuthorDate: Fri Aug 4 12:48:39 2023 +0800

    [stats](nereids) improve Anti join stats estimation #22444
    
    No impact on TPC-H
    impact on TPC-DS 16/69/94  improved
---
 .../apache/doris/nereids/stats/JoinEstimation.java |  7 ++--
 .../nereids_tpcds_shape_sf100_p0/shape/query16.out | 38 +++++++++++-----------
 .../nereids_tpcds_shape_sf100_p0/shape/query69.out |  5 +--
 .../nereids_tpcds_shape_sf100_p0/shape/query94.out |  7 ++--
 4 files changed, 31 insertions(+), 26 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
index 36dc90b343..9c42acf5fd 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
@@ -44,6 +44,7 @@ import java.util.stream.Collectors;
  * TODO: Update other props in the ColumnStats properly.
  */
 public class JoinEstimation {
+    private static double DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT = 0.3;
 
     private static EqualTo normalizeHashJoinCondition(EqualTo equalTo, 
Statistics leftStats, Statistics rightStats) {
         boolean changeOrder = equalTo.left().getInputSlots().stream().anyMatch(
@@ -221,7 +222,8 @@ public class JoinEstimation {
             if (join.getJoinType().isSemiJoin()) {
                 rowCount = semiRowCount;
             } else {
-                rowCount = leftStats.getRowCount() - semiRowCount;
+                rowCount = Math.max(leftStats.getRowCount() - semiRowCount,
+                        leftStats.getRowCount() * 
DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT);
             }
         } else {
             //right semi or anti
@@ -230,7 +232,8 @@ public class JoinEstimation {
             if (join.getJoinType().isSemiJoin()) {
                 rowCount = semiRowCount;
             } else {
-                rowCount = rightStats.getRowCount() - semiRowCount;
+                rowCount = Math.max(rightStats.getRowCount() - semiRowCount,
+                        rightStats.getRowCount() * 
DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT);
             }
         }
         return Math.max(1, rowCount);
diff --git 
a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out 
b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out
index 967e3b6063..4b580416f2 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out
@@ -8,32 +8,32 @@ PhysicalResultSink
 ----------PhysicalDistribute
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
-----------------hashJoin[INNER_JOIN](cs1.cs_call_center_sk = 
call_center.cc_call_center_sk)
-------------------PhysicalProject
---------------------filter(cc_county IN ('Ziebach County', 'Luce County', 
'Richland County', 'Daviess County', 'Barrow County'))
-----------------------PhysicalOlapScan[call_center]
+----------------hashJoin[INNER_JOIN](cs1.cs_ship_date_sk = date_dim.d_date_sk)
 ------------------PhysicalDistribute
 --------------------PhysicalProject
 ----------------------hashJoin[RIGHT_SEMI_JOIN](cs1.cs_order_number = 
cs2.cs_order_number)( not (cs_warehouse_sk = cs_warehouse_sk))
 ------------------------PhysicalDistribute
 --------------------------PhysicalProject
 ----------------------------PhysicalOlapScan[catalog_sales]
-------------------------PhysicalDistribute
---------------------------hashJoin[INNER_JOIN](cs1.cs_ship_date_sk = 
date_dim.d_date_sk)
-----------------------------PhysicalProject
-------------------------------filter((cast(d_date as DATETIMEV2(0)) <= 
cast(days_add(cast('2002-4-01' as DATEV2), INTERVAL 60 DAY) as 
DATETIMEV2(0)))(date_dim.d_date >= 2002-04-01))
---------------------------------PhysicalOlapScan[date_dim]
+------------------------hashJoin[INNER_JOIN](cs1.cs_call_center_sk = 
call_center.cc_call_center_sk)
+--------------------------hashJoin[RIGHT_ANTI_JOIN](cs1.cs_order_number = 
cr1.cr_order_number)
 ----------------------------PhysicalDistribute
-------------------------------hashJoin[RIGHT_ANTI_JOIN](cs1.cs_order_number = 
cr1.cr_order_number)
+------------------------------PhysicalProject
+--------------------------------PhysicalOlapScan[catalog_returns]
+----------------------------PhysicalDistribute
+------------------------------hashJoin[INNER_JOIN](cs1.cs_ship_addr_sk = 
customer_address.ca_address_sk)
+--------------------------------PhysicalProject
+----------------------------------PhysicalOlapScan[catalog_sales]
 --------------------------------PhysicalDistribute
 ----------------------------------PhysicalProject
-------------------------------------PhysicalOlapScan[catalog_returns]
---------------------------------PhysicalDistribute
-----------------------------------hashJoin[INNER_JOIN](cs1.cs_ship_addr_sk = 
customer_address.ca_address_sk)
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[catalog_sales]
-------------------------------------PhysicalDistribute
---------------------------------------PhysicalProject
-----------------------------------------filter((cast(ca_state as VARCHAR(*)) = 
'WV'))
-------------------------------------------PhysicalOlapScan[customer_address]
+------------------------------------filter((cast(ca_state as VARCHAR(*)) = 
'WV'))
+--------------------------------------PhysicalOlapScan[customer_address]
+--------------------------PhysicalDistribute
+----------------------------PhysicalProject
+------------------------------filter(cc_county IN ('Ziebach County', 'Luce 
County', 'Richland County', 'Daviess County', 'Barrow County'))
+--------------------------------PhysicalOlapScan[call_center]
+------------------PhysicalDistribute
+--------------------PhysicalProject
+----------------------filter((cast(d_date as DATETIMEV2(0)) <= 
cast(days_add(cast('2002-4-01' as DATEV2), INTERVAL 60 DAY) as 
DATETIMEV2(0)))(date_dim.d_date >= 2002-04-01))
+------------------------PhysicalOlapScan[date_dim]
 
diff --git 
a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out 
b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out
index 653e6e166b..96ae616ceb 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out
@@ -32,8 +32,9 @@ PhysicalResultSink
 ------------------------------------PhysicalOlapScan[date_dim]
 ------------------------PhysicalDistribute
 
--------------------------hashJoin[INNER_JOIN](customer_demographics.cd_demo_sk 
= c.c_current_cdemo_sk)
-----------------------------PhysicalProject
-------------------------------PhysicalOlapScan[customer_demographics]
+----------------------------PhysicalDistribute
+------------------------------PhysicalProject
+--------------------------------PhysicalOlapScan[customer_demographics]
 ----------------------------PhysicalDistribute
 ------------------------------hashJoin[RIGHT_ANTI_JOIN](c.c_customer_sk = 
web_sales.ws_bill_customer_sk)
 --------------------------------PhysicalDistribute
diff --git 
a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out 
b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out
index b613247de9..26640d46a8 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out
@@ -9,9 +9,6 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------hashJoin[INNER_JOIN](ws1.ws_ship_date_sk = date_dim.d_date_sk)
-------------------PhysicalProject
---------------------filter((date_dim.d_date >= 2000-02-01)(cast(d_date as 
DATETIMEV2(0)) <= cast(days_add(cast('2000-2-01' as DATEV2), INTERVAL 60 DAY) 
as DATETIMEV2(0))))
-----------------------PhysicalOlapScan[date_dim]
 ------------------PhysicalDistribute
 --------------------PhysicalProject
 ----------------------hashJoin[RIGHT_SEMI_JOIN](ws1.ws_order_number = 
ws2.ws_order_number)( not (ws_warehouse_sk = ws_warehouse_sk))
@@ -35,4 +32,8 @@ PhysicalResultSink
 --------------------------------PhysicalProject
 ----------------------------------filter((cast(web_company_name as VARCHAR(*)) 
= 'pri'))
 ------------------------------------PhysicalOlapScan[web_site]
+------------------PhysicalDistribute
+--------------------PhysicalProject
+----------------------filter((date_dim.d_date >= 2000-02-01)(cast(d_date as 
DATETIMEV2(0)) <= cast(days_add(cast('2000-2-01' as DATEV2), INTERVAL 60 DAY) 
as DATETIMEV2(0))))
+------------------------PhysicalOlapScan[date_dim]
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to