This is an automated email from the ASF dual-hosted git repository. englefly pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 0e9fad4fe9 [stats](nereids) improve Anti join stats estimation #22444 0e9fad4fe9 is described below commit 0e9fad4fe9e95e07b2337e8e5baa24528756e3a9 Author: minghong <engle...@gmail.com> AuthorDate: Fri Aug 4 12:48:39 2023 +0800 [stats](nereids) improve Anti join stats estimation #22444 No impact on TPC-H impact on TPC-DS 16/69/94 improved --- .../apache/doris/nereids/stats/JoinEstimation.java | 7 ++-- .../nereids_tpcds_shape_sf100_p0/shape/query16.out | 38 +++++++++++----------- .../nereids_tpcds_shape_sf100_p0/shape/query69.out | 5 +-- .../nereids_tpcds_shape_sf100_p0/shape/query94.out | 7 ++-- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index 36dc90b343..9c42acf5fd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -44,6 +44,7 @@ import java.util.stream.Collectors; * TODO: Update other props in the ColumnStats properly. */ public class JoinEstimation { + private static double DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT = 0.3; private static EqualTo normalizeHashJoinCondition(EqualTo equalTo, Statistics leftStats, Statistics rightStats) { boolean changeOrder = equalTo.left().getInputSlots().stream().anyMatch( @@ -221,7 +222,8 @@ public class JoinEstimation { if (join.getJoinType().isSemiJoin()) { rowCount = semiRowCount; } else { - rowCount = leftStats.getRowCount() - semiRowCount; + rowCount = Math.max(leftStats.getRowCount() - semiRowCount, + leftStats.getRowCount() * DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT); } } else { //right semi or anti @@ -230,7 +232,8 @@ public class JoinEstimation { if (join.getJoinType().isSemiJoin()) { rowCount = semiRowCount; } else { - rowCount = rightStats.getRowCount() - semiRowCount; + rowCount = Math.max(rightStats.getRowCount() - semiRowCount, + rightStats.getRowCount() * DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT); } } return Math.max(1, rowCount); diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out index 967e3b6063..4b580416f2 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query16.out @@ -8,32 +8,32 @@ PhysicalResultSink ----------PhysicalDistribute ------------hashAgg[LOCAL] --------------PhysicalProject -----------------hashJoin[INNER_JOIN](cs1.cs_call_center_sk = call_center.cc_call_center_sk) -------------------PhysicalProject ---------------------filter(cc_county IN ('Ziebach County', 'Luce County', 'Richland County', 'Daviess County', 'Barrow County')) -----------------------PhysicalOlapScan[call_center] +----------------hashJoin[INNER_JOIN](cs1.cs_ship_date_sk = date_dim.d_date_sk) ------------------PhysicalDistribute --------------------PhysicalProject ----------------------hashJoin[RIGHT_SEMI_JOIN](cs1.cs_order_number = cs2.cs_order_number)( not (cs_warehouse_sk = cs_warehouse_sk)) ------------------------PhysicalDistribute --------------------------PhysicalProject ----------------------------PhysicalOlapScan[catalog_sales] -------------------------PhysicalDistribute ---------------------------hashJoin[INNER_JOIN](cs1.cs_ship_date_sk = date_dim.d_date_sk) -----------------------------PhysicalProject -------------------------------filter((cast(d_date as DATETIMEV2(0)) <= cast(days_add(cast('2002-4-01' as DATEV2), INTERVAL 60 DAY) as DATETIMEV2(0)))(date_dim.d_date >= 2002-04-01)) ---------------------------------PhysicalOlapScan[date_dim] +------------------------hashJoin[INNER_JOIN](cs1.cs_call_center_sk = call_center.cc_call_center_sk) +--------------------------hashJoin[RIGHT_ANTI_JOIN](cs1.cs_order_number = cr1.cr_order_number) ----------------------------PhysicalDistribute -------------------------------hashJoin[RIGHT_ANTI_JOIN](cs1.cs_order_number = cr1.cr_order_number) +------------------------------PhysicalProject +--------------------------------PhysicalOlapScan[catalog_returns] +----------------------------PhysicalDistribute +------------------------------hashJoin[INNER_JOIN](cs1.cs_ship_addr_sk = customer_address.ca_address_sk) +--------------------------------PhysicalProject +----------------------------------PhysicalOlapScan[catalog_sales] --------------------------------PhysicalDistribute ----------------------------------PhysicalProject -------------------------------------PhysicalOlapScan[catalog_returns] ---------------------------------PhysicalDistribute -----------------------------------hashJoin[INNER_JOIN](cs1.cs_ship_addr_sk = customer_address.ca_address_sk) -------------------------------------PhysicalProject ---------------------------------------PhysicalOlapScan[catalog_sales] -------------------------------------PhysicalDistribute ---------------------------------------PhysicalProject -----------------------------------------filter((cast(ca_state as VARCHAR(*)) = 'WV')) -------------------------------------------PhysicalOlapScan[customer_address] +------------------------------------filter((cast(ca_state as VARCHAR(*)) = 'WV')) +--------------------------------------PhysicalOlapScan[customer_address] +--------------------------PhysicalDistribute +----------------------------PhysicalProject +------------------------------filter(cc_county IN ('Ziebach County', 'Luce County', 'Richland County', 'Daviess County', 'Barrow County')) +--------------------------------PhysicalOlapScan[call_center] +------------------PhysicalDistribute +--------------------PhysicalProject +----------------------filter((cast(d_date as DATETIMEV2(0)) <= cast(days_add(cast('2002-4-01' as DATEV2), INTERVAL 60 DAY) as DATETIMEV2(0)))(date_dim.d_date >= 2002-04-01)) +------------------------PhysicalOlapScan[date_dim] diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out index 653e6e166b..96ae616ceb 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query69.out @@ -32,8 +32,9 @@ PhysicalResultSink ------------------------------------PhysicalOlapScan[date_dim] ------------------------PhysicalDistribute --------------------------hashJoin[INNER_JOIN](customer_demographics.cd_demo_sk = c.c_current_cdemo_sk) -----------------------------PhysicalProject -------------------------------PhysicalOlapScan[customer_demographics] +----------------------------PhysicalDistribute +------------------------------PhysicalProject +--------------------------------PhysicalOlapScan[customer_demographics] ----------------------------PhysicalDistribute ------------------------------hashJoin[RIGHT_ANTI_JOIN](c.c_customer_sk = web_sales.ws_bill_customer_sk) --------------------------------PhysicalDistribute diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out index b613247de9..26640d46a8 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query94.out @@ -9,9 +9,6 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------hashJoin[INNER_JOIN](ws1.ws_ship_date_sk = date_dim.d_date_sk) -------------------PhysicalProject ---------------------filter((date_dim.d_date >= 2000-02-01)(cast(d_date as DATETIMEV2(0)) <= cast(days_add(cast('2000-2-01' as DATEV2), INTERVAL 60 DAY) as DATETIMEV2(0)))) -----------------------PhysicalOlapScan[date_dim] ------------------PhysicalDistribute --------------------PhysicalProject ----------------------hashJoin[RIGHT_SEMI_JOIN](ws1.ws_order_number = ws2.ws_order_number)( not (ws_warehouse_sk = ws_warehouse_sk)) @@ -35,4 +32,8 @@ PhysicalResultSink --------------------------------PhysicalProject ----------------------------------filter((cast(web_company_name as VARCHAR(*)) = 'pri')) ------------------------------------PhysicalOlapScan[web_site] +------------------PhysicalDistribute +--------------------PhysicalProject +----------------------filter((date_dim.d_date >= 2000-02-01)(cast(d_date as DATETIMEV2(0)) <= cast(days_add(cast('2000-2-01' as DATEV2), INTERVAL 60 DAY) as DATETIMEV2(0)))) +------------------------PhysicalOlapScan[date_dim] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org