This is an automated email from the ASF dual-hosted git repository. morrysnow pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new d110859fad8 [fix](nereids) refine row count estimation for mark join (#38270) d110859fad8 is described below commit d110859fad8602c77001d16a867137ac77aae37c Author: xzj7019 <131111794+xzj7...@users.noreply.github.com> AuthorDate: Wed Jul 24 16:30:06 2024 +0800 [fix](nereids) refine row count estimation for mark join (#38270) Current semi/anti stats estimation doesn't consider the mark join case, whose row count should follow either side's stats without change. --- .../main/java/org/apache/doris/nereids/stats/JoinEstimation.java | 4 ++-- regression-test/data/nereids_hint_tpcds_p0/shape/query45.out | 2 +- .../nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out | 2 +- .../data/nereids_tpcds_shape_sf1000_p0/shape/query45.out | 2 +- .../data/nereids_tpcds_shape_sf1000_p0/shape/query51.out | 8 ++++---- .../data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out | 2 +- .../data/nereids_tpcds_shape_sf100_p0/shape/query45.out | 2 +- .../data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out | 2 +- regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out | 2 +- .../new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out | 2 +- regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out | 2 +- regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out | 8 ++++---- 12 files changed, 19 insertions(+), 19 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index 29e30b30f33..f8298871f0d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -267,8 +267,8 @@ public class JoinEstimation { } private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, Join join) { - if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { - double sel = computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join); + if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join) || join.isMarkJoin()) { + double sel = join.isMarkJoin() ? 1.0 : computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join); if (join.getJoinType().isLeftSemiOrAntiJoin()) { return new StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel) .putColumnStatistics(leftStats.columnStatistics()) diff --git a/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out b/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out index b65fa9047c0..e032d162e9e 100644 --- a/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out +++ b/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] ------------------------PhysicalProject diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out index b65fa9047c0..e032d162e9e 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] ------------------------PhysicalProject diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out index b65fa9047c0..e032d162e9e 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] ------------------------PhysicalProject diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out index 6c22d2df308..38bec2403ec 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out @@ -19,9 +19,9 @@ PhysicalResultSink --------------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------------hashAgg[LOCAL] ------------------------------------PhysicalProject ---------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk] +--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk] ----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 +------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF1 ----------------------------------------PhysicalProject ------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212)) --------------------------------------------PhysicalOlapScan[date_dim] @@ -34,9 +34,9 @@ PhysicalResultSink --------------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------------hashAgg[LOCAL] ------------------------------------PhysicalProject ---------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk] +--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk] ----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF0 +------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 ----------------------------------------PhysicalProject ------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212)) --------------------------------------------PhysicalOlapScan[date_dim] diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out index 48886e631ea..377431110d4 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() ------------------------PhysicalProject diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out index a3a6b75ca6d..5c74bb70d39 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] ------------------------PhysicalProject diff --git a/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out b/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out index 83f6b9ca5df..b8f9dc6e8a0 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() ------------------------PhysicalProject diff --git a/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out b/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out index e05c3f0537a..95b5d1168d7 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] ------------------------PhysicalProject diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out b/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out index 6ac3b85090a..3995aa66e33 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] ------------------------PhysicalProject diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out index 6ac3b85090a..3995aa66e33 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out @@ -9,7 +9,7 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +------------------hashJoin[INNER_JOIN shuffleBucket] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalProject ----------------------hashJoin[INNER_JOIN shuffle] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] ------------------------PhysicalProject diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out index 6c22d2df308..38bec2403ec 100644 --- a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out +++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out @@ -19,9 +19,9 @@ PhysicalResultSink --------------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------------hashAgg[LOCAL] ------------------------------------PhysicalProject ---------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk] +--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk] ----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 +------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF1 ----------------------------------------PhysicalProject ------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212)) --------------------------------------------PhysicalOlapScan[date_dim] @@ -34,9 +34,9 @@ PhysicalResultSink --------------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------------hashAgg[LOCAL] ------------------------------------PhysicalProject ---------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk] +--------------------------------------hashJoin[INNER_JOIN broadcast] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk] ----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF0 +------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 ----------------------------------------PhysicalProject ------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212)) --------------------------------------------PhysicalOlapScan[date_dim] --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org