This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d110859fad8 [fix](nereids) refine row count estimation for mark join 
(#38270)
d110859fad8 is described below

commit d110859fad8602c77001d16a867137ac77aae37c
Author: xzj7019 <131111794+xzj7...@users.noreply.github.com>
AuthorDate: Wed Jul 24 16:30:06 2024 +0800

    [fix](nereids) refine row count estimation for mark join (#38270)
    
    Current semi/anti stats estimation doesn't consider the mark join case,
    whose row count should follow either side's stats without change.
---
 .../main/java/org/apache/doris/nereids/stats/JoinEstimation.java  | 4 ++--
 regression-test/data/nereids_hint_tpcds_p0/shape/query45.out      | 2 +-
 .../nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out  | 2 +-
 .../data/nereids_tpcds_shape_sf1000_p0/shape/query45.out          | 2 +-
 .../data/nereids_tpcds_shape_sf1000_p0/shape/query51.out          | 8 ++++----
 .../data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out        | 2 +-
 .../data/nereids_tpcds_shape_sf100_p0/shape/query45.out           | 2 +-
 .../data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out           | 2 +-
 regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out  | 2 +-
 .../new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out     | 2 +-
 regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out | 2 +-
 regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out | 8 ++++----
 12 files changed, 19 insertions(+), 19 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
index 29e30b30f33..f8298871f0d 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java
@@ -267,8 +267,8 @@ public class JoinEstimation {
     }
 
     private static Statistics estimateSemiOrAnti(Statistics leftStats, 
Statistics rightStats, Join join) {
-        if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, 
join)) {
-            double sel = 
computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
+        if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, 
join) || join.isMarkJoin()) {
+            double sel = join.isMarkJoin() ? 1.0 : 
computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
             if (join.getJoinType().isLeftSemiOrAntiJoin()) {
                 return new 
StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel)
                         .putColumnStatistics(leftStats.columnStatistics())
diff --git a/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out 
b/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out
index b65fa9047c0..e032d162e9e 100644
--- a/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out
+++ b/regression-test/data/nereids_hint_tpcds_p0/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
 ------------------------PhysicalProject
diff --git 
a/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out
 
b/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out
index b65fa9047c0..e032d162e9e 100644
--- 
a/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out
+++ 
b/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
 ------------------------PhysicalProject
diff --git 
a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out 
b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out
index b65fa9047c0..e032d162e9e 100644
--- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out
+++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
 ------------------------PhysicalProject
diff --git 
a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out 
b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out
index 6c22d2df308..38bec2403ec 100644
--- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out
+++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out
@@ -19,9 +19,9 @@ PhysicalResultSink
 --------------------------------PhysicalDistribute[DistributionSpecHash]
 ----------------------------------hashAgg[LOCAL]
 ------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) 
otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+--------------------------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) 
otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk]
 ----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[web_sales] apply 
RFs: RF1
+------------------------------------------PhysicalOlapScan[store_sales] apply 
RFs: RF1
 ----------------------------------------PhysicalProject
 ------------------------------------------filter((date_dim.d_month_seq <= 
1223) and (date_dim.d_month_seq >= 1212))
 --------------------------------------------PhysicalOlapScan[date_dim]
@@ -34,9 +34,9 @@ PhysicalResultSink
 --------------------------------PhysicalDistribute[DistributionSpecHash]
 ----------------------------------hashAgg[LOCAL]
 ------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) 
otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk]
+--------------------------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) 
otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk]
 ----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[store_sales] apply 
RFs: RF0
+------------------------------------------PhysicalOlapScan[web_sales] apply 
RFs: RF0
 ----------------------------------------PhysicalProject
 ------------------------------------------filter((date_dim.d_month_seq <= 
1223) and (date_dim.d_month_seq >= 1212))
 --------------------------------------------PhysicalOlapScan[date_dim]
diff --git 
a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out 
b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out
index 48886e631ea..377431110d4 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=()
 ------------------------PhysicalProject
diff --git 
a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out 
b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out
index a3a6b75ca6d..5c74bb70d39 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
 ------------------------PhysicalProject
diff --git 
a/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out 
b/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out
index 83f6b9ca5df..b8f9dc6e8a0 100644
--- a/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out
+++ b/regression-test/data/new_shapes_p0/tpcds_sf100/rf_prune/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffleBucket] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=()
 ------------------------PhysicalProject
diff --git a/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out 
b/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out
index e05c3f0537a..95b5d1168d7 100644
--- a/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out
+++ b/regression-test/data/new_shapes_p0/tpcds_sf100/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffleBucket] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
 ------------------------PhysicalProject
diff --git 
a/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out
 
b/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out
index 6ac3b85090a..3995aa66e33 100644
--- 
a/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out
+++ 
b/regression-test/data/new_shapes_p0/tpcds_sf1000/bs_downgrade_shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffleBucket] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
 ------------------------PhysicalProject
diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out 
b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out
index 6ac3b85090a..3995aa66e33 100644
--- a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out
+++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query45.out
@@ -9,7 +9,7 @@ PhysicalResultSink
 ------------hashAgg[LOCAL]
 --------------PhysicalProject
 ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', 
'85392', '85460', '85669', '86197', '86475', '88274') OR $c$1))
-------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
+------------------hashJoin[INNER_JOIN shuffleBucket] 
hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build 
RFs:RF3 i_item_sk->[ws_item_sk]
 --------------------PhysicalProject
 ----------------------hashJoin[INNER_JOIN shuffle] 
hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) 
otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk]
 ------------------------PhysicalProject
diff --git a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out 
b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out
index 6c22d2df308..38bec2403ec 100644
--- a/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out
+++ b/regression-test/data/new_shapes_p0/tpcds_sf1000/shape/query51.out
@@ -19,9 +19,9 @@ PhysicalResultSink
 --------------------------------PhysicalDistribute[DistributionSpecHash]
 ----------------------------------hashAgg[LOCAL]
 ------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) 
otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+--------------------------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) 
otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk]
 ----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[web_sales] apply 
RFs: RF1
+------------------------------------------PhysicalOlapScan[store_sales] apply 
RFs: RF1
 ----------------------------------------PhysicalProject
 ------------------------------------------filter((date_dim.d_month_seq <= 
1223) and (date_dim.d_month_seq >= 1212))
 --------------------------------------------PhysicalOlapScan[date_dim]
@@ -34,9 +34,9 @@ PhysicalResultSink
 --------------------------------PhysicalDistribute[DistributionSpecHash]
 ----------------------------------hashAgg[LOCAL]
 ------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) 
otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk]
+--------------------------------------hashJoin[INNER_JOIN broadcast] 
hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) 
otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk]
 ----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[store_sales] apply 
RFs: RF0
+------------------------------------------PhysicalOlapScan[web_sales] apply 
RFs: RF0
 ----------------------------------------PhysicalProject
 ------------------------------------------filter((date_dim.d_month_seq <= 
1223) and (date_dim.d_month_seq >= 1212))
 --------------------------------------------PhysicalOlapScan[date_dim]


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to