asolimando commented on code in PR #22762:
URL: https://github.com/apache/datafusion/pull/22762#discussion_r3357913305


##########
datafusion/physical-plan/src/joins/utils.rs:
##########
@@ -806,11 +978,21 @@ fn estimate_semi_join_cardinality(
         if let (Some(&o), Some(&i)) = (outer_ndv.get_value(), 
inner_ndv.get_value())
             && o > 0
         {
-            let null_frac = outer_stat
-                .null_count
-                .get_value()
-                .map(|&nc| nc as f64 / outer_rows as f64)
-                .unwrap_or(0.0);
+            let null_frac = if null_equality == 
NullEquality::NullEqualsNothing {

Review Comment:
   Nice catch



##########
datafusion/physical-plan/src/joins/utils.rs:
##########
@@ -610,12 +654,132 @@ fn estimate_join_cardinality(
             column_statistics.push(ColumnStatistics::new_unknown());
             Some(PartialJoinStatistics {
                 num_rows,
+                total_byte_size: Precision::Absent,
                 column_statistics,
             })
         }
     }
 }
 
+fn equijoin_column_indices(
+    left: &PhysicalExprRef,
+    right: &PhysicalExprRef,
+) -> Option<(usize, usize)> {
+    Some((
+        left.downcast_ref::<Column>()?.index(),
+        right.downcast_ref::<Column>()?.index(),
+    ))
+}
+
+/// Adjusts the preserved input's column statistics to describe the subset of
+/// rows a semi or anti join emits. Most values become estimates (marked
+/// inexact) bounded by the smaller output row count:
+///
+/// - `null_count` and `byte_size` are scaled by the output/input row ratio.
+/// - `distinct_count` is capped at the number of non-null output rows.
+/// - `sum_value` is dropped, since the input sum does not apply to the subset.
+///
+/// Join-key columns are the exception for `null_count`: under regular SQL
+/// equality, null keys never match, so a semi join keeps none of those rows 
and
+/// an anti join keeps all of them. Under null-equal joins, null keys can match
+/// and are treated like the rest of the subset.
+fn normalize_semi_anti_join_column_statistics(
+    column_statistics: Vec<ColumnStatistics>,
+    input_num_rows: &Precision<usize>,
+    output_num_rows: usize,
+    join_key_indices: &[usize],
+    is_anti: bool,
+    null_equality: NullEquality,
+) -> Vec<ColumnStatistics> {
+    let input_num_rows = input_num_rows.get_value().copied().unwrap_or(0);
+
+    column_statistics
+        .into_iter()
+        .enumerate()
+        .map(|(idx, stats)| {
+            let mut stats = stats.to_inexact();
+            stats.null_count = if join_key_indices.contains(&idx) {

Review Comment:
   Nit: Not a big problem as there won't be many items, but maybe we can use a 
`HashSet` here as it seems we are just using containment test.
   
   



##########
datafusion/physical-plan/src/joins/utils.rs:
##########
@@ -446,24 +449,34 @@ struct PartialJoinStatistics {
 /// - Does not account for selectivity of arbitrary join filter expressions
 ///   (e.g., `(t1.v1 + t2.v1) % 2 = 0`). Such filters, common in 
NestedLoopJoinExec,
 ///   are not factored into the cardinality estimation.
-/// - Column statistics for the output are simply combined from inputs without
-///   adjusting for join selectivity (acknowledged in the code as needing
-///   "filter selectivity analysis").
+/// - Column statistics for inner/outer joins are simply combined from inputs
+///   without adjusting for join selectivity (acknowledged in the code as
+///   needing "filter selectivity analysis").

Review Comment:
   Q: is there an issue filed already? It's easier to discover pending issues 
in GitHub rather than code comments



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to