neilconway commented on code in PR #21362:
URL: https://github.com/apache/datafusion/pull/21362#discussion_r3036954195


##########
datafusion/common/src/functional_dependencies.rs:
##########
@@ -590,6 +590,46 @@ pub fn get_required_group_by_exprs_indices(
         .collect()
 }
 
+/// Returns indices for the minimal subset of ORDER BY expressions that are
+/// functionally equivalent to the original set of ORDER BY expressions.
+pub fn get_required_sort_exprs_indices(
+    schema: &DFSchema,
+    sort_expr_names: &[String],
+) -> Option<Vec<usize>> {
+    let dependencies = schema.functional_dependencies();
+    let field_names = schema.field_names();
+    let sort_expr_indices = sort_expr_names
+        .iter()
+        .map(|sort_expr_name| {
+            field_names
+                .iter()
+                .position(|field_name| field_name == sort_expr_name)
+        })
+        .collect::<Option<Vec<_>>>()?;

Review Comment:
   Possible improvement: as written, the optimization bails out entirely if 
there are any ORDER BY expressions that don't match column names. That's a bit 
conservative; e.g.,
   
   ```sql
     SELECT deptno, SUM(sal) AS total_sal
     FROM emp
     GROUP BY deptno
     ORDER BY deptno, total_sal, ABS(deptno)
   ```
   
   We could still simplify this to `ORDER BY deptno, ABS(deptno)`. I'd guess 
this case isn't super common in practice though?



##########
datafusion/optimizer/src/eliminate_duplicated_expr.rs:
##########
@@ -76,12 +76,50 @@ impl OptimizerRule for EliminateDuplicatedExpr {
                     .map(|wrapper| wrapper.0)
                     .collect();
 
-                let transformed = if len != unique_exprs.len() {
+                let sort_expr_names = unique_exprs
+                    .iter()
+                    .map(|sort_expr| sort_expr.expr.schema_name().to_string())
+                    .collect::<Vec<_>>();
+                let required_indices = get_required_sort_exprs_indices(
+                    sort.input.schema().as_ref(),
+                    &sort_expr_names,
+                );
+
+                let (unique_exprs, fd_transformed) =
+                    if let Some(required_indices) = required_indices {
+                        if required_indices.len() != unique_exprs.len() {
+                            (
+                                required_indices
+                                    .into_iter()
+                                    .map(|idx| unique_exprs[idx].clone())
+                                    .collect(),
+                                true,
+                            )
+                        } else {
+                            (unique_exprs, false)
+                        }
+                    } else {
+                        (unique_exprs, false)
+                    };
+
+                let transformed = if len != unique_exprs.len() || 
fd_transformed {
                     Transformed::yes
                 } else {
                     Transformed::no
                 };

Review Comment:
   Could be simplified, e.g.,:
   
   
   ```suggestion
                     let unique_exprs = match required_indices {
                         Some(indices) if indices.len() < unique_exprs.len() => 
indices
                             .into_iter()
                             .map(|idx| unique_exprs[idx].clone())
                             .collect(),
                         _ => unique_exprs,
                     };
   
                     let transformed = if len != unique_exprs.len() {
                         Transformed::yes
                     } else {
                         Transformed::no
                     };
   ```



##########
datafusion/optimizer/src/eliminate_duplicated_expr.rs:
##########
@@ -76,12 +76,50 @@ impl OptimizerRule for EliminateDuplicatedExpr {
                     .map(|wrapper| wrapper.0)
                     .collect();
 
-                let transformed = if len != unique_exprs.len() {
+                let sort_expr_names = unique_exprs
+                    .iter()
+                    .map(|sort_expr| sort_expr.expr.schema_name().to_string())
+                    .collect::<Vec<_>>();
+                let required_indices = get_required_sort_exprs_indices(
+                    sort.input.schema().as_ref(),
+                    &sort_expr_names,
+                );
+
+                let (unique_exprs, fd_transformed) =
+                    if let Some(required_indices) = required_indices {
+                        if required_indices.len() != unique_exprs.len() {
+                            (
+                                required_indices
+                                    .into_iter()
+                                    .map(|idx| unique_exprs[idx].clone())
+                                    .collect(),
+                                true,
+                            )
+                        } else {
+                            (unique_exprs, false)
+                        }
+                    } else {
+                        (unique_exprs, false)
+                    };
+
+                let transformed = if len != unique_exprs.len() || 
fd_transformed {
                     Transformed::yes
                 } else {
                     Transformed::no
                 };
 
+                if unique_exprs.is_empty() {

Review Comment:
   `unique_exprs` can't be empty because FDs can never get us down to zero sort 
columns, right?



##########
datafusion/optimizer/src/eliminate_duplicated_expr.rs:
##########
@@ -175,4 +214,40 @@ mod tests {
             TableScan: test
         ")
     }
+
+    #[test]
+    fn eliminate_fd_redundant_sort_expr() -> Result<()> {
+        let table_scan = test_table_scan().unwrap();
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a")], vec![sum(col("b")).alias("total_sal")])?
+            .sort(vec![
+                col("a").sort(true, true),
+                col("total_sal").sort(true, true),
+            ])?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Sort: test.a ASC NULLS FIRST
+          Aggregate: groupBy=[[test.a]], aggr=[[sum(test.b) AS total_sal]]
+            TableScan: test
+        ")
+    }
+
+    #[test]
+    fn keep_order_by_when_dependency_comes_later() -> Result<()> {
+        let table_scan = test_table_scan().unwrap();
+        let plan = LogicalPlanBuilder::from(table_scan)
+            .aggregate(vec![col("a")], vec![sum(col("b")).alias("total_sal")])?
+            .sort(vec![
+                col("total_sal").sort(true, true),
+                col("a").sort(true, true),
+            ])?
+            .build()?;
+
+        assert_optimized_plan_equal!(plan, @r"
+        Sort: total_sal ASC NULLS FIRST, test.a ASC NULLS FIRST
+          Aggregate: groupBy=[[test.a]], aggr=[[sum(test.b) AS total_sal]]
+            TableScan: test
+        ")
+    }

Review Comment:
   Personally my preference is always to add tests as SLT if all we're doing is 
stuff like checking `EXPLAIN` output, because it is easier to read, can be 
evaluated in parallel, and doesn't have to be compiled.



##########
datafusion/sqllogictest/test_files/window.slt:
##########


Review Comment:
   This comment needs updating.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to