(spark) branch branch-4.0 updated: Revert "[SPARK-47895][SQL] group by alias should be idempotent"

wenchen Sun, 13 Apr 2025 21:24:00 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new df25015fda16 Revert "[SPARK-47895][SQL] group by alias should be 
idempotent"
df25015fda16 is described below

commit df25015fda16d2228eb02ccb7bf257e8c2703a4d
Author: Mihailo Timotic <mihailo.timo...@databricks.com>
AuthorDate: Mon Apr 14 12:16:37 2025 +0800

    Revert "[SPARK-47895][SQL] group by alias should be idempotent"
    
    ### What changes were proposed in this pull request?
    This PR reverts https://github.com/apache/spark/pull/50461 because it 
introduces a correctness issue by replacing an `Alias` with incorrect literal. 
A followup will be made with a different way to fix this issue.
    
    ### Why are the changes needed?
    In the below example, alias `abc` is replaced with `2` resulting in 2 rows 
instead of correct 3.
    Before erronous PR:
    
![image](https://github.com/user-attachments/assets/dcc98323-369d-4f5e-b0ad-de5b76ffc5c3)
    
    After erronous PR:
    
![image](https://github.com/user-attachments/assets/31c24125-6654-48f8-9b55-40a6a667ed23)
    
    ### Does this PR introduce _any_ user-facing change?
    User now sees an error message instead of an incorrect result.
    
    ### How was this patch tested?
    Added a test case to check for this behavior in the future
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #50567 from mihailotim-db/mihailotim-db/revert_group_by.
    
    Authored-by: Mihailo Timotic <mihailo.timo...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit 8718eba1f6f7a1cf3efbffd0ec86c81f10e8e487)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../analysis/ResolveReferencesInAggregate.scala        | 14 +-------------
 .../analysis/SubstituteUnresolvedOrdinalsSuite.scala   | 18 ------------------
 .../sql-tests/analyzer-results/group-by-alias.sql.out  |  7 +++++++
 .../test/resources/sql-tests/inputs/group-by-alias.sql |  3 +++
 .../resources/sql-tests/results/group-by-alias.sql.out | 10 ++++++++++
 5 files changed, 21 insertions(+), 31 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
index d1aee4ca2a9d..7ea90854932e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala
@@ -116,19 +116,7 @@ class ResolveReferencesInAggregate(val catalogManager: 
CatalogManager) extends S
       groupExprs.map { g =>
         g.transformWithPruning(_.containsPattern(UNRESOLVED_ATTRIBUTE)) {
           case u: UnresolvedAttribute =>
-            val (result, index) =
-              selectList.zipWithIndex.find(ne => conf.resolver(ne._1.name, 
u.name))
-                .getOrElse((u, -1))
-
-            trimAliases(result) match {
-              // HACK ALERT: If the expanded grouping expression is an integer 
literal, don't use it
-              //             but use an integer literal of the index. The 
reason is we may
-              //             repeatedly analyze the plan, and the original 
integer literal may cause
-              //             failures with a later GROUP BY ordinal 
resolution. GROUP BY constant is
-              //             meaningless so whatever value does not matter 
here.
-              case IntegerLiteral(_) => Literal(index + 1)
-              case _ => result
-            }
+            selectList.find(ne => conf.resolver(ne.name, u.name)).getOrElse(u)
         }
       }
     } else {
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
index 106a607ba6df..39cf298aec43 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala
@@ -104,22 +104,4 @@ class SubstituteUnresolvedOrdinalsSuite extends 
AnalysisTest {
       testRelationWithData.groupBy(Literal(1))(Literal(100).as("a"))
     )
   }
-
-  test("SPARK-47895: group by alias repeated analysis") {
-    val plan = testRelation.groupBy($"b")(Literal(100).as("b")).analyze
-    comparePlans(
-      plan,
-      testRelation.groupBy(Literal(1))(Literal(100).as("b"))
-    )
-
-    val testRelationWithData = testRelation.copy(data = Seq(new 
GenericInternalRow(Array(1: Any))))
-    // Copy the plan to reset its `analyzed` flag, so that analyzer rules will 
re-apply.
-    val copiedPlan = plan.transform {
-      case _: LocalRelation => testRelationWithData
-    }
-    comparePlans(
-      copiedPlan.analyze, // repeated analysis
-      testRelationWithData.groupBy(Literal(1))(Literal(100).as("b"))
-    )
-  }
 }
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out
index 00ec4319ad0f..8a57acd50d48 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out
@@ -331,6 +331,13 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc
+-- !query analysis
+Aggregate [(col1#x % 3)], [max(col1#x) AS max(col1)#x, 3 AS abc#x]
++- LocalRelation [col1#x]
+
+
 -- !query
 set spark.sql.groupByAliases=false
 -- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql 
b/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql
index 80a94b50bdb7..75afc82f998d 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql
@@ -43,6 +43,9 @@ SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k;
 -- Aggregate functions cannot be used in GROUP BY
 SELECT COUNT(b) AS k FROM testData GROUP BY k;
 
+-- Ordinal is replaced correctly when grouping by alias of a literal
+SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc;
+
 -- turn off group by aliases
 set spark.sql.groupByAliases=false;
 
diff --git 
a/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out 
b/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out
index 12cb5b3056c7..fd740c88bc55 100644
--- a/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out
@@ -277,6 +277,16 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
 }
 
 
+-- !query
+SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc
+-- !query schema
+struct<max(col1):int,abc:int>
+-- !query output
+2      3
+3      3
+4      3
+
+
 -- !query
 set spark.sql.groupByAliases=false
 -- !query schema


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch branch-4.0 updated: Revert "[SPARK-47895][SQL] group by alias should be idempotent"

Reply via email to