This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-4.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.0 by this push: new df25015fda16 Revert "[SPARK-47895][SQL] group by alias should be idempotent" df25015fda16 is described below commit df25015fda16d2228eb02ccb7bf257e8c2703a4d Author: Mihailo Timotic <mihailo.timo...@databricks.com> AuthorDate: Mon Apr 14 12:16:37 2025 +0800 Revert "[SPARK-47895][SQL] group by alias should be idempotent" ### What changes were proposed in this pull request? This PR reverts https://github.com/apache/spark/pull/50461 because it introduces a correctness issue by replacing an `Alias` with incorrect literal. A followup will be made with a different way to fix this issue. ### Why are the changes needed? In the below example, alias `abc` is replaced with `2` resulting in 2 rows instead of correct 3. Before erronous PR:  After erronous PR:  ### Does this PR introduce _any_ user-facing change? User now sees an error message instead of an incorrect result. ### How was this patch tested? Added a test case to check for this behavior in the future ### Was this patch authored or co-authored using generative AI tooling? No Closes #50567 from mihailotim-db/mihailotim-db/revert_group_by. Authored-by: Mihailo Timotic <mihailo.timo...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit 8718eba1f6f7a1cf3efbffd0ec86c81f10e8e487) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../analysis/ResolveReferencesInAggregate.scala | 14 +------------- .../analysis/SubstituteUnresolvedOrdinalsSuite.scala | 18 ------------------ .../sql-tests/analyzer-results/group-by-alias.sql.out | 7 +++++++ .../test/resources/sql-tests/inputs/group-by-alias.sql | 3 +++ .../resources/sql-tests/results/group-by-alias.sql.out | 10 ++++++++++ 5 files changed, 21 insertions(+), 31 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala index d1aee4ca2a9d..7ea90854932e 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveReferencesInAggregate.scala @@ -116,19 +116,7 @@ class ResolveReferencesInAggregate(val catalogManager: CatalogManager) extends S groupExprs.map { g => g.transformWithPruning(_.containsPattern(UNRESOLVED_ATTRIBUTE)) { case u: UnresolvedAttribute => - val (result, index) = - selectList.zipWithIndex.find(ne => conf.resolver(ne._1.name, u.name)) - .getOrElse((u, -1)) - - trimAliases(result) match { - // HACK ALERT: If the expanded grouping expression is an integer literal, don't use it - // but use an integer literal of the index. The reason is we may - // repeatedly analyze the plan, and the original integer literal may cause - // failures with a later GROUP BY ordinal resolution. GROUP BY constant is - // meaningless so whatever value does not matter here. - case IntegerLiteral(_) => Literal(index + 1) - case _ => result - } + selectList.find(ne => conf.resolver(ne.name, u.name)).getOrElse(u) } } } else { diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala index 106a607ba6df..39cf298aec43 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/SubstituteUnresolvedOrdinalsSuite.scala @@ -104,22 +104,4 @@ class SubstituteUnresolvedOrdinalsSuite extends AnalysisTest { testRelationWithData.groupBy(Literal(1))(Literal(100).as("a")) ) } - - test("SPARK-47895: group by alias repeated analysis") { - val plan = testRelation.groupBy($"b")(Literal(100).as("b")).analyze - comparePlans( - plan, - testRelation.groupBy(Literal(1))(Literal(100).as("b")) - ) - - val testRelationWithData = testRelation.copy(data = Seq(new GenericInternalRow(Array(1: Any)))) - // Copy the plan to reset its `analyzed` flag, so that analyzer rules will re-apply. - val copiedPlan = plan.transform { - case _: LocalRelation => testRelationWithData - } - comparePlans( - copiedPlan.analyze, // repeated analysis - testRelationWithData.groupBy(Literal(1))(Literal(100).as("b")) - ) - } } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out index 00ec4319ad0f..8a57acd50d48 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/group-by-alias.sql.out @@ -331,6 +331,13 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException } +-- !query +SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc +-- !query analysis +Aggregate [(col1#x % 3)], [max(col1#x) AS max(col1)#x, 3 AS abc#x] ++- LocalRelation [col1#x] + + -- !query set spark.sql.groupByAliases=false -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql b/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql index 80a94b50bdb7..75afc82f998d 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/group-by-alias.sql @@ -43,6 +43,9 @@ SELECT a AS k, COUNT(non_existing) FROM testData GROUP BY k; -- Aggregate functions cannot be used in GROUP BY SELECT COUNT(b) AS k FROM testData GROUP BY k; +-- Ordinal is replaced correctly when grouping by alias of a literal +SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc; + -- turn off group by aliases set spark.sql.groupByAliases=false; diff --git a/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out b/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out index 12cb5b3056c7..fd740c88bc55 100644 --- a/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/group-by-alias.sql.out @@ -277,6 +277,16 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException } +-- !query +SELECT MAX(col1), 3 as abc FROM VALUES(1),(2),(3),(4) GROUP BY col1 % abc +-- !query schema +struct<max(col1):int,abc:int> +-- !query output +2 3 +3 3 +4 3 + + -- !query set spark.sql.groupByAliases=false -- !query schema --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org