This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 03537905f144 [SPARK-54871][SQL] Trim aliases from grouping and
aggregate expressions before handling grouping analytics
03537905f144 is described below
commit 03537905f1444ef74e5ab3d6b1d5e04415e234c5
Author: mihailoale-db <[email protected]>
AuthorDate: Sat Jan 3 22:53:02 2026 +0800
[SPARK-54871][SQL] Trim aliases from grouping and aggregate expressions
before handling grouping analytics
### What changes were proposed in this pull request?
In this PR I propose to trim aliases from grouping and aggregate
expressions before handling grouping analytics. This is needed for the
following query:
```
SELECT col1 AS k2 FROM values(1) GROUP BY CUBE(k2)
```
Here we have `col1` in the single-pass whereas in the fixed-point we have
`col1 AS k2` before constructing an `Aggregate` in `ResolveGroupingAnalytics`.
Change removes the `AS k2` part and keeps the compatibility between single-pass
and fixed-point analyzers without changing outputs (analyzed plans are
different, only names).
The change also inlines the behavior with regular aggregates (without
grouping analytics). In other words:
```
SELECT col1 + col2 AS a FROM VALUES(1,2) GROUP BY a
```
Here `col1 + col2` should be the grouping expression (as it is) and we
inline the behaviors.
### Why are the changes needed?
To keep the compatibility between fixed-point and single-pass analyzers.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Changed tests.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #53644 from mihailoale-db/trimaliasesgroupinganalytics.
Authored-by: mihailoale-db <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../apache/spark/sql/catalyst/analysis/Analyzer.scala | 3 ++-
.../spark/sql/catalyst/expressions/grouping.scala | 12 +++++++-----
.../sql-tests/analyzer-results/group-analytics.sql.out | 18 +++++++++---------
.../analyzer-results/udf/udf-group-analytics.sql.out | 18 +++++++++---------
4 files changed, 27 insertions(+), 24 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
index d6154b1e2666..980d07f86ecd 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -755,6 +755,7 @@ class Analyzer(
groupByExprs: Seq[Expression],
aggregationExprs: Seq[NamedExpression],
child: LogicalPlan): LogicalPlan = {
+ val aggregationExprsNoAlias =
aggregationExprs.map(trimNonTopLevelAliases)
if (groupByExprs.size > GroupingID.dataType.defaultSize * 8) {
throw
QueryCompilationErrors.groupingSizeTooLargeError(GroupingID.dataType.defaultSize
* 8)
@@ -775,7 +776,7 @@ class Analyzer(
val groupingAttrs = expand.output.drop(child.output.length)
val aggregations = constructAggregateExprs(
- groupByExprs, aggregationExprs, groupByAliases, groupingAttrs, gid)
+ groupByExprs, aggregationExprsNoAlias, groupByAliases, groupingAttrs,
gid)
Aggregate(groupingAttrs, aggregations, expand)
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
index 4350c4da932f..15161b62758c 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/grouping.scala
@@ -267,24 +267,26 @@ object GroupingID {
}
}
-object GroupingAnalytics {
+object GroupingAnalytics extends AliasHelper {
def unapply(exprs: Seq[Expression])
: Option[(Seq[Seq[Expression]], Seq[Expression])] = {
- if (!exprs.exists(_.isInstanceOf[BaseGroupingSets])) {
+ val exprsNoAlias = exprs.map(trimAliases)
+
+ if (!exprsNoAlias.exists(_.isInstanceOf[BaseGroupingSets])) {
None
} else {
- val resolved = exprs.forall {
+ val resolved = exprsNoAlias.forall {
case gs: BaseGroupingSets => gs.childrenResolved
case other => other.resolved
}
if (!resolved) {
None
} else {
- val groups = exprs.flatMap {
+ val groups = exprsNoAlias.flatMap {
case gs: BaseGroupingSets => gs.groupByExprs
case other: Expression => other :: Nil
}
- val unmergedSelectedGroupByExprs = exprs.map {
+ val unmergedSelectedGroupByExprs = exprsNoAlias.map {
case gs: BaseGroupingSets => gs.selectedGroupByExprs
case other: Expression => Seq(Seq(other))
}
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out
index fc0f6fef8c16..543a5cc61133 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/group-analytics.sql.out
@@ -501,9 +501,9 @@ Project [course#x, year#x]
-- !query
SELECT a + b AS k1, b AS k2, SUM(a - b) FROM testData GROUP BY CUBE(k1, k2)
-- !query analysis
-Aggregate [k1#x, k2#x, spark_grouping_id#xL], [k1#x, k2#x, sum((a#x - b#x)) AS
sum((a - b))#xL]
-+- Expand [[a#x, b#x, k1#x, k2#x, 0], [a#x, b#x, k1#x, null, 1], [a#x, b#x,
null, k2#x, 2], [a#x, b#x, null, null, 3]], [a#x, b#x, k1#x, k2#x,
spark_grouping_id#xL]
- +- Project [a#x, b#x, (a#x + b#x) AS k1#x, b#x AS k2#x]
+Aggregate [(a + b)#x, b#x, spark_grouping_id#xL], [(a + b)#x AS k1#x, b#x AS
k2#x, sum((a#x - b#x)) AS sum((a - b))#xL]
++- Expand [[a#x, b#x, (a + b)#x, b#x, 0], [a#x, b#x, (a + b)#x, null, 1],
[a#x, b#x, null, b#x, 2], [a#x, b#x, null, null, 3]], [a#x, b#x, (a + b)#x,
b#x, spark_grouping_id#xL]
+ +- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, b#x AS b#x]
+- SubqueryAlias testdata
+- View (`testData`, [a#x, b#x])
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
@@ -515,9 +515,9 @@ Aggregate [k1#x, k2#x, spark_grouping_id#xL], [k1#x, k2#x,
sum((a#x - b#x)) AS s
-- !query
SELECT a + b AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b)
-- !query analysis
-Aggregate [k#x, b#x, spark_grouping_id#xL], [k#x, b#x, sum((a#x - b#x)) AS
sum((a - b))#xL]
-+- Expand [[a#x, b#x, k#x, b#x, 0], [a#x, b#x, k#x, null, 1], [a#x, b#x, null,
null, 3]], [a#x, b#x, k#x, b#x, spark_grouping_id#xL]
- +- Project [a#x, b#x, (a#x + b#x) AS k#x, b#x AS b#x]
+Aggregate [(a + b)#x, b#x, spark_grouping_id#xL], [(a + b)#x AS k#x, b#x,
sum((a#x - b#x)) AS sum((a - b))#xL]
++- Expand [[a#x, b#x, (a + b)#x, b#x, 0], [a#x, b#x, (a + b)#x, null, 1],
[a#x, b#x, null, null, 3]], [a#x, b#x, (a + b)#x, b#x, spark_grouping_id#xL]
+ +- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, b#x AS b#x]
+- SubqueryAlias testdata
+- View (`testData`, [a#x, b#x])
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
@@ -529,9 +529,9 @@ Aggregate [k#x, b#x, spark_grouping_id#xL], [k#x, b#x,
sum((a#x - b#x)) AS sum((
-- !query
SELECT a + b, b AS k, SUM(a - b) FROM testData GROUP BY a + b, k GROUPING
SETS(k)
-- !query analysis
-Aggregate [(a + b)#x, k#x, spark_grouping_id#xL], [(a + b)#x AS (a + b)#x,
k#x, sum((a#x - b#x)) AS sum((a - b))#xL]
-+- Expand [[a#x, b#x, null, k#x, 2]], [a#x, b#x, (a + b)#x, k#x,
spark_grouping_id#xL]
- +- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, b#x AS k#x]
+Aggregate [(a + b)#x, b#x, spark_grouping_id#xL], [(a + b)#x AS (a + b)#x, b#x
AS k#x, sum((a#x - b#x)) AS sum((a - b))#xL]
++- Expand [[a#x, b#x, null, b#x, 2]], [a#x, b#x, (a + b)#x, b#x,
spark_grouping_id#xL]
+ +- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x, b#x AS b#x]
+- SubqueryAlias testdata
+- View (`testData`, [a#x, b#x])
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out
index fc8199333f09..9623bc0c0720 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/udf/udf-group-analytics.sql.out
@@ -374,9 +374,9 @@ Project [course#x, year#x]
-- !query
SELECT udf(a + b) AS k1, udf(b) AS k2, SUM(a - b) FROM testData GROUP BY
CUBE(k1, k2)
-- !query analysis
-Aggregate [k1#x, k2#x, spark_grouping_id#xL], [k1#x, k2#x, sum((a#x - b#x)) AS
sum((a - b))#xL]
-+- Expand [[a#x, b#x, k1#x, k2#x, 0], [a#x, b#x, k1#x, null, 1], [a#x, b#x,
null, k2#x, 2], [a#x, b#x, null, null, 3]], [a#x, b#x, k1#x, k2#x,
spark_grouping_id#xL]
- +- Project [a#x, b#x, cast(udf(cast((a#x + b#x) as string)) as int) AS
k1#x, cast(udf(cast(b#x as string)) as int) AS k2#x]
+Aggregate [udf((a + b))#x, udf(b)#x, spark_grouping_id#xL], [udf((a + b))#x AS
k1#x, udf(b)#x AS k2#x, sum((a#x - b#x)) AS sum((a - b))#xL]
++- Expand [[a#x, b#x, udf((a + b))#x, udf(b)#x, 0], [a#x, b#x, udf((a + b))#x,
null, 1], [a#x, b#x, null, udf(b)#x, 2], [a#x, b#x, null, null, 3]], [a#x, b#x,
udf((a + b))#x, udf(b)#x, spark_grouping_id#xL]
+ +- Project [a#x, b#x, cast(udf(cast((a#x + b#x) as string)) as int) AS
udf((a + b))#x, cast(udf(cast(b#x as string)) as int) AS udf(b)#x]
+- SubqueryAlias testdata
+- View (`testData`, [a#x, b#x])
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
@@ -388,9 +388,9 @@ Aggregate [k1#x, k2#x, spark_grouping_id#xL], [k1#x, k2#x,
sum((a#x - b#x)) AS s
-- !query
SELECT udf(udf(a + b)) AS k, b, SUM(a - b) FROM testData GROUP BY ROLLUP(k, b)
-- !query analysis
-Aggregate [k#x, b#x, spark_grouping_id#xL], [k#x, b#x, sum((a#x - b#x)) AS
sum((a - b))#xL]
-+- Expand [[a#x, b#x, k#x, b#x, 0], [a#x, b#x, k#x, null, 1], [a#x, b#x, null,
null, 3]], [a#x, b#x, k#x, b#x, spark_grouping_id#xL]
- +- Project [a#x, b#x, cast(udf(cast(cast(udf(cast((a#x + b#x) as string))
as int) as string)) as int) AS k#x, b#x AS b#x]
+Aggregate [udf(udf((a + b)))#x, b#x, spark_grouping_id#xL], [udf(udf((a +
b)))#x AS k#x, b#x, sum((a#x - b#x)) AS sum((a - b))#xL]
++- Expand [[a#x, b#x, udf(udf((a + b)))#x, b#x, 0], [a#x, b#x, udf(udf((a +
b)))#x, null, 1], [a#x, b#x, null, null, 3]], [a#x, b#x, udf(udf((a + b)))#x,
b#x, spark_grouping_id#xL]
+ +- Project [a#x, b#x, cast(udf(cast(cast(udf(cast((a#x + b#x) as string))
as int) as string)) as int) AS udf(udf((a + b)))#x, b#x AS b#x]
+- SubqueryAlias testdata
+- View (`testData`, [a#x, b#x])
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
@@ -402,9 +402,9 @@ Aggregate [k#x, b#x, spark_grouping_id#xL], [k#x, b#x,
sum((a#x - b#x)) AS sum((
-- !query
SELECT udf(a + b), udf(udf(b)) AS k, SUM(a - b) FROM testData GROUP BY a + b,
k GROUPING SETS(k)
-- !query analysis
-Aggregate [(a + b)#x, k#x, spark_grouping_id#xL], [cast(udf(cast((a + b)#x as
string)) as int) AS udf((a + b))#x, k#x, sum((a#x - b#x)) AS sum((a - b))#xL]
-+- Expand [[a#x, b#x, null, k#x, 2]], [a#x, b#x, (a + b)#x, k#x,
spark_grouping_id#xL]
- +- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x,
cast(udf(cast(cast(udf(cast(b#x as string)) as int) as string)) as int) AS k#x]
+Aggregate [(a + b)#x, udf(udf(b))#x, spark_grouping_id#xL], [cast(udf(cast((a
+ b)#x as string)) as int) AS udf((a + b))#x, udf(udf(b))#x AS k#x, sum((a#x -
b#x)) AS sum((a - b))#xL]
++- Expand [[a#x, b#x, null, udf(udf(b))#x, 2]], [a#x, b#x, (a + b)#x,
udf(udf(b))#x, spark_grouping_id#xL]
+ +- Project [a#x, b#x, (a#x + b#x) AS (a + b)#x,
cast(udf(cast(cast(udf(cast(b#x as string)) as int) as string)) as int) AS
udf(udf(b))#x]
+- SubqueryAlias testdata
+- View (`testData`, [a#x, b#x])
+- Project [cast(a#x as int) AS a#x, cast(b#x as int) AS b#x]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]