This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 5f2d37e0b429 [SPARK-42746][FOLLOW-UP][SQL] Fix flaky listagg tests and
refactor code
5f2d37e0b429 is described below
commit 5f2d37e0b429a940dc52872c41ac71e4012f22dd
Author: Mihailo Milosevic <[email protected]>
AuthorDate: Fri Mar 21 21:18:10 2025 +0800
[SPARK-42746][FOLLOW-UP][SQL] Fix flaky listagg tests and refactor code
### What changes were proposed in this pull request?
It was noticed that some tests could be flaky for listagg. The flakiness
mainly can come due to non-deterministic behaviour of grouping by a column or
collations (i.e. utf8_lcase, unicode_rtrim...). This PR aims to make queries
more deterministic by checking only the required info in the result of listagg
queries.
### Why are the changes needed?
We do not want to waste resources on flaky tests. Also, we want the tests
to scope the minimum set of truth, which means we do not assume ordering in
specific result, when there was an aggregation or grouping.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Test only change.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #50338 from mihailom-db/listaggFollowUp.
Authored-by: Mihailo Milosevic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../catalyst/expressions/aggregate/collect.scala | 12 +-
.../analyzer-results/listagg-collations.sql.out | 56 ++-
.../sql-tests/analyzer-results/listagg.sql.out | 417 ++++++++++++---------
.../sql-tests/inputs/listagg-collations.sql | 18 +-
.../test/resources/sql-tests/inputs/listagg.sql | 51 +--
.../sql-tests/results/listagg-collations.sql.out | 44 +--
.../resources/sql-tests/results/listagg.sql.out | 156 ++++----
.../apache/spark/sql/DataFrameAggregateSuite.scala | 33 +-
8 files changed, 433 insertions(+), 354 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
index 7789c23b50a4..015bd1e3e142 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/collect.scala
@@ -277,14 +277,14 @@ private[aggregate] object CollectTopK {
@ExpressionDescription(
usage = """
_FUNC_(expr[, delimiter])[ WITHIN GROUP (ORDER BY key [ASC | DESC]
[,...])] - Returns
- the concatenation of non-null input values, separated by the delimiter
ordered by key.
- If all values are null, null is returned.
+ the concatenation of non-NULL input values, separated by the delimiter
ordered by key.
+ If all values are NULL, NULL is returned.
""",
arguments = """
Arguments:
* expr - a string or binary expression to be concatenated.
* delimiter - an optional string or binary foldable expression used to
separate the input values.
- If null, the concatenation will be performed without a delimiter.
Default is null.
+ If NULL, the concatenation will be performed without a delimiter.
Default is NULL.
* key - an optional expression for ordering the input values. Multiple
keys can be specified.
If none are specified, the order of the rows in the result is
non-deterministic.
""",
@@ -400,7 +400,7 @@ case class ListAgg(
)
)
} else if (delimiter.dataType == NullType) {
- // null is the default empty delimiter so type is not important
+ // Null is the default empty delimiter so type is not important
TypeCheckSuccess
} else {
TypeUtils.checkForSameTypeInputExpr(child.dataType :: delimiter.dataType
:: Nil, prettyName)
@@ -451,7 +451,7 @@ case class ListAgg(
}
/**
- * @return ordering by (orderValue0, orderValue1, ...)
+ * @return Ordering by (orderValue0, orderValue1, ...)
* for InternalRow with format [childValue, orderValue0,
orderValue1, ...]
*/
private[this] def bufferOrdering: Ordering[InternalRow] = {
@@ -477,7 +477,7 @@ case class ListAgg(
}
/**
- * @return delimiter value or default empty value if delimiter is null. Type
respects [[dataType]]
+ * @return Delimiter value or default empty value if delimiter is null. Type
respects [[dataType]]
*/
private[this] def getDelimiterValue: Either[UTF8String, Array[Byte]] = {
val delimiterValue = delimiter.eval()
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
index ca471858a541..3d239f1bdb04 100644
---
a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
+++
b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg-collations.sql.out
@@ -9,66 +9,82 @@ Aggregate [listagg(c1#x, null, collate(c1#x, utf8_binary) ASC
NULLS FIRST, 0, 0)
-- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES
('a'), ('A'), ('b'), ('B')) AS t(c1)
+SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'),
('b'), ('B')) AS t(c1)
-- !query analysis
-Aggregate [listagg(c1#x, null, collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0,
0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC
NULLS FIRST)#x]
+Aggregate [listagg(distinct collate(c1#x, utf8_binary), null, 0, 0) AS
listagg(DISTINCT collate(c1, utf8_binary), NULL)#x]
+- SubqueryAlias t
+- Project [col1#x AS c1#x]
+- LocalRelation [col1#x]
-- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'),
('b'), ('B')) AS t(c1)
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES
('abc '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, '
', ''), '\n', '$') FROM t
-- !query analysis
-Aggregate [listagg(distinct collate(c1#x, utf8_binary), null, 0, 0) AS
listagg(DISTINCT collate(c1, utf8_binary), NULL)#x]
-+- SubqueryAlias t
- +- Project [col1#x AS c1#x]
- +- LocalRelation [col1#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS
FIRST)#x AS c1#x]
+: +- Aggregate [listagg(col1#x, null, col1#x ASC NULLS FIRST, 0, 0) AS
listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x]
++- Project [replace(replace(c1#x, , ),
+, $) AS replace(replace(c1, , ),
+, $)#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [c1#x], false, false, 1
-- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'),
('b'), ('B')) AS t(c1)
+SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM
(VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1)
-- !query analysis
-Aggregate [listagg(distinct collate(c1#x, utf8_lcase), null, 0, 0) AS
listagg(DISTINCT collate(c1, utf8_lcase), NULL)#x]
+Aggregate [lower(listagg(c1#x, null, collate(c1#x, utf8_lcase) ASC NULLS
FIRST, 0, 0)) AS lower(listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1,
utf8_lcase) ASC NULLS FIRST))#x]
+- SubqueryAlias t
+- Project [col1#x AS c1#x]
+- LocalRelation [col1#x]
-- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1
COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'),
('A'), ('b'), ('B')) AS t(c1)
-- !query analysis
-Aggregate [listagg(distinct collate(c1#x, utf8_lcase), null, collate(c1#x,
utf8_lcase) ASC NULLS FIRST, 0, 0) AS listagg(DISTINCT collate(c1, utf8_lcase),
NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)#x]
+Aggregate [lower(listagg(distinct collate(c1#x, utf8_lcase), null, 0, 0)) AS
lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL))#x]
+- SubqueryAlias t
+- Project [col1#x AS c1#x]
+- LocalRelation [col1#x]
-- !query
-SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '),
('abc '), ('x'), ('abc')) AS t(c1)
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1
COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
-- !query analysis
-Aggregate [listagg(distinct collate(c1#x, unicode_rtrim), null, 0, 0) AS
listagg(DISTINCT collate(c1, unicode_rtrim), NULL)#x]
+Aggregate [lower(listagg(distinct collate(c1#x, utf8_lcase), null,
collate(c1#x, utf8_lcase) ASC NULLS FIRST, 0, 0)) AS lower(listagg(DISTINCT
collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase)
ASC NULLS FIRST))#x]
+- SubqueryAlias t
+- Project [col1#x AS c1#x]
+- LocalRelation [col1#x]
-- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc
'), ('abc\n'), ('abc'), ('x')) AS t(c1)
+SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc
'), ('xbc '), ('a'), ('xbc')) AS t(c1)
-- !query analysis
-Aggregate [listagg(c1#x, null, c1#x ASC NULLS FIRST, 0, 0) AS listagg(c1,
NULL) WITHIN GROUP (ORDER BY c1 ASC NULLS FIRST)#x]
+Aggregate [rtrim(listagg(distinct collate(c1#x, unicode_rtrim), null, 0, 0),
None) AS rtrim(listagg(DISTINCT collate(c1, unicode_rtrim), NULL))#x]
+- SubqueryAlias t
+- Project [col1#x AS c1#x]
+- LocalRelation [col1#x]
-- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM
(VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1)
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE
unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT
replace(replace(c1, ' ', ''), '\n', '$') FROM t
-- !query analysis
-Aggregate [listagg(c1#x, null, collate(c1#x, unicode_rtrim) ASC NULLS FIRST,
0, 0) AS listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, unicode_rtrim)
ASC NULLS FIRST)#x]
-+- SubqueryAlias t
- +- Project [col1#x AS c1#x]
- +- LocalRelation [col1#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY collate(col1,
unicode_rtrim) ASC NULLS FIRST)#x AS c1#x]
+: +- Aggregate [listagg(col1#x, null, collate(col1#x, unicode_rtrim)
ASC NULLS FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY
collate(col1, unicode_rtrim) ASC NULLS FIRST)#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x]
++- Project [replace(replace(c1#x, , ),
+, $) AS replace(replace(c1, , ),
+, $)#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [c1#x], false, false, 1
-- !query
diff --git
a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
index 71eb3f8ca76b..76aba15b4617 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/listagg.sql.out
@@ -1,223 +1,274 @@
-- Automatically generated by SQLQueryTestSuite
-- !query
CREATE TEMP VIEW df AS
-SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL,
NULL)) AS t(a, b)
+SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL,
NULL))
-- !query analysis
-CreateViewCommand `df`, SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b',
'c'), ('b', 'd'), (NULL, NULL)) AS t(a, b), false, false, LocalTempView,
UNSUPPORTED, true
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+CreateViewCommand `df`, SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b',
'c'), ('b', 'd'), (NULL, NULL)), false, false, LocalTempView, UNSUPPORTED, true
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
CREATE TEMP VIEW df2 AS
-SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b)
+SELECT * FROM (VALUES (1, true), (2, false), (3, false))
-- !query analysis
-CreateViewCommand `df2`, SELECT * FROM (VALUES (1, true), (2, false), (3,
false)) AS t(a, b), false, false, LocalTempView, UNSUPPORTED, true
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+CreateViewCommand `df2`, SELECT * FROM (VALUES (1, true), (2, false), (3,
false)), false, false, LocalTempView, UNSUPPORTED, true
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(b) FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'),
regexp_count(col, 'd') FROM t
-- !query analysis
-Aggregate [a#x], [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x]
-+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(col2, NULL)#x AS col#x]
+: +- Aggregate [col1#x], [listagg(col2#x, null, 0, 0) AS listagg(col2,
NULL)#x]
+: +- SubqueryAlias df
+: +- View (`df`, [col1#x, col2#x])
+: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+: +- Project [col1#x, col2#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x,
regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS
regexp_count(col, d)#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [col#x], false, false
-- !query
-SELECT string_agg(b) FROM df GROUP BY a
+WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t
-- !query analysis
-Aggregate [a#x], [string_agg(b#x, null, 0, 0) AS string_agg(b, NULL)#x]
-+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [string_agg(col2, NULL)#x AS col#x]
+: +- Aggregate [col1#x], [string_agg(col2#x, null, 0, 0) AS
string_agg(col2, NULL)#x]
+: +- SubqueryAlias df
+: +- View (`df`, [col1#x, col2#x])
+: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+: +- Project [col1#x, col2#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x,
regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS
regexp_count(col, d)#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [col#x], false, false
-- !query
-SELECT listagg(b, NULL) FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t
-- !query analysis
-Aggregate [a#x], [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x]
-+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(col2, NULL)#x AS col#x]
+: +- Aggregate [col1#x], [listagg(col2#x, null, 0, 0) AS listagg(col2,
NULL)#x]
+: +- SubqueryAlias df
+: +- View (`df`, [col1#x, col2#x])
+: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+: +- Project [col1#x, col2#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x,
regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS
regexp_count(col, d)#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [col#x], false, false
-- !query
-SELECT listagg(b) FROM df WHERE 1 != 1
+SELECT listagg(col2) FROM df WHERE 1 != 1
-- !query analysis
-Aggregate [listagg(b#x, null, 0, 0) AS listagg(b, NULL)#x]
+Aggregate [listagg(col2#x, null, 0, 0) AS listagg(col2, NULL)#x]
+- Filter NOT (1 = 1)
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string)
AS col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(b, '|') FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t
-- !query analysis
-Aggregate [a#x], [listagg(b#x, |, 0, 0) AS listagg(b, |)#x]
-+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(col2, |)#x AS col#x]
+: +- Aggregate [col1#x], [listagg(col2#x, |, 0, 0) AS listagg(col2,
|)#x]
+: +- SubqueryAlias df
+: +- View (`df`, [col1#x, col2#x])
+: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+: +- Project [col1#x, col2#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x,
regexp_count(col#x, c) AS regexp_count(col, c)#x, regexp_count(col#x, d) AS
regexp_count(col, d)#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [col#x], false, false
-- !query
-SELECT listagg(a) FROM df
+WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t
-- !query analysis
-Aggregate [listagg(a#x, null, 0, 0) AS listagg(a, NULL)#x]
-+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(col1, NULL)#x AS col#x]
+: +- Aggregate [listagg(col1#x, null, 0, 0) AS listagg(col1, NULL)#x]
+: +- SubqueryAlias df
+: +- View (`df`, [col1#x, col2#x])
+: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+: +- Project [col1#x, col2#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [col#x], false, false, 1
-- !query
-SELECT listagg(DISTINCT a) FROM df
+WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t
-- !query analysis
-Aggregate [listagg(distinct a#x, null, 0, 0) AS listagg(DISTINCT a, NULL)#x]
-+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(DISTINCT col1, NULL)#x AS col#x]
+: +- Aggregate [listagg(distinct col1#x, null, 0, 0) AS
listagg(DISTINCT col1, NULL)#x]
+: +- SubqueryAlias df
+: +- View (`df`, [col1#x, col2#x])
+: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+: +- Project [col1#x, col2#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x, col2#x]
++- Project [len(col#x) AS len(col)#x, regexp_count(col#x, a) AS
regexp_count(col, a)#x, regexp_count(col#x, b) AS regexp_count(col, b)#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [col#x], false, false, 1
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df
-- !query analysis
-Aggregate [listagg(a#x, null, a#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL)
WITHIN GROUP (ORDER BY a ASC NULLS FIRST)#x]
+Aggregate [listagg(col1#x, null, col1#x ASC NULLS FIRST, 0, 0) AS
listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST)#x]
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS
col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df
-- !query analysis
-Aggregate [listagg(a#x, null, a#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL)
WITHIN GROUP (ORDER BY a DESC NULLS LAST)#x]
+Aggregate [listagg(col1#x, null, col1#x DESC NULLS LAST, 0, 0) AS
listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST)#x]
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS
col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY
col2) FROM df
-- !query analysis
-Project [listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER
(PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x]
-+- Project [a#x, b#x, listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS
LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING)#x, listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER
(PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x]
- +- Window [listagg(a#x, null, a#x DESC NULLS LAST, 0, 0)
windowspecdefinition(b#x, specifiedwindowframe(RowFrame, unboundedpreceding$(),
unboundedfollowing$())) AS listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS
LAST) OVER (PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING)#x], [b#x]
- +- Project [a#x, b#x]
+Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER
(PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x]
++- Project [col1#x, col2#x, listagg(col1, NULL) WITHIN GROUP (ORDER BY col1
DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING)#x, listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC
NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND
UNBOUNDED FOLLOWING)#x]
+ +- Window [listagg(col1#x, null, col1#x DESC NULLS LAST, 0, 0)
windowspecdefinition(col2#x, specifiedwindowframe(RowFrame,
unboundedpreceding$(), unboundedfollowing$())) AS listagg(col1, NULL) WITHIN
GROUP (ORDER BY col1 DESC NULLS LAST) OVER (PARTITION BY col2 ROWS BETWEEN
UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#x], [col2#x]
+ +- Project [col1#x, col2#x]
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS
b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df
-- !query analysis
-Aggregate [listagg(a#x, null, b#x ASC NULLS FIRST, 0, 0) AS listagg(a, NULL)
WITHIN GROUP (ORDER BY b ASC NULLS FIRST)#x]
+Aggregate [listagg(col1#x, null, col2#x ASC NULLS FIRST, 0, 0) AS
listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 ASC NULLS FIRST)#x]
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS
col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df
+WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM
df) SELECT (col == 'baba') || (col == 'bbaa') FROM t
-- !query analysis
-Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, 0, 0) AS listagg(a, NULL)
WITHIN GROUP (ORDER BY b DESC NULLS LAST)#x]
-+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS
LAST)#x AS col#x]
+: +- Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, 0, 0) AS
listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x]
+: +- SubqueryAlias df
+: +- View (`df`, [col1#x, col2#x])
+: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+: +- Project [col1#x, col2#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x, col2#x]
++- Project [concat(cast((col#x = baba) as string), cast((col#x = bbaa) as
string)) AS concat((col = baba), (col = bbaa))#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [col#x], false, false, 1
-- !query
-SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df
+WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC)
FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t
-- !query analysis
-Aggregate [listagg(a#x, |, b#x DESC NULLS LAST, 0, 0) AS listagg(a, |) WITHIN
GROUP (ORDER BY b DESC NULLS LAST)#x]
+WithCTE
+:- CTERelationDef xxxx, false
+: +- SubqueryAlias t
+: +- Project [listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS
LAST)#x AS col#x]
+: +- Aggregate [listagg(col1#x, |, col2#x DESC NULLS LAST, 0, 0) AS
listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x]
+: +- SubqueryAlias df
+: +- View (`df`, [col1#x, col2#x])
+: +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+: +- Project [col1#x, col2#x]
+: +- SubqueryAlias __auto_generated_subquery_name
+: +- LocalRelation [col1#x, col2#x]
++- Project [concat(cast((col#x = b|a|b|a) as string), cast((col#x = b|b|a|a)
as string)) AS concat((col = b|a|b|a), (col = b|b|a|a))#x]
+ +- SubqueryAlias t
+ +- CTERelationRef xxxx, true, [col#x], false, false, 1
+
+
+-- !query
+SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df
+-- !query analysis
+Aggregate [listagg(col1#x, |, col2#x DESC NULLS LAST, 0, 0) AS listagg(col1,
|) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST)#x]
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS
col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df
-- !query analysis
-Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, a#x ASC NULLS FIRST, 0, 0)
AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a ASC NULLS
FIRST)#x]
+Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, col1#x ASC NULLS
FIRST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS
LAST, col1 ASC NULLS FIRST)#x]
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS
col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df
-- !query analysis
-Aggregate [listagg(a#x, null, b#x DESC NULLS LAST, a#x DESC NULLS LAST, 0, 0)
AS listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a DESC NULLS
LAST)#x]
+Aggregate [listagg(col1#x, null, col2#x DESC NULLS LAST, col1#x DESC NULLS
LAST, 0, 0) AS listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST,
col1 DESC NULLS LAST)#x]
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as string) AS
col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
@@ -248,16 +299,15 @@ Aggregate [listagg(c1#x, 0x42, 0, 0) AS listagg(c1,
X'42')#x]
-- !query
-SELECT listagg(a), listagg(b, ',') FROM df2
+SELECT listagg(col1), listagg(col2, ',') FROM df2
-- !query analysis
-Aggregate [listagg(cast(a#x as string), null, 0, 0) AS listagg(a, NULL)#x,
listagg(cast(b#x as string), ,, 0, 0) AS listagg(b, ,)#x]
+Aggregate [listagg(cast(col1#x as string), null, 0, 0) AS listagg(col1,
NULL)#x, listagg(cast(col2#x as string), ,, 0, 0) AS listagg(col2, ,)#x]
+- SubqueryAlias df2
- +- View (`df2`, [a#x, b#x])
- +- Project [cast(a#x as int) AS a#x, cast(b#x as boolean) AS b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df2`, [col1#x, col2#x])
+ +- Project [cast(col1#x as int) AS col1#x, cast(col2#x as boolean) AS
col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
@@ -307,129 +357,128 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
-SELECT listagg(b, a) FROM df GROUP BY a
+SELECT listagg(col2, col1) FROM df GROUP BY col1
-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
"sqlState" : "42K09",
"messageParameters" : {
- "inputExpr" : "\"a\"",
+ "inputExpr" : "\"col1\"",
"inputName" : "`delimiter`",
"inputType" : "\"STRING\"",
- "sqlExpr" : "\"listagg(b, a)\""
+ "sqlExpr" : "\"listagg(col2, col1)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
- "stopIndex" : 20,
- "fragment" : "listagg(b, a)"
+ "stopIndex" : 26,
+ "fragment" : "listagg(col2, col1)"
} ]
}
-- !query
-SELECT listagg(a) OVER (ORDER BY a) FROM df
+SELECT listagg(col1) OVER (ORDER BY col1) FROM df
-- !query analysis
-Project [listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW)#x]
-+- Project [a#x, listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE
BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x, listagg(a, NULL) OVER (ORDER BY
a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x]
- +- Window [listagg(a#x, null, 0, 0) windowspecdefinition(a#x ASC NULLS
FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$()))
AS listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN UNBOUNDED
PRECEDING AND CURRENT ROW)#x], [a#x ASC NULLS FIRST]
- +- Project [a#x]
+Project [listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW)#x]
++- Project [col1#x, listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x, listagg(col1, NULL) OVER
(ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT
ROW)#x]
+ +- Window [listagg(col1#x, null, 0, 0) windowspecdefinition(col1#x ASC
NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(),
currentrow$())) AS listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST
RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#x], [col1#x ASC NULLS FIRST]
+ +- Project [col1#x]
+- SubqueryAlias df
- +- View (`df`, [a#x, b#x])
- +- Project [cast(a#x as string) AS a#x, cast(b#x as string) AS
b#x]
- +- Project [a#x, b#x]
- +- SubqueryAlias t
- +- Project [col1#x AS a#x, col2#x AS b#x]
- +- LocalRelation [col1#x, col2#x]
+ +- View (`df`, [col1#x, col2#x])
+ +- Project [cast(col1#x as string) AS col1#x, cast(col2#x as
string) AS col2#x]
+ +- Project [col1#x, col2#x]
+ +- SubqueryAlias __auto_generated_subquery_name
+ +- LocalRelation [col1#x, col2#x]
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df
-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
"sqlState" : "42601",
"messageParameters" : {
- "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\""
+ "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
- "stopIndex" : 61,
- "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+ "stopIndex" : 70,
+ "fragment" : "listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY
col1)"
} ]
}
-- !query
-SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM
df
-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
"sqlState" : "42601",
"messageParameters" : {
- "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\""
+ "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
- "stopIndex" : 64,
- "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+ "stopIndex" : 73,
+ "fragment" : "string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY
col1)"
} ]
}
-- !query
-SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df
+SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df
-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED",
"sqlState" : "0A000",
"messageParameters" : {
- "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS
FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
+ "windowExpr" : "\"listagg(DISTINCT col1, NULL) OVER (ORDER BY col1 ASC
NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
- "stopIndex" : 44,
- "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)"
+ "stopIndex" : 50,
+ "fragment" : "listagg(DISTINCT col1) OVER (ORDER BY col1)"
} ]
}
-- !query
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df
-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" :
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT",
"sqlState" : "42K0K",
"messageParameters" : {
- "funcArg" : "\"a\"",
+ "funcArg" : "\"col1\"",
"funcName" : "`listagg`",
- "orderingExpr" : "\"b\""
+ "orderingExpr" : "\"col2\""
}
}
-- !query
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df
-- !query analysis
org.apache.spark.sql.catalyst.ExtendedAnalysisException
{
"errorClass" :
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT",
"sqlState" : "42K0K",
"messageParameters" : {
- "funcArg" : "\"a\"",
+ "funcArg" : "\"col1\"",
"funcName" : "`listagg`",
- "orderingExpr" : "\"a\", \"b\""
+ "orderingExpr" : "\"col1\", \"col2\""
}
}
diff --git
a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
index 35f86183c37b..a1949cec1ebf 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/listagg-collations.sql
@@ -1,12 +1,14 @@
--- Test cases with collations
+-- Test cases with utf8_binary
SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_binary) FROM (VALUES
('a'), ('A'), ('b'), ('B')) AS t(c1);
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES
('a'), ('A'), ('b'), ('B')) AS t(c1);
SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'),
('b'), ('B')) AS t(c1);
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'),
('b'), ('B')) AS t(c1);
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1
COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1);
-SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '),
('abc '), ('x'), ('abc')) AS t(c1);
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc
'), ('abc\n'), ('abc'), ('x')) AS t(c1);
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM
(VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1);
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES
('abc '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, '
', ''), '\n', '$') FROM t;
+-- Test cases with utf8_lcase. Lower expression added for determinism
+SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM
(VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1);
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'),
('A'), ('b'), ('B')) AS t(c1);
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1
COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1);
+-- Test cases with unicode_rtrim.
+SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc
'), ('xbc '), ('a'), ('xbc')) AS t(c1);
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE
unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT
replace(replace(c1, ' ', ''), '\n', '$') FROM t;
-- Error case with collations
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1
COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1);
\ No newline at end of file
+SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1
COLLATE utf8_binary) FROM (VALUES ('a'), ('b'), ('A'), ('B')) AS t(c1);
diff --git a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
index 573e0ddddf2f..7acac239cd9c 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/listagg.sql
@@ -2,39 +2,40 @@
-- Create temporary views
CREATE TEMP VIEW df AS
-SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL,
NULL)) AS t(a, b);
+SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL,
NULL));
CREATE TEMP VIEW df2 AS
-SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b);
+SELECT * FROM (VALUES (1, true), (2, false), (3, false));
-- Test cases for listagg function
-SELECT listagg(b) FROM df GROUP BY a;
-SELECT string_agg(b) FROM df GROUP BY a;
-SELECT listagg(b, NULL) FROM df GROUP BY a;
-SELECT listagg(b) FROM df WHERE 1 != 1;
-SELECT listagg(b, '|') FROM df GROUP BY a;
-SELECT listagg(a) FROM df;
-SELECT listagg(DISTINCT a) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df;
-SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df;
+WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'),
regexp_count(col, 'd') FROM t;
+WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t;
+WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t;
+SELECT listagg(col2) FROM df WHERE 1 != 1;
+WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t;
+WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t;
+WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY
col2) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df;
+WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM
df) SELECT (col == 'baba') || (col == 'bbaa') FROM t;
+WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC)
FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t;
+SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df;
SELECT listagg(c1) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1);
SELECT listagg(c1, NULL) FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1);
SELECT listagg(c1, X'42') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1);
-SELECT listagg(a), listagg(b, ',') FROM df2;
+SELECT listagg(col1), listagg(col2, ',') FROM df2;
-- Error cases
SELECT listagg(c1) FROM (VALUES (ARRAY('a', 'b'))) AS t(c1);
SELECT listagg(c1, ', ') FROM (VALUES (X'DEAD'), (X'BEEF')) AS t(c1);
-SELECT listagg(b, a) FROM df GROUP BY a;
-SELECT listagg(a) OVER (ORDER BY a) FROM df;
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df;
-SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df;
-SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df;
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df;
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df;
\ No newline at end of file
+SELECT listagg(col2, col1) FROM df GROUP BY col1;
+SELECT listagg(col1) OVER (ORDER BY col1) FROM df;
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df;
+SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM
df;
+SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df;
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df;
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df;
diff --git
a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
index cf3bac04f09c..7d5b054217b4 100644
--- a/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/listagg-collations.sql.out
@@ -8,61 +8,61 @@ ABab
-- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase) FROM (VALUES
('a'), ('A'), ('b'), ('B')) AS t(c1)
+SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'),
('b'), ('B')) AS t(c1)
-- !query schema
-struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase) ASC
NULLS FIRST):string>
+struct<listagg(DISTINCT collate(c1, utf8_binary), NULL):string>
-- !query output
aAbB
-- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_binary) FROM (VALUES ('a'), ('A'),
('b'), ('B')) AS t(c1)
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM (VALUES
('abc '), ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT replace(replace(c1, '
', ''), '\n', '$') FROM t
-- !query schema
-struct<listagg(DISTINCT collate(c1, utf8_binary), NULL):string>
+struct<replace(replace(c1, , ),
+, $):string>
-- !query output
-aAbB
+abcabc$abcabcx
-- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) FROM (VALUES ('a'), ('A'),
('b'), ('B')) AS t(c1)
+SELECT lower(listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE utf8_lcase)) FROM
(VALUES ('a'), ('A'), ('b'), ('B')) AS t(c1)
-- !query schema
-struct<listagg(DISTINCT collate(c1, utf8_lcase), NULL):string collate
UTF8_LCASE>
+struct<lower(listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, utf8_lcase)
ASC NULLS FIRST)):string>
-- !query output
-ab
+aabb
-- !query
-SELECT listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1
COLLATE utf8_lcase) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase)) FROM (VALUES ('a'),
('A'), ('b'), ('B')) AS t(c1)
-- !query schema
-struct<listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP (ORDER BY
collate(c1, utf8_lcase) ASC NULLS FIRST):string collate UTF8_LCASE>
+struct<lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL)):string collate
UTF8_LCASE>
-- !query output
-aB
+ab
-- !query
-SELECT listagg(DISTINCT c1 COLLATE unicode_rtrim) FROM (VALUES ('abc '),
('abc '), ('x'), ('abc')) AS t(c1)
+SELECT lower(listagg(DISTINCT c1 COLLATE utf8_lcase) WITHIN GROUP (ORDER BY c1
COLLATE utf8_lcase)) FROM (VALUES ('a'), ('B'), ('b'), ('A')) AS t(c1)
-- !query schema
-struct<listagg(DISTINCT collate(c1, unicode_rtrim), NULL):string collate
UNICODE_RTRIM>
+struct<lower(listagg(DISTINCT collate(c1, utf8_lcase), NULL) WITHIN GROUP
(ORDER BY collate(c1, utf8_lcase) ASC NULLS FIRST)):string collate UTF8_LCASE>
-- !query output
-abc x
+ab
-- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1) FROM (VALUES ('abc '), ('abc
'), ('abc\n'), ('abc'), ('x')) AS t(c1)
+SELECT rtrim(listagg(DISTINCT c1 COLLATE unicode_rtrim)) FROM (VALUES ('xbc
'), ('xbc '), ('a'), ('xbc')) AS t(c1)
-- !query schema
-struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY c1 ASC NULLS FIRST):string>
+struct<rtrim(listagg(DISTINCT collate(c1, unicode_rtrim), NULL)):string
collate UNICODE_RTRIM>
-- !query output
-abcabc
-abc abc x
+axbc
-- !query
-SELECT listagg(c1) WITHIN GROUP (ORDER BY c1 COLLATE unicode_rtrim) FROM
(VALUES ('abc '), ('abc '), ('abc\n'), ('abc'), ('x')) AS t(c1)
+WITH t(c1) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 COLLATE
unicode_rtrim) FROM (VALUES ('abc '), ('abc\n'), ('abc'), ('x'))) SELECT
replace(replace(c1, ' ', ''), '\n', '$') FROM t
-- !query schema
-struct<listagg(c1, NULL) WITHIN GROUP (ORDER BY collate(c1, unicode_rtrim) ASC
NULLS FIRST):string>
+struct<replace(replace(c1, , ),
+, $):string>
-- !query output
-abc abc abcabc
-x
+abcabcabc$x
-- !query
diff --git a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
index cc4b568c9e4e..de5432939031 100644
--- a/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/listagg.sql.out
@@ -1,7 +1,7 @@
-- Automatically generated by SQLQueryTestSuite
-- !query
CREATE TEMP VIEW df AS
-SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL,
NULL)) AS t(a, b)
+SELECT * FROM (VALUES ('a', 'b'), ('a', 'c'), ('b', 'c'), ('b', 'd'), (NULL,
NULL))
-- !query schema
struct<>
-- !query output
@@ -10,7 +10,7 @@ struct<>
-- !query
CREATE TEMP VIEW df2 AS
-SELECT * FROM (VALUES (1, true), (2, false), (3, false)) AS t(a, b)
+SELECT * FROM (VALUES (1, true), (2, false), (3, false))
-- !query schema
struct<>
-- !query output
@@ -18,89 +18,89 @@ struct<>
-- !query
-SELECT listagg(b) FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2) FROM df GROUP BY col1) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col, 'c'),
regexp_count(col, 'd') FROM t
-- !query schema
-struct<listagg(b, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col,
b):int,regexp_count(col, c):int,regexp_count(col, d):int>
-- !query output
-NULL
-bc
-cd
+2 0 0 1 1
+2 0 1 1 0
+NULL NULL NULL NULL NULL
-- !query
-SELECT string_agg(b) FROM df GROUP BY a
+WITH t(col) AS (SELECT string_agg(col2) FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t
-- !query schema
-struct<string_agg(b, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col,
b):int,regexp_count(col, c):int,regexp_count(col, d):int>
-- !query output
-NULL
-bc
-cd
+2 0 0 1 1
+2 0 1 1 0
+NULL NULL NULL NULL NULL
-- !query
-SELECT listagg(b, NULL) FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2, NULL) FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t
-- !query schema
-struct<listagg(b, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col,
b):int,regexp_count(col, c):int,regexp_count(col, d):int>
-- !query output
-NULL
-bc
-cd
+2 0 0 1 1
+2 0 1 1 0
+NULL NULL NULL NULL NULL
-- !query
-SELECT listagg(b) FROM df WHERE 1 != 1
+SELECT listagg(col2) FROM df WHERE 1 != 1
-- !query schema
-struct<listagg(b, NULL):string>
+struct<listagg(col2, NULL):string>
-- !query output
NULL
-- !query
-SELECT listagg(b, '|') FROM df GROUP BY a
+WITH t(col) AS (SELECT listagg(col2, '|') FROM df GROUP BY col1) SELECT
len(col), regexp_count(col, 'a'), regexp_count(col, 'b'), regexp_count(col,
'c'), regexp_count(col, 'd') FROM t
-- !query schema
-struct<listagg(b, |):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col,
b):int,regexp_count(col, c):int,regexp_count(col, d):int>
-- !query output
-NULL
-b|c
-c|d
+3 0 0 1 1
+3 0 1 1 0
+NULL NULL NULL NULL NULL
-- !query
-SELECT listagg(a) FROM df
+WITH t(col) AS (SELECT listagg(col1) FROM df) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t
-- !query schema
-struct<listagg(a, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int>
-- !query output
-aabb
+4 2 2
-- !query
-SELECT listagg(DISTINCT a) FROM df
+WITH t(col) AS (SELECT listagg(DISTINCT col1) FROM df) SELECT len(col),
regexp_count(col, 'a'), regexp_count(col, 'b') FROM t
-- !query schema
-struct<listagg(DISTINCT a, NULL):string>
+struct<len(col):int,regexp_count(col, a):int,regexp_count(col, b):int>
-- !query output
-ab
+2 1 1
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) FROM df
-- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a ASC NULLS FIRST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 ASC NULLS FIRST):string>
-- !query output
aabb
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) FROM df
-- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST):string>
-- !query output
bbaa
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a DESC) OVER (PARTITION BY b) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1 DESC) OVER (PARTITION BY
col2) FROM df
-- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY a DESC NULLS LAST) OVER
(PARTITION BY b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col1 DESC NULLS LAST) OVER
(PARTITION BY col2 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED
FOLLOWING):string>
-- !query output
NULL
a
@@ -110,41 +110,49 @@ ba
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2) FROM df
-- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b ASC NULLS FIRST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 ASC NULLS FIRST):string>
-- !query output
aabb
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC) FROM df
+WITH t(col) AS (SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC) FROM
df) SELECT (col == 'baba') || (col == 'bbaa') FROM t
-- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST):string>
+struct<concat((col = baba), (col = bbaa)):string>
-- !query output
-baba
+truefalse
+
+
+-- !query
+WITH t(col) AS (SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC)
FROM df) SELECT (col == 'b|a|b|a') || (col == 'b|b|a|a') FROM t
+-- !query schema
+struct<concat((col = b|a|b|a), (col = b|b|a|a)):string>
+-- !query output
+truefalse
-- !query
-SELECT listagg(a, '|') WITHIN GROUP (ORDER BY b DESC) FROM df
+SELECT listagg(col1, '|') WITHIN GROUP (ORDER BY col2 DESC) FROM df
-- !query schema
-struct<listagg(a, |) WITHIN GROUP (ORDER BY b DESC NULLS LAST):string>
+struct<listagg(col1, |) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST):string>
-- !query output
b|a|b|a
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a ASC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 ASC) FROM df
-- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a ASC NULLS
FIRST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, col1
ASC NULLS FIRST):string>
-- !query output
baba
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY b DESC, a DESC) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col2 DESC, col1 DESC) FROM df
-- !query schema
-struct<listagg(a, NULL) WITHIN GROUP (ORDER BY b DESC NULLS LAST, a DESC NULLS
LAST):string>
+struct<listagg(col1, NULL) WITHIN GROUP (ORDER BY col2 DESC NULLS LAST, col1
DESC NULLS LAST):string>
-- !query output
bbaa
@@ -174,9 +182,9 @@ DEAD42BEEF
-- !query
-SELECT listagg(a), listagg(b, ',') FROM df2
+SELECT listagg(col1), listagg(col2, ',') FROM df2
-- !query schema
-struct<listagg(a, NULL):string,listagg(b, ,):string>
+struct<listagg(col1, NULL):string,listagg(col2, ,):string>
-- !query output
123 true,false,false
@@ -232,7 +240,7 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
-- !query
-SELECT listagg(b, a) FROM df GROUP BY a
+SELECT listagg(col2, col1) FROM df GROUP BY col1
-- !query schema
struct<>
-- !query output
@@ -241,25 +249,25 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"errorClass" : "DATATYPE_MISMATCH.NON_FOLDABLE_INPUT",
"sqlState" : "42K09",
"messageParameters" : {
- "inputExpr" : "\"a\"",
+ "inputExpr" : "\"col1\"",
"inputName" : "`delimiter`",
"inputType" : "\"STRING\"",
- "sqlExpr" : "\"listagg(b, a)\""
+ "sqlExpr" : "\"listagg(col2, col1)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
- "stopIndex" : 20,
- "fragment" : "listagg(b, a)"
+ "stopIndex" : 26,
+ "fragment" : "listagg(col2, col1)"
} ]
}
-- !query
-SELECT listagg(a) OVER (ORDER BY a) FROM df
+SELECT listagg(col1) OVER (ORDER BY col1) FROM df
-- !query schema
-struct<listagg(a, NULL) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW):string>
+struct<listagg(col1, NULL) OVER (ORDER BY col1 ASC NULLS FIRST RANGE BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW):string>
-- !query output
NULL
aa
@@ -269,7 +277,7 @@ aabb
-- !query
-SELECT listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+SELECT listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM df
-- !query schema
struct<>
-- !query output
@@ -278,20 +286,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
"sqlState" : "42601",
"messageParameters" : {
- "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\""
+ "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
- "stopIndex" : 61,
- "fragment" : "listagg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+ "stopIndex" : 70,
+ "fragment" : "listagg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY
col1)"
} ]
}
-- !query
-SELECT string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a) FROM df
+SELECT string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY col1) FROM
df
-- !query schema
struct<>
-- !query output
@@ -300,20 +308,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"errorClass" : "INVALID_WINDOW_SPEC_FOR_AGGREGATION_FUNC",
"sqlState" : "42601",
"messageParameters" : {
- "aggFunc" : "\"listagg(a, NULL, a ASC NULLS FIRST)\""
+ "aggFunc" : "\"listagg(col1, NULL, col1 ASC NULLS FIRST)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
- "stopIndex" : 64,
- "fragment" : "string_agg(a) WITHIN GROUP (ORDER BY a) OVER (ORDER BY a)"
+ "stopIndex" : 73,
+ "fragment" : "string_agg(col1) WITHIN GROUP (ORDER BY col1) OVER (ORDER BY
col1)"
} ]
}
-- !query
-SELECT listagg(DISTINCT a) OVER (ORDER BY a) FROM df
+SELECT listagg(DISTINCT col1) OVER (ORDER BY col1) FROM df
-- !query schema
struct<>
-- !query output
@@ -322,20 +330,20 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"errorClass" : "DISTINCT_WINDOW_FUNCTION_UNSUPPORTED",
"sqlState" : "0A000",
"messageParameters" : {
- "windowExpr" : "\"listagg(DISTINCT a, NULL) OVER (ORDER BY a ASC NULLS
FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
+ "windowExpr" : "\"listagg(DISTINCT col1, NULL) OVER (ORDER BY col1 ASC
NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)\""
},
"queryContext" : [ {
"objectType" : "",
"objectName" : "",
"startIndex" : 8,
- "stopIndex" : 44,
- "fragment" : "listagg(DISTINCT a) OVER (ORDER BY a)"
+ "stopIndex" : 50,
+ "fragment" : "listagg(DISTINCT col1) OVER (ORDER BY col1)"
} ]
}
-- !query
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY b) FROM df
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col2) FROM df
-- !query schema
struct<>
-- !query output
@@ -344,15 +352,15 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"errorClass" :
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT",
"sqlState" : "42K0K",
"messageParameters" : {
- "funcArg" : "\"a\"",
+ "funcArg" : "\"col1\"",
"funcName" : "`listagg`",
- "orderingExpr" : "\"b\""
+ "orderingExpr" : "\"col2\""
}
}
-- !query
-SELECT listagg(DISTINCT a) WITHIN GROUP (ORDER BY a, b) FROM df
+SELECT listagg(DISTINCT col1) WITHIN GROUP (ORDER BY col1, col2) FROM df
-- !query schema
struct<>
-- !query output
@@ -361,8 +369,8 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
"errorClass" :
"INVALID_WITHIN_GROUP_EXPRESSION.MISMATCH_WITH_DISTINCT_INPUT",
"sqlState" : "42K0K",
"messageParameters" : {
- "funcArg" : "\"a\"",
+ "funcArg" : "\"col1\"",
"funcName" : "`listagg`",
- "orderingExpr" : "\"a\", \"b\""
+ "orderingExpr" : "\"col1\", \"col2\""
}
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
index 141d6b219f2a..6ce0a657d5b9 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
@@ -624,36 +624,39 @@ class DataFrameAggregateSuite extends QueryTest
}
test("listagg function") {
- // normal case
- val df = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("a", "b")
+ // Normal case.
+ val df = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("col1", "col2")
checkAnswer(
- df.selectExpr("listagg(a)", "listagg(b)"),
+ df.selectExpr("listagg(col1)", "listagg(col2)"),
Seq(Row("abc", "bcd"))
)
checkAnswer(
- df.select(listagg($"a"), listagg($"b")),
+ df.select(listagg($"col1"), listagg($"col2")),
Seq(Row("abc", "bcd"))
)
- // distinct case
- val df2 = Seq(("a", "b"), ("a", "b"), ("b", "d")).toDF("a", "b")
+ // Distinct case.
+ val df2 = Seq(("a", "b"), ("a", "b"), ("b", "d")).toDF("col1", "col2")
checkAnswer(
- df2.select(listagg_distinct($"a"), listagg_distinct($"b")),
+ df2.select(listagg_distinct($"col1"), listagg_distinct($"col2")),
Seq(Row("ab", "bd"))
)
- // null case
- val df3 = Seq(("a", "b", null), ("a", "b", null), (null, null,
null)).toDF("a", "b", "c")
+ // Null case.
+ val df3 = Seq(("a", "b", null), ("a", "b", null), (null, null, null))
+ .toDF("col1", "col2", "col3")
checkAnswer(
- df3.select(listagg_distinct($"a"), listagg($"a"),
listagg_distinct($"b"), listagg($"b"),
- listagg($"c")),
- Seq(Row("a", "aa", "b", "bb", null))
+ df3.select(
+ listagg_distinct($"col1"), listagg($"col1"),
+ listagg_distinct($"col2"), listagg($"col2"),
+ listagg_distinct($"col3"), listagg($"col3")),
+ Seq(Row("a", "aa", "b", "bb", null, null))
)
- // custom delimiter
- val df4 = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("a", "b")
+ // Custom delimiter.
+ val df4 = Seq(("a", "b"), ("b", "c"), ("c", "d")).toDF("col1", "col2")
checkAnswer(
- df4.selectExpr("listagg(a, '|')", "listagg(b, '|')"),
+ df4.selectExpr("listagg(col1, '|')", "listagg(col2, '|')"),
Seq(Row("a|b|c", "b|c|d"))
)
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]