This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 867250f68a1d [SPARK-52925][SQL] Return correct error message for anchor self references in rCTEs 867250f68a1d is described below commit 867250f68a1d23cfe9eeefc0d1ee37305211a552 Author: pavle-martinovic_data <pavle.martino...@databricks.com> AuthorDate: Thu Jul 24 12:26:19 2025 +0800 [SPARK-52925][SQL] Return correct error message for anchor self references in rCTEs ### What changes were proposed in this pull request? Catch case when there is a self reference in the anchor, and return an error message that this is an illegal rCTE. ### Why are the changes needed? Currently the cases where rCTEs are self referenced inside the anchor return unhelpful messages which happen due to the fact that the recursive CTE isn't defined at the time. ### Does this PR introduce _any_ user-facing change? Different error messages. ### How was this patch tested? New golden file tests that check that cover cases with self references inside the anchor. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51619 from Pajaraja/pavle-martinovic_data/AnchorSelfReference. Authored-by: pavle-martinovic_data <pavle.martino...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../sql/catalyst/analysis/ResolveWithCTE.scala | 4 ++ .../sql/catalyst/plans/logical/cteOperators.scala | 22 +++++- .../analyzer-results/cte-recursion.sql.out | 74 +++++++++++++++---- .../analyzer-results/postgreSQL/with.sql.out | 16 +---- .../resources/sql-tests/inputs/cte-recursion.sql | 26 +++++++ .../sql-tests/results/cte-recursion.sql.out | 82 ++++++++++++++++++---- .../sql-tests/results/postgreSQL/with.sql.out | 16 +---- 7 files changed, 187 insertions(+), 53 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala index 5605519967d0..69b341b5574f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala @@ -59,6 +59,10 @@ object ResolveWithCTE extends Rule[LogicalPlan] { cteDefMap.put(cteDef.id, cteDef) } cteDef + case cteDef if cteDef.hasSelfReferenceInAnchor => + throw new AnalysisException( + errorClass = "INVALID_RECURSIVE_CTE", + messageParameters = Map.empty) case cteDef => // Multiple self-references are not allowed within one cteDef. cteDef.child match { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala index cea342d37c06..68e93455d453 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.plans.logical -import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation +import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, UnresolvedSubqueryColumnAliases} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.trees.TreePattern._ @@ -130,6 +130,26 @@ case class CTERelationDef( lazy val hasSelfReferenceAsCTERef: Boolean = child.collectFirstWithSubqueries { case CTERelationRef(this.id, _, _, _, _, true, _) => true }.getOrElse(false) + lazy val hasSelfReferenceInAnchor: Boolean = { + val unionNode: Option[Union] = child match { + case SubqueryAlias(_, union: Union) => + Some(union) + case SubqueryAlias(_, UnresolvedSubqueryColumnAliases(_, union: Union)) => + Some(union) + case SubqueryAlias(_, WithCTE(union: Union, _)) => + Some(union) + case SubqueryAlias(_, UnresolvedSubqueryColumnAliases(_, WithCTE(union: Union, _))) => + Some(union) + case _ => None + } + if (unionNode.isDefined) { + unionNode.get.children.head.collectFirstWithSubqueries { + case CTERelationRef(this.id, _, _, _, _, true, _) => true + }.getOrElse(false) + } else { + false + } + } lazy val hasSelfReferenceAsUnionLoopRef: Boolean = child.collectFirstWithSubqueries { case UnionLoopRef(this.id, _, _) => true }.getOrElse(false) diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out index 7dd6cc1b0ec9..66d01bc838aa 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out @@ -409,20 +409,10 @@ WITH RECURSIVE r(level) AS ( ) SELECT * FROM r -- !query analysis -org.apache.spark.sql.catalyst.ExtendedAnalysisException +org.apache.spark.sql.AnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", - "sqlState" : "42703", - "messageParameters" : { - "objectName" : "`level`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 62, - "stopIndex" : 66, - "fragment" : "level" - } ] + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" } @@ -2085,3 +2075,61 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "INVALID_RECURSIVE_REFERENCE.PLACE", "sqlState" : "42836" } + + +-- !query +WITH RECURSIVE t1(n) AS ( + SELECT 1 FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} + + +-- !query +WITH RECURSIVE t1 AS ( + SELECT 1 AS n FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} + + +-- !query +WITH RECURSIVE t1(n) AS ( + WITH t2(m) AS (SELECT 1) + SELECT 1 FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} + + +-- !query +WITH RECURSIVE t1 AS ( + WITH t2(m) AS (SELECT 1) + SELECT 1 AS n FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1 +-- !query analysis +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out index de86c88f6d1b..00ee071abdce 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out @@ -1183,20 +1183,10 @@ org.apache.spark.sql.AnalysisException WITH RECURSIVE x(n) AS (SELECT n FROM x UNION ALL SELECT 1) SELECT * FROM x -- !query analysis -org.apache.spark.sql.catalyst.ExtendedAnalysisException +org.apache.spark.sql.AnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", - "sqlState" : "42703", - "messageParameters" : { - "objectName" : "`n`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 32, - "stopIndex" : 32, - "fragment" : "n" - } ] + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" } diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql b/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql index ebb5c9e0bb2b..05ddd7fb89fa 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql @@ -756,3 +756,29 @@ WITH RECURSIVE t1(n) AS ( (SELECT n + 1 FROM t1 WHERE n < 5 ORDER BY n) ) SELECT * FROM t1; + +WITH RECURSIVE t1(n) AS ( + SELECT 1 FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1; + +WITH RECURSIVE t1 AS ( + SELECT 1 AS n FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1; + +WITH RECURSIVE t1(n) AS ( + WITH t2(m) AS (SELECT 1) + SELECT 1 FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1; + +WITH RECURSIVE t1 AS ( + WITH t2(m) AS (SELECT 1) + SELECT 1 AS n FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1; diff --git a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out index f70a6ef3f9ad..6689bbeeefdd 100644 --- a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out @@ -448,20 +448,10 @@ SELECT * FROM r -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.ExtendedAnalysisException +org.apache.spark.sql.AnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", - "sqlState" : "42703", - "messageParameters" : { - "objectName" : "`level`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 62, - "stopIndex" : 66, - "fragment" : "level" - } ] + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" } @@ -1875,3 +1865,69 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException "errorClass" : "INVALID_RECURSIVE_REFERENCE.PLACE", "sqlState" : "42836" } + + +-- !query +WITH RECURSIVE t1(n) AS ( + SELECT 1 FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} + + +-- !query +WITH RECURSIVE t1 AS ( + SELECT 1 AS n FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} + + +-- !query +WITH RECURSIVE t1(n) AS ( + WITH t2(m) AS (SELECT 1) + SELECT 1 FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} + + +-- !query +WITH RECURSIVE t1 AS ( + WITH t2(m) AS (SELECT 1) + SELECT 1 AS n FROM t1 + UNION ALL + SELECT n+1 FROM t1 WHERE n < 5) +SELECT * FROM t1 +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +{ + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" +} diff --git a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out index 0ce77049905c..4094c69ec463 100644 --- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out @@ -901,20 +901,10 @@ WITH RECURSIVE x(n) AS (SELECT n FROM x UNION ALL SELECT 1) -- !query schema struct<> -- !query output -org.apache.spark.sql.catalyst.ExtendedAnalysisException +org.apache.spark.sql.AnalysisException { - "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION", - "sqlState" : "42703", - "messageParameters" : { - "objectName" : "`n`" - }, - "queryContext" : [ { - "objectType" : "", - "objectName" : "", - "startIndex" : 32, - "stopIndex" : 32, - "fragment" : "n" - } ] + "errorClass" : "INVALID_RECURSIVE_CTE", + "sqlState" : "42836" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org