This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 867250f68a1d [SPARK-52925][SQL] Return correct error message for 
anchor self references in rCTEs
867250f68a1d is described below

commit 867250f68a1d23cfe9eeefc0d1ee37305211a552
Author: pavle-martinovic_data <pavle.martino...@databricks.com>
AuthorDate: Thu Jul 24 12:26:19 2025 +0800

    [SPARK-52925][SQL] Return correct error message for anchor self references 
in rCTEs
    
    ### What changes were proposed in this pull request?
    
    Catch case when there is a self reference in the anchor, and return an 
error message that this is an illegal rCTE.
    
    ### Why are the changes needed?
    
    Currently the cases where rCTEs are self referenced inside the anchor 
return unhelpful messages which happen due to the fact that the recursive CTE 
isn't defined at the time.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Different error messages.
    
    ### How was this patch tested?
    
    New golden file tests that check that cover cases with self references 
inside the anchor.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #51619 from Pajaraja/pavle-martinovic_data/AnchorSelfReference.
    
    Authored-by: pavle-martinovic_data <pavle.martino...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../sql/catalyst/analysis/ResolveWithCTE.scala     |  4 ++
 .../sql/catalyst/plans/logical/cteOperators.scala  | 22 +++++-
 .../analyzer-results/cte-recursion.sql.out         | 74 +++++++++++++++----
 .../analyzer-results/postgreSQL/with.sql.out       | 16 +----
 .../resources/sql-tests/inputs/cte-recursion.sql   | 26 +++++++
 .../sql-tests/results/cte-recursion.sql.out        | 82 ++++++++++++++++++----
 .../sql-tests/results/postgreSQL/with.sql.out      | 16 +----
 7 files changed, 187 insertions(+), 53 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala
index 5605519967d0..69b341b5574f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveWithCTE.scala
@@ -59,6 +59,10 @@ object ResolveWithCTE extends Rule[LogicalPlan] {
               cteDefMap.put(cteDef.id, cteDef)
             }
             cteDef
+          case cteDef if cteDef.hasSelfReferenceInAnchor =>
+            throw new AnalysisException(
+              errorClass = "INVALID_RECURSIVE_CTE",
+              messageParameters = Map.empty)
           case cteDef =>
             // Multiple self-references are not allowed within one cteDef.
             cteDef.child match {
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
index cea342d37c06..68e93455d453 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.plans.logical
 
-import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
+import org.apache.spark.sql.catalyst.analysis.{MultiInstanceRelation, 
UnresolvedSubqueryColumnAliases}
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.trees.TreePattern._
@@ -130,6 +130,26 @@ case class CTERelationDef(
   lazy val hasSelfReferenceAsCTERef: Boolean = 
child.collectFirstWithSubqueries {
     case CTERelationRef(this.id, _, _, _, _, true, _) => true
   }.getOrElse(false)
+  lazy val hasSelfReferenceInAnchor: Boolean = {
+    val unionNode: Option[Union] = child match {
+      case SubqueryAlias(_, union: Union) =>
+        Some(union)
+      case SubqueryAlias(_, UnresolvedSubqueryColumnAliases(_, union: Union)) 
=>
+        Some(union)
+      case SubqueryAlias(_, WithCTE(union: Union, _)) =>
+        Some(union)
+      case SubqueryAlias(_, UnresolvedSubqueryColumnAliases(_, WithCTE(union: 
Union, _))) =>
+        Some(union)
+      case _ => None
+    }
+    if (unionNode.isDefined) {
+      unionNode.get.children.head.collectFirstWithSubqueries {
+        case CTERelationRef(this.id, _, _, _, _, true, _) => true
+      }.getOrElse(false)
+    } else {
+      false
+    }
+  }
   lazy val hasSelfReferenceAsUnionLoopRef: Boolean = 
child.collectFirstWithSubqueries {
     case UnionLoopRef(this.id, _, _) => true
   }.getOrElse(false)
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out
index 7dd6cc1b0ec9..66d01bc838aa 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/cte-recursion.sql.out
@@ -409,20 +409,10 @@ WITH RECURSIVE r(level) AS (
 )
 SELECT * FROM r
 -- !query analysis
-org.apache.spark.sql.catalyst.ExtendedAnalysisException
+org.apache.spark.sql.AnalysisException
 {
-  "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION",
-  "sqlState" : "42703",
-  "messageParameters" : {
-    "objectName" : "`level`"
-  },
-  "queryContext" : [ {
-    "objectType" : "",
-    "objectName" : "",
-    "startIndex" : 62,
-    "stopIndex" : 66,
-    "fragment" : "level"
-  } ]
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
 }
 
 
@@ -2085,3 +2075,61 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
   "errorClass" : "INVALID_RECURSIVE_REFERENCE.PLACE",
   "sqlState" : "42836"
 }
+
+
+-- !query
+WITH RECURSIVE t1(n) AS (
+    SELECT 1 FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
+
+
+-- !query
+WITH RECURSIVE t1 AS (
+    SELECT 1 AS n FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
+
+
+-- !query
+WITH RECURSIVE t1(n) AS (
+    WITH t2(m) AS (SELECT 1)
+    SELECT 1 FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
+
+
+-- !query
+WITH RECURSIVE t1 AS (
+    WITH t2(m) AS (SELECT 1)
+    SELECT 1 AS n FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1
+-- !query analysis
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out
index de86c88f6d1b..00ee071abdce 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/postgreSQL/with.sql.out
@@ -1183,20 +1183,10 @@ org.apache.spark.sql.AnalysisException
 WITH RECURSIVE x(n) AS (SELECT n FROM x UNION ALL SELECT 1)
        SELECT * FROM x
 -- !query analysis
-org.apache.spark.sql.catalyst.ExtendedAnalysisException
+org.apache.spark.sql.AnalysisException
 {
-  "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION",
-  "sqlState" : "42703",
-  "messageParameters" : {
-    "objectName" : "`n`"
-  },
-  "queryContext" : [ {
-    "objectType" : "",
-    "objectName" : "",
-    "startIndex" : 32,
-    "stopIndex" : 32,
-    "fragment" : "n"
-  } ]
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
 }
 
 
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql 
b/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql
index ebb5c9e0bb2b..05ddd7fb89fa 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte-recursion.sql
@@ -756,3 +756,29 @@ WITH RECURSIVE t1(n) AS (
     (SELECT n + 1 FROM t1 WHERE n < 5 ORDER BY n)
 )
 SELECT * FROM t1;
+
+WITH RECURSIVE t1(n) AS (
+    SELECT 1 FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1;
+
+WITH RECURSIVE t1 AS (
+    SELECT 1 AS n FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1;
+
+WITH RECURSIVE t1(n) AS (
+    WITH t2(m) AS (SELECT 1)
+    SELECT 1 FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1;
+
+WITH RECURSIVE t1 AS (
+    WITH t2(m) AS (SELECT 1)
+    SELECT 1 AS n FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1;
diff --git 
a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out 
b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out
index f70a6ef3f9ad..6689bbeeefdd 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte-recursion.sql.out
@@ -448,20 +448,10 @@ SELECT * FROM r
 -- !query schema
 struct<>
 -- !query output
-org.apache.spark.sql.catalyst.ExtendedAnalysisException
+org.apache.spark.sql.AnalysisException
 {
-  "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION",
-  "sqlState" : "42703",
-  "messageParameters" : {
-    "objectName" : "`level`"
-  },
-  "queryContext" : [ {
-    "objectType" : "",
-    "objectName" : "",
-    "startIndex" : 62,
-    "stopIndex" : 66,
-    "fragment" : "level"
-  } ]
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
 }
 
 
@@ -1875,3 +1865,69 @@ org.apache.spark.sql.catalyst.ExtendedAnalysisException
   "errorClass" : "INVALID_RECURSIVE_REFERENCE.PLACE",
   "sqlState" : "42836"
 }
+
+
+-- !query
+WITH RECURSIVE t1(n) AS (
+    SELECT 1 FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
+
+
+-- !query
+WITH RECURSIVE t1 AS (
+    SELECT 1 AS n FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
+
+
+-- !query
+WITH RECURSIVE t1(n) AS (
+    WITH t2(m) AS (SELECT 1)
+    SELECT 1 FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
+
+
+-- !query
+WITH RECURSIVE t1 AS (
+    WITH t2(m) AS (SELECT 1)
+    SELECT 1 AS n FROM t1
+    UNION ALL
+    SELECT n+1 FROM t1 WHERE n < 5)
+SELECT * FROM t1
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.AnalysisException
+{
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
+}
diff --git 
a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out 
b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out
index 0ce77049905c..4094c69ec463 100644
--- a/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/postgreSQL/with.sql.out
@@ -901,20 +901,10 @@ WITH RECURSIVE x(n) AS (SELECT n FROM x UNION ALL SELECT 
1)
 -- !query schema
 struct<>
 -- !query output
-org.apache.spark.sql.catalyst.ExtendedAnalysisException
+org.apache.spark.sql.AnalysisException
 {
-  "errorClass" : "UNRESOLVED_COLUMN.WITHOUT_SUGGESTION",
-  "sqlState" : "42703",
-  "messageParameters" : {
-    "objectName" : "`n`"
-  },
-  "queryContext" : [ {
-    "objectType" : "",
-    "objectName" : "",
-    "startIndex" : 32,
-    "stopIndex" : 32,
-    "fragment" : "n"
-  } ]
+  "errorClass" : "INVALID_RECURSIVE_CTE",
+  "sqlState" : "42836"
 }
 
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to