(spark) branch master updated: [SPARK-54718][SQL] Preserve attributes names during CTE newInstance()

ptoth Wed, 17 Dec 2025 04:53:31 -0800

This is an automated email from the ASF dual-hosted git repository.

ptoth pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 7735522de0cd [SPARK-54718][SQL] Preserve attributes names during CTE 
newInstance()
7735522de0cd is described below

commit 7735522de0cd58e9650cb7f1ca0ab922059849f9
Author: Mikhail Nikoliukin <[email protected]>
AuthorDate: Wed Dec 17 13:53:00 2025 +0100

    [SPARK-54718][SQL] Preserve attributes names during CTE newInstance()
    
    ### What changes were proposed in this pull request?
    
    New implementation of `CTERelationRef.newInstance()` that preserves 
attributes' names (see tests).
    To control this small behavior change added the flag 
`LEGACY_CTE_DUPLICATE_ATTRIBUTE_NAMES`.
    
    ### Why are the changes needed?
    
    The output schema is much more predictable and does not depend on query 
shape.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, the column names case in the output schema has changed in case. Only 
affects queries with CTE that have several attributes with the same exprId but 
different names.
    
    Example of current output of tests (without fix, after fix you can see in 
golden files).
    
    ```
    WITH cte AS (SELECT 1 AS id, 'test' AS COLNAME),
         cte2 AS (SELECT id, COLNAME, colname FROM cte)
    SELECT * FROM cte2 a JOIN cte2 b ON (a.id = b.id);
    
    +---+-------+-------+---+-------+-------+
    | id|COLNAME|colname| id|colname|colname|
    +---+-------+-------+---+-------+-------+
    |  1|   test|   test|  1|   test|   test|
    +---+-------+-------+---+-------+-------+
    
    WITH cte AS (SELECT 1 AS id, 'test' AS COLNAME),
         cte2 AS (SELECT id, COLNAME, colname FROM cte),
         cte3 AS (SELECT id, COLNAME, colname FROM cte2)
    SELECT * FROM cte2 a JOIN cte2 b ON (a.id = b.id)
    
    +---+-------+-------+---+-------+-------+
    | id|colname|colname| id|colname|colname|
    +---+-------+-------+---+-------+-------+
    |  1|   test|   test|  1|   test|   test|
    +---+-------+-------+---+-------+-------+
    ```
    
    ### How was this patch tested?
    
    New tests + existing tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Cursor 2.2.20
    
    Closes #53487 from mikhailnik-db/fix-cte-new-instance.
    
    Lead-authored-by: Mikhail Nikoliukin <[email protected]>
    Co-authored-by: Wenchen Fan <[email protected]>
    Signed-off-by: Peter Toth <[email protected]>
---
 .../sql/catalyst/plans/logical/cteOperators.scala  |  6 ++-
 .../org/apache/spark/sql/internal/SQLConf.scala    | 10 ++++
 .../sql/catalyst/analysis/AnalysisSuite.scala      | 22 +++++++++
 .../sql-tests/analyzer-results/cte.sql.out         | 56 ++++++++++++++++++++++
 .../src/test/resources/sql-tests/inputs/cte.sql    | 11 +++++
 .../test/resources/sql-tests/results/cte.sql.out   | 21 ++++++++
 6 files changed, 125 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
index 4b915ec60b77..4114bc290312 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/cteOperators.scala
@@ -217,7 +217,11 @@ case class CTERelationRef(
     // attributes `a` have the same id, but `Project('a, CTERelationRef(a#2, 
a#3))` can't be
     // resolved.
     val oldAttrToNewAttr = 
AttributeMap(output.zip(output.map(_.newInstance())))
-    copy(output = output.map(attr => oldAttrToNewAttr(attr)))
+    if (conf.getConf(SQLConf.LEGACY_CTE_DUPLICATE_ATTRIBUTE_NAMES)) {
+      copy(output = output.map(attr => oldAttrToNewAttr(attr)))
+    } else {
+      copy(output = output.map(attr => 
attr.withExprId(oldAttrToNewAttr(attr).exprId)))
+    }
   }
 
   def withNewStats(statsOpt: Option[Statistics]): CTERelationRef = 
copy(statsOpt = statsOpt)
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index bfdc6b193281..a93cdb1e7828 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -5414,6 +5414,16 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
+  val LEGACY_CTE_DUPLICATE_ATTRIBUTE_NAMES =
+    buildConf("spark.sql.legacy.cteDuplicateAttributeNames")
+      .internal()
+      .doc("When true, CTE references sometimes (e.g., self-joins) may 
incorrectly unify column" +
+        "names that differ only in casing (e.g., COLNAME and colname become 
the same). This is" +
+        "the legacy behavior. When false, column name casing is preserved 
correctly.")
+      .version("4.2.0")
+      .booleanConf
+      .createWithDefault(false)
+
   val LEGACY_TIME_PARSER_POLICY = 
buildConf(SqlApiConfHelper.LEGACY_TIME_PARSER_POLICY_KEY)
     .internal()
     .doc("When LEGACY, java.text.SimpleDateFormat is used for formatting and 
parsing " +
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
index f45ed718a247..e25777b230ce 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisSuite.scala
@@ -1712,6 +1712,28 @@ class AnalysisSuite extends AnalysisTest with Matchers {
     }
   }
 
+  test("SPARK-54718: CTERelationRef.newInstance preserves attribute metadata") 
{
+    val baseAttr = AttributeReference("COLNAME", StringType)()
+    val attr1 = baseAttr
+    val attr2 = baseAttr.withName("colname")
+    val cteDef = CTERelationDef(testRelation.select($"a".as("COLNAME")))
+    val cteRef = CTERelationRef(cteDef.id, true, Seq(attr1, attr2), false)
+
+    val newInstance = cteRef.newInstance().asInstanceOf[CTERelationRef]
+    assert(newInstance.output(0).name == "COLNAME")
+    assert(newInstance.output(1).name == "colname")
+    assert(newInstance.output(0).exprId != attr1.exprId)
+    assert(newInstance.output(0).exprId == newInstance.output(1).exprId)
+
+    withSQLConf(SQLConf.LEGACY_CTE_DUPLICATE_ATTRIBUTE_NAMES.key -> "true") {
+      val legacyInstance = cteRef.newInstance().asInstanceOf[CTERelationRef]
+      assert(legacyInstance.output(0).name == "colname")
+      assert(legacyInstance.output(1).name == "colname")
+      assert(legacyInstance.output(0).exprId != attr1.exprId)
+      assert(legacyInstance.output(0).exprId == 
legacyInstance.output(1).exprId)
+    }
+  }
+
   test("SPARK-43190: ListQuery.childOutput should be consistent with child 
output") {
     val listQuery1 = ListQuery(testRelation2.select($"a"))
     val listQuery2 = ListQuery(testRelation2.select($"b"))
diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out 
b/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out
index 8f195185556f..b1cdec7e7359 100644
--- a/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out
+++ b/sql/core/src/test/resources/sql-tests/analyzer-results/cte.sql.out
@@ -1125,6 +1125,62 @@ Project [col1#x]
                   +- Relation spark_catalog.default.t4[col1#x] parquet
 
 
+-- !query
+WITH cte AS (SELECT 1 AS id, 'test' AS COLNAME),
+     cte2 AS (SELECT id, COLNAME, colname FROM cte)
+SELECT * FROM cte2 a JOIN cte2 b ON (a.id = b.id)
+-- !query analysis
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias cte
+:     +- Project [1 AS id#x, test AS COLNAME#x]
+:        +- OneRowRelation
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias cte2
+:     +- Project [id#x, COLNAME#x, colname#x]
+:        +- SubqueryAlias cte
+:           +- CTERelationRef xxxx, true, [id#x, COLNAME#x], false, false, 1
++- Project [id#x, COLNAME#x, colname#x, id#x, COLNAME#x, colname#x]
+   +- Join Inner, (id#x = id#x)
+      :- SubqueryAlias a
+      :  +- SubqueryAlias cte2
+      :     +- CTERelationRef xxxx, true, [id#x, COLNAME#x, colname#x], false, 
false, 1
+      +- SubqueryAlias b
+         +- SubqueryAlias cte2
+            +- CTERelationRef xxxx, true, [id#x, COLNAME#x, colname#x], false, 
false, 1
+
+
+-- !query
+WITH cte AS (SELECT 1 AS id, 'test' AS COLNAME),
+     cte2 AS (SELECT id, COLNAME, colname FROM cte),
+     cte3 AS (SELECT id, COLNAME, colname FROM cte2)
+SELECT * FROM cte2 a JOIN cte2 b ON (a.id = b.id)
+-- !query analysis
+WithCTE
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias cte
+:     +- Project [1 AS id#x, test AS COLNAME#x]
+:        +- OneRowRelation
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias cte2
+:     +- Project [id#x, COLNAME#x, colname#x]
+:        +- SubqueryAlias cte
+:           +- CTERelationRef xxxx, true, [id#x, COLNAME#x], false, false, 1
+:- CTERelationDef xxxx, false
+:  +- SubqueryAlias cte3
+:     +- Project [id#x, COLNAME#x, colname#x]
+:        +- SubqueryAlias cte2
+:           +- CTERelationRef xxxx, true, [id#x, COLNAME#x, colname#x], false, 
false, 1
++- Project [id#x, COLNAME#x, colname#x, id#x, COLNAME#x, colname#x]
+   +- Join Inner, (id#x = id#x)
+      :- SubqueryAlias a
+      :  +- SubqueryAlias cte2
+      :     +- CTERelationRef xxxx, true, [id#x, COLNAME#x, colname#x], false, 
false, 1
+      +- SubqueryAlias b
+         +- SubqueryAlias cte2
+            +- CTERelationRef xxxx, true, [id#x, COLNAME#x, colname#x], false, 
false, 1
+
+
 -- !query
 DROP VIEW IF EXISTS t
 -- !query analysis
diff --git a/sql/core/src/test/resources/sql-tests/inputs/cte.sql 
b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
index 625933302843..2bd4a0cbb7be 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/cte.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/cte.sql
@@ -286,6 +286,17 @@ SELECT * FROM (
   WITH cte1 AS (SELECT * FROM t4) SELECT cte1.col1 FROM cte1 JOIN t4 USING 
(col1)
 );
 
+-- Column reference with different casing should preserve alias casing in 
self-join
+WITH cte AS (SELECT 1 AS id, 'test' AS COLNAME),
+     cte2 AS (SELECT id, COLNAME, colname FROM cte)
+SELECT * FROM cte2 a JOIN cte2 b ON (a.id = b.id);
+
+-- Same test with an unused CTE, used to return all columns in lower case
+WITH cte AS (SELECT 1 AS id, 'test' AS COLNAME),
+     cte2 AS (SELECT id, COLNAME, colname FROM cte),
+     cte3 AS (SELECT id, COLNAME, colname FROM cte2)
+SELECT * FROM cte2 a JOIN cte2 b ON (a.id = b.id);
+
 -- Clean up
 DROP VIEW IF EXISTS t;
 DROP VIEW IF EXISTS t2;
diff --git a/sql/core/src/test/resources/sql-tests/results/cte.sql.out 
b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
index 39398b26a0b5..28832838df19 100644
--- a/sql/core/src/test/resources/sql-tests/results/cte.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/cte.sql.out
@@ -774,6 +774,27 @@ struct<col1:timestamp>
 
 
 
+-- !query
+WITH cte AS (SELECT 1 AS id, 'test' AS COLNAME),
+     cte2 AS (SELECT id, COLNAME, colname FROM cte)
+SELECT * FROM cte2 a JOIN cte2 b ON (a.id = b.id)
+-- !query schema
+struct<id:int,COLNAME:string,colname:string,id:int,COLNAME:string,colname:string>
+-- !query output
+1      test    test    1       test    test
+
+
+-- !query
+WITH cte AS (SELECT 1 AS id, 'test' AS COLNAME),
+     cte2 AS (SELECT id, COLNAME, colname FROM cte),
+     cte3 AS (SELECT id, COLNAME, colname FROM cte2)
+SELECT * FROM cte2 a JOIN cte2 b ON (a.id = b.id)
+-- !query schema
+struct<id:int,COLNAME:string,colname:string,id:int,COLNAME:string,colname:string>
+-- !query output
+1      test    test    1       test    test
+
+
 -- !query
 DROP VIEW IF EXISTS t
 -- !query schema


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-54718][SQL] Preserve attributes names during CTE newInstance()

Reply via email to