(spark) branch master updated: [SPARK-53734][SQL] Prefer table column over LCA when resolving array index

wenchen Fri, 17 Oct 2025 23:09:33 -0700

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 46ac78ea367c [SPARK-53734][SQL] Prefer table column over LCA when 
resolving array index
46ac78ea367c is described below

commit 46ac78ea367cfa9a7acc04482770aaca33f5a575
Author: Mihailo Timotic <[email protected]>
AuthorDate: Tue Sep 30 13:23:45 2025 +0800

    [SPARK-53734][SQL] Prefer table column over LCA when resolving array index
    
    ### What changes were proposed in this pull request?
    Prefer table column over LCA when resolving array index.
    
    ### Why are the changes needed?
    For a query like:
    
    ```
    SELECT 1 AS col1, col2[col1] FROM VALUES(0, ARRAY(1,2));
    ```
    
    the output should be (1,1), but current Spark implementation outputs (1,2). 
This is because `[col1]` is resolved as an LCA instead of being resolved to a 
column. This is because we never actually resolve `field` of 
`UnresolvedExtractValue` in `innerResolve`, so the resolution of `field` fails 
over to the next item in precedence chain, which is LCA.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, user now sees the correct result for the impacted query shape
    
    ### How was this patch tested?
    Added test case for the impacted query.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #52472 from 
mihailotim-db/mihailo-timotic_data/array_index_lca_correctness.
    
    Authored-by: Mihailo Timotic <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../spark/sql/catalyst/analysis/ColumnResolutionHelper.scala  | 11 ++++++++---
 .../main/scala/org/apache/spark/sql/internal/SQLConf.scala    | 10 ++++++++++
 .../src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala   | 11 +++++++++++
 3 files changed, 29 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
index 3224ccafafec..0502f7f67078 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
@@ -167,12 +167,17 @@ trait ColumnResolutionHelper extends Logging with 
DataTypeErrorsBase {
           }
         }
 
-        case u @ UnresolvedExtractValue(child, fieldName) =>
+        case u @ UnresolvedExtractValue(child, field) =>
           val newChild = innerResolve(child, isTopLevel = false)
+          val resolvedField = if 
(conf.getConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX)) {
+            innerResolve(field, isTopLevel = false)
+          } else {
+            field
+          }
           if (newChild.resolved) {
-            ExtractValue(newChild, fieldName, resolver)
+            ExtractValue(child = newChild, extraction = resolvedField, 
resolver = resolver)
           } else {
-            u.copy(child = newChild)
+            u.copy(child = newChild, extraction = resolvedField)
           }
 
         case _ => e.mapChildren(innerResolve(_, isTopLevel = false))
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 477d09d29a05..eea92dffb048 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -241,6 +241,16 @@ object SQLConf {
     }
   }
 
+  val PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX =
+    buildConf("spark.sql.analyzer.preferColumnOverLcaInArrayIndex")
+    .internal()
+    .doc(
+      "When true, prefer the column from the underlying relation over the 
lateral column alias " +
+      "reference with the same name (see SPARK-53734)."
+    )
+    .booleanConf
+    .createWithDefault(true)
+
   val DONT_DEDUPLICATE_EXPRESSION_IF_EXPR_ID_IN_OUTPUT =
     buildConf("spark.sql.analyzer.dontDeduplicateExpressionIfExprIdInOutput")
     .internal()
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
index 89a6a12a7e4e..90375d0e0873 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
@@ -5079,6 +5079,17 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession with AdaptiveSpark
 
     checkAnswer(df, Row(1))
   }
+
+  test("SPARK-53734: Prefer table column over LCA when resolving array index") 
{
+    val query = "SELECT 1 AS col1, col2[col1] FROM VALUES(0, ARRAY(1, 2));"
+    withSQLConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX.key -> "true") {
+      checkAnswer(sql(query), Row(1, 1))
+    }
+
+    withSQLConf(SQLConf.PREFER_COLUMN_OVER_LCA_IN_ARRAY_INDEX.key -> "false") {
+      checkAnswer(sql(query), Row(1, 2))
+    }
+  }
 }
 
 case class Foo(bar: Option[String])


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-53734][SQL] Prefer table column over LCA when resolving array index

Reply via email to