(spark) branch master updated: [SPARK-45509][SQL] Fix df column reference behavior for Spark Connect

wenchen Tue, 07 Nov 2023 00:36:50 -0800

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 8c416290335 [SPARK-45509][SQL] Fix df column reference behavior for 
Spark Connect
8c416290335 is described below

commit 8c41629033508197b80f6a50c78fcf4d5f9752ff
Author: Wenchen Fan <[email protected]>
AuthorDate: Tue Nov 7 16:35:48 2023 +0800

    [SPARK-45509][SQL] Fix df column reference behavior for Spark Connect
    
    ### What changes were proposed in this pull request?
    
    This PR fixes a few problems of column resolution for Spark Connect, to 
make the behavior closer to classic Spark SQL (unfortunately we still have some 
behavior differences in corner cases).
    1. resolve df column references in both `resolveExpressionByPlanChildren` 
and `resolveExpressionByPlanOutput`. Previously it's only in 
`resolveExpressionByPlanChildren`.
    2. when the plan id has multiple matches, fail with 
`AMBIGUOUS_COLUMN_REFERENCE`
    
    ### Why are the changes needed?
    
    fix behavior differences between spark connect and classic spark sql
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, for spark connect scala client
    
    ### How was this patch tested?
    
    new tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #43465 from cloud-fan/column.
    
    Authored-by: Wenchen Fan <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../src/main/resources/error/error-classes.json    |  9 ++
 .../org/apache/spark/sql/ClientE2ETestSuite.scala  | 62 +++++++++++++-
 docs/sql-error-conditions.md                       |  9 ++
 python/pyspark/pandas/indexes/multi.py             |  2 +-
 python/pyspark/sql/connect/plan.py                 |  4 +-
 .../catalyst/analysis/ColumnResolutionHelper.scala | 98 +++++++++++++---------
 .../sql/catalyst/analysis/ResolveSetVariable.scala |  1 -
 7 files changed, 140 insertions(+), 45 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-classes.json 
b/common/utils/src/main/resources/error/error-classes.json
index 3e0743d366a..db46ee8ca20 100644
--- a/common/utils/src/main/resources/error/error-classes.json
+++ b/common/utils/src/main/resources/error/error-classes.json
@@ -31,6 +31,15 @@
     ],
     "sqlState" : "42702"
   },
+  "AMBIGUOUS_COLUMN_REFERENCE" : {
+    "message" : [
+      "Column <name> is ambiguous. It's because you joined several DataFrame 
together, and some of these DataFrames are the same.",
+      "This column points to one of the DataFrame but Spark is unable to 
figure out which one.",
+      "Please alias the DataFrames with different names via `DataFrame.alias` 
before joining them,",
+      "and specify the column using qualified name, e.g. 
`df.alias(\"a\").join(df.alias(\"b\"), col(\"a.id\") > col(\"b.id\"))`."
+    ],
+    "sqlState" : "42702"
+  },
   "AMBIGUOUS_LATERAL_COLUMN_ALIAS" : {
     "message" : [
       "Lateral column alias <name> is ambiguous and has <n> matches."
diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
index d9a77f2830b..b9fa415034c 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala
@@ -44,7 +44,7 @@ import org.apache.spark.sql.types._
 
 class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper with 
PrivateMethodTester {
 
-  test(s"throw SparkException with null filename in stack trace elements") {
+  test("throw SparkException with null filename in stack trace elements") {
     withSQLConf("spark.sql.connect.enrichError.enabled" -> "true") {
       val session = spark
       import session.implicits._
@@ -94,7 +94,7 @@ class ClientE2ETestSuite extends RemoteSparkSession with 
SQLHelper with PrivateM
     }
   }
 
-  test(s"throw SparkException with large cause exception") {
+  test("throw SparkException with large cause exception") {
     withSQLConf("spark.sql.connect.enrichError.enabled" -> "true") {
       val session = spark
       import session.implicits._
@@ -872,6 +872,64 @@ class ClientE2ETestSuite extends RemoteSparkSession with 
SQLHelper with PrivateM
     assert(joined2.schema.catalogString === "struct<id:bigint,a:double>")
   }
 
+  test("SPARK-45509: ambiguous column reference") {
+    val session = spark
+    import session.implicits._
+    val df1 = Seq(1 -> "a").toDF("i", "j")
+    val df1_filter = df1.filter(df1("i") > 0)
+    val df2 = Seq(2 -> "b").toDF("i", "y")
+
+    checkSameResult(
+      Seq(Row(1)),
+      // df1("i") is not ambiguous, and it's still valid in the filtered df.
+      df1_filter.select(df1("i")))
+
+    val e1 = intercept[AnalysisException] {
+      // df1("i") is not ambiguous, but it's not valid in the projected df.
+      df1.select((df1("i") + 1).as("plus")).select(df1("i")).collect()
+    }
+    
assert(e1.getMessage.contains("MISSING_ATTRIBUTES.RESOLVED_ATTRIBUTE_MISSING_FROM_INPUT"))
+
+    checkSameResult(
+      Seq(Row(1, "a")),
+      // All these column references are not ambiguous and are still valid 
after join.
+      df1.join(df2, df1("i") + 1 === 
df2("i")).sort(df1("i").desc).select(df1("i"), df1("j")))
+
+    val e2 = intercept[AnalysisException] {
+      // df1("i") is ambiguous as df1 appears in both join sides.
+      df1.join(df1, df1("i") === 1).collect()
+    }
+    assert(e2.getMessage.contains("AMBIGUOUS_COLUMN_REFERENCE"))
+
+    val e3 = intercept[AnalysisException] {
+      // df1("i") is ambiguous as df1 appears in both join sides.
+      df1.join(df1).select(df1("i")).collect()
+    }
+    assert(e3.getMessage.contains("AMBIGUOUS_COLUMN_REFERENCE"))
+
+    val e4 = intercept[AnalysisException] {
+      // df1("i") is ambiguous as df1 appears in both join sides (df1_filter 
contains df1).
+      df1.join(df1_filter, df1("i") === 1).collect()
+    }
+    assert(e4.getMessage.contains("AMBIGUOUS_COLUMN_REFERENCE"))
+
+    checkSameResult(
+      Seq(Row("a")),
+      // df1_filter("i") is not ambiguous as df1_filter does not exist in the 
join left side.
+      df1.join(df1_filter, df1_filter("i") === 1).select(df1_filter("j")))
+
+    val e5 = intercept[AnalysisException] {
+      // df1("i") is ambiguous as df1 appears in both sides of the first join.
+      df1.join(df1_filter, df1_filter("i") === 1).join(df2, df1("i") === 
1).collect()
+    }
+    assert(e5.getMessage.contains("AMBIGUOUS_COLUMN_REFERENCE"))
+
+    checkSameResult(
+      Seq(Row("a")),
+      // df1_filter("i") is not ambiguous as df1_filter only appears once.
+      df1.join(df1_filter).join(df2, df1_filter("i") === 
1).select(df1_filter("j")))
+  }
+
   test("broadcast join") {
     withSQLConf("spark.sql.autoBroadcastJoinThreshold" -> "-1") {
       val left = spark.range(100).select(col("id"), rand(10).as("a"))
diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
index a6f003647dd..7b0bc8ceb2b 100644
--- a/docs/sql-error-conditions.md
+++ b/docs/sql-error-conditions.md
@@ -55,6 +55,15 @@ See '`<docroot>`/sql-migration-guide.html#query-engine'.
 
 Column or field `<name>` is ambiguous and has `<n>` matches.
 
+### AMBIGUOUS_COLUMN_REFERENCE
+
+[SQLSTATE: 
42702](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
+
+Column `<name>` is ambiguous. It's because you joined several DataFrame 
together, and some of these DataFrames are the same.
+This column points to one of the DataFrame but Spark is unable to figure out 
which one.
+Please alias the DataFrames with different names via `DataFrame.alias` before 
joining them,
+and specify the column using qualified name, e.g. 
`df.alias("a").join(df.alias("b"), col("a.id") > col("b.id"))`.
+
 ### AMBIGUOUS_LATERAL_COLUMN_ALIAS
 
 [SQLSTATE: 
42702](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
diff --git a/python/pyspark/pandas/indexes/multi.py 
b/python/pyspark/pandas/indexes/multi.py
index 41c3b93ed51..9fbc608c12a 100644
--- a/python/pyspark/pandas/indexes/multi.py
+++ b/python/pyspark/pandas/indexes/multi.py
@@ -813,7 +813,7 @@ class MultiIndex(Index):
         sdf_symdiff = 
sdf_self.union(sdf_other).subtract(sdf_self.intersect(sdf_other))
 
         if sort:
-            sdf_symdiff = sdf_symdiff.sort(*self._internal.index_spark_columns)
+            sdf_symdiff = 
sdf_symdiff.sort(*self._internal.index_spark_column_names)
 
         internal = InternalFrame(
             spark_frame=sdf_symdiff,
diff --git a/python/pyspark/sql/connect/plan.py 
b/python/pyspark/sql/connect/plan.py
index d888422d29f..607d1429a9e 100644
--- a/python/pyspark/sql/connect/plan.py
+++ b/python/pyspark/sql/connect/plan.py
@@ -2251,7 +2251,9 @@ class CoGroupMap(LogicalPlan):
         self._input_grouping_cols = input_grouping_cols
         self._other_grouping_cols = other_grouping_cols
         self._other = cast(LogicalPlan, other)
-        self._func = function._build_common_inline_user_defined_function(*cols)
+        # The function takes entire DataFrame as inputs, no need to do
+        # column binding (no input columns).
+        self._func = function._build_common_inline_user_defined_function()
 
     def plan(self, session: "SparkConnectClient") -> proto.Relation:
         assert self._child is not None
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
index 54a9c6ca018..edfc60fc6ea 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ColumnResolutionHelper.scala
@@ -30,10 +30,10 @@ import 
org.apache.spark.sql.catalyst.trees.CurrentOrigin.withOrigin
 import org.apache.spark.sql.catalyst.trees.TreePattern._
 import org.apache.spark.sql.catalyst.util.toPrettySQL
 import org.apache.spark.sql.connector.catalog.{CatalogManager, Identifier}
-import org.apache.spark.sql.errors.QueryCompilationErrors
+import org.apache.spark.sql.errors.{DataTypeErrorsBase, QueryCompilationErrors}
 import org.apache.spark.sql.internal.SQLConf
 
-trait ColumnResolutionHelper extends Logging {
+trait ColumnResolutionHelper extends Logging with DataTypeErrorsBase {
 
   def conf: SQLConf
 
@@ -426,7 +426,7 @@ trait ColumnResolutionHelper extends Logging {
       throws: Boolean = false,
       includeLastResort: Boolean = false): Expression = {
     resolveExpression(
-      expr,
+      tryResolveColumnByPlanId(expr, plan),
       resolveColumnByName = nameParts => {
         plan.resolve(nameParts, conf.resolver)
       },
@@ -447,21 +447,8 @@ trait ColumnResolutionHelper extends Logging {
       e: Expression,
       q: LogicalPlan,
       includeLastResort: Boolean = false): Expression = {
-    val newE = if (e.exists(_.getTagValue(LogicalPlan.PLAN_ID_TAG).nonEmpty)) {
-      // If the TreeNodeTag 'LogicalPlan.PLAN_ID_TAG' is attached, it means 
that the plan and
-      // expression are from Spark Connect, and need to be resolved in this 
way:
-      //    1, extract the attached plan id from the expression 
(UnresolvedAttribute only for now);
-      //    2, top-down traverse the query plan to find the plan node that 
matches the plan id;
-      //    3, if can not find the matching node, fail the analysis due to 
illegal references;
-      //    4, resolve the expression with the matching node, if any error 
occurs here, apply the
-      //    old code path;
-      resolveExpressionByPlanId(e, q)
-    } else {
-      e
-    }
-
     resolveExpression(
-      newE,
+      tryResolveColumnByPlanId(e, q),
       resolveColumnByName = nameParts => {
         q.resolveChildren(nameParts, conf.resolver)
       },
@@ -490,39 +477,46 @@ trait ColumnResolutionHelper extends Logging {
     }
   }
 
-  private def resolveExpressionByPlanId(
+  // If the TreeNodeTag 'LogicalPlan.PLAN_ID_TAG' is attached, it means that 
the plan and
+  // expression are from Spark Connect, and need to be resolved in this way:
+  //    1. extract the attached plan id from UnresolvedAttribute;
+  //    2. top-down traverse the query plan to find the plan node that matches 
the plan id;
+  //    3. if can not find the matching node, fail the analysis due to illegal 
references;
+  //    4. if more than one matching nodes are found, fail due to ambiguous 
column reference;
+  //    5. resolve the expression with the matching node, if any error occurs 
here, return the
+  //       original expression as it is.
+  private def tryResolveColumnByPlanId(
       e: Expression,
-      q: LogicalPlan): Expression = {
-    if (!e.exists(_.getTagValue(LogicalPlan.PLAN_ID_TAG).nonEmpty)) {
-      return e
-    }
-
-    e match {
-      case u: UnresolvedAttribute =>
-        resolveUnresolvedAttributeByPlanId(u, q).getOrElse(u)
-      case _ =>
-        e.mapChildren(c => resolveExpressionByPlanId(c, q))
-    }
+      q: LogicalPlan,
+      idToPlan: mutable.HashMap[Long, LogicalPlan] = mutable.HashMap.empty): 
Expression = e match {
+    case u: UnresolvedAttribute =>
+      resolveUnresolvedAttributeByPlanId(
+        u, q, idToPlan: mutable.HashMap[Long, LogicalPlan]
+      ).getOrElse(u)
+    case _ if e.containsPattern(UNRESOLVED_ATTRIBUTE) =>
+      e.mapChildren(c => tryResolveColumnByPlanId(c, q, idToPlan))
+    case _ => e
   }
 
   private def resolveUnresolvedAttributeByPlanId(
       u: UnresolvedAttribute,
-      q: LogicalPlan): Option[NamedExpression] = {
+      q: LogicalPlan,
+      idToPlan: mutable.HashMap[Long, LogicalPlan]): Option[NamedExpression] = 
{
     val planIdOpt = u.getTagValue(LogicalPlan.PLAN_ID_TAG)
     if (planIdOpt.isEmpty) return None
     val planId = planIdOpt.get
     logDebug(s"Extract plan_id $planId from $u")
 
-    val planOpt = 
q.find(_.getTagValue(LogicalPlan.PLAN_ID_TAG).contains(planId))
-    if (planOpt.isEmpty) {
-      // For example:
-      //  df1 = spark.createDataFrame([Row(a = 1, b = 2, c = 3)]])
-      //  df2 = spark.createDataFrame([Row(a = 1, b = 2)]])
-      //  df1.select(df2.a)   <-   illegal reference df2.a
-      throw new AnalysisException(s"When resolving $u, " +
-        s"fail to find subplan with plan_id=$planId in $q")
-    }
-    val plan = planOpt.get
+    val plan = idToPlan.getOrElseUpdate(planId, {
+      findPlanById(u, planId, q).getOrElse {
+        // For example:
+        //  df1 = spark.createDataFrame([Row(a = 1, b = 2, c = 3)]])
+        //  df2 = spark.createDataFrame([Row(a = 1, b = 2)]])
+        //  df1.select(df2.a)   <-   illegal reference df2.a
+        throw new AnalysisException(s"When resolving $u, " +
+          s"fail to find subplan with plan_id=$planId in $q")
+      }
+    })
 
     val isMetadataAccess = u.getTagValue(LogicalPlan.IS_METADATA_COL).isDefined
     try {
@@ -539,4 +533,28 @@ trait ColumnResolutionHelper extends Logging {
         None
     }
   }
+
+  private def findPlanById(
+      u: UnresolvedAttribute,
+      id: Long,
+      plan: LogicalPlan): Option[LogicalPlan] = {
+    if (plan.getTagValue(LogicalPlan.PLAN_ID_TAG).contains(id)) {
+      Some(plan)
+    } else if (plan.children.length == 1) {
+      findPlanById(u, id, plan.children.head)
+    } else if (plan.children.length > 1) {
+      val matched = plan.children.flatMap(findPlanById(u, id, _))
+      if (matched.length > 1) {
+        throw new AnalysisException(
+          errorClass = "AMBIGUOUS_COLUMN_REFERENCE",
+          messageParameters = Map("name" -> toSQLId(u.nameParts)),
+          origin = u.origin
+        )
+      } else {
+        matched.headOption
+      }
+    } else {
+      None
+    }
+  }
 }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSetVariable.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSetVariable.scala
index ebf56ef1cc4..bd0204ba06f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSetVariable.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/ResolveSetVariable.scala
@@ -24,7 +24,6 @@ import org.apache.spark.sql.catalyst.plans.logical.{Limit, 
LogicalPlan, SetVaria
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern.COMMAND
 import org.apache.spark.sql.connector.catalog.CatalogManager
-import org.apache.spark.sql.errors.DataTypeErrors.toSQLId
 import 
org.apache.spark.sql.errors.QueryCompilationErrors.unresolvedVariableError
 import org.apache.spark.sql.types.IntegerType
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch master updated: [SPARK-45509][SQL] Fix df column reference behavior for Spark Connect

Reply via email to