This is an automated email from the ASF dual-hosted git repository.

peter-toth pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new 38164f182706 [SPARK-56840][SQL][3.5] Avoid unresolved NullIf type 
lookup
38164f182706 is described below

commit 38164f182706ac8f8ef2f4a1e7f2975006664298
Author: Chao Sun <[email protected]>
AuthorDate: Mon May 18 13:46:03 2026 +0200

    [SPARK-56840][SQL][3.5] Avoid unresolved NullIf type lookup
    
    ### Why are the changes needed?
    
    `NULLIF` builds its replacement expression before analysis has resolved all 
child expressions.
    For nested field references, the existing implementation can read the left 
operand's data type
    too early while constructing the null branch, which can fail analysis even 
though the SQL shape
    is valid.
    
    SPARK-56840 tracks this analyzer failure.
    
    ### What changes were proposed in this PR?
    
    - Build the `NULLIF` null branch with a lazy typed-null placeholder so 
construction does not eagerly
      read the unresolved left operand type, while 
`NullIf.replacement.dataType` remains valid once the
      operand type is available.
    - Make that placeholder `RuntimeReplaceable`, so `ReplaceExpressions` 
restores an ordinary typed
      `Literal(null, ...)` before later optimizer rules run and existing 
null-literal simplifications
      continue to apply.
    - Add focused regressions for:
      - nested struct-field `nullif(c.provider, lower(...))` analysis in both
        `ALWAYS_INLINE_COMMON_EXPR` modes;
      - `NullIf` replacement type reporting before type coercion;
      - optimizer replacement back to a normal null literal;
      - explain output avoiding exposure of the internal helper name.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes. Valid `NULLIF` expressions over unresolved nested field references 
that could fail during
    analysis now resolve and execute successfully.
    
    ### How was this patch tested?
    
    - `build/sbt 'catalyst/testOnly 
org.apache.spark.sql.catalyst.expressions.NullExpressionsSuite -- -z "NullIf 
replacement preserves its data type before type coercion"'`
    - `build/sbt 'catalyst/testOnly 
org.apache.spark.sql.catalyst.optimizer.OptimizerSuite -- -z "NullIf typed null 
branch is replaced with a null literal"'`
    - `build/sbt 'sql/testOnly org.apache.spark.sql.DataFrameFunctionsSuite -- 
-z "nullif function"'`
    - `build/sbt 'sql/testOnly org.apache.spark.sql.ExplainSuite -- -z "explain 
for these functions; use range to avoid constant folding"'`
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Codex (GPT-5.5)
    
    Closes #55926 from sunchao/dev/chao/codex/spark-56840-branch-3.5.
    
    Authored-by: Chao Sun <[email protected]>
    Signed-off-by: Peter Toth <[email protected]>
---
 .../sql/catalyst/expressions/nullExpressions.scala   | 17 ++++++++++++++++-
 .../catalyst/expressions/NullExpressionsSuite.scala  | 19 ++++++++++++++++++-
 .../sql/catalyst/optimizer/OptimizerSuite.scala      | 20 +++++++++++++++++++-
 .../apache/spark/sql/DataFrameFunctionsSuite.scala   |  7 +++++++
 .../scala/org/apache/spark/sql/ExplainSuite.scala    |  1 +
 5 files changed, 61 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 948cb6fbedd3..edf8ee00e708 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -140,6 +140,21 @@ case class Coalesce(children: Seq[Expression])
     copy(children = newChildren)
 }
 
+private case class TypedNullLiteral(child: Expression)
+    extends UnaryExpression with RuntimeReplaceable {
+  override def nullable: Boolean = true
+
+  override def dataType: DataType = child.dataType
+
+  override def toString: String = "null"
+
+  override def sql: String = "NULL"
+
+  override lazy val replacement: Expression = Literal.create(null, 
child.dataType)
+
+  override protected def withNewChildInternal(newChild: Expression): 
TypedNullLiteral =
+    copy(child = newChild)
+}
 
 @ExpressionDescription(
   usage = "_FUNC_(expr1, expr2) - Returns null if `expr1` equals to `expr2`, 
or `expr1` otherwise.",
@@ -154,7 +169,7 @@ case class NullIf(left: Expression, right: Expression, 
replacement: Expression)
   extends RuntimeReplaceable with InheritAnalysisRules {
 
   def this(left: Expression, right: Expression) = {
-    this(left, right, If(EqualTo(left, right), Literal.create(null, 
left.dataType), left))
+    this(left, right, If(EqualTo(left, right), TypedNullLiteral(left), left))
   }
 
   override def parameters: Seq[Expression] = Seq(left, right)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
index da8e11c0433e..f97af0835f3f 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.expressions
 import java.sql.Timestamp
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry, 
SimpleAnalyzer, UnresolvedAttribute}
 import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
 import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
 import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
@@ -140,6 +141,22 @@ class NullExpressionsSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     assert(analyze(new Nvl(floatLit, doubleLit)).dataType == DoubleType)
   }
 
+  test("NullIf replacement preserves its data type before type coercion") {
+    val nullIf = new NullIf(Literal(1), Literal(1))
+    assert(nullIf.dataType == IntegerType)
+    assert(nullIf.replacement.dataType == IntegerType)
+  }
+
+  test("NullIf accepts unresolved nested fields during function construction") 
{
+    val nullIf = FunctionRegistry.builtin.lookupFunction(
+      FunctionIdentifier("nullif"),
+      Seq(
+        UnresolvedAttribute(Seq("c", "provider")),
+        Lower(Literal("ERROR_MULTIPLE_PROVIDERS"))))
+
+    assert(nullIf.isInstanceOf[NullIf])
+  }
+
   test("AtLeastNNonNulls") {
     val mix = Seq(Literal("x"),
       Literal.create(null, StringType),
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
index 6b63f860b7da..fb9a0f6f6e6c 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
@@ -19,11 +19,12 @@ package org.apache.spark.sql.catalyst.optimizer
 
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Alias, IntegerLiteral, 
Literal}
+import org.apache.spark.sql.catalyst.expressions.{Alias, IntegerLiteral, 
Literal, NullIf, RuntimeReplaceable}
 import org.apache.spark.sql.catalyst.plans.PlanTest
 import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, 
OneRowRelation, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.BooleanType
 
 /**
  * A dummy optimizer rule for testing that decrements integer literals until 0.
@@ -71,4 +72,21 @@ class OptimizerSuite extends PlanTest {
         s"test, please set '${SQLConf.OPTIMIZER_MAX_ITERATIONS.key}' to a 
larger value."))
     }
   }
+  test("NullIf typed null branch is replaced with a null literal") {
+    val optimizer = new SimpleTestOptimizer() {
+      override def defaultBatches: Seq[Batch] =
+        Batch("test", fixedPoint,
+          ReplaceExpressions) :: Nil
+    }
+
+    val nullIf = new NullIf(Literal(true), Literal(true))
+    val plan = Project(Alias(nullIf, "out")() :: Nil, OneRowRelation()).analyze
+    val optimized = optimizer.execute(plan)
+
+    assert(optimized.expressions.exists(_.exists {
+      case Literal(null, BooleanType) => true
+      case _ => false
+    }))
+    
assert(optimized.expressions.forall(!_.exists(_.isInstanceOf[RuntimeReplaceable])))
+  }
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 71ad4a25578e..251b5429102c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -325,6 +325,13 @@ class DataFrameFunctionsSuite extends QueryTest with 
SharedSparkSession {
 
     checkAnswer(df.selectExpr("nullif(a, a)"), Seq(Row(null)))
     checkAnswer(df.select(nullif(lit(5), lit(5))), Seq(Row(null)))
+
+    val nestedDf = Seq("error_multiple_providers", "openai")
+      .toDF("provider")
+      .select(struct(col("provider")).as("c"))
+    checkAnswer(
+      nestedDf.select(nullif(col("c.provider"), 
lower(lit("ERROR_MULTIPLE_PROVIDERS")))),
+      Seq(Row(null), Row("openai")))
   }
 
   test("nvl") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index a206e97c3536..739557bef301 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -248,6 +248,7 @@ class ExplainSuite extends ExplainSuiteHelper with 
DisableAdaptiveExecutionSuite
     checkKeywordsExistsInExplain(df,
       "Project [id#xL AS ifnull(id, 1)#xL, if ((id#xL = 1)) null " +
         "else id#xL AS nullif(id, 1)#xL, id#xL AS nvl(id, 1)#xL, 1 AS nvl2(id, 
1, 2)#x]")
+    checkKeywordsNotExistsInExplain(df, ExtendedMode, "typednullliteral")
   }
 
   test("SPARK-26659: explain of DataWritingCommandExec should not contain 
duplicate cmd.nodeName") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to