This is an automated email from the ASF dual-hosted git repository.
peter-toth pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push:
new 38164f182706 [SPARK-56840][SQL][3.5] Avoid unresolved NullIf type
lookup
38164f182706 is described below
commit 38164f182706ac8f8ef2f4a1e7f2975006664298
Author: Chao Sun <[email protected]>
AuthorDate: Mon May 18 13:46:03 2026 +0200
[SPARK-56840][SQL][3.5] Avoid unresolved NullIf type lookup
### Why are the changes needed?
`NULLIF` builds its replacement expression before analysis has resolved all
child expressions.
For nested field references, the existing implementation can read the left
operand's data type
too early while constructing the null branch, which can fail analysis even
though the SQL shape
is valid.
SPARK-56840 tracks this analyzer failure.
### What changes were proposed in this PR?
- Build the `NULLIF` null branch with a lazy typed-null placeholder so
construction does not eagerly
read the unresolved left operand type, while
`NullIf.replacement.dataType` remains valid once the
operand type is available.
- Make that placeholder `RuntimeReplaceable`, so `ReplaceExpressions`
restores an ordinary typed
`Literal(null, ...)` before later optimizer rules run and existing
null-literal simplifications
continue to apply.
- Add focused regressions for:
- nested struct-field `nullif(c.provider, lower(...))` analysis in both
`ALWAYS_INLINE_COMMON_EXPR` modes;
- `NullIf` replacement type reporting before type coercion;
- optimizer replacement back to a normal null literal;
- explain output avoiding exposure of the internal helper name.
### Does this PR introduce _any_ user-facing change?
Yes. Valid `NULLIF` expressions over unresolved nested field references
that could fail during
analysis now resolve and execute successfully.
### How was this patch tested?
- `build/sbt 'catalyst/testOnly
org.apache.spark.sql.catalyst.expressions.NullExpressionsSuite -- -z "NullIf
replacement preserves its data type before type coercion"'`
- `build/sbt 'catalyst/testOnly
org.apache.spark.sql.catalyst.optimizer.OptimizerSuite -- -z "NullIf typed null
branch is replaced with a null literal"'`
- `build/sbt 'sql/testOnly org.apache.spark.sql.DataFrameFunctionsSuite --
-z "nullif function"'`
- `build/sbt 'sql/testOnly org.apache.spark.sql.ExplainSuite -- -z "explain
for these functions; use range to avoid constant folding"'`
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Codex (GPT-5.5)
Closes #55926 from sunchao/dev/chao/codex/spark-56840-branch-3.5.
Authored-by: Chao Sun <[email protected]>
Signed-off-by: Peter Toth <[email protected]>
---
.../sql/catalyst/expressions/nullExpressions.scala | 17 ++++++++++++++++-
.../catalyst/expressions/NullExpressionsSuite.scala | 19 ++++++++++++++++++-
.../sql/catalyst/optimizer/OptimizerSuite.scala | 20 +++++++++++++++++++-
.../apache/spark/sql/DataFrameFunctionsSuite.scala | 7 +++++++
.../scala/org/apache/spark/sql/ExplainSuite.scala | 1 +
5 files changed, 61 insertions(+), 3 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
index 948cb6fbedd3..edf8ee00e708 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/nullExpressions.scala
@@ -140,6 +140,21 @@ case class Coalesce(children: Seq[Expression])
copy(children = newChildren)
}
+private case class TypedNullLiteral(child: Expression)
+ extends UnaryExpression with RuntimeReplaceable {
+ override def nullable: Boolean = true
+
+ override def dataType: DataType = child.dataType
+
+ override def toString: String = "null"
+
+ override def sql: String = "NULL"
+
+ override lazy val replacement: Expression = Literal.create(null,
child.dataType)
+
+ override protected def withNewChildInternal(newChild: Expression):
TypedNullLiteral =
+ copy(child = newChild)
+}
@ExpressionDescription(
usage = "_FUNC_(expr1, expr2) - Returns null if `expr1` equals to `expr2`,
or `expr1` otherwise.",
@@ -154,7 +169,7 @@ case class NullIf(left: Expression, right: Expression,
replacement: Expression)
extends RuntimeReplaceable with InheritAnalysisRules {
def this(left: Expression, right: Expression) = {
- this(left, right, If(EqualTo(left, right), Literal.create(null,
left.dataType), left))
+ this(left, right, If(EqualTo(left, right), TypedNullLiteral(left), left))
}
override def parameters: Seq[Expression] = Seq(left, right)
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
index da8e11c0433e..f97af0835f3f 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/NullExpressionsSuite.scala
@@ -20,7 +20,8 @@ package org.apache.spark.sql.catalyst.expressions
import java.sql.Timestamp
import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.analysis.SimpleAnalyzer
+import org.apache.spark.sql.catalyst.FunctionIdentifier
+import org.apache.spark.sql.catalyst.analysis.{FunctionRegistry,
SimpleAnalyzer, UnresolvedAttribute}
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
import org.apache.spark.sql.catalyst.expressions.objects.AssertNotNull
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project}
@@ -140,6 +141,22 @@ class NullExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper {
assert(analyze(new Nvl(floatLit, doubleLit)).dataType == DoubleType)
}
+ test("NullIf replacement preserves its data type before type coercion") {
+ val nullIf = new NullIf(Literal(1), Literal(1))
+ assert(nullIf.dataType == IntegerType)
+ assert(nullIf.replacement.dataType == IntegerType)
+ }
+
+ test("NullIf accepts unresolved nested fields during function construction")
{
+ val nullIf = FunctionRegistry.builtin.lookupFunction(
+ FunctionIdentifier("nullif"),
+ Seq(
+ UnresolvedAttribute(Seq("c", "provider")),
+ Lower(Literal("ERROR_MULTIPLE_PROVIDERS"))))
+
+ assert(nullIf.isInstanceOf[NullIf])
+ }
+
test("AtLeastNNonNulls") {
val mix = Seq(Literal("x"),
Literal.create(null, StringType),
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
index 6b63f860b7da..fb9a0f6f6e6c 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/OptimizerSuite.scala
@@ -19,11 +19,12 @@ package org.apache.spark.sql.catalyst.optimizer
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.plans._
-import org.apache.spark.sql.catalyst.expressions.{Alias, IntegerLiteral,
Literal}
+import org.apache.spark.sql.catalyst.expressions.{Alias, IntegerLiteral,
Literal, NullIf, RuntimeReplaceable}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan,
OneRowRelation, Project}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.types.BooleanType
/**
* A dummy optimizer rule for testing that decrements integer literals until 0.
@@ -71,4 +72,21 @@ class OptimizerSuite extends PlanTest {
s"test, please set '${SQLConf.OPTIMIZER_MAX_ITERATIONS.key}' to a
larger value."))
}
}
+ test("NullIf typed null branch is replaced with a null literal") {
+ val optimizer = new SimpleTestOptimizer() {
+ override def defaultBatches: Seq[Batch] =
+ Batch("test", fixedPoint,
+ ReplaceExpressions) :: Nil
+ }
+
+ val nullIf = new NullIf(Literal(true), Literal(true))
+ val plan = Project(Alias(nullIf, "out")() :: Nil, OneRowRelation()).analyze
+ val optimized = optimizer.execute(plan)
+
+ assert(optimized.expressions.exists(_.exists {
+ case Literal(null, BooleanType) => true
+ case _ => false
+ }))
+
assert(optimized.expressions.forall(!_.exists(_.isInstanceOf[RuntimeReplaceable])))
+ }
}
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
index 71ad4a25578e..251b5429102c 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameFunctionsSuite.scala
@@ -325,6 +325,13 @@ class DataFrameFunctionsSuite extends QueryTest with
SharedSparkSession {
checkAnswer(df.selectExpr("nullif(a, a)"), Seq(Row(null)))
checkAnswer(df.select(nullif(lit(5), lit(5))), Seq(Row(null)))
+
+ val nestedDf = Seq("error_multiple_providers", "openai")
+ .toDF("provider")
+ .select(struct(col("provider")).as("c"))
+ checkAnswer(
+ nestedDf.select(nullif(col("c.provider"),
lower(lit("ERROR_MULTIPLE_PROVIDERS")))),
+ Seq(Row(null), Row("openai")))
}
test("nvl") {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
index a206e97c3536..739557bef301 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ExplainSuite.scala
@@ -248,6 +248,7 @@ class ExplainSuite extends ExplainSuiteHelper with
DisableAdaptiveExecutionSuite
checkKeywordsExistsInExplain(df,
"Project [id#xL AS ifnull(id, 1)#xL, if ((id#xL = 1)) null " +
"else id#xL AS nullif(id, 1)#xL, id#xL AS nvl(id, 1)#xL, 1 AS nvl2(id,
1, 2)#x]")
+ checkKeywordsNotExistsInExplain(df, ExtendedMode, "typednullliteral")
}
test("SPARK-26659: explain of DataWritingCommandExec should not contain
duplicate cmd.nodeName") {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]