This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.1 by this push:
new 91ed66854e07 [SPARK-55647][SQL] Fix `ConstantPropagation` incorrectly
replacing attributes with non-binary-stable collations
91ed66854e07 is described below
commit 91ed66854e076198f22eaa2d84112cc42345becc
Author: ilicmarkodb <[email protected]>
AuthorDate: Tue Feb 24 13:24:27 2026 +0800
[SPARK-55647][SQL] Fix `ConstantPropagation` incorrectly replacing
attributes with non-binary-stable collations
### What changes were proposed in this pull request?
* `ConstantPropagation` optimizer rule substitutes attributes with literals
derived from equality
predicates (e.g. `c = 'hello'`), then propagates them into other
conditions in the same
conjunction. This is unsafe for non-binary-stable collations (e.g.
`UTF8_LCASE`) where
equality is non-identity: `c = 'hello'` (case-insensitive) does not
imply `c` holds exactly
the bytes `'hello'` - it could also be `'HELLO'`, `'Hello'`, etc.
* Substituting `c → 'hello'` in a second condition like `c = 'HELLO'
COLLATE UNICODE` turns it
into the constant expression `'hello' = 'HELLO' COLLATE UNICODE`, which
is always `false`,
producing incorrect results.
* Fixed by guarding `safeToReplace` with `isBinaryStable(ar.dataType)` so
propagation is skipped
for attributes whose type is not binary-stable.
### Why are the changes needed?
Bug fix.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
New unit test.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54435 from ilicmarkodb/fix_collation_.
Authored-by: ilicmarkodb <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit ec35791b3d987fed2f7b2a872fbbbaf30ada20d5)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../apache/spark/sql/catalyst/optimizer/expressions.scala | 12 +++++++++++-
.../org/apache/spark/sql/collation/CollationSuite.scala | 13 +++++++++++++
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 71eb3e5ea2bd..c0fc8d0bae42 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.catalyst.trees.{AlwaysProcess, TreeNodeTag}
import org.apache.spark.sql.catalyst.trees.TreePattern._
import
org.apache.spark.sql.catalyst.util.CharVarcharUtils.CHAR_VARCHAR_TYPE_STRING_METADATA_KEY
+import org.apache.spark.sql.catalyst.util.UnsafeRowUtils.isBinaryStable
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
@@ -217,8 +218,17 @@ object ConstantPropagation extends Rule[LogicalPlan] {
// substituted into `1 + 1 = 1` if 'c' isn't nullable. If 'c' is nullable
then the enclosing
// NOT prevents us to do the substitution as NOT flips the context
(`nullIsFalse`) of what a
// null result of the enclosed expression means.
+ //
+ // Also, we shouldn't replace attributes with non-binary-stable data types,
since this can lead
+ // to incorrect results. For example:
+ // `CREATE TABLE t (c STRING COLLATE UTF8_LCASE);`
+ // `INSERT INTO t VALUES ('HELLO'), ('hello');`
+ // `SELECT * FROM t WHERE c = 'hello' AND c = 'HELLO' COLLATE UNICODE;`
+ // If we replace `c` with `'hello'`, we get `'hello' = 'HELLO' COLLATE
UNICODE` for the right
+ // condition, which is false, while the original `c = 'HELLO' COLLATE
UNICODE` is true for
+ // 'HELLO' and false for 'hello'.
private def safeToReplace(ar: AttributeReference, nullIsFalse: Boolean) =
- !ar.nullable || nullIsFalse
+ (!ar.nullable || nullIsFalse) && isBinaryStable(ar.dataType)
private def replaceConstants(
condition: Expression,
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
index c84647066f25..8f7a68bcbe6f 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
@@ -2195,4 +2195,17 @@ class CollationSuite extends DatasourceV2SQLBase with
AdaptiveSparkPlanHelper {
Seq())
}
}
+
+ test("ConstantPropagation does not replace attributes with non-binary-stable
collation") {
+ val tableName = "t1"
+ withTable(tableName) {
+ sql(s"CREATE TABLE $tableName (c STRING COLLATE UTF8_LCASE)")
+ sql(s"INSERT INTO $tableName VALUES ('hello'), ('HELLO')")
+
+ checkAnswer(
+ sql(s"SELECT * FROM $tableName WHERE c = 'hello' AND c = 'HELLO'
COLLATE UNICODE"),
+ Row("HELLO")
+ )
+ }
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]