This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new 91ed66854e07 [SPARK-55647][SQL] Fix `ConstantPropagation` incorrectly 
replacing attributes with non-binary-stable collations
91ed66854e07 is described below

commit 91ed66854e076198f22eaa2d84112cc42345becc
Author: ilicmarkodb <[email protected]>
AuthorDate: Tue Feb 24 13:24:27 2026 +0800

    [SPARK-55647][SQL] Fix `ConstantPropagation` incorrectly replacing 
attributes with non-binary-stable collations
    
    ### What changes were proposed in this pull request?
    * `ConstantPropagation` optimizer rule substitutes attributes with literals 
derived from equality
        predicates (e.g. `c = 'hello'`), then propagates them into other 
conditions in the same
        conjunction. This is unsafe for non-binary-stable collations (e.g. 
`UTF8_LCASE`) where
        equality is non-identity: `c = 'hello'` (case-insensitive) does not 
imply `c` holds exactly
        the bytes `'hello'` - it could also be `'HELLO'`, `'Hello'`, etc.
     * Substituting `c → 'hello'` in a second condition like `c = 'HELLO' 
COLLATE UNICODE` turns it
        into the constant expression `'hello' = 'HELLO' COLLATE UNICODE`, which 
is always `false`,
        producing incorrect results.
     * Fixed by guarding `safeToReplace` with `isBinaryStable(ar.dataType)` so 
propagation is skipped
        for attributes whose type is not binary-stable.
    
    ### Why are the changes needed?
    Bug fix.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    New unit test.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #54435 from ilicmarkodb/fix_collation_.
    
    Authored-by: ilicmarkodb <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
    (cherry picked from commit ec35791b3d987fed2f7b2a872fbbbaf30ada20d5)
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../apache/spark/sql/catalyst/optimizer/expressions.scala   | 12 +++++++++++-
 .../org/apache/spark/sql/collation/CollationSuite.scala     | 13 +++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
index 71eb3e5ea2bd..c0fc8d0bae42 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/expressions.scala
@@ -32,6 +32,7 @@ import org.apache.spark.sql.catalyst.rules._
 import org.apache.spark.sql.catalyst.trees.{AlwaysProcess, TreeNodeTag}
 import org.apache.spark.sql.catalyst.trees.TreePattern._
 import 
org.apache.spark.sql.catalyst.util.CharVarcharUtils.CHAR_VARCHAR_TYPE_STRING_METADATA_KEY
+import org.apache.spark.sql.catalyst.util.UnsafeRowUtils.isBinaryStable
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.unsafe.types.UTF8String
@@ -217,8 +218,17 @@ object ConstantPropagation extends Rule[LogicalPlan] {
   // substituted into `1 + 1 = 1` if 'c' isn't nullable. If 'c' is nullable 
then the enclosing
   // NOT prevents us to do the substitution as NOT flips the context 
(`nullIsFalse`) of what a
   // null result of the enclosed expression means.
+  //
+  // Also, we shouldn't replace attributes with non-binary-stable data types, 
since this can lead
+  // to incorrect results. For example:
+  // `CREATE TABLE t (c STRING COLLATE UTF8_LCASE);`
+  // `INSERT INTO t VALUES ('HELLO'), ('hello');`
+  // `SELECT * FROM t WHERE c = 'hello' AND c = 'HELLO' COLLATE UNICODE;`
+  // If we replace `c` with `'hello'`, we get `'hello' = 'HELLO' COLLATE 
UNICODE` for the right
+  // condition, which is false, while the original `c = 'HELLO' COLLATE 
UNICODE` is true for
+  // 'HELLO' and false for 'hello'.
   private def safeToReplace(ar: AttributeReference, nullIsFalse: Boolean) =
-    !ar.nullable || nullIsFalse
+    (!ar.nullable || nullIsFalse) && isBinaryStable(ar.dataType)
 
   private def replaceConstants(
       condition: Expression,
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
index c84647066f25..8f7a68bcbe6f 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/collation/CollationSuite.scala
@@ -2195,4 +2195,17 @@ class CollationSuite extends DatasourceV2SQLBase with 
AdaptiveSparkPlanHelper {
         Seq())
     }
   }
+
+  test("ConstantPropagation does not replace attributes with non-binary-stable 
collation") {
+    val tableName = "t1"
+    withTable(tableName) {
+      sql(s"CREATE TABLE $tableName (c STRING COLLATE UTF8_LCASE)")
+      sql(s"INSERT INTO $tableName VALUES ('hello'), ('HELLO')")
+
+      checkAnswer(
+        sql(s"SELECT * FROM $tableName WHERE c = 'hello' AND c = 'HELLO' 
COLLATE UNICODE"),
+        Row("HELLO")
+      )
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to