This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 5075ea6a85f3 [SPARK-54665][PS] Fix boolean vs string comparison to 
match pandas behavior
5075ea6a85f3 is described below

commit 5075ea6a85f3f1689766cf08a7d5b2ce500be1fb
Author: Devin Petersohn <[email protected]>
AuthorDate: Thu Mar 5 11:18:35 2026 -0800

    [SPARK-54665][PS] Fix boolean vs string comparison to match pandas behavior
    
    ### What changes were proposed in this pull request?
    Move the `_should_return_all_false` type-mismatch check outside the ANSI 
mode guard in `DataTypeOps.eq/ne` and `NumericOps.eq/ne` so it runs regardless 
of `spark.sql.ansi.enabled`.
    
    ### Why are the changes needed?
    It is a bug, boolean vs string comparison doesn't match pandas behavior 
when ANSI mode is off.
    
    ### Does this PR introduce _any_ user-facing change?
    Yes, fix the bug
    
    ### How was this patch tested?
    CI
    
    ### Was this patch authored or co-authored using generative AI tooling?
    Co-authored-by: Claude Opus 4
    
    Closes #54456 from devin-petersohn/devin/fix-bool-string-comparison.
    
    Authored-by: Devin Petersohn <[email protected]>
    Signed-off-by: Takuya Ueshin <[email protected]>
---
 python/pyspark/pandas/data_type_ops/base.py        | 23 +++++++++++++++------
 python/pyspark/pandas/data_type_ops/num_ops.py     | 24 ++++++++++++++--------
 .../pandas/tests/data_type_ops/test_boolean_ops.py |  6 ++++++
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/base.py 
b/python/pyspark/pandas/data_type_ops/base.py
index fbd6b8a30a1e..21177847a312 100644
--- a/python/pyspark/pandas/data_type_ops/base.py
+++ b/python/pyspark/pandas/data_type_ops/base.py
@@ -17,7 +17,7 @@
 
 import numbers
 from abc import ABCMeta
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, cast
 from itertools import chain
 
 import numpy as np
@@ -53,7 +53,6 @@ from pyspark.pandas.typedef.typehints import (
     handle_dtype_as_extension_dtype,
     spark_type_to_pandas_dtype,
 )
-from pyspark.pandas.utils import is_ansi_mode_enabled
 
 if extension_dtypes_available:
     from pandas import Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype
@@ -424,9 +423,14 @@ class DataTypeOps(object, metaclass=ABCMeta):
         raise TypeError(">= can not be applied to %s." % self.pretty_name)
 
     def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
-        if is_ansi_mode_enabled(left._internal.spark_frame.sparkSession):
-            if _should_return_all_false(left, right):
-                return left._with_new_scol(F.lit(False)).rename(None)  # type: 
ignore[attr-defined]
+        from pyspark.pandas.base import IndexOpsMixin
+
+        if _should_return_all_false(left, right):
+            left_scol = left._with_new_scol(F.lit(False))
+            if isinstance(right, IndexOpsMixin):
+                return left_scol.rename(None)  # type: ignore[attr-defined]
+            else:
+                return cast(SeriesOrIndex, left_scol)
 
         if isinstance(right, (list, tuple)):
             from pyspark.pandas.series import first_series, scol_for
@@ -521,10 +525,17 @@ class DataTypeOps(object, metaclass=ABCMeta):
             return column_op(PySparkColumn.__eq__)(left, right)
 
     def ne(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
-        from pyspark.pandas.base import column_op
+        from pyspark.pandas.base import column_op, IndexOpsMixin
 
         _sanitize_list_like(right)
 
+        if _should_return_all_false(left, right):
+            left_scol = left._with_new_scol(F.lit(True))
+            if isinstance(right, IndexOpsMixin):
+                return left_scol.rename(None)  # type: ignore[attr-defined]
+            else:
+                return cast(SeriesOrIndex, left_scol)
+
         return column_op(PySparkColumn.__ne__)(left, right)
 
     def invert(self, operand: IndexOpsLike) -> IndexOpsLike:
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py 
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 17f17cd76d8a..8c3b9ab66bc3 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -276,16 +276,16 @@ class NumericOps(DataTypeOps):
         if not isinstance(right, IndexOpsMixin) and is_list_like(right):
             return super().eq(left, right)
         else:
+            if _should_return_all_false(left, right):
+                left_scol = left._with_new_scol(F.lit(False))
+                if isinstance(right, IndexOpsMixin):
+                    # When comparing with another Series/Index, drop the name
+                    # to align with pandas behavior
+                    return left_scol.rename(None)  # type: ignore[attr-defined]
+                else:
+                    # When comparing with scalar-like, keep the name of left 
operand
+                    return cast(SeriesOrIndex, left_scol)
             if is_ansi_mode_enabled(left._internal.spark_frame.sparkSession):
-                if _should_return_all_false(left, right):
-                    left_scol = left._with_new_scol(F.lit(False))
-                    if isinstance(right, IndexOpsMixin):
-                        # When comparing with another Series/Index, drop the 
name
-                        # to align with pandas behavior
-                        return left_scol.rename(None)  # type: 
ignore[attr-defined]
-                    else:
-                        # When comparing with scalar-like, keep the name of 
left operand
-                        return cast(SeriesOrIndex, left_scol)
                 if _is_boolean_type(right):  # numeric vs. bool
                     right = transform_boolean_operand_to_numeric(
                         right, spark_type=left.spark.data_type
@@ -294,6 +294,12 @@ class NumericOps(DataTypeOps):
 
     def ne(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
         _sanitize_list_like(right)
+        if _should_return_all_false(left, right):
+            left_scol = left._with_new_scol(F.lit(True))
+            if isinstance(right, IndexOpsMixin):
+                return left_scol.rename(None)  # type: ignore[attr-defined]
+            else:
+                return cast(SeriesOrIndex, left_scol)
         return pyspark_column_op("__ne__", left, right, fillna=True)
 
     def lt(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py 
b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index f4b069426caa..9911b5dc4976 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -364,6 +364,9 @@ class BooleanOpsTestsMixin:
         psser, other_psser = psdf["this"], psdf["that"]
         self.assert_eq(pser == other_pser, psser == other_psser)
         self.assert_eq(pser == pser, psser == psser)
+        # SPARK-54665: boolean vs string comparison should match pandas 
behavior
+        self.assert_eq(pser == "True", psser == "True")
+        self.assert_eq(pser == "False", psser == "False")
 
     def test_ne(self):
         pdf, psdf = self.bool_pdf, self.bool_psdf
@@ -371,6 +374,9 @@ class BooleanOpsTestsMixin:
         psser, other_psser = psdf["this"], psdf["that"]
         self.assert_eq(pser != other_pser, psser != other_psser)
         self.assert_eq(pser != pser, psser != psser)
+        # SPARK-54665: boolean vs string comparison should match pandas 
behavior
+        self.assert_eq(pser != "True", psser != "True")
+        self.assert_eq(pser != "False", psser != "False")
 
     def test_lt(self):
         pdf, psdf = self.bool_pdf, self.bool_psdf


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to