(spark) branch master updated: [SPARK-52249][PS] Enable divide-by-zero for numeric truediv with ANSI enabled

ueshin Fri, 30 May 2025 15:03:45 -0700

This is an automated email from the ASF dual-hosted git repository.

ueshin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new b0c2ba357bf0 [SPARK-52249][PS] Enable divide-by-zero for numeric 
truediv with ANSI enabled
b0c2ba357bf0 is described below

commit b0c2ba357bf080dd328b95e4a6402b134a641a1a
Author: Xinrong Meng <xinr...@apache.org>
AuthorDate: Fri May 30 15:00:49 2025 -0700

    [SPARK-52249][PS] Enable divide-by-zero for numeric truediv with ANSI 
enabled
    
    ### What changes were proposed in this pull request?
    Enable divide-by-zero for truediv with ANSI enabled
    
    ### Why are the changes needed?
    Part of https://issues.apache.org/jira/browse/SPARK-52169
    
    ### Does this PR introduce _any_ user-facing change?
    Yes,  divide-by-zero for truediv is enabled with ANSI enabled
    
    ```py
    >>> spark.conf.get("spark.sql.ansi.enabled")
    'true'
    >>> pdf = pd.DataFrame({"a": [1.0, -1.0, 0.0, np.nan], "b": [0.0, 0.0, 0.0, 
0.0]})
    >>> psdf = ps.from_pandas(pdf)
    ```
    
    FROM
    ```py
    >>> psdf["a"] / psdf["b"]
    ...
    pyspark.errors.exceptions.captured.ArithmeticException: [DIVIDE_BY_ZERO] 
Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL 
instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this 
error. SQLSTATE: 22012
    == DataFrame ==
    "__div__" was called from
    <stdin>:1
    ```
    
    TO
    ```py
    >>> psdf["a"] / psdf["b"]
    0    inf
    1   -inf
    2    NaN
    3    NaN
    dtype: float64
    ```
    
    ### How was this patch tested?
    Unit tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #50972 from xinrong-meng/divide_0.
    
    Authored-by: Xinrong Meng <xinr...@apache.org>
    Signed-off-by: Takuya Ueshin <ues...@databricks.com>
---
 python/pyspark/pandas/data_type_ops/num_ops.py     | 45 +++++++++++++++-------
 .../pandas/tests/computation/test_binary_ops.py    |  1 -
 python/pyspark/pandas/utils.py                     |  7 ++++
 3 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py 
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 8e8dfee9990e..34d313af8232 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -43,6 +43,7 @@ from pyspark.pandas.data_type_ops.base import (
     _is_boolean_type,
 )
 from pyspark.pandas.typedef.typehints import extension_dtypes, 
pandas_on_spark_type
+from pyspark.pandas.utils import is_ansi_mode_enabled
 from pyspark.sql import functions as F, Column as PySparkColumn
 from pyspark.sql.types import (
     BooleanType,
@@ -247,14 +248,23 @@ class IntegralOps(NumericOps):
         _sanitize_list_like(right)
         if not is_valid_operand_for_numeric_arithmetic(right):
             raise TypeError("True division can not be applied to given types.")
+        spark_session = left._internal.spark_frame.sparkSession
+        right = transform_boolean_operand_to_numeric(right, 
spark_type=left.spark.data_type)
 
         def truediv(left: PySparkColumn, right: Any) -> PySparkColumn:
-            return F.when(
-                F.lit(right != 0) | F.lit(right).isNull(),
-                left.__div__(right),
-            ).otherwise(F.lit(np.inf).__div__(left))
+            if is_ansi_mode_enabled(spark_session):
+                return F.when(
+                    F.lit(right == 0),
+                    F.when(left < 0, F.lit(float("-inf")))
+                    .when(left > 0, F.lit(float("inf")))
+                    .otherwise(F.lit(np.nan)),
+                ).otherwise(left / right)
+            else:
+                return F.when(
+                    F.lit(right != 0) | F.lit(right).isNull(),
+                    left.__div__(right),
+                ).otherwise(F.lit(np.inf).__div__(left))
 
-        right = transform_boolean_operand_to_numeric(right, 
spark_type=left.spark.data_type)
         return numpy_column_op(truediv)(left, right)
 
     def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
@@ -332,18 +342,27 @@ class FractionalOps(NumericOps):
         _sanitize_list_like(right)
         if not is_valid_operand_for_numeric_arithmetic(right):
             raise TypeError("True division can not be applied to given types.")
+        spark_session = left._internal.spark_frame.sparkSession
+        right = transform_boolean_operand_to_numeric(right, 
spark_type=left.spark.data_type)
 
         def truediv(left: PySparkColumn, right: Any) -> PySparkColumn:
-            return F.when(
-                F.lit(right != 0) | F.lit(right).isNull(),
-                left.__div__(right),
-            ).otherwise(
-                F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), 
left).otherwise(
-                    F.lit(np.inf).__div__(left)
+            if is_ansi_mode_enabled(spark_session):
+                return F.when(
+                    F.lit(right == 0),
+                    F.when(left < 0, F.lit(float("-inf")))
+                    .when(left > 0, F.lit(float("inf")))
+                    .otherwise(F.lit(np.nan)),
+                ).otherwise(left / right)
+            else:
+                return F.when(
+                    F.lit(right != 0) | F.lit(right).isNull(),
+                    left.__div__(right),
+                ).otherwise(
+                    F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), 
left).otherwise(
+                        F.lit(np.inf).__div__(left)
+                    )
                 )
-            )
 
-        right = transform_boolean_operand_to_numeric(right, 
spark_type=left.spark.data_type)
         return numpy_column_op(truediv)(left, right)
 
     def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
diff --git a/python/pyspark/pandas/tests/computation/test_binary_ops.py 
b/python/pyspark/pandas/tests/computation/test_binary_ops.py
index c2a773e1c2a8..3c9b7293d5d5 100644
--- a/python/pyspark/pandas/tests/computation/test_binary_ops.py
+++ b/python/pyspark/pandas/tests/computation/test_binary_ops.py
@@ -111,7 +111,6 @@ class FrameBinaryOpsMixin:
         psdf = ps.DataFrame({"a": ["x"], "b": ["y"]})
         self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - 
psdf["b"])
 
-    @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message)
     def test_divide_by_zero_behavior(self):
         # float / float
         # np.float32
diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py
index f4528ed63cf7..23350c06a147 100644
--- a/python/pyspark/pandas/utils.py
+++ b/python/pyspark/pandas/utils.py
@@ -1070,6 +1070,13 @@ def xor(df1: PySparkDataFrame, df2: PySparkDataFrame) -> 
PySparkDataFrame:
     )
 
 
+def is_ansi_mode_enabled(spark: SparkSession) -> bool:
+    return (
+        ps.get_option("compute.ansi_mode_support", spark_session=spark)
+        and spark.conf.get("spark.sql.ansi.enabled") == "true"
+    )
+
+
 def _test() -> None:
     import os
     import doctest


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-52249][PS] Enable divide-by-zero for numeric truediv with ANSI enabled

Reply via email to