This is an automated email from the ASF dual-hosted git repository. ueshin pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new b0c2ba357bf0 [SPARK-52249][PS] Enable divide-by-zero for numeric truediv with ANSI enabled b0c2ba357bf0 is described below commit b0c2ba357bf080dd328b95e4a6402b134a641a1a Author: Xinrong Meng <xinr...@apache.org> AuthorDate: Fri May 30 15:00:49 2025 -0700 [SPARK-52249][PS] Enable divide-by-zero for numeric truediv with ANSI enabled ### What changes were proposed in this pull request? Enable divide-by-zero for truediv with ANSI enabled ### Why are the changes needed? Part of https://issues.apache.org/jira/browse/SPARK-52169 ### Does this PR introduce _any_ user-facing change? Yes, divide-by-zero for truediv is enabled with ANSI enabled ```py >>> spark.conf.get("spark.sql.ansi.enabled") 'true' >>> pdf = pd.DataFrame({"a": [1.0, -1.0, 0.0, np.nan], "b": [0.0, 0.0, 0.0, 0.0]}) >>> psdf = ps.from_pandas(pdf) ``` FROM ```py >>> psdf["a"] / psdf["b"] ... pyspark.errors.exceptions.captured.ArithmeticException: [DIVIDE_BY_ZERO] Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. SQLSTATE: 22012 == DataFrame == "__div__" was called from <stdin>:1 ``` TO ```py >>> psdf["a"] / psdf["b"] 0 inf 1 -inf 2 NaN 3 NaN dtype: float64 ``` ### How was this patch tested? Unit tests ### Was this patch authored or co-authored using generative AI tooling? No Closes #50972 from xinrong-meng/divide_0. Authored-by: Xinrong Meng <xinr...@apache.org> Signed-off-by: Takuya Ueshin <ues...@databricks.com> --- python/pyspark/pandas/data_type_ops/num_ops.py | 45 +++++++++++++++------- .../pandas/tests/computation/test_binary_ops.py | 1 - python/pyspark/pandas/utils.py | 7 ++++ 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py b/python/pyspark/pandas/data_type_ops/num_ops.py index 8e8dfee9990e..34d313af8232 100644 --- a/python/pyspark/pandas/data_type_ops/num_ops.py +++ b/python/pyspark/pandas/data_type_ops/num_ops.py @@ -43,6 +43,7 @@ from pyspark.pandas.data_type_ops.base import ( _is_boolean_type, ) from pyspark.pandas.typedef.typehints import extension_dtypes, pandas_on_spark_type +from pyspark.pandas.utils import is_ansi_mode_enabled from pyspark.sql import functions as F, Column as PySparkColumn from pyspark.sql.types import ( BooleanType, @@ -247,14 +248,23 @@ class IntegralOps(NumericOps): _sanitize_list_like(right) if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError("True division can not be applied to given types.") + spark_session = left._internal.spark_frame.sparkSession + right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) def truediv(left: PySparkColumn, right: Any) -> PySparkColumn: - return F.when( - F.lit(right != 0) | F.lit(right).isNull(), - left.__div__(right), - ).otherwise(F.lit(np.inf).__div__(left)) + if is_ansi_mode_enabled(spark_session): + return F.when( + F.lit(right == 0), + F.when(left < 0, F.lit(float("-inf"))) + .when(left > 0, F.lit(float("inf"))) + .otherwise(F.lit(np.nan)), + ).otherwise(left / right) + else: + return F.when( + F.lit(right != 0) | F.lit(right).isNull(), + left.__div__(right), + ).otherwise(F.lit(np.inf).__div__(left)) - right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) return numpy_column_op(truediv)(left, right) def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: @@ -332,18 +342,27 @@ class FractionalOps(NumericOps): _sanitize_list_like(right) if not is_valid_operand_for_numeric_arithmetic(right): raise TypeError("True division can not be applied to given types.") + spark_session = left._internal.spark_frame.sparkSession + right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) def truediv(left: PySparkColumn, right: Any) -> PySparkColumn: - return F.when( - F.lit(right != 0) | F.lit(right).isNull(), - left.__div__(right), - ).otherwise( - F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( - F.lit(np.inf).__div__(left) + if is_ansi_mode_enabled(spark_session): + return F.when( + F.lit(right == 0), + F.when(left < 0, F.lit(float("-inf"))) + .when(left > 0, F.lit(float("inf"))) + .otherwise(F.lit(np.nan)), + ).otherwise(left / right) + else: + return F.when( + F.lit(right != 0) | F.lit(right).isNull(), + left.__div__(right), + ).otherwise( + F.when(F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise( + F.lit(np.inf).__div__(left) + ) ) - ) - right = transform_boolean_operand_to_numeric(right, spark_type=left.spark.data_type) return numpy_column_op(truediv)(left, right) def floordiv(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: diff --git a/python/pyspark/pandas/tests/computation/test_binary_ops.py b/python/pyspark/pandas/tests/computation/test_binary_ops.py index c2a773e1c2a8..3c9b7293d5d5 100644 --- a/python/pyspark/pandas/tests/computation/test_binary_ops.py +++ b/python/pyspark/pandas/tests/computation/test_binary_ops.py @@ -111,7 +111,6 @@ class FrameBinaryOpsMixin: psdf = ps.DataFrame({"a": ["x"], "b": ["y"]}) self.assertRaisesRegex(TypeError, ks_err_msg, lambda: psdf["a"] - psdf["b"]) - @unittest.skipIf(is_ansi_mode_test, ansi_mode_not_supported_message) def test_divide_by_zero_behavior(self): # float / float # np.float32 diff --git a/python/pyspark/pandas/utils.py b/python/pyspark/pandas/utils.py index f4528ed63cf7..23350c06a147 100644 --- a/python/pyspark/pandas/utils.py +++ b/python/pyspark/pandas/utils.py @@ -1070,6 +1070,13 @@ def xor(df1: PySparkDataFrame, df2: PySparkDataFrame) -> PySparkDataFrame: ) +def is_ansi_mode_enabled(spark: SparkSession) -> bool: + return ( + ps.get_option("compute.ansi_mode_support", spark_session=spark) + and spark.conf.get("spark.sql.ansi.enabled") == "true" + ) + + def _test() -> None: import os import doctest --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org