This is an automated email from the ASF dual-hosted git repository.
xinrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 7f2289909383 [SPARK-52570][PS] Enable divide-by-zero for numeric rmod
with ANSI enabled
7f2289909383 is described below
commit 7f228990938319eb83753489367435e754c142ea
Author: Xinrong Meng <[email protected]>
AuthorDate: Fri Jun 27 11:59:53 2025 -0700
[SPARK-52570][PS] Enable divide-by-zero for numeric rmod with ANSI enabled
### What changes were proposed in this pull request?
Enable divide-by-zero for numeric rmod with ANSI enabled
### Why are the changes needed?
Part of https://issues.apache.org/jira/browse/SPARK-52169.
### Does this PR introduce _any_ user-facing change?
Yes.
```py
>>> ps.set_option("compute.fail_on_ansi_mode", False)
>>> ps.set_option("compute.ansi_mode_support", True)
>>> pdf = pd.DataFrame({"a": [0], "b": [False]})
>>> pdf.dtypes
a int64
b bool
dtype: object
>>> psdf = ps.from_pandas(pdf)
>>> 1 % psdf["a"]
0 NaN
Name: a, dtype: float64
>>> 1 % psdf["b"]
0 NaN
Name: b, dtype: float64
```
### How was this patch tested?
Unit tests.
```
(dev3.11) spark (bool_mod_new) % SPARK_ANSI_SQL_MODE=true
./python/run-tests --python-executables=python3.11 --testnames
"pyspark.pandas.tests.data_type_ops.test_boolean_ops
...
Tests passed in 4 seconds
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #51275 from xinrong-meng/bool_mod_new.
Authored-by: Xinrong Meng <[email protected]>
Signed-off-by: Xinrong Meng <[email protected]>
---
python/pyspark/pandas/data_type_ops/boolean_ops.py | 28 ++++------------------
python/pyspark/pandas/data_type_ops/num_ops.py | 17 +++++++++----
.../pandas/tests/computation/test_binary_ops.py | 3 ++-
.../pandas/tests/data_type_ops/test_boolean_ops.py | 13 ++++------
4 files changed, 24 insertions(+), 37 deletions(-)
diff --git a/python/pyspark/pandas/data_type_ops/boolean_ops.py
b/python/pyspark/pandas/data_type_ops/boolean_ops.py
index 765ec6a94634..c91dcc913080 100644
--- a/python/pyspark/pandas/data_type_ops/boolean_ops.py
+++ b/python/pyspark/pandas/data_type_ops/boolean_ops.py
@@ -21,7 +21,7 @@ from typing import Any, Union
import pandas as pd
from pandas.api.types import CategoricalDtype
-from pyspark.pandas.base import column_op, IndexOpsMixin, numpy_column_op
+from pyspark.pandas.base import column_op, IndexOpsMixin
from pyspark.pandas._typing import Dtype, IndexOpsLike, SeriesOrIndex
from pyspark.pandas.data_type_ops.base import (
DataTypeOps,
@@ -35,7 +35,6 @@ from pyspark.pandas.data_type_ops.base import (
_is_boolean_type,
)
from pyspark.pandas.typedef.typehints import as_spark_type, extension_dtypes,
pandas_on_spark_type
-from pyspark.pandas.utils import is_ansi_mode_enabled
from pyspark.sql import functions as F, Column as PySparkColumn
from pyspark.sql.types import BooleanType, StringType
from pyspark.errors import PySparkValueError
@@ -137,21 +136,13 @@ class BooleanOps(DataTypeOps):
raise TypeError(
"Modulo can not be applied to %s and the given type." %
self.pretty_name
)
- spark_session = left._internal.spark_frame.sparkSession
-
- def safe_mod(left_col: PySparkColumn, right_val: Any) -> PySparkColumn:
- if is_ansi_mode_enabled(spark_session):
- return F.when(F.lit(right_val == 0),
F.lit(None)).otherwise(left_col % right_val)
- else:
- return left_col % right_val
-
if isinstance(right, numbers.Number):
left = transform_boolean_operand_to_numeric(left,
spark_type=as_spark_type(type(right)))
- return numpy_column_op(safe_mod)(left, right)
+ return left % right
else:
assert isinstance(right, IndexOpsMixin)
left = transform_boolean_operand_to_numeric(left,
spark_type=right.spark.data_type)
- return numpy_column_op(safe_mod)(left, right)
+ return left % right
def pow(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
_sanitize_list_like(right)
@@ -235,18 +226,7 @@ class BooleanOps(DataTypeOps):
_sanitize_list_like(right)
if isinstance(right, numbers.Number) and not isinstance(right, bool):
left = transform_boolean_operand_to_numeric(left,
spark_type=as_spark_type(type(right)))
- spark_session = left._internal.spark_frame.sparkSession
-
- if is_ansi_mode_enabled(spark_session):
-
- def safe_rmod(left_col: PySparkColumn, right_val: Any) ->
PySparkColumn:
- return F.when(left_col != 0, F.pmod(F.lit(right_val),
left_col)).otherwise(
- F.lit(None)
- )
-
- return numpy_column_op(safe_rmod)(left, right)
- else:
- return right % left
+ return right % left
else:
raise TypeError(
"Modulo can not be applied to %s and the given type." %
self.pretty_name
diff --git a/python/pyspark/pandas/data_type_ops/num_ops.py
b/python/pyspark/pandas/data_type_ops/num_ops.py
index 508b1f4984ba..00d8cfc83286 100644
--- a/python/pyspark/pandas/data_type_ops/num_ops.py
+++ b/python/pyspark/pandas/data_type_ops/num_ops.py
@@ -160,12 +160,21 @@ class NumericOps(DataTypeOps):
_sanitize_list_like(right)
if not isinstance(right, numbers.Number):
raise TypeError("Modulo can not be applied to given types.")
-
- def rmod(left: PySparkColumn, right: Any) -> PySparkColumn:
- return ((right % left) + left) % left
+ spark_session = left._internal.spark_frame.sparkSession
right = transform_boolean_operand_to_numeric(right)
- return column_op(rmod)(left, right)
+
+ def safe_rmod(left_col: PySparkColumn, right_val: Any) ->
PySparkColumn:
+ if is_ansi_mode_enabled(spark_session):
+ # Java-style modulo -> Python-style modulo
+ result = F.when(
+ left_col != 0, ((F.lit(right_val) % left_col) + left_col)
% left_col
+ ).otherwise(F.lit(None))
+ return result
+ else:
+ return ((right % left) + left) % left
+
+ return column_op(safe_rmod)(left, right)
def neg(self, operand: IndexOpsLike) -> IndexOpsLike:
return operand._with_new_scol(-operand.spark.column,
field=operand._internal.data_fields[0])
diff --git a/python/pyspark/pandas/tests/computation/test_binary_ops.py
b/python/pyspark/pandas/tests/computation/test_binary_ops.py
index bdcad33b4c51..1861e345b2b1 100644
--- a/python/pyspark/pandas/tests/computation/test_binary_ops.py
+++ b/python/pyspark/pandas/tests/computation/test_binary_ops.py
@@ -225,11 +225,12 @@ class FrameBinaryOpsMixin:
def test_binary_operator_mod(self):
# Positive
- pdf = pd.DataFrame({"a": [3], "b": [2]})
+ pdf = pd.DataFrame({"a": [3], "b": [2], "c": [0]})
psdf = ps.from_pandas(pdf)
self.assert_eq(psdf["a"] % psdf["b"], pdf["a"] % pdf["b"])
self.assert_eq(psdf["a"] % 0, pdf["a"] % 0)
+ self.assert_eq(1 % psdf["c"], 1 % pdf["c"])
# Negative
psdf = ps.DataFrame({"a": ["x"], "b": [1]})
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
index 8a2652f82e05..c4430fa33f19 100644
--- a/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
+++ b/python/pyspark/pandas/tests/data_type_ops/test_boolean_ops.py
@@ -235,16 +235,13 @@ class BooleanOpsTestsMixin:
def test_rmod(self):
psdf = self.psdf
+ pdf = self.pdf
b_psser = psdf["bool"]
- # 1 % False is 0.0 in pandas
- self.assert_eq(pd.Series([0, 0, None], dtype=float, name="bool"), 1 %
b_psser)
- # 0.1 / True is 0.1 in pandas
- self.assert_eq(
- pd.Series([0.10000000000000009, 0.10000000000000009, None],
dtype=float, name="bool"),
- 0.1 % b_psser,
- check_exact=False, # [0.1, 0.1, nan] for pandas-on-Spark
- )
+ b_pser = pdf["bool"]
+ self.assert_eq(1 % b_pser.astype(float), 1 % b_psser)
+ # # Allow float precision diff: pandas: 0.10000000000000009; pandas
on spark: 0.1
+ self.assert_eq(0.1 % b_pser, 0.1 % b_psser, almost=True)
self.assertRaises(TypeError, lambda: datetime.date(1994, 1, 1) %
b_psser)
self.assertRaises(TypeError, lambda: True % b_psser)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]