This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 58cb44686701 [SPARK-55624][PS][TESTS] Ignore ArrowDtype in tests with
pandas 3
58cb44686701 is described below
commit 58cb446867014c6f40f7137acbacd7dad32c4280
Author: Takuya Ueshin <[email protected]>
AuthorDate: Mon Feb 23 07:20:10 2026 +0900
[SPARK-55624][PS][TESTS] Ignore ArrowDtype in tests with pandas 3
### What changes were proposed in this pull request?
Ignores `ArrowDtype` in tests with pandas 3.
### Why are the changes needed?
The `ArrowDtype` is used for the result of the predicates operations on
some dtypes in pandas 3 by default, but the values should be same as
`BooleanDtype`.
It should be ignored in tests.
```py
>>> pser = pd.Series(["x", "y", "z", None], dtype="string")
>>> other_pser = pd.Series([None, "z", "y", "x"], dtype="string")
>>>
>>> pser == other_pser
0 <NA>
1 False
2 False
3 <NA>
dtype: bool[pyarrow]
>>> (pser == other_pser).astype("boolean")
0 <NA>
1 False
2 False
3 <NA>
dtype: boolean
```
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Updated the test util.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #54412 from ueshin/issues/SPARK-55624/ignore_arrow_dtype.
Authored-by: Takuya Ueshin <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/testing/pandasutils.py | 36 +++++++++++++++++++++++++++++++----
1 file changed, 32 insertions(+), 4 deletions(-)
diff --git a/python/pyspark/testing/pandasutils.py
b/python/pyspark/testing/pandasutils.py
index 0cd8f7f8cf46..ff5bfa8d0d48 100644
--- a/python/pyspark/testing/pandasutils.py
+++ b/python/pyspark/testing/pandasutils.py
@@ -32,6 +32,15 @@ try:
except ImportError:
pass
+try:
+ from pyspark.sql.pandas.utils import require_minimum_pyarrow_version
+
+ require_minimum_pyarrow_version()
+ import pyarrow as pa
+except ImportError:
+ pass
+
+from pyspark.loose_version import LooseVersion
import pyspark.pandas as ps
from pyspark.pandas.frame import DataFrame
from pyspark.pandas.indexes import Index
@@ -439,9 +448,9 @@ class PandasOnSparkTestUtils:
)
else:
if not isinstance(left, (pd.DataFrame, pd.Index, pd.Series)):
- left = left.to_pandas()
+ left = self._ignore_arrow_dtypes(left.to_pandas())
if not isinstance(right, (pd.DataFrame, pd.Index, pd.Series)):
- right = right.to_pandas()
+ right = self._ignore_arrow_dtypes(right.to_pandas())
if not check_row_order:
if isinstance(left, pd.DataFrame) and len(left.columns) >
0:
@@ -454,8 +463,8 @@ class PandasOnSparkTestUtils:
else:
_assert_pandas_equal(left, right, checkExact=check_exact)
- lobj = self._to_pandas(left)
- robj = self._to_pandas(right)
+ lobj = self._ignore_arrow_dtypes(self._to_pandas(left))
+ robj = self._ignore_arrow_dtypes(self._to_pandas(right))
if isinstance(lobj, (pd.DataFrame, pd.Series, pd.Index)):
if almost:
_assert_pandas_almost_equal(lobj, robj, rtol=rtol, atol=atol)
@@ -482,6 +491,25 @@ class PandasOnSparkTestUtils:
else:
return obj
+ @staticmethod
+ def _ignore_arrow_dtypes(obj: Any):
+ if LooseVersion(pd.__version__) < "3.0.0":
+ return obj
+ else:
+ if isinstance(obj, pd.DataFrame):
+ arrow_boolean_columns = [
+ col
+ for col in obj.columns
+ if isinstance(col.dtype, pd.ArrowDtype)
+ and col.dtype.pyarrow_dtype == pa.bool_()
+ ]
+ if arrow_boolean_columns:
+ return obj.astype({col: "boolean" for col in
arrow_boolean_columns})
+ elif isinstance(obj, (pd.Series, pd.Index)):
+ if isinstance(obj.dtype, pd.ArrowDtype) and
obj.dtype.pyarrow_dtype == pa.bool_():
+ return obj.astype("boolean")
+ return obj
+
class PandasOnSparkTestCase(ReusedSQLTestCase, PandasOnSparkTestUtils):
@classmethod
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]