(spark) branch branch-4.0 updated: [SPARK-54481][SPARK-54495][SPARK-54483][4.0][TEST][PYTHON][CONNECT] Fix failing tests

gurwls223 Mon, 01 Dec 2025 01:49:31 -0800

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new a77cdbdbcb9d 
[SPARK-54481][SPARK-54495][SPARK-54483][4.0][TEST][PYTHON][CONNECT] Fix failing 
tests
a77cdbdbcb9d is described below

commit a77cdbdbcb9d80e2b176037554efffbf825274a6
Author: xianzhe-databricks <[email protected]>
AuthorDate: Mon Dec 1 18:48:52 2025 +0900

    [SPARK-54481][SPARK-54495][SPARK-54483][4.0][TEST][PYTHON][CONNECT] Fix 
failing tests
    
    ### What changes were proposed in this pull request?
    3 tests fail on cross-version tests: Spark connect 4.0 client against Spark 
connect 4.2 server. We fixed them in this PR.
    
    We basically cherry-picked fix from these PRs:
    * https://github.com/apache/spark/pull/51635
    * https://github.com/apache/spark/pull/51120
    
    ### Why are the changes needed?
    
    Make post-merge CI happy!
    
    ### Does this PR introduce _any_ user-facing change?
    
    No
    ### How was this patch tested?
    
    Following 
https://github.com/apache/spark/blob/master/.github/workflows/build_python_connect40.yml#L85-L106,
 I manually tested it on my local machine.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    I finished this PR with the help of claude code.
    
    Closes #53250 from xianzhe-databricks/SPARK-54481-reenable-tests.
    
    Lead-authored-by: xianzhe-databricks <[email protected]>
    Co-authored-by: Hyukjin Kwon <[email protected]>
    Signed-off-by: Hyukjin Kwon <[email protected]>
---
 python/pyspark/pandas/tests/test_typedef.py        |  8 ++--
 .../sql/tests/pandas/test_pandas_cogrouped_map.py  |  6 +--
 python/pyspark/sql/tests/pandas/test_pandas_map.py | 49 +++++++++++++---------
 3 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/python/pyspark/pandas/tests/test_typedef.py 
b/python/pyspark/pandas/tests/test_typedef.py
index 62ba2672de19..afed59660d74 100644
--- a/python/pyspark/pandas/tests/test_typedef.py
+++ b/python/pyspark/pandas/tests/test_typedef.py
@@ -311,13 +311,9 @@ class TypeHintTestsMixin:
 
         self.assertRaisesRegex(TypeError, "object.*not understood", 
try_infer_return_type)
 
-    @unittest.skipIf(
-        os.environ.get("SPARK_SKIP_CONNECT_COMPAT_TESTS") == "1", 
"SPARK-54495: To be reenabled"
-    )
     def test_as_spark_type_pandas_on_spark_dtype(self):
         type_mapper = {
             # binary
-            np.character: (np.character, BinaryType()),
             np.bytes_: (np.bytes_, BinaryType()),
             bytes: (np.bytes_, BinaryType()),
             # integer
@@ -352,6 +348,10 @@ class TypeHintTestsMixin:
             ),
         }
 
+        if LooseVersion(np.__version__) < LooseVersion("2.3"):
+            # binary
+            type_mapper.update({np.character: (np.character, BinaryType())})
+
         for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
             self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
             self.assertEqual(pandas_on_spark_type(numpy_or_python_type), 
(dtype, spark_type))
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py 
b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
index ab937cb68f74..3a6ab9c98ebb 100644
--- a/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
+++ b/python/pyspark/sql/tests/pandas/test_pandas_cogrouped_map.py
@@ -17,7 +17,6 @@
 
 import unittest
 from typing import cast
-import os
 
 from pyspark.sql.functions import array, explode, col, lit, udf, pandas_udf, 
sum
 from pyspark.sql.types import (
@@ -240,9 +239,6 @@ class CogroupedApplyInPandasTestsMixin:
 
         self._test_merge_empty(fn=merge_pandas)
 
-    @unittest.skipIf(
-        os.environ.get("SPARK_SKIP_CONNECT_COMPAT_TESTS") == "1", 
"SPARK-54481: To be reenabled"
-    )
     def test_apply_in_pandas_returning_incompatible_type(self):
         with self.quiet():
             self.check_apply_in_pandas_returning_incompatible_type()
@@ -266,7 +262,7 @@ class CogroupedApplyInPandasTestsMixin:
                             
"`spark.sql.execution.pandas.convertToArrowArraySafely`."
                         )
                     self._test_merge_error(
-                        fn=lambda lft, rgt: pd.DataFrame({"id": [1], "k": 
["2.0"]}),
+                        fn=lambda lft, rgt: pd.DataFrame({"id": [1], "k": 
["test_string"]}),
                         output_schema="id long, k double",
                         errorClass=PythonException,
                         error_message_regex=expected,
diff --git a/python/pyspark/sql/tests/pandas/test_pandas_map.py 
b/python/pyspark/sql/tests/pandas/test_pandas_map.py
index a9f66b161403..e5d0b56be691 100644
--- a/python/pyspark/sql/tests/pandas/test_pandas_map.py
+++ b/python/pyspark/sql/tests/pandas/test_pandas_map.py
@@ -246,24 +246,22 @@ class MapInPandasTestsMixin:
         actual = df.repartition(1).mapInPandas(f, "id long, value 
long").collect()
         self.assertEqual(actual, expected)
 
-    @unittest.skipIf(
-        os.environ.get("SPARK_SKIP_CONNECT_COMPAT_TESTS") == "1", 
"SPARK-54483: To be reenabled"
-    )
     def test_dataframes_with_incompatible_types(self):
         with self.quiet():
             self.check_dataframes_with_incompatible_types()
 
     def check_dataframes_with_incompatible_types(self):
-        def func(iterator):
-            for pdf in iterator:
-                yield pdf.assign(id=pdf["id"].apply(str))
-
         for safely in [True, False]:
             with self.subTest(convertToArrowArraySafely=safely), self.sql_conf(
                 {"spark.sql.execution.pandas.convertToArrowArraySafely": 
safely}
             ):
                 # sometimes we see ValueErrors
                 with self.subTest(convert="string to double"):
+
+                    def func(iterator):
+                        for pdf in iterator:
+                            yield pdf.assign(id="test_string")
+
                     expected = (
                         r"ValueError: Exception thrown when converting 
pandas.Series "
                         r"\(object\) with name 'id' to Arrow Array \(double\)."
@@ -282,18 +280,31 @@ class MapInPandasTestsMixin:
                             .collect()
                         )
 
-                # sometimes we see TypeErrors
-                with self.subTest(convert="double to string"):
-                    with self.assertRaisesRegex(
-                        PythonException,
-                        r"TypeError: Exception thrown when converting 
pandas.Series "
-                        r"\(float64\) with name 'id' to Arrow Array 
\(string\).\n",
-                    ):
-                        (
-                            self.spark.range(10, numPartitions=3)
-                            .select(col("id").cast("double"))
-                            .mapInPandas(self.identity_dataframes_iter("id"), 
"id string")
-                            .collect()
+                with self.subTest(convert="float to int precision loss"):
+
+                    def func(iterator):
+                        for pdf in iterator:
+                            yield pdf.assign(id=pdf["id"] + 0.1)
+
+                    df = (
+                        self.spark.range(10, numPartitions=3)
+                        .select(col("id").cast("double"))
+                        .mapInPandas(func, "id int")
+                    )
+                    if safely:
+                        expected = (
+                            r"ValueError: Exception thrown when converting 
pandas.Series "
+                            r"\(float64\) with name 'id' to Arrow Array 
\(int32\)."
+                            " It can be caused by overflows or other "
+                            "unsafe conversions warned by Arrow. Arrow safe 
type check "
+                            "can be disabled by using SQL config "
+                            
"`spark.sql.execution.pandas.convertToArrowArraySafely`."
+                        )
+                        with self.assertRaisesRegex(PythonException, expected 
+ "\n"):
+                            df.collect()
+                    else:
+                        self.assertEqual(
+                            df.collect(), self.spark.range(10, 
numPartitions=3).collect()
                         )
 
     def test_empty_iterator(self):


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) branch branch-4.0 updated: [SPARK-54481][SPARK-54495][SPARK-54483][4.0][TEST][PYTHON][CONNECT] Fix failing tests

Reply via email to