(spark) branch master updated: [SPARK-52928][PYTHON][FOLLOW-UP] Remove unreachable code after upgrading pyarrow minimum version to 15.0.0

dongjoon Fri, 01 Aug 2025 06:57:28 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 032677cb329d [SPARK-52928][PYTHON][FOLLOW-UP] Remove unreachable code 
after upgrading pyarrow minimum version to 15.0.0
032677cb329d is described below

commit 032677cb329d76363add0d2c1915cd8c2d714186
Author: Ruifeng Zheng <ruife...@apache.org>
AuthorDate: Fri Aug 1 06:57:09 2025 -0700

    [SPARK-52928][PYTHON][FOLLOW-UP] Remove unreachable code after upgrading 
pyarrow minimum version to 15.0.0
    
    ### What changes were proposed in this pull request?
    Remove unreachable code after upgrading pyarrow minimum version to 15.0.0
    
    ### Why are the changes needed?
    code clean up
    
    ### Does this PR introduce _any_ user-facing change?
    no
    
    ### How was this patch tested?
    ci
    
    ### Was this patch authored or co-authored using generative AI tooling?
    no
    
    Closes #51769 from zhengruifeng/pyarrow_15_cleanup.
    
    Authored-by: Ruifeng Zheng <ruife...@apache.org>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 python/pyspark/sql/connect/client/core.py    | 12 +-----------
 python/pyspark/sql/pandas/conversion.py      | 17 ++++-------------
 python/pyspark/sql/pandas/serializers.py     | 19 ++++---------------
 python/pyspark/sql/pandas/types.py           |  8 --------
 python/pyspark/sql/tests/arrow/test_arrow.py |  9 ++-------
 5 files changed, 11 insertions(+), 54 deletions(-)

diff --git a/python/pyspark/sql/connect/client/core.py 
b/python/pyspark/sql/connect/client/core.py
index 122372877a7b..6ac4cc1894c7 100644
--- a/python/pyspark/sql/connect/client/core.py
+++ b/python/pyspark/sql/connect/client/core.py
@@ -63,7 +63,6 @@ from google.rpc import error_details_pb2
 
 from pyspark.util import is_remote_only
 from pyspark.accumulators import SpecialAccumulatorIds
-from pyspark.loose_version import LooseVersion
 from pyspark.version import __version__
 from pyspark.resource.information import ResourceInformation
 from pyspark.sql.metrics import MetricValue, PlanMetrics, ExecutionInfo, 
ObservedMetrics
@@ -968,7 +967,7 @@ class SparkConnectClient(object):
             # Rename columns to avoid duplicated column names.
             renamed_table = table.rename_columns([f"col_{i}" for i in 
range(table.num_columns)])
 
-            pandas_options = {}
+            pandas_options = {"coerce_temporal_nanoseconds": True}
             if self_destruct:
                 # Configure PyArrow to use as little memory as possible:
                 # self_destruct - free columns as they are converted
@@ -981,15 +980,6 @@ class SparkConnectClient(object):
                         "use_threads": False,
                     }
                 )
-            if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"):
-                # A legacy option to coerce date32, date64, duration, and 
timestamp
-                # time units to nanoseconds when converting to pandas.
-                # This option can only be added since 13.0.0.
-                pandas_options.update(
-                    {
-                        "coerce_temporal_nanoseconds": True,
-                    }
-                )
             pdf = renamed_table.to_pandas(**pandas_options)
             pdf.columns = schema.names
 
diff --git a/python/pyspark/sql/pandas/conversion.py 
b/python/pyspark/sql/pandas/conversion.py
index 0a1b1c917c36..cb7dfa555260 100644
--- a/python/pyspark/sql/pandas/conversion.py
+++ b/python/pyspark/sql/pandas/conversion.py
@@ -28,7 +28,6 @@ from typing import (
 from warnings import warn
 
 from pyspark.errors.exceptions.captured import unwrap_spark_exception
-from pyspark.loose_version import LooseVersion
 from pyspark.util import _load_from_socket
 from pyspark.sql.pandas.serializers import ArrowCollectSerializer
 from pyspark.sql.pandas.types import _dedup_names
@@ -121,18 +120,10 @@ class PandasConversionMixin:
                         # Pandas DataFrame created from PyArrow uses 
datetime64[ns] for date type
                         # values, but we should use datetime.date to match the 
behavior with when
                         # Arrow optimization is disabled.
-                        pandas_options = {"date_as_object": True}
-
-                        if LooseVersion(pa.__version__) >= 
LooseVersion("13.0.0"):
-                            # A legacy option to coerce date32, date64, 
duration, and timestamp
-                            # time units to nanoseconds when converting to 
pandas.
-                            # This option can only be added since 13.0.0.
-                            pandas_options.update(
-                                {
-                                    "coerce_temporal_nanoseconds": True,
-                                }
-                            )
-
+                        pandas_options = {
+                            "date_as_object": True,
+                            "coerce_temporal_nanoseconds": True,
+                        }
                         if self_destruct:
                             # Configure PyArrow to use as little memory as 
possible:
                             # self_destruct - free columns as they are 
converted
diff --git a/python/pyspark/sql/pandas/serializers.py 
b/python/pyspark/sql/pandas/serializers.py
index 459bd726df22..2ec2a36ee520 100644
--- a/python/pyspark/sql/pandas/serializers.py
+++ b/python/pyspark/sql/pandas/serializers.py
@@ -25,7 +25,6 @@ from typing import TYPE_CHECKING, Optional
 
 import pyspark
 from pyspark.errors import PySparkRuntimeError, PySparkTypeError, 
PySparkValueError
-from pyspark.loose_version import LooseVersion
 from pyspark.serializers import (
     Serializer,
     read_int,
@@ -304,20 +303,10 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer):
         # instead of creating datetime64[ns] as intermediate data to avoid 
overflow caused by
         # datetime64[ns] type handling.
         # Cast dates to objects instead of datetime64[ns] dtype to avoid 
overflow.
-        pandas_options = {"date_as_object": True}
-
-        import pyarrow as pa
-
-        if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"):
-            # A legacy option to coerce date32, date64, duration, and timestamp
-            # time units to nanoseconds when converting to pandas.
-            # This option can only be added since 13.0.0.
-            pandas_options.update(
-                {
-                    "coerce_temporal_nanoseconds": True,
-                }
-            )
-
+        pandas_options = {
+            "date_as_object": True,
+            "coerce_temporal_nanoseconds": True,
+        }
         s = arrow_column.to_pandas(**pandas_options)
 
         # TODO(SPARK-43579): cache the converter for reuse
diff --git a/python/pyspark/sql/pandas/types.py 
b/python/pyspark/sql/pandas/types.py
index 771819d3c9ba..fbbe3fb81ae6 100644
--- a/python/pyspark/sql/pandas/types.py
+++ b/python/pyspark/sql/pandas/types.py
@@ -314,14 +314,6 @@ def from_arrow_type(at: "pa.DataType", 
prefer_timestamp_ntz: bool = False) -> Da
     elif types.is_list(at):
         spark_type = ArrayType(from_arrow_type(at.value_type, 
prefer_timestamp_ntz))
     elif types.is_fixed_size_list(at):
-        import pyarrow as pa
-
-        if LooseVersion(pa.__version__) < LooseVersion("14.0.0"):
-            # PyArrow versions before 14.0.0 do not support casting 
FixedSizeListArray to ListArray
-            raise PySparkTypeError(
-                errorClass="UNSUPPORTED_DATA_TYPE_FOR_ARROW_CONVERSION",
-                messageParameters={"data_type": str(at)},
-            )
         spark_type = ArrayType(from_arrow_type(at.value_type, 
prefer_timestamp_ntz))
     elif types.is_large_list(at):
         spark_type = ArrayType(from_arrow_type(at.value_type, 
prefer_timestamp_ntz))
diff --git a/python/pyspark/sql/tests/arrow/test_arrow.py 
b/python/pyspark/sql/tests/arrow/test_arrow.py
index a17c4c6d2567..c74ca121f26d 100644
--- a/python/pyspark/sql/tests/arrow/test_arrow.py
+++ b/python/pyspark/sql/tests/arrow/test_arrow.py
@@ -1713,13 +1713,8 @@ class ArrowTestsMixin:
     def test_createDataFrame_arrow_fixed_size_list(self):
         a = pa.array([[-1, 3]] * 5, type=pa.list_(pa.int32(), 2))
         t = pa.table([a], ["fsl"])
-        if LooseVersion(pa.__version__) < LooseVersion("14.0.0"):
-            # PyArrow versions before 14.0.0 do not support casting 
FixedSizeListArray to ListArray
-            with self.assertRaises(PySparkTypeError):
-                df = self.spark.createDataFrame(t)
-        else:
-            df = self.spark.createDataFrame(t)
-            self.assertIsInstance(df.schema["fsl"].dataType, ArrayType)
+        df = self.spark.createDataFrame(t)
+        self.assertIsInstance(df.schema["fsl"].dataType, ArrayType)
 
 
 @unittest.skipIf(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-52928][PYTHON][FOLLOW-UP] Remove unreachable code after upgrading pyarrow minimum version to 15.0.0

Reply via email to