This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 032677cb329d [SPARK-52928][PYTHON][FOLLOW-UP] Remove unreachable code after upgrading pyarrow minimum version to 15.0.0 032677cb329d is described below commit 032677cb329d76363add0d2c1915cd8c2d714186 Author: Ruifeng Zheng <ruife...@apache.org> AuthorDate: Fri Aug 1 06:57:09 2025 -0700 [SPARK-52928][PYTHON][FOLLOW-UP] Remove unreachable code after upgrading pyarrow minimum version to 15.0.0 ### What changes were proposed in this pull request? Remove unreachable code after upgrading pyarrow minimum version to 15.0.0 ### Why are the changes needed? code clean up ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #51769 from zhengruifeng/pyarrow_15_cleanup. Authored-by: Ruifeng Zheng <ruife...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- python/pyspark/sql/connect/client/core.py | 12 +----------- python/pyspark/sql/pandas/conversion.py | 17 ++++------------- python/pyspark/sql/pandas/serializers.py | 19 ++++--------------- python/pyspark/sql/pandas/types.py | 8 -------- python/pyspark/sql/tests/arrow/test_arrow.py | 9 ++------- 5 files changed, 11 insertions(+), 54 deletions(-) diff --git a/python/pyspark/sql/connect/client/core.py b/python/pyspark/sql/connect/client/core.py index 122372877a7b..6ac4cc1894c7 100644 --- a/python/pyspark/sql/connect/client/core.py +++ b/python/pyspark/sql/connect/client/core.py @@ -63,7 +63,6 @@ from google.rpc import error_details_pb2 from pyspark.util import is_remote_only from pyspark.accumulators import SpecialAccumulatorIds -from pyspark.loose_version import LooseVersion from pyspark.version import __version__ from pyspark.resource.information import ResourceInformation from pyspark.sql.metrics import MetricValue, PlanMetrics, ExecutionInfo, ObservedMetrics @@ -968,7 +967,7 @@ class SparkConnectClient(object): # Rename columns to avoid duplicated column names. renamed_table = table.rename_columns([f"col_{i}" for i in range(table.num_columns)]) - pandas_options = {} + pandas_options = {"coerce_temporal_nanoseconds": True} if self_destruct: # Configure PyArrow to use as little memory as possible: # self_destruct - free columns as they are converted @@ -981,15 +980,6 @@ class SparkConnectClient(object): "use_threads": False, } ) - if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"): - # A legacy option to coerce date32, date64, duration, and timestamp - # time units to nanoseconds when converting to pandas. - # This option can only be added since 13.0.0. - pandas_options.update( - { - "coerce_temporal_nanoseconds": True, - } - ) pdf = renamed_table.to_pandas(**pandas_options) pdf.columns = schema.names diff --git a/python/pyspark/sql/pandas/conversion.py b/python/pyspark/sql/pandas/conversion.py index 0a1b1c917c36..cb7dfa555260 100644 --- a/python/pyspark/sql/pandas/conversion.py +++ b/python/pyspark/sql/pandas/conversion.py @@ -28,7 +28,6 @@ from typing import ( from warnings import warn from pyspark.errors.exceptions.captured import unwrap_spark_exception -from pyspark.loose_version import LooseVersion from pyspark.util import _load_from_socket from pyspark.sql.pandas.serializers import ArrowCollectSerializer from pyspark.sql.pandas.types import _dedup_names @@ -121,18 +120,10 @@ class PandasConversionMixin: # Pandas DataFrame created from PyArrow uses datetime64[ns] for date type # values, but we should use datetime.date to match the behavior with when # Arrow optimization is disabled. - pandas_options = {"date_as_object": True} - - if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"): - # A legacy option to coerce date32, date64, duration, and timestamp - # time units to nanoseconds when converting to pandas. - # This option can only be added since 13.0.0. - pandas_options.update( - { - "coerce_temporal_nanoseconds": True, - } - ) - + pandas_options = { + "date_as_object": True, + "coerce_temporal_nanoseconds": True, + } if self_destruct: # Configure PyArrow to use as little memory as possible: # self_destruct - free columns as they are converted diff --git a/python/pyspark/sql/pandas/serializers.py b/python/pyspark/sql/pandas/serializers.py index 459bd726df22..2ec2a36ee520 100644 --- a/python/pyspark/sql/pandas/serializers.py +++ b/python/pyspark/sql/pandas/serializers.py @@ -25,7 +25,6 @@ from typing import TYPE_CHECKING, Optional import pyspark from pyspark.errors import PySparkRuntimeError, PySparkTypeError, PySparkValueError -from pyspark.loose_version import LooseVersion from pyspark.serializers import ( Serializer, read_int, @@ -304,20 +303,10 @@ class ArrowStreamPandasSerializer(ArrowStreamSerializer): # instead of creating datetime64[ns] as intermediate data to avoid overflow caused by # datetime64[ns] type handling. # Cast dates to objects instead of datetime64[ns] dtype to avoid overflow. - pandas_options = {"date_as_object": True} - - import pyarrow as pa - - if LooseVersion(pa.__version__) >= LooseVersion("13.0.0"): - # A legacy option to coerce date32, date64, duration, and timestamp - # time units to nanoseconds when converting to pandas. - # This option can only be added since 13.0.0. - pandas_options.update( - { - "coerce_temporal_nanoseconds": True, - } - ) - + pandas_options = { + "date_as_object": True, + "coerce_temporal_nanoseconds": True, + } s = arrow_column.to_pandas(**pandas_options) # TODO(SPARK-43579): cache the converter for reuse diff --git a/python/pyspark/sql/pandas/types.py b/python/pyspark/sql/pandas/types.py index 771819d3c9ba..fbbe3fb81ae6 100644 --- a/python/pyspark/sql/pandas/types.py +++ b/python/pyspark/sql/pandas/types.py @@ -314,14 +314,6 @@ def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> Da elif types.is_list(at): spark_type = ArrayType(from_arrow_type(at.value_type, prefer_timestamp_ntz)) elif types.is_fixed_size_list(at): - import pyarrow as pa - - if LooseVersion(pa.__version__) < LooseVersion("14.0.0"): - # PyArrow versions before 14.0.0 do not support casting FixedSizeListArray to ListArray - raise PySparkTypeError( - errorClass="UNSUPPORTED_DATA_TYPE_FOR_ARROW_CONVERSION", - messageParameters={"data_type": str(at)}, - ) spark_type = ArrayType(from_arrow_type(at.value_type, prefer_timestamp_ntz)) elif types.is_large_list(at): spark_type = ArrayType(from_arrow_type(at.value_type, prefer_timestamp_ntz)) diff --git a/python/pyspark/sql/tests/arrow/test_arrow.py b/python/pyspark/sql/tests/arrow/test_arrow.py index a17c4c6d2567..c74ca121f26d 100644 --- a/python/pyspark/sql/tests/arrow/test_arrow.py +++ b/python/pyspark/sql/tests/arrow/test_arrow.py @@ -1713,13 +1713,8 @@ class ArrowTestsMixin: def test_createDataFrame_arrow_fixed_size_list(self): a = pa.array([[-1, 3]] * 5, type=pa.list_(pa.int32(), 2)) t = pa.table([a], ["fsl"]) - if LooseVersion(pa.__version__) < LooseVersion("14.0.0"): - # PyArrow versions before 14.0.0 do not support casting FixedSizeListArray to ListArray - with self.assertRaises(PySparkTypeError): - df = self.spark.createDataFrame(t) - else: - df = self.spark.createDataFrame(t) - self.assertIsInstance(df.schema["fsl"].dataType, ArrayType) + df = self.spark.createDataFrame(t) + self.assertIsInstance(df.schema["fsl"].dataType, ArrayType) @unittest.skipIf( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org