This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 3b343b4d5c52 [SPARK-55713][PYTHON][TESTS] Add
`LongArrowToPandasBenchmark` for long type conversions
3b343b4d5c52 is described below
commit 3b343b4d5c5294e145d4d8d936fba2b49bb76b36
Author: Ruifeng Zheng <[email protected]>
AuthorDate: Fri Feb 27 09:16:27 2026 +0800
[SPARK-55713][PYTHON][TESTS] Add `LongArrowToPandasBenchmark` for long type
conversions
### What changes were proposed in this pull request?
Add benchmark for long type conversions
### Why are the changes needed?
to check the performance of critical code path
### Does this PR introduce _any_ user-facing change?
no, test-only
### How was this patch tested?
manually check for now, the ASV is not yet set up in CI
```sh
(spark-dev-313) ➜ benchmarks git:(update_benchmark_null_int) asv run
--python=same --quick -b 'bench_arrow.LongArrowToPandasBenchmark'
· Discovering benchmarks
· Running 2 total benchmarks (1 commits * 1 environments * 2 benchmarks)
[ 0.00%] ·· Benchmarking
existing-py_Users_ruifeng.zheng_.dev_miniconda3_envs_spark-dev-313_bin_python3.13
[25.00%] ··· bench_arrow.LongArrowToPandasBenchmark.peakmem_long_to_pandas
ok
[25.00%] ··· ========= ======== ==================== ===========
-- method
--------- -----------------------------------------
n_rows simple arrow_types_mapper pd.Series
========= ======== ==================== ===========
10000 109M 110M 111M
100000 117M 115M 110M
1000000 164M 165M 165M
========= ======== ==================== ===========
[50.00%] ··· bench_arrow.LongArrowToPandasBenchmark.time_long_to_pandas
ok
[50.00%] ··· ========= ========= ==================== ===========
-- method
--------- ------------------------------------------
n_rows simple arrow_types_mapper pd.Series
========= ========= ==================== ===========
10000 131±0μs 310±0μs 162±0μs
100000 134±0μs 482±0μs 173±0μs
1000000 155±0μs 1.35±0ms 273±0μs
========= ========= ==================== ===========
(spark-dev-313) ➜ benchmarks git:(update_benchmark_null_int) asv run
--python=same --quick -b 'bench_arrow.NullableLongArrowToPandasBenchmark'
· Discovering benchmarks
· Running 2 total benchmarks (1 commits * 1 environments * 2 benchmarks)
[ 0.00%] ·· Benchmarking
existing-py_Users_ruifeng.zheng_.dev_miniconda3_envs_spark-dev-313_bin_python3.13
[25.00%] ···
bench_arrow.NullableLongArrowToPandasBenchmark.peakmem_long_with_nulls_to_pandas_ext
ok
[25.00%] ··· ========= ====================== ====================
===========
-- method
---------
-------------------------------------------------------
n_rows integer_object_nulls arrow_types_mapper
pd.Series
========= ====================== ====================
===========
10000 110M 110M 108M
100000 132M 115M 113M
1000000 246M 201M 201M
========= ====================== ====================
===========
[50.00%] ···
bench_arrow.NullableLongArrowToPandasBenchmark.time_long_with_nulls_to_pandas_ext
ok
[50.00%] ··· ========= ====================== ====================
===========
-- method
---------
-------------------------------------------------------
n_rows integer_object_nulls arrow_types_mapper
pd.Series
========= ====================== ====================
===========
10000 1.49±0ms 1.81±0ms
3.19±0ms
100000 13.2±0ms 12.2±0ms
30.0±0ms
1000000 158±0ms 123±0ms 296±0ms
========= ====================== ====================
===========
```
### Was this patch authored or co-authored using generative AI tooling?
no
Closes #54513 from zhengruifeng/update_benchmark_null_int.
Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
---
python/benchmarks/bench_arrow.py | 65 ++++++++++++++++++++++++++++++++++++++++
python/pyspark/sql/conversion.py | 28 +----------------
2 files changed, 66 insertions(+), 27 deletions(-)
diff --git a/python/benchmarks/bench_arrow.py b/python/benchmarks/bench_arrow.py
index 9791e2cdf1d0..7ace66f22965 100644
--- a/python/benchmarks/bench_arrow.py
+++ b/python/benchmarks/bench_arrow.py
@@ -51,3 +51,68 @@ class ArrowToPandasBenchmark:
def peakmem_int_with_nulls_to_pandas(self, n_rows, types_mapper):
self.int_array_with_nulls.to_pandas(types_mapper=self.types_mapper)
+
+
+class LongArrowToPandasBenchmark:
+ """Benchmark for Arrow long array -> Pandas conversions."""
+
+ params = [
+ [10000, 100000, 1000000],
+ ["simple", "arrow_types_mapper", "pd.Series"],
+ ]
+ param_names = ["n_rows", "method"]
+
+ def setup(self, n_rows, method):
+ self.long_array = pa.array(list(range(n_rows - 1)) +
[9223372036854775707], type=pa.int64())
+
+ # check 3 different ways to convert non-nullable longs to numpy int64
+ def run_long_to_pandas(self, n_rows, method):
+ if method == "simple":
+ ser = self.long_array.to_pandas()
+ elif method == "arrow_types_mapper":
+ ser =
self.long_array.to_pandas(types_mapper=pd.ArrowDtype).astype(np.int64)
+ else:
+ ser = pd.Series(self.long_array, dtype=np.int64)
+ assert ser.dtype == np.int64
+
+ def time_long_to_pandas(self, n_rows, method):
+ self.run_long_to_pandas(n_rows, method)
+
+ def peakmem_long_to_pandas(self, n_rows, method):
+ self.run_long_to_pandas(n_rows, method)
+
+
+class NullableLongArrowToPandasBenchmark:
+ """Benchmark for Arrow long array with nulls -> Pandas conversions."""
+
+ params = [
+ [10000, 100000, 1000000],
+ ["integer_object_nulls", "arrow_types_mapper", "pd.Series"],
+ ]
+ param_names = ["n_rows", "method"]
+
+ def setup(self, n_rows, method):
+ self.long_array_with_nulls = pa.array(
+ [i if i % 10 != 0 else None for i in range(n_rows - 1)] +
[9223372036854775707],
+ type=pa.int64(),
+ )
+
+ # check 3 different ways to convert nullable longs to nullable extension
type
+ def run_long_with_nulls_to_pandas_ext(self, n_rows, method):
+ if method == "integer_object_nulls":
+ ser =
self.long_array_with_nulls.to_pandas(integer_object_nulls=True).astype(
+ pd.Int64Dtype()
+ )
+ elif method == "arrow_types_mapper":
+ ser =
self.long_array_with_nulls.to_pandas(types_mapper=pd.ArrowDtype).astype(
+ pd.Int64Dtype()
+ )
+ else:
+ ser = pd.Series(self.long_array_with_nulls.to_pylist(),
dtype=pd.Int64Dtype())
+ assert ser.dtype == pd.Int64Dtype()
+
+ def time_long_with_nulls_to_pandas_ext(self, n_rows, method):
+ self.run_long_with_nulls_to_pandas_ext(n_rows, method)
+
+ def peakmem_long_with_nulls_to_pandas_ext(self, n_rows, method):
+ self.run_long_with_nulls_to_pandas_ext(n_rows, method)
diff --git a/python/pyspark/sql/conversion.py b/python/pyspark/sql/conversion.py
index 34e047ae52d5..e0708716a46c 100644
--- a/python/pyspark/sql/conversion.py
+++ b/python/pyspark/sql/conversion.py
@@ -1654,33 +1654,7 @@ class ArrowArrayToPandasConversion:
series: pd.Series
- # TODO(SPARK-55332): Create benchmark for pa.array -> pd.series
integer conversion
- # 1, benchmark a nullable integral array
- # a = pa.array(list(range(10000000)) + [9223372036854775707, None],
type=pa.int64())
- # %timeit a.to_pandas(types_mapper=pd.ArrowDtype)
- # 11.9 μs ± 407 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops
each)
- # %timeit
a.to_pandas(types_mapper=pd.ArrowDtype).astype(pd.Int64Dtype())
- # 589 ms ± 9.35 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- # %timeit pd.Series(a.to_pylist(), dtype=pd.Int64Dtype())
- # 2.94 s ± 19.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- # %timeit
a.to_pandas(integer_object_nulls=True).astype(pd.Int64Dtype())
- # 2.05 s ± 22.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
- # pd.Series(a, dtype=pd.Int64Dtype())
- # fails due to internal np.float64 coercion
- # OverflowError: Python int too large to convert to C long
- #
- # 2, benchmark a nullable integral array
- # b = pa.array(list(range(10000000)) + [9223372036854775707, 1],
type=pa.int64())
- # %timeit b.to_pandas(types_mapper=pd.ArrowDtype).astype(np.int64)
- # 30.2 μs ± 831 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops
each)
- # %timeit pd.Series(b.to_pandas(types_mapper=pd.ArrowDtype),
dtype=np.int64)
- # 33.3 μs ± 928 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops
each)
- # %timeit pd.Series(b, dtype=np.int64) <- lose the name
- # 11.9 μs ± 125 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops
each)
- # %timeit b.to_pandas()
- # 7.56 μs ± 96.5 ns per loop (mean ± std. dev. of 7 runs, 100,000
loops each)
- # %timeit b.to_pandas().astype(np.int64) <- astype is non-trivial
- # 19.1 μs ± 242 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops
each)
+ # conversion methods are selected based on benchmark
python/benchmarks/bench_arrow.py
if isinstance(spark_type, ByteType):
if arr.null_count > 0:
series =
arr.to_pandas(types_mapper=pd.ArrowDtype).astype(pd.Int8Dtype())
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]