This is an automated email from the ASF dual-hosted git repository.
gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 43575015be30 [SPARK-54787][PS] Use list comprehension instead of for
loops in pandas
43575015be30 is described below
commit 43575015be307f6b3ec992c38b2ba4fd2536d49a
Author: Devin Petersohn <[email protected]>
AuthorDate: Tue Dec 23 10:37:27 2025 +0900
[SPARK-54787][PS] Use list comprehension instead of for loops in pandas
### What changes were proposed in this pull request?
Use list comprehension in multiple pandas DataFrame methods for performance
and code quality.
### Why are the changes needed?
For mantainability and performance
### Does this PR introduce _any_ user-facing change?
No
### How was this patch tested?
CI
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #53565 from devin-petersohn/devin/pandas_maintain_02.
Authored-by: Devin Petersohn <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
---
python/pyspark/pandas/frame.py | 31 +++++++++++++++----------------
1 file changed, 15 insertions(+), 16 deletions(-)
diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py
index 7f0a516d5963..df68e31d4f33 100644
--- a/python/pyspark/pandas/frame.py
+++ b/python/pyspark/pandas/frame.py
@@ -11281,15 +11281,14 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
"""
from pyspark.pandas.series import first_series
- cols = []
result_scol_name = "value"
- for label, applied_col in zip(column_labels, scols):
- cols.append(
- F.struct(
- *[F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col
in enumerate(label)],
- *[applied_col.alias(result_scol_name)],
- )
+ cols = [
+ F.struct(
+ *[F.lit(col).alias(SPARK_INDEX_NAME_FORMAT(i)) for i, col in
enumerate(label)],
+ *[applied_col.alias(result_scol_name)],
)
+ for label, applied_col in zip(column_labels, scols)
+ ]
# Statements under this comment implement spark frame transformations
as below:
# From:
#
+-------------------------------------------------------------------------------------+
@@ -11434,12 +11433,11 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
3 4.0
"""
if numeric_only:
- numeric_col_names = []
- for label in self._internal.column_labels:
- psser = self._psser_for(label)
- if isinstance(psser.spark.data_type, (NumericType,
BooleanType)):
- numeric_col_names.append(psser.name)
-
+ numeric_col_names = [
+ self._psser_for(label).name
+ for label in self._internal.column_labels
+ if isinstance(self._psser_for(label).spark.data_type,
(NumericType, BooleanType))
+ ]
psdf = self[numeric_col_names] if numeric_only else self
return psdf._apply_series_op(
lambda psser: psser._rank(method=method, ascending=ascending),
should_resolve=True
@@ -12530,9 +12528,10 @@ defaultdict(<class 'list'>, {'col..., 'col...})]
cols_dict[column].append(scol_for(sdf,
column)[i].alias(column))
internal_index_column = SPARK_DEFAULT_INDEX_NAME
- cols = []
- for i, col in enumerate(zip(*cols_dict.values())):
-
cols.append(F.struct(F.lit(qq[i]).alias(internal_index_column), *col))
+ cols = [
+ F.struct(F.lit(qq[i]).alias(internal_index_column), *col)
+ for i, col in enumerate(zip(*cols_dict.values()))
+ ]
sdf = sdf.select(F.array(*cols).alias("arrays"))
# And then, explode it and manually set the index.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]