This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 8cbd520e24db [SPARK-55896][PS] Use numpy functions instead of builtins
8cbd520e24db is described below

commit 8cbd520e24dbf996a951aa120ba2d67f2521ebad
Author: Takuya Ueshin <[email protected]>
AuthorDate: Tue Mar 10 09:48:52 2026 -0700

    [SPARK-55896][PS] Use numpy functions instead of builtins
    
    ### What changes were proposed in this pull request?
    
    Uses `numpy` functions instead of builtins and fix groupby-apply.
    
    ### Why are the changes needed?
    
    In pandas 3, the given builtin functions won't implicitly be replaced with 
the corresponding `numpy` function anymore.
    
    For example:
    
    ```py
    >>> pdf = pd.DataFrame(
    ...     {"d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0], "v": [1.0, 2.0, 3.0, 4.0, 
5.0, 6.0]}
    ... )
    ```
    
    - pandas 2
    
    ```py
    >>> pdf.groupby("d").apply(sum)
    <stdin>:1: FutureWarning: The provided callable <built-in function sum> is 
currently using np.sum. In a future version of pandas, the provided callable 
will be used directly. To keep current behavior pass the string np.sum instead.
           d     v
    d
    1.0  3.0   6.0
    2.0  6.0  15.0
    ```
    
    - pandas 3
    
    ```py
    >>> pdf.groupby("d").apply(sum)
    Traceback (most recent call last):
    ...
    TypeError: unsupported operand type(s) for +: 'int' and 'str'
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, it will behave more like pandas 3.
    
    ### How was this patch tested?
    
    Updated the related tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #54699 from ueshin/issues/SPARK-55896/builtin.
    
    Authored-by: Takuya Ueshin <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 python/pyspark/pandas/groupby.py                   |  7 ++--
 .../pandas/tests/groupby/test_apply_func.py        | 38 +++++++++++++++++-----
 2 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py
index 5e47f9840811..f23422b43a22 100644
--- a/python/pyspark/pandas/groupby.py
+++ b/python/pyspark/pandas/groupby.py
@@ -1988,8 +1988,11 @@ class GroupBy(Generic[FrameLike], metaclass=ABCMeta):
             if include_groups:
                 raise ValueError("include_groups=True is no longer allowed.")
 
-        spec = inspect.getfullargspec(func)
-        return_sig = spec.annotations.get("return", None)
+        try:
+            spec = inspect.getfullargspec(func)
+            return_sig = spec.annotations.get("return", None)
+        except TypeError:
+            return_sig = None
         should_infer_schema = return_sig is None
         should_retain_index = should_infer_schema
 
diff --git a/python/pyspark/pandas/tests/groupby/test_apply_func.py 
b/python/pyspark/pandas/tests/groupby/test_apply_func.py
index 5716e574cb44..c8e2bf41b62e 100644
--- a/python/pyspark/pandas/tests/groupby/test_apply_func.py
+++ b/python/pyspark/pandas/tests/groupby/test_apply_func.py
@@ -345,14 +345,18 @@ class GroupbyApplyFuncMixin:
         )
         psdf = ps.from_pandas(pdf)
 
+        if LooseVersion(pd.__version__) < "3.0.0":
+            sum_f = sum
+        else:
+            sum_f = np.sum
+
         self.assert_eq(
-            psdf.groupby("d").apply(sum).sort_index(), 
pdf.groupby("d").apply(sum).sort_index()
+            psdf.groupby("d").apply(sum_f).sort_index(), 
pdf.groupby("d").apply(sum_f).sort_index()
         )
 
-        with ps.option_context("compute.shortcut_limit", 1):
-            self.assert_eq(
-                psdf.groupby("d").apply(sum).sort_index(), 
pdf.groupby("d").apply(sum).sort_index()
-            )
+    def test_apply_key_handling_without_shortcut(self):
+        with ps.option_context("compute.shortcut_limit", 0):
+            self.test_apply_key_handling()
 
     def test_apply_with_side_effect(self):
         pdf = pd.DataFrame(
@@ -370,6 +374,11 @@ class GroupbyApplyFuncMixin:
     def _check_apply_with_side_effect(self, psdf, pdf, include_groups):
         acc = ps.utils.default_session().sparkContext.accumulator(0)
 
+        if LooseVersion(pd.__version__) < "3.0.0":
+            sum_f = sum
+        else:
+            sum_f = np.sum
+
         if include_groups:
 
             def sum_with_acc_frame(x) -> ps.DataFrame[np.float64, np.float64]:
@@ -378,18 +387,25 @@ class GroupbyApplyFuncMixin:
                 return np.sum(x)
 
         else:
+            if LooseVersion(pd.__version__) < "3.0.0":
+                ret_type = ps.DataFrame[np.float64]
+            else:
+                ret_type = np.float64
 
-            def sum_with_acc_frame(x) -> ps.DataFrame[np.float64]:
+            def sum_with_acc_frame(x) -> ret_type:
                 nonlocal acc
                 acc += 1
                 return np.sum(x)
 
         actual = psdf.groupby("d").apply(sum_with_acc_frame, 
include_groups=include_groups)
-        actual.columns = ["d", "v"] if include_groups else ["v"]
+        if LooseVersion(pd.__version__) < "3.0.0":
+            actual.columns = ["d", "v"] if include_groups else ["v"]
+        else:
+            actual = actual.rename()
         self.assert_eq(
             actual._to_pandas().sort_index(),
             pdf.groupby("d")
-            .apply(sum, include_groups=include_groups)
+            .apply(sum_f, include_groups=include_groups)
             .sort_index()
             .reset_index(drop=True),
         )
@@ -406,12 +422,16 @@ class GroupbyApplyFuncMixin:
             ._to_pandas()
             .sort_index(),
             pdf.groupby("d")["v"]
-            .apply(sum, include_groups=include_groups)
+            .apply(sum_f, include_groups=include_groups)
             .sort_index()
             .reset_index(drop=True),
         )
         self.assert_eq(acc.value, 4)
 
+    def test_apply_with_side_effect_without_shortcut(self):
+        with ps.option_context("compute.shortcut_limit", 0):
+            self.test_apply_with_side_effect()
+
     def test_apply_return_series(self):
         # SPARK-36907: Fix DataFrameGroupBy.apply without shortcut.
         pdf = pd.DataFrame(


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to