This is an automated email from the ASF dual-hosted git repository.

kosiew pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-python.git


The following commit(s) were added to refs/heads/main by this push:
     new 93f4c34b Add docstring examples for Aggregate basic and 
bitwise/boolean functions (#1416)
93f4c34b is described below

commit 93f4c34bf5a4afae2547d5ccb677143d1833ebf0
Author: Nick <[email protected]>
AuthorDate: Tue Mar 17 02:14:42 2026 -0400

    Add docstring examples for Aggregate basic and bitwise/boolean functions 
(#1416)
    
    * Add docstring examples for Aggregate basic and bitwise/boolean functions
    
    Add example usage to docstrings for Aggregate basic and bitwise/boolean 
functions to improve documentation.
    
    Co-Authored-By: Claude Opus 4.6 <[email protected]>
    
    * Add tighter bound on approx_distinct for small sizes
    
    ---------
    
    Co-authored-by: Claude Opus 4.6 <[email protected]>
---
 python/datafusion/functions.py | 141 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py
index 4738061c..765d1365 100644
--- a/python/datafusion/functions.py
+++ b/python/datafusion/functions.py
@@ -2370,6 +2370,15 @@ def approx_distinct(
     Args:
         expression: Values to check for distinct entries
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 1, 2, 3]})
+    >>> result = df.aggregate(
+    ...     [], [dfn.functions.approx_distinct(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py() == 3
+    True
     """
     filter_raw = filter.expr if filter is not None else None
 
@@ -2388,6 +2397,15 @@ def approx_median(expression: Expr, filter: Expr | None 
= None) -> Expr:
     Args:
         expression: Values to find the median for
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
+    >>> result = df.aggregate(
+    ...     [], [dfn.functions.approx_median(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    2.0
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.approx_median(expression.expr, filter=filter_raw))
@@ -2419,6 +2437,15 @@ def approx_percentile_cont(
         percentile: This must be between 0.0 and 1.0, inclusive
         num_centroids: Max bin size for the t-digest algorithm
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0, 4.0, 5.0]})
+    >>> result = df.aggregate(
+    ...     [], [dfn.functions.approx_percentile_cont(dfn.col("a"), 
0.5).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    3.0
     """
     sort_expr_raw = sort_or_default(sort_expression)
     filter_raw = filter.expr if filter is not None else None
@@ -2451,6 +2478,15 @@ def approx_percentile_cont_with_weight(
         num_centroids: Max bin size for the t-digest algorithm
         filter: If provided, only compute against rows for which the filter is 
True
 
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0], "w": [1.0, 1.0, 1.0]})
+    >>> result = df.aggregate(
+    ...     [], [dfn.functions.approx_percentile_cont_with_weight(dfn.col("a"),
+    ...     dfn.col("w"), 0.5).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    2.0
     """
     sort_expr_raw = sort_or_default(sort_expression)
     filter_raw = filter.expr if filter is not None else None
@@ -2514,6 +2550,14 @@ def avg(
     Args:
         expression: Values to combine into an array
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
+    >>> result = df.aggregate([], [dfn.functions.avg(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    2.0
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.avg(expression.expr, filter=filter_raw))
@@ -2552,6 +2596,14 @@ def count(
         expressions: Argument to perform bitwise calculation on
         distinct: If True, a single entry for each distinct value will be in 
the result
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+    >>> result = df.aggregate([], 
[dfn.functions.count(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    3
     """
     filter_raw = filter.expr if filter is not None else None
 
@@ -2616,6 +2668,14 @@ def max(expression: Expr, filter: Expr | None = None) -> 
Expr:
     Args:
         expression: The value to find the maximum of
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+    >>> result = df.aggregate([], [dfn.functions.max(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    3
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.max(expression.expr, filter=filter_raw))
@@ -2625,6 +2685,14 @@ def mean(expression: Expr, filter: Expr | None = None) 
-> Expr:
     """Returns the average (mean) value of the argument.
 
     This is an alias for :py:func:`avg`.
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
+    >>> result = df.aggregate([], 
[dfn.functions.mean(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    2.0
     """
     return avg(expression, filter)
 
@@ -2644,6 +2712,14 @@ def median(
         expression: The value to compute the median of
         distinct: If True, a single entry for each distinct value will be in 
the result
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1.0, 2.0, 3.0]})
+    >>> result = df.aggregate([], 
[dfn.functions.median(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    2.0
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.median(expression.expr, distinct=distinct, 
filter=filter_raw))
@@ -2658,6 +2734,14 @@ def min(expression: Expr, filter: Expr | None = None) -> 
Expr:
     Args:
         expression: The value to find the minimum of
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+    >>> result = df.aggregate([], [dfn.functions.min(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    1
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.min(expression.expr, filter=filter_raw))
@@ -2677,6 +2761,14 @@ def sum(
     Args:
         expression: Values to combine into an array
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2, 3]})
+    >>> result = df.aggregate([], [dfn.functions.sum(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    6
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.sum(expression.expr, filter=filter_raw))
@@ -3094,6 +3186,14 @@ def bit_and(expression: Expr, filter: Expr | None = 
None) -> Expr:
     Args:
         expression: Argument to perform bitwise calculation on
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [7, 3]})
+    >>> result = df.aggregate([], 
[dfn.functions.bit_and(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    3
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.bit_and(expression.expr, filter=filter_raw))
@@ -3110,6 +3210,14 @@ def bit_or(expression: Expr, filter: Expr | None = None) 
-> Expr:
     Args:
         expression: Argument to perform bitwise calculation on
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [1, 2]})
+    >>> result = df.aggregate([], 
[dfn.functions.bit_or(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    3
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.bit_or(expression.expr, filter=filter_raw))
@@ -3129,6 +3237,14 @@ def bit_xor(
         expression: Argument to perform bitwise calculation on
         distinct: If True, evaluate each unique value of expression only once
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [5, 3]})
+    >>> result = df.aggregate([], 
[dfn.functions.bit_xor(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    6
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.bit_xor(expression.expr, distinct=distinct, 
filter=filter_raw))
@@ -3146,6 +3262,14 @@ def bool_and(expression: Expr, filter: Expr | None = 
None) -> Expr:
     Args:
         expression: Argument to perform calculation on
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [True, True, False]})
+    >>> result = df.aggregate([], 
[dfn.functions.bool_and(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    False
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.bool_and(expression.expr, filter=filter_raw))
@@ -3163,6 +3287,14 @@ def bool_or(expression: Expr, filter: Expr | None = 
None) -> Expr:
     Args:
         expression: Argument to perform calculation on
         filter: If provided, only compute against rows for which the filter is 
True
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": [False, False, True]})
+    >>> result = df.aggregate([], 
[dfn.functions.bool_or(dfn.col("a")).alias("v")])
+    >>> result.collect_column("v")[0].as_py()
+    True
     """
     filter_raw = filter.expr if filter is not None else None
     return Expr(f.bool_or(expression.expr, filter=filter_raw))
@@ -3553,6 +3685,15 @@ def string_agg(
     For example::
 
         df.aggregate([], string_agg(col("a"), ",", order_by="b"))
+
+    Examples:
+    ---------
+    >>> ctx = dfn.SessionContext()
+    >>> df = ctx.from_pydict({"a": ["x", "y", "z"]})
+    >>> result = df.aggregate(
+    ...     [], [dfn.functions.string_agg(dfn.col("a"), ",", 
order_by="a").alias("s")])
+    >>> result.collect_column("s")[0].as_py()
+    'x,y,z'
     """
     order_by_raw = sort_list_to_raw_sort_list(order_by)
     filter_raw = filter.expr if filter is not None else None


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to