This is an automated email from the ASF dual-hosted git repository.
yangjie01 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new a809740cf6ec [SPARK-48893][SQL][PYTHON][DOCS] Add some examples for
`linearRegression` built-in functions
a809740cf6ec is described below
commit a809740cf6ec039141d416f6fb27a6deb66b3d2c
Author: Wei Guo <[email protected]>
AuthorDate: Tue Jul 23 10:27:21 2024 +0800
[SPARK-48893][SQL][PYTHON][DOCS] Add some examples for `linearRegression`
built-in functions
### What changes were proposed in this pull request?
This PR aims to add some extra examples for `linearRegression` built-in
functions.
### Why are the changes needed?
- Align the use examples for this series of functions.
- Allow users to better understand the usage of `linearRegression` related
methods from sql built-in functions
docs(https://spark.apache.org/docs/latest/api/sql/index.html).
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Pass GA and Manual testing for new examples.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #47343 from wayneguow/regr_series.
Authored-by: Wei Guo <[email protected]>
Signed-off-by: yangjie01 <[email protected]>
---
python/pyspark/sql/functions/builtin.py | 545 ++++++++++++++++++---
.../expressions/aggregate/linearRegression.scala | 28 +-
.../sql-functions/sql-expression-schema.md | 4 +-
3 files changed, 494 insertions(+), 83 deletions(-)
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index 5b9d0dd87002..3d094dd38c50 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -3671,16 +3671,59 @@ def regr_avgx(y: "ColumnOrName", x: "ColumnOrName") ->
Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_avgx("y", "x"), sf.avg("x")
- ... ).show()
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show()
+---------------+------+
|regr_avgx(y, x)|avg(x)|
+---------------+------+
- | 0.999| 0.999|
+ | 2.75| 2.75|
+ +---------------+------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show()
+ +---------------+------+
+ |regr_avgx(y, x)|avg(x)|
+ +---------------+------+
+ | NULL| NULL|
+ +---------------+------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show()
+ +---------------+------+
+ |regr_avgx(y, x)|avg(x)|
+ +---------------+------+
+ | NULL| 1.0|
+ +---------------+------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (2, 3), (2, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show()
+ +---------------+------+
+ |regr_avgx(y, x)|avg(x)|
+ +---------------+------+
+ | 3.0| 3.0|
+ +---------------+------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (null, 3), (2,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_avgx("y", "x"), sf.avg("x")).show()
+ +---------------+------+
+ |regr_avgx(y, x)|avg(x)|
+ +---------------+------+
+ | 3.0| 3.0|
+---------------+------+
"""
return _invoke_function_over_columns("regr_avgx", y, x)
@@ -3708,17 +3751,60 @@ def regr_avgy(y: "ColumnOrName", x: "ColumnOrName") ->
Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_avgy("y", "x"), sf.avg("y")
- ... ).show()
- +-----------------+-----------------+
- | regr_avgy(y, x)| avg(y)|
- +-----------------+-----------------+
- |9.980732994136...|9.980732994136...|
- +-----------------+-----------------+
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show()
+ +---------------+------+
+ |regr_avgy(y, x)|avg(y)|
+ +---------------+------+
+ | 1.75| 1.75|
+ +---------------+------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show()
+ +---------------+------+
+ |regr_avgy(y, x)|avg(y)|
+ +---------------+------+
+ | NULL| 1.0|
+ +---------------+------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show()
+ +---------------+------+
+ |regr_avgy(y, x)|avg(y)|
+ +---------------+------+
+ | NULL| NULL|
+ +---------------+------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (2, 3), (2, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show()
+ +------------------+------+
+ | regr_avgy(y, x)|avg(y)|
+ +------------------+------+
+ |1.6666666666666...| 1.75|
+ +------------------+------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (null, 3), (2,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_avgy("y", "x"), sf.avg("y")).show()
+ +---------------+------------------+
+ |regr_avgy(y, x)| avg(y)|
+ +---------------+------------------+
+ | 1.5|1.6666666666666...|
+ +---------------+------------------+
"""
return _invoke_function_over_columns("regr_avgy", y, x)
@@ -3745,16 +3831,59 @@ def regr_count(y: "ColumnOrName", x: "ColumnOrName") ->
Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_count("y", "x"), sf.count(sf.lit(0))
- ... ).show()
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show()
+ +----------------+--------+
+ |regr_count(y, x)|count(0)|
+ +----------------+--------+
+ | 4| 4|
+ +----------------+--------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show()
+ +----------------+--------+
+ |regr_count(y, x)|count(0)|
+ +----------------+--------+
+ | 0| 1|
+ +----------------+--------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show()
+ +----------------+--------+
+ |regr_count(y, x)|count(0)|
+ +----------------+--------+
+ | 0| 1|
+ +----------------+--------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (2, 3), (2, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show()
+----------------+--------+
|regr_count(y, x)|count(0)|
+----------------+--------+
- | 1000| 1000|
+ | 3| 4|
+ +----------------+--------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 2), (2, null), (null, 3), (2,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_count("y", "x"), sf.count(sf.lit(0))).show()
+ +----------------+--------+
+ |regr_count(y, x)|count(0)|
+ +----------------+--------+
+ | 2| 4|
+----------------+--------+
"""
return _invoke_function_over_columns("regr_count", y, x)
@@ -3783,16 +3912,59 @@ def regr_intercept(y: "ColumnOrName", x:
"ColumnOrName") -> Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_intercept("y", "x")
- ... ).show()
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_intercept("y", "x")).show()
+ +--------------------+
+ |regr_intercept(y, x)|
+ +--------------------+
+ | 0.0|
+ +--------------------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_intercept("y", "x")).show()
+ +--------------------+
+ |regr_intercept(y, x)|
+ +--------------------+
+ | NULL|
+ +--------------------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_intercept("y", "x")).show()
+--------------------+
|regr_intercept(y, x)|
+--------------------+
- |-0.04961745990969568|
+ | NULL|
+ +--------------------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_intercept("y", "x")).show()
+ +--------------------+
+ |regr_intercept(y, x)|
+ +--------------------+
+ | 0.0|
+ +--------------------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_intercept("y", "x")).show()
+ +--------------------+
+ |regr_intercept(y, x)|
+ +--------------------+
+ | 0.0|
+--------------------+
"""
return _invoke_function_over_columns("regr_intercept", y, x)
@@ -3820,17 +3992,60 @@ def regr_r2(y: "ColumnOrName", x: "ColumnOrName") ->
Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_r2("y", "x")
- ... ).show()
- +------------------+
- | regr_r2(y, x)|
- +------------------+
- |0.9851908293645...|
- +------------------+
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_r2("y", "x")).show()
+ +-------------+
+ |regr_r2(y, x)|
+ +-------------+
+ | 1.0|
+ +-------------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_r2("y", "x")).show()
+ +-------------+
+ |regr_r2(y, x)|
+ +-------------+
+ | NULL|
+ +-------------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_r2("y", "x")).show()
+ +-------------+
+ |regr_r2(y, x)|
+ +-------------+
+ | NULL|
+ +-------------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_r2("y", "x")).show()
+ +-------------+
+ |regr_r2(y, x)|
+ +-------------+
+ | 1.0|
+ +-------------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_r2("y", "x")).show()
+ +-------------+
+ |regr_r2(y, x)|
+ +-------------+
+ | 1.0|
+ +-------------+
"""
return _invoke_function_over_columns("regr_r2", y, x)
@@ -3857,17 +4072,60 @@ def regr_slope(y: "ColumnOrName", x: "ColumnOrName") ->
Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_slope("y", "x")
- ... ).show()
- +------------------+
- | regr_slope(y, x)|
- +------------------+
- |10.040390844891...|
- +------------------+
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_slope("y", "x")).show()
+ +----------------+
+ |regr_slope(y, x)|
+ +----------------+
+ | 1.0|
+ +----------------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_slope("y", "x")).show()
+ +----------------+
+ |regr_slope(y, x)|
+ +----------------+
+ | NULL|
+ +----------------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_slope("y", "x")).show()
+ +----------------+
+ |regr_slope(y, x)|
+ +----------------+
+ | NULL|
+ +----------------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_slope("y", "x")).show()
+ +----------------+
+ |regr_slope(y, x)|
+ +----------------+
+ | 1.0|
+ +----------------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_slope("y", "x")).show()
+ +----------------+
+ |regr_slope(y, x)|
+ +----------------+
+ | 1.0|
+ +----------------+
"""
return _invoke_function_over_columns("regr_slope", y, x)
@@ -3894,17 +4152,60 @@ def regr_sxx(y: "ColumnOrName", x: "ColumnOrName") ->
Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_sxx("y", "x")
- ... ).show()
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_sxx("y", "x")).show()
+ +--------------+
+ |regr_sxx(y, x)|
+ +--------------+
+ | 5.0|
+ +--------------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_sxx("y", "x")).show()
+ +--------------+
+ |regr_sxx(y, x)|
+ +--------------+
+ | NULL|
+ +--------------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_sxx("y", "x")).show()
+ +--------------+
+ |regr_sxx(y, x)|
+ +--------------+
+ | NULL|
+ +--------------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_sxx("y", "x")).show()
+-----------------+
| regr_sxx(y, x)|
+-----------------+
- |666.9989999999...|
+ |4.666666666666...|
+-----------------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_sxx("y", "x")).show()
+ +--------------+
+ |regr_sxx(y, x)|
+ +--------------+
+ | 4.5|
+ +--------------+
"""
return _invoke_function_over_columns("regr_sxx", y, x)
@@ -3931,17 +4232,60 @@ def regr_sxy(y: "ColumnOrName", x: "ColumnOrName") ->
Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_sxy("y", "x")
- ... ).show()
- +----------------+
- | regr_sxy(y, x)|
- +----------------+
- |6696.93065315...|
- +----------------+
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_sxy("y", "x")).show()
+ +--------------+
+ |regr_sxy(y, x)|
+ +--------------+
+ | 5.0|
+ +--------------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_sxy("y", "x")).show()
+ +--------------+
+ |regr_sxy(y, x)|
+ +--------------+
+ | NULL|
+ +--------------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_sxy("y", "x")).show()
+ +--------------+
+ |regr_sxy(y, x)|
+ +--------------+
+ | NULL|
+ +--------------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_sxy("y", "x")).show()
+ +-----------------+
+ | regr_sxy(y, x)|
+ +-----------------+
+ |4.666666666666...|
+ +-----------------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_sxy("y", "x")).show()
+ +--------------+
+ |regr_sxy(y, x)|
+ +--------------+
+ | 4.5|
+ +--------------+
"""
return _invoke_function_over_columns("regr_sxy", y, x)
@@ -3968,17 +4312,60 @@ def regr_syy(y: "ColumnOrName", x: "ColumnOrName") ->
Column:
Examples
--------
- >>> from pyspark.sql import functions as sf
- >>> x = (sf.col("id") % 3).alias("x")
- >>> y = (sf.randn(42) + x * 10).alias("y")
- >>> spark.range(0, 1000, 1, 1).select(x, y).select(
- ... sf.regr_syy("y", "x")
- ... ).show()
+ Example 1: All paris are non-null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS
tab(y, x)")
+ >>> df.select(sf.regr_syy("y", "x")).show()
+ +--------------+
+ |regr_syy(y, x)|
+ +--------------+
+ | 5.0|
+ +--------------+
+
+ Example 2: All paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, null) AS tab(y, x)")
+ >>> df.select(sf.regr_syy("y", "x")).show()
+ +--------------+
+ |regr_syy(y, x)|
+ +--------------+
+ | NULL|
+ +--------------+
+
+ Example 3: All paris's y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (null, 1) AS tab(y, x)")
+ >>> df.select(sf.regr_syy("y", "x")).show()
+ +--------------+
+ |regr_syy(y, x)|
+ +--------------+
+ | NULL|
+ +--------------+
+
+ Example 4: Some paris's x values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (3, 3), (4, 4)
AS tab(y, x)")
+ >>> df.select(sf.regr_syy("y", "x")).show()
+-----------------+
| regr_syy(y, x)|
+-----------------+
- |68250.53503811...|
+ |4.666666666666...|
+-----------------+
+
+ Example 5: Some paris's x or y values are null
+
+ >>> import pyspark.sql.functions as sf
+ >>> df = spark.sql("SELECT * FROM VALUES (1, 1), (2, null), (null, 3), (4,
4) AS tab(y, x)")
+ >>> df.select(sf.regr_syy("y", "x")).show()
+ +--------------+
+ |regr_syy(y, x)|
+ +--------------+
+ | 4.5|
+ +--------------+
"""
return _invoke_function_over_columns("regr_syy", y, x)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
index 7d73cf211a6e..d82ffd12b9df 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/linearRegression.scala
@@ -29,6 +29,10 @@ import org.apache.spark.sql.types.{AbstractDataType,
DataType, DoubleType, Numer
Examples:
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS
tab(y, x);
4
+ > SELECT _FUNC_(y, x) FROM VALUES (1, null) AS tab(y, x);
+ 0
+ > SELECT _FUNC_(y, x) FROM VALUES (null, 1) AS tab(y, x);
+ 0
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS
tab(y, x);
3
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4)
AS tab(y, x);
@@ -158,6 +162,10 @@ case class RegrR2(y: Expression, x: Expression) extends
PearsonCorrelation(y, x,
Examples:
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS
tab(y, x);
2.75
+ > SELECT _FUNC_(y, x) FROM VALUES (1, null) AS tab(y, x);
+ NULL
+ > SELECT _FUNC_(y, x) FROM VALUES (null, 1) AS tab(y, x);
+ NULL
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS
tab(y, x);
2.0
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4)
AS tab(y, x);
@@ -189,6 +197,10 @@ case class RegrSXX(
Examples:
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS
tab(y, x);
0.75
+ > SELECT _FUNC_(y, x) FROM VALUES (1, null) AS tab(y, x);
+ NULL
+ > SELECT _FUNC_(y, x) FROM VALUES (null, 1) AS tab(y, x);
+ NULL
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS
tab(y, x);
1.0
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4)
AS tab(y, x);
@@ -214,6 +226,10 @@ case class RegrSXY(y: Expression, x: Expression) extends
Covariance(y, x, true)
Examples:
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS
tab(y, x);
0.75
+ > SELECT _FUNC_(y, x) FROM VALUES (1, null) AS tab(y, x);
+ NULL
+ > SELECT _FUNC_(y, x) FROM VALUES (null, 1) AS tab(y, x);
+ NULL
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (2, 3), (2, 4) AS
tab(y, x);
0.6666666666666666
> SELECT _FUNC_(y, x) FROM VALUES (1, 2), (2, null), (null, 3), (2, 4)
AS tab(y, x);
@@ -243,12 +259,16 @@ case class RegrSYY(
usage = "_FUNC_(y, x) - Returns the slope of the linear regression line for
non-null pairs in a group, where `y` is the dependent variable and `x` is the
independent variable.",
examples = """
Examples:
- > SELECT _FUNC_(y, x) FROM VALUES (1,1), (2,2), (3,3) AS tab(y, x);
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS
tab(y, x);
1.0
> SELECT _FUNC_(y, x) FROM VALUES (1, null) AS tab(y, x);
NULL
> SELECT _FUNC_(y, x) FROM VALUES (null, 1) AS tab(y, x);
NULL
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 1), (2, null), (3, 3), (4, 4) AS
tab(y, x);
+ 1.0
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 1), (2, null), (null, 3), (4, 4)
AS tab(y, x);
+ 1.0
""",
group = "agg_funcs",
since = "3.4.0")
@@ -302,12 +322,16 @@ case class RegrSlope(left: Expression, right: Expression)
extends DeclarativeAgg
usage = "_FUNC_(y, x) - Returns the intercept of the univariate linear
regression line for non-null pairs in a group, where `y` is the dependent
variable and `x` is the independent variable.",
examples = """
Examples:
- > SELECT _FUNC_(y, x) FROM VALUES (1,1), (2,2), (3,3) AS tab(y, x);
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS
tab(y, x);
0.0
> SELECT _FUNC_(y, x) FROM VALUES (1, null) AS tab(y, x);
NULL
> SELECT _FUNC_(y, x) FROM VALUES (null, 1) AS tab(y, x);
NULL
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 1), (2, null), (3, 3), (4, 4) AS
tab(y, x);
+ 0.0
+ > SELECT _FUNC_(y, x) FROM VALUES (1, 1), (2, null), (null, 3), (4, 4)
AS tab(y, x);
+ 0.0
""",
group = "agg_funcs",
since = "3.4.0")
diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
index 7f0e20ba345e..88013d41a575 100644
--- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
+++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md
@@ -428,12 +428,12 @@
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrAvgX | regr_avgx |
SELECT regr_avgx(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x)
| struct<regr_avgx(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrAvgY | regr_avgy |
SELECT regr_avgy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x)
| struct<regr_avgy(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrCount | regr_count |
SELECT regr_count(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x)
| struct<regr_count(y, x):bigint> |
-| org.apache.spark.sql.catalyst.expressions.aggregate.RegrIntercept |
regr_intercept | SELECT regr_intercept(y, x) FROM VALUES (1,1), (2,2), (3,3) AS
tab(y, x) | struct<regr_intercept(y, x):double> |
+| org.apache.spark.sql.catalyst.expressions.aggregate.RegrIntercept |
regr_intercept | SELECT regr_intercept(y, x) FROM VALUES (1, 1), (2, 2), (3,
3), (4, 4) AS tab(y, x) | struct<regr_intercept(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrR2 | regr_r2 |
SELECT regr_r2(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) |
struct<regr_r2(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrSXX | regr_sxx |
SELECT regr_sxx(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) |
struct<regr_sxx(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrSXY | regr_sxy |
SELECT regr_sxy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) |
struct<regr_sxy(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.RegrSYY | regr_syy |
SELECT regr_syy(y, x) FROM VALUES (1, 2), (2, 2), (2, 3), (2, 4) AS tab(y, x) |
struct<regr_syy(y, x):double> |
-| org.apache.spark.sql.catalyst.expressions.aggregate.RegrSlope | regr_slope |
SELECT regr_slope(y, x) FROM VALUES (1,1), (2,2), (3,3) AS tab(y, x) |
struct<regr_slope(y, x):double> |
+| org.apache.spark.sql.catalyst.expressions.aggregate.RegrSlope | regr_slope |
SELECT regr_slope(y, x) FROM VALUES (1, 1), (2, 2), (3, 3), (4, 4) AS tab(y, x)
| struct<regr_slope(y, x):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.Skewness | skewness |
SELECT skewness(col) FROM VALUES (-10), (-20), (100), (1000) AS tab(col) |
struct<skewness(col):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.StddevPop | stddev_pop |
SELECT stddev_pop(col) FROM VALUES (1), (2), (3) AS tab(col) |
struct<stddev_pop(col):double> |
| org.apache.spark.sql.catalyst.expressions.aggregate.StddevSamp | std |
SELECT std(col) FROM VALUES (1), (2), (3) AS tab(col) | struct<std(col):double>
|
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]