This is an automated email from the ASF dual-hosted git repository.
ruifengz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d279dbf [SPARK-31243][ML][PYSPARK] Add ANOVATest and FValueTest to
PySpark
d279dbf is described below
commit d279dbf09c18f37a2660dbc822763ebb54e4459a
Author: Huaxin Gao <[email protected]>
AuthorDate: Fri Mar 27 14:05:49 2020 +0800
[SPARK-31243][ML][PYSPARK] Add ANOVATest and FValueTest to PySpark
### What changes were proposed in this pull request?
Add ANOVATest and FValueTest to PySpark
### Why are the changes needed?
Parity between Scala and Python.
### Does this PR introduce any user-facing change?
Yes. Python ANOVATest and FValueTest
### How was this patch tested?
doctest
Closes #28012 from huaxingao/stats-python.
Authored-by: Huaxin Gao <[email protected]>
Signed-off-by: zhengruifeng <[email protected]>
---
python/pyspark/ml/stat.py | 100 ++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 100 insertions(+)
diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py
index 53a57af..0581469 100644
--- a/python/pyspark/ml/stat.py
+++ b/python/pyspark/ml/stat.py
@@ -411,6 +411,106 @@ class MultivariateGaussian(object):
self.cov = cov
+class ANOVATest(object):
+ """
+ Conduct ANOVA Classification Test for continuous features against
categorical labels.
+
+ .. versionadded:: 3.1.0
+ """
+ @staticmethod
+ @since("3.1.0")
+ def test(dataset, featuresCol, labelCol):
+ """
+ Perform an ANOVA test using dataset.
+
+ :param dataset:
+ DataFrame of categorical labels and continuous features.
+ :param featuresCol:
+ Name of features column in dataset, of type `Vector` (`VectorUDT`).
+ :param labelCol:
+ Name of label column in dataset, of any numerical type.
+ :return:
+ DataFrame containing the test result for every feature against the
label.
+ This DataFrame will contain a single Row with the following fields:
+ - `pValues: Vector`
+ - `degreesOfFreedom: Array[Long]`
+ - `fValues: Vector`
+ Each of these fields has one value per feature.
+
+ >>> from pyspark.ml.linalg import Vectors
+ >>> from pyspark.ml.stat import ANOVATest
+ >>> dataset = [[2.0, Vectors.dense([0.43486404, 0.57153633, 0.43175686,
+ ... 0.51418671, 0.61632374,
0.96565515])],
+ ... [1.0, Vectors.dense([0.49162732, 0.6785187, 0.85460572,
+ ... 0.59784822, 0.12394819,
0.53783355])],
+ ... [2.0, Vectors.dense([0.30879653, 0.54904515, 0.17103889,
+ ... 0.40492506, 0.18957493,
0.5440016])],
+ ... [3.0, Vectors.dense([0.68114391, 0.60549825, 0.69094651,
+ ... 0.62102109, 0.05471483,
0.96449167])]]
+ >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
+ >>> anovaResult = ANOVATest.test(dataset, 'features', 'label')
+ >>> row = anovaResult.select("fValues", "pValues").collect()
+ >>> row[0].fValues
+ DenseVector([4.0264, 18.4713, 3.4659, 1.9042, 0.5532, 0.512])
+ >>> row[0].pValues
+ DenseVector([0.3324, 0.1623, 0.3551, 0.456, 0.689, 0.7029])
+ """
+ sc = SparkContext._active_spark_context
+ javaTestObj = _jvm().org.apache.spark.ml.stat.ANOVATest
+ args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
+ return _java2py(sc, javaTestObj.test(*args))
+
+
+class FValueTest(object):
+ """
+ Conduct F Regression test for continuous features against continuous
labels.
+
+ .. versionadded:: 3.1.0
+ """
+ @staticmethod
+ @since("3.1.0")
+ def test(dataset, featuresCol, labelCol):
+ """
+ Perform a F Regression test using dataset.
+
+ :param dataset:
+ DataFrame of continuous labels and continuous features.
+ :param featuresCol:
+ Name of features column in dataset, of type `Vector` (`VectorUDT`).
+ :param labelCol:
+ Name of label column in dataset, of any numerical type.
+ :return:
+ DataFrame containing the test result for every feature against the
label.
+ This DataFrame will contain a single Row with the following fields:
+ - `pValues: Vector`
+ - `degreesOfFreedom: Array[Long]`
+ - `fValues: Vector`
+ Each of these fields has one value per feature.
+
+ >>> from pyspark.ml.linalg import Vectors
+ >>> from pyspark.ml.stat import FValueTest
+ >>> dataset = [[0.57495218, Vectors.dense([0.43486404, 0.57153633,
0.43175686,
+ ... 0.51418671, 0.61632374,
0.96565515])],
+ ... [0.84619853, Vectors.dense([0.49162732, 0.6785187,
0.85460572,
+ ... 0.59784822, 0.12394819,
0.53783355])],
+ ... [0.39777647, Vectors.dense([0.30879653, 0.54904515,
0.17103889,
+ ... 0.40492506, 0.18957493,
0.5440016])],
+ ... [0.79201573, Vectors.dense([0.68114391, 0.60549825,
0.69094651,
+ ... 0.62102109, 0.05471483,
0.96449167])]]
+ >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
+ >>> fValueResult = FValueTest.test(dataset, 'features', 'label')
+ >>> row = fValueResult.select("fValues", "pValues").collect()
+ >>> row[0].fValues
+ DenseVector([3.741, 7.5807, 142.0684, 34.9849, 0.4112, 0.0539])
+ >>> row[0].pValues
+ DenseVector([0.1928, 0.1105, 0.007, 0.0274, 0.5871, 0.838])
+ """
+ sc = SparkContext._active_spark_context
+ javaTestObj = _jvm().org.apache.spark.ml.stat.FValueTest
+ args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)]
+ return _java2py(sc, javaTestObj.test(*args))
+
+
if __name__ == "__main__":
import doctest
import numpy
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]