spark git commit: [SPARK-13787][ML][PYSPARK] Pyspark feature importances for decision tree and random forest

mlnick Thu, 10 Mar 2016 23:55:46 -0800

Repository: spark
Updated Branches:
  refs/heads/master 0b713e045 -> 234f781ae



[SPARK-13787][ML][PYSPARK] Pyspark feature importances for decision tree and 
random forest

## What changes were proposed in this pull request?

This patch adds a `featureImportance` property to the Pyspark API for 
`DecisionTreeRegressionModel`, `DecisionTreeClassificationModel`, 
`RandomForestRegressionModel` and `RandomForestClassificationModel`.

## How was this patch tested?

Python doc tests for the affected classes were updated to check feature 
importances.

Author: sethah <[email protected]>

Closes #11622 from sethah/SPARK-13787.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/234f781a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/234f781a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/234f781a

Branch: refs/heads/master
Commit: 234f781ae19370ce1ab485e364a6fdab7ed2598c
Parents: 0b713e0
Author: sethah <[email protected]>
Authored: Fri Mar 11 09:54:23 2016 +0200
Committer: Nick Pentreath <[email protected]>
Committed: Fri Mar 11 09:54:23 2016 +0200

----------------------------------------------------------------------
 python/pyspark/ml/classification.py | 44 ++++++++++++++++++++++++++++++++
 python/pyspark/ml/regression.py     | 44 ++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/234f781a/python/pyspark/ml/classification.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 29d1d20..ec8834a 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -285,6 +285,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPred
     3
     >>> model.depth
     1
+    >>> model.featureImportances
+    SparseVector(1, {0: 1.0})
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], 
["features"])
     >>> result = model.transform(test0).head()
     >>> result.prediction
@@ -352,6 +354,27 @@ class DecisionTreeClassificationModel(DecisionTreeModel):
     .. versionadded:: 1.4.0
     """
 
+    @property
+    @since("2.0.0")
+    def featureImportances(self):
+        """
+        Estimate of the importance of each feature.
+
+        This generalizes the idea of "Gini" importance to other losses,
+        following the explanation of Gini importance from "Random Forests" 
documentation
+        by Leo Breiman and Adele Cutler, and following the implementation from 
scikit-learn.
+
+        This feature importance is calculated as follows:
+          - importance(feature j) = sum (over nodes which split on feature j) 
of the gain,
+            where gain is scaled by the number of instances passing through 
node
+          - Normalize importances for tree to sum to 1.
+
+        Note: Feature importance for single decision trees can have high 
variance due to
+              correlated predictor variables. Consider using a 
:class:`RandomForestClassifier`
+              to determine feature importance instead.
+        """
+        return self._call_java("featureImportances")
+
 
 @inherit_doc
 class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasSeed,
@@ -375,6 +398,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPred
     >>> td = si_model.transform(df)
     >>> rf = RandomForestClassifier(numTrees=3, maxDepth=2, 
labelCol="indexed", seed=42)
     >>> model = rf.fit(td)
+    >>> model.featureImportances
+    SparseVector(1, {0: 1.0})
     >>> allclose(model.treeWeights, [1.0, 1.0, 1.0])
     True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], 
["features"])
@@ -443,6 +468,25 @@ class RandomForestClassificationModel(TreeEnsembleModels):
     .. versionadded:: 1.4.0
     """
 
+    @property
+    @since("2.0.0")
+    def featureImportances(self):
+        """
+        Estimate of the importance of each feature.
+
+        This generalizes the idea of "Gini" importance to other losses,
+        following the explanation of Gini importance from "Random Forests" 
documentation
+        by Leo Breiman and Adele Cutler, and following the implementation from 
scikit-learn.
+
+        This feature importance is calculated as follows:
+         - Average over trees:
+            - importance(feature j) = sum (over nodes which split on feature 
j) of the gain,
+              where gain is scaled by the number of instances passing through 
node
+            - Normalize importances for tree to sum to 1.
+         - Normalize feature importance vector to sum to 1.
+        """
+        return self._call_java("featureImportances")
+
 
 @inherit_doc
 class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasMaxIter,

http://git-wip-us.apache.org/repos/asf/spark/blob/234f781a/python/pyspark/ml/regression.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py
index 6b994fe..6e23393 100644
--- a/python/pyspark/ml/regression.py
+++ b/python/pyspark/ml/regression.py
@@ -401,6 +401,8 @@ class DecisionTreeRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
     1
     >>> model.numNodes
     3
+    >>> model.featureImportances
+    SparseVector(1, {0: 1.0})
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], 
["features"])
     >>> model.transform(test0).head().prediction
     0.0
@@ -499,6 +501,27 @@ class DecisionTreeRegressionModel(DecisionTreeModel):
     .. versionadded:: 1.4.0
     """
 
+    @property
+    @since("2.0.0")
+    def featureImportances(self):
+        """
+        Estimate of the importance of each feature.
+
+        This generalizes the idea of "Gini" importance to other losses,
+        following the explanation of Gini importance from "Random Forests" 
documentation
+        by Leo Breiman and Adele Cutler, and following the implementation from 
scikit-learn.
+
+        This feature importance is calculated as follows:
+          - importance(feature j) = sum (over nodes which split on feature j) 
of the gain,
+            where gain is scaled by the number of instances passing through 
node
+          - Normalize importances for tree to sum to 1.
+
+        Note: Feature importance for single decision trees can have high 
variance due to
+              correlated predictor variables. Consider using a 
:class:`RandomForestRegressor`
+              to determine feature importance instead.
+        """
+        return self._call_java("featureImportances")
+
 
 @inherit_doc
 class RandomForestRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasSeed,
@@ -515,6 +538,8 @@ class RandomForestRegressor(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredi
     ...     (0.0, Vectors.sparse(1, [], []))], ["label", "features"])
     >>> rf = RandomForestRegressor(numTrees=2, maxDepth=2, seed=42)
     >>> model = rf.fit(df)
+    >>> model.featureImportances
+    SparseVector(1, {0: 1.0})
     >>> allclose(model.treeWeights, [1.0, 1.0])
     True
     >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], 
["features"])
@@ -579,6 +604,25 @@ class RandomForestRegressionModel(TreeEnsembleModels):
     .. versionadded:: 1.4.0
     """
 
+    @property
+    @since("2.0.0")
+    def featureImportances(self):
+        """
+        Estimate of the importance of each feature.
+
+        This generalizes the idea of "Gini" importance to other losses,
+        following the explanation of Gini importance from "Random Forests" 
documentation
+        by Leo Breiman and Adele Cutler, and following the implementation from 
scikit-learn.
+
+        This feature importance is calculated as follows:
+         - Average over trees:
+            - importance(feature j) = sum (over nodes which split on feature 
j) of the gain,
+              where gain is scaled by the number of instances passing through 
node
+            - Normalize importances for tree to sum to 1.
+         - Normalize feature importance vector to sum to 1.
+        """
+        return self._call_java("featureImportances")
+
 
 @inherit_doc
 class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasMaxIter,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-13787][ML][PYSPARK] Pyspark feature importances for decision tree and random forest

Reply via email to