Repository: spark Updated Branches: refs/heads/master fdd460f5f -> 585097716
[SPARK-14107][PYSPARK][ML] Add seed as named argument to GBTs in pyspark ## What changes were proposed in this pull request? GBTs in pyspark previously had seed parameters, but they could not be passed as keyword arguments through the class constructor. This patch adds seed as a keyword argument and also sets default value. ## How was this patch tested? Doc tests were updated to pass a random seed through the GBTClassifier and GBTRegressor constructors. Author: sethah <[email protected]> Closes #11944 from sethah/SPARK-14107. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/58509771 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/58509771 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/58509771 Branch: refs/heads/master Commit: 585097716c1979ea538ef733cf33225ef7be06f5 Parents: fdd460f Author: sethah <[email protected]> Authored: Thu Mar 24 19:14:24 2016 -0700 Committer: Xiangrui Meng <[email protected]> Committed: Thu Mar 24 19:14:24 2016 -0700 ---------------------------------------------------------------------- python/pyspark/ml/classification.py | 12 ++++++------ python/pyspark/ml/regression.py | 13 +++++++------ 2 files changed, 13 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/58509771/python/pyspark/ml/classification.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index fdeccf8..850d775 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -520,7 +520,7 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol >>> stringIndexer = StringIndexer(inputCol="label", outputCol="indexed") >>> si_model = stringIndexer.fit(df) >>> td = si_model.transform(df) - >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed") + >>> gbt = GBTClassifier(maxIter=5, maxDepth=2, labelCol="indexed", seed=42) >>> model = gbt.fit(td) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True @@ -543,19 +543,19 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, lossType="logistic", - maxIter=20, stepSize=0.1): + maxIter=20, stepSize=0.1, seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) """ super(GBTClassifier, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.classification.GBTClassifier", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -564,12 +564,12 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, - lossType="logistic", maxIter=20, stepSize=0.1): + lossType="logistic", maxIter=20, stepSize=0.1, seed=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, \ - lossType="logistic", maxIter=20, stepSize=0.1) + lossType="logistic", maxIter=20, stepSize=0.1, seed=None) Sets params for Gradient Boosted Tree Classification. """ kwargs = self.setParams._input_kwargs http://git-wip-us.apache.org/repos/asf/spark/blob/58509771/python/pyspark/ml/regression.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/regression.py b/python/pyspark/ml/regression.py index 8982608..59d4fe3 100644 --- a/python/pyspark/ml/regression.py +++ b/python/pyspark/ml/regression.py @@ -641,7 +641,7 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, >>> df = sqlContext.createDataFrame([ ... (1.0, Vectors.dense(1.0)), ... (0.0, Vectors.sparse(1, [], []))], ["label", "features"]) - >>> gbt = GBTRegressor(maxIter=5, maxDepth=2) + >>> gbt = GBTRegressor(maxIter=5, maxDepth=2, seed=42) >>> model = gbt.fit(df) >>> allclose(model.treeWeights, [1.0, 0.1, 0.1, 0.1, 0.1]) True @@ -664,18 +664,19 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1): + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None) """ super(GBTRegressor, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.regression.GBTRegressor", self.uid) self._setDefault(maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, + seed=None) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -684,12 +685,12 @@ class GBTRegressor(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1): + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, subsamplingRate=1.0, \ - checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1) + checkpointInterval=10, lossType="squared", maxIter=20, stepSize=0.1, seed=None) Sets params for Gradient Boosted Tree Regression. """ kwargs = self.setParams._input_kwargs --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
