Repository: spark
Updated Branches:
  refs/heads/branch-1.6 d7b3d5785 -> 0a878ad0e


[SPARK-11875][ML][PYSPARK] Update doc for PySpark HasCheckpointInterval

* Update doc for PySpark ```HasCheckpointInterval``` that users can understand 
how to disable checkpoint.
* Update doc for PySpark ```cacheNodeIds``` of ```DecisionTreeParams``` to 
notify the relationship between ```cacheNodeIds``` and ```checkpointInterval```.

Author: Yanbo Liang <[email protected]>

Closes #9856 from yanboliang/spark-11875.

(cherry picked from commit 7216f405454f6f3557b5b1f72df8f393605faf60)
Signed-off-by: Xiangrui Meng <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0a878ad0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0a878ad0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0a878ad0

Branch: refs/heads/branch-1.6
Commit: 0a878ad0e422cdf00c4beedb5bea01ebba135347
Parents: d7b3d57
Author: Yanbo Liang <[email protected]>
Authored: Thu Nov 19 22:14:01 2015 -0800
Committer: Xiangrui Meng <[email protected]>
Committed: Thu Nov 19 22:14:26 2015 -0800

----------------------------------------------------------------------
 python/pyspark/ml/param/_shared_params_code_gen.py |  6 ++++--
 python/pyspark/ml/param/shared.py                  | 14 +++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/0a878ad0/python/pyspark/ml/param/_shared_params_code_gen.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py 
b/python/pyspark/ml/param/_shared_params_code_gen.py
index 070c5db..0528dc1 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -118,7 +118,8 @@ if __name__ == "__main__":
         ("inputCols", "input column names.", None),
         ("outputCol", "output column name.", "self.uid + '__output'"),
         ("numFeatures", "number of features.", None),
-        ("checkpointInterval", "checkpoint interval (>= 1).", None),
+        ("checkpointInterval", "set checkpoint interval (>= 1) or disable 
checkpoint (-1). " +
+         "E.g. 10 means that the cache will get checkpointed every 10 
iterations.", None),
         ("seed", "random seed.", "hash(type(self).__name__)"),
         ("tol", "the convergence tolerance for iterative algorithms.", None),
         ("stepSize", "Step size to be used for each iteration of 
optimization.", None),
@@ -157,7 +158,8 @@ if __name__ == "__main__":
         ("maxMemoryInMB", "Maximum memory in MB allocated to histogram 
aggregation."),
         ("cacheNodeIds", "If false, the algorithm will pass trees to executors 
to match " +
          "instances with nodes. If true, the algorithm will cache node IDs for 
each instance. " +
-         "Caching can speed up training of deeper trees.")]
+         "Caching can speed up training of deeper trees. Users can set how 
often should the " +
+         "cache be checkpointed or disable it by setting checkpointInterval.")]
 
     decisionTreeCode = '''class DecisionTreeParams(Params):
     """

http://git-wip-us.apache.org/repos/asf/spark/blob/0a878ad0/python/pyspark/ml/param/shared.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/shared.py 
b/python/pyspark/ml/param/shared.py
index 4bdf2a8..4d96080 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -325,16 +325,16 @@ class HasNumFeatures(Params):
 
 class HasCheckpointInterval(Params):
     """
-    Mixin for param checkpointInterval: checkpoint interval (>= 1).
+    Mixin for param checkpointInterval: set checkpoint interval (>= 1) or 
disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed 
every 10 iterations.
     """
 
     # a placeholder to make it appear in the generated doc
-    checkpointInterval = Param(Params._dummy(), "checkpointInterval", 
"checkpoint interval (>= 1).")
+    checkpointInterval = Param(Params._dummy(), "checkpointInterval", "set 
checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the 
cache will get checkpointed every 10 iterations.")
 
     def __init__(self):
         super(HasCheckpointInterval, self).__init__()
-        #: param for checkpoint interval (>= 1).
-        self.checkpointInterval = Param(self, "checkpointInterval", 
"checkpoint interval (>= 1).")
+        #: param for set checkpoint interval (>= 1) or disable checkpoint 
(-1). E.g. 10 means that the cache will get checkpointed every 10 iterations.
+        self.checkpointInterval = Param(self, "checkpointInterval", "set 
checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the 
cache will get checkpointed every 10 iterations.")
 
     def setCheckpointInterval(self, value):
         """
@@ -636,7 +636,7 @@ class DecisionTreeParams(Params):
     minInstancesPerNode = Param(Params._dummy(), "minInstancesPerNode", 
"Minimum number of instances each child must have after split. If a split 
causes the left or right child to have fewer than minInstancesPerNode, the 
split will be discarded as invalid. Should be >= 1.")
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information 
gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in 
MB allocated to histogram aggregation.")
-    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees.")
+    cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees. Users can set how often should the cache be 
checkpointed or disable it by setting checkpointInterval.")
     
 
     def __init__(self):
@@ -651,8 +651,8 @@ class DecisionTreeParams(Params):
         self.minInfoGain = Param(self, "minInfoGain", "Minimum information 
gain for a split to be considered at a tree node.")
         #: param for Maximum memory in MB allocated to histogram aggregation.
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in 
MB allocated to histogram aggregation.")
-        #: param for If false, the algorithm will pass trees to executors to 
match instances with nodes. If true, the algorithm will cache node IDs for each 
instance. Caching can speed up training of deeper trees.
-        self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees.")
+        #: param for If false, the algorithm will pass trees to executors to 
match instances with nodes. If true, the algorithm will cache node IDs for each 
instance. Caching can speed up training of deeper trees. Users can set how 
often should the cache be checkpointed or disable it by setting 
checkpointInterval.
+        self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees. Users can set how often should the cache be 
checkpointed or disable it by setting checkpointInterval.")
         
     def setMaxDepth(self, value):
         """


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to