Repository: spark
Updated Branches:
  refs/heads/master 2a3d39f48 -> 302a18686


[SPARK-11559][MLLIB] Make `runs` no effect in mllib.KMeans

## What changes were proposed in this pull request?
We deprecated  ```runs``` of mllib.KMeans in Spark 1.6 (SPARK-11358). In 2.0, 
we will make it no effect (with warning messages). We did not remove 
```setRuns/getRuns``` for better binary compatibility.
This PR change `runs` which are appeared at the public API. Usage inside of 
```KMeans.runAlgorithm()``` will be resolved at #10806.

## How was this patch tested?
Existing unit tests.

cc jkbradley

Author: Yanbo Liang <[email protected]>

Closes #12608 from yanboliang/spark-11559.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/302a1868
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/302a1868
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/302a1868

Branch: refs/heads/master
Commit: 302a18686998b8b96546526bfccec9cf5b667386
Parents: 2a3d39f
Author: Yanbo Liang <[email protected]>
Authored: Tue Apr 26 11:55:21 2016 -0700
Committer: Joseph K. Bradley <[email protected]>
Committed: Tue Apr 26 11:55:21 2016 -0700

----------------------------------------------------------------------
 .../spark/mllib/api/python/PythonMLLibAPI.scala |  1 -
 .../apache/spark/mllib/clustering/KMeans.scala  | 42 +++++---------------
 python/pyspark/ml/clustering.py                 |  5 +--
 python/pyspark/mllib/clustering.py              |  9 ++---
 4 files changed, 16 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/302a1868/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
index 32dc16d..8daee7b 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala
@@ -357,7 +357,6 @@ private[python] class PythonMLLibAPI extends Serializable {
     val kMeansAlg = new KMeans()
       .setK(k)
       .setMaxIterations(maxIterations)
-      .internalSetRuns(runs)
       .setInitializationMode(initializationMode)
       .setInitializationSteps(initializationSteps)
       .setEpsilon(epsilon)

http://git-wip-us.apache.org/repos/asf/spark/blob/302a1868/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
index ff77090..60f13d2 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -32,9 +32,8 @@ import org.apache.spark.util.Utils
 import org.apache.spark.util.random.XORShiftRandom
 
 /**
- * K-means clustering with support for multiple parallel runs and a k-means++ 
like initialization
- * mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent 
runs are requested,
- * they are executed together with joint passes over the data for efficiency.
+ * K-means clustering with a k-means++ like initialization mode
+ * (the k-means|| algorithm by Bahmani et al).
  *
  * This is an iterative algorithm that will make multiple passes over the 
data, so any RDDs given
  * to it should be cached by the user.
@@ -109,35 +108,20 @@ class KMeans private (
   }
 
   /**
-   * :: Experimental ::
-   * Number of runs of the algorithm to execute in parallel.
+   * This function has no effect since Spark 2.0.0.
    */
   @Since("1.4.0")
-  @deprecated("Support for runs is deprecated. This param will have no effect 
in 2.0.0.", "1.6.0")
-  def getRuns: Int = runs
+  def getRuns: Int = {
+    logWarning("Getting number of runs has no effect since Spark 2.0.0.")
+    runs
+  }
 
   /**
-   * :: Experimental ::
-   * Set the number of runs of the algorithm to execute in parallel. We 
initialize the algorithm
-   * this many times with random starting conditions (configured by the 
initialization mode), then
-   * return the best clustering found over any run. Default: 1.
+   * This function has no effect since Spark 2.0.0.
    */
   @Since("0.8.0")
-  @deprecated("Support for runs is deprecated. This param will have no effect 
in 2.0.0.", "1.6.0")
   def setRuns(runs: Int): this.type = {
-    internalSetRuns(runs)
-  }
-
-  // Internal version of setRuns for Python API, this should be removed at the 
same time as setRuns
-  // this is done to avoid deprecation warnings in our build.
-  private[mllib] def internalSetRuns(runs: Int): this.type = {
-    if (runs <= 0) {
-      throw new IllegalArgumentException("Number of runs must be positive")
-    }
-    if (runs != 1) {
-      logWarning("Setting number of runs is deprecated and will have no effect 
in 2.0.0")
-    }
-    this.runs = runs
+    logWarning("Setting number of runs has no effect since Spark 2.0.0.")
     this
   }
 
@@ -511,8 +495,7 @@ object KMeans {
    * @param data Training points as an `RDD` of `Vector` types.
    * @param k Number of clusters to create.
    * @param maxIterations Maximum number of iterations allowed.
-   * @param runs Number of runs to execute in parallel. The best model 
according to the cost
-   *             function will be returned. (default: 1)
+   * @param runs This param has no effect since Spark 2.0.0.
    * @param initializationMode The initialization algorithm. This can either 
be "random" or
    *                           "k-means||". (default: "k-means||")
    * @param seed Random seed for cluster initialization. Default is to 
generate seed based
@@ -528,7 +511,6 @@ object KMeans {
       seed: Long): KMeansModel = {
     new KMeans().setK(k)
       .setMaxIterations(maxIterations)
-      .internalSetRuns(runs)
       .setInitializationMode(initializationMode)
       .setSeed(seed)
       .run(data)
@@ -540,8 +522,7 @@ object KMeans {
    * @param data Training points as an `RDD` of `Vector` types.
    * @param k Number of clusters to create.
    * @param maxIterations Maximum number of iterations allowed.
-   * @param runs Number of runs to execute in parallel. The best model 
according to the cost
-   *             function will be returned. (default: 1)
+   * @param runs This param has no effect since Spark 2.0.0.
    * @param initializationMode The initialization algorithm. This can either 
be "random" or
    *                           "k-means||". (default: "k-means||")
    */
@@ -554,7 +535,6 @@ object KMeans {
       initializationMode: String): KMeansModel = {
     new KMeans().setK(k)
       .setMaxIterations(maxIterations)
-      .internalSetRuns(runs)
       .setInitializationMode(initializationMode)
       .run(data)
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/302a1868/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 4ce8012..9740ec4 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -194,9 +194,8 @@ class KMeansModel(JavaModel, JavaMLWritable, 
JavaMLReadable):
 class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, 
HasTol, HasSeed,
              JavaMLWritable, JavaMLReadable):
     """
-    K-means clustering with support for multiple parallel runs and a k-means++ 
like initialization
-    mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent 
runs are requested,
-    they are executed together with joint passes over the data for efficiency.
+    K-means clustering with a k-means++ like initialization mode
+    (the k-means|| algorithm by Bahmani et al).
 
     >>> from pyspark.mllib.linalg import Vectors
     >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),

http://git-wip-us.apache.org/repos/asf/spark/blob/302a1868/python/pyspark/mllib/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/mllib/clustering.py 
b/python/pyspark/mllib/clustering.py
index 23d118b..95f7278 100644
--- a/python/pyspark/mllib/clustering.py
+++ b/python/pyspark/mllib/clustering.py
@@ -179,7 +179,7 @@ class KMeansModel(Saveable, Loader):
 
     >>> data = array([0.0,0.0, 1.0,1.0, 9.0,8.0, 8.0,9.0]).reshape(4, 2)
     >>> model = KMeans.train(
-    ...     sc.parallelize(data), 2, maxIterations=10, runs=30, 
initializationMode="random",
+    ...     sc.parallelize(data), 2, maxIterations=10, 
initializationMode="random",
     ...                    seed=50, initializationSteps=5, epsilon=1e-4)
     >>> model.predict(array([0.0, 0.0])) == model.predict(array([1.0, 1.0]))
     True
@@ -323,9 +323,7 @@ class KMeans(object):
           Maximum number of iterations allowed.
           (default: 100)
         :param runs:
-          Number of runs to execute in parallel. The best model according
-          to the cost function will be returned (deprecated in 1.6.0).
-          (default: 1)
+          This param has no effect since Spark 2.0.0.
         :param initializationMode:
           The initialization algorithm. This can be either "random" or
           "k-means||".
@@ -350,8 +348,7 @@ class KMeans(object):
           (default: None)
         """
         if runs != 1:
-            warnings.warn(
-                "Support for runs is deprecated in 1.6.0. This param will have 
no effect in 2.0.0.")
+            warnings.warn("The param `runs` has no effect since Spark 2.0.0.")
         clusterInitialModel = []
         if initialModel is not None:
             if not isinstance(initialModel, KMeansModel):


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to