spark git commit: [SPARK-10164] [MLLIB] Fixed GMM distributed decomposition bug

jkbradley Sun, 23 Aug 2015 18:34:22 -0700

Repository: spark
Updated Branches:
  refs/heads/master c6df5f66d -> b963c19a8



[SPARK-10164] [MLLIB] Fixed GMM distributed decomposition bug

GaussianMixture now distributes matrix decompositions for certain problem 
sizes. Distributed computation actually fails, but this was not tested in unit 
tests.

This PR adds a unit test which checks this.  It failed previously but works 
with this fix.

CC: mengxr

Author: Joseph K. Bradley <jos...@databricks.com>

Closes #8370 from jkbradley/gmm-fix.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b963c19a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b963c19a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b963c19a

Branch: refs/heads/master
Commit: b963c19a803c5a26c9b65655d40ca6621acf8bd4
Parents: c6df5f6
Author: Joseph K. Bradley <jos...@databricks.com>
Authored: Sun Aug 23 18:34:07 2015 -0700
Committer: Joseph K. Bradley <jos...@databricks.com>
Committed: Sun Aug 23 18:34:07 2015 -0700

----------------------------------------------------------------------
 .../mllib/clustering/GaussianMixture.scala      | 22 +++++++++++++-------
 .../mllib/clustering/GaussianMixtureSuite.scala | 22 ++++++++++++++++++--
 2 files changed, 35 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/b963c19a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
index fcc9dfe..daa947e 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixture.scala
@@ -169,9 +169,7 @@ class GaussianMixture private (
     // Get length of the input vectors
     val d = breezeData.first().length
 
-    // Heuristic to distribute the computation of the 
[[MultivariateGaussian]]s, approximately when
-    // d > 25 except for when k is very small
-    val distributeGaussians = ((k - 1.0) / k) * d > 25
+    val shouldDistributeGaussians = 
GaussianMixture.shouldDistributeGaussians(k, d)
 
     // Determine initial weights and corresponding Gaussians.
     // If the user supplied an initial GMM, we use those values, otherwise
@@ -205,15 +203,15 @@ class GaussianMixture private (
       // (often referred to as the "M" step in literature)
       val sumWeights = sums.weights.sum
 
-      if (distributeGaussians) {
+      if (shouldDistributeGaussians) {
         val numPartitions = math.min(k, 1024)
         val tuples =
           Seq.tabulate(k)(i => (sums.means(i), sums.sigmas(i), 
sums.weights(i)))
         val (ws, gs) = sc.parallelize(tuples, numPartitions).map { case (mean, 
sigma, weight) =>
           updateWeightsAndGaussians(mean, sigma, weight, sumWeights)
-        }.collect.unzip
-        Array.copy(ws, 0, weights, 0, ws.length)
-        Array.copy(gs, 0, gaussians, 0, gs.length)
+        }.collect().unzip
+        Array.copy(ws.toArray, 0, weights, 0, ws.length)
+        Array.copy(gs.toArray, 0, gaussians, 0, gs.length)
       } else {
         var i = 0
         while (i < k) {
@@ -271,6 +269,16 @@ class GaussianMixture private (
   }
 }
 
+private[clustering] object GaussianMixture {
+  /**
+   * Heuristic to distribute the computation of the [[MultivariateGaussian]]s, 
approximately when
+   * d > 25 except for when k is very small.
+   * @param k  Number of topics
+   * @param d  Number of features
+   */
+  def shouldDistributeGaussians(k: Int, d: Int): Boolean = ((k - 1.0) / k) * d 
> 25
+}
+
 // companion class to provide zero constructor for ExpectationSum
 private object ExpectationSum {
   def zero(k: Int, d: Int): ExpectationSum = {

http://git-wip-us.apache.org/repos/asf/spark/blob/b963c19a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index b636d02..a72723e 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -18,7 +18,7 @@
 package org.apache.spark.mllib.clustering
 
 import org.apache.spark.SparkFunSuite
-import org.apache.spark.mllib.linalg.{Vectors, Matrices}
+import org.apache.spark.mllib.linalg.{Vector, Vectors, Matrices}
 import org.apache.spark.mllib.stat.distribution.MultivariateGaussian
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
@@ -76,6 +76,20 @@ class GaussianMixtureSuite extends SparkFunSuite with 
MLlibTestSparkContext {
     assert(gmm.gaussians(1).sigma ~== Esigma(1) absTol 1E-3)
   }
 
+  test("two clusters with distributed decompositions") {
+    val data = sc.parallelize(GaussianTestData.data2, 2)
+
+    val k = 5
+    val d = data.first().size
+    assert(GaussianMixture.shouldDistributeGaussians(k, d))
+
+    val gmm = new GaussianMixture()
+      .setK(k)
+      .run(data)
+
+    assert(gmm.k === k)
+  }
+
   test("single cluster with sparse data") {
     val data = sc.parallelize(Array(
       Vectors.sparse(3, Array(0, 2), Array(4.0, 2.0)),
@@ -116,7 +130,7 @@ class GaussianMixtureSuite extends SparkFunSuite with 
MLlibTestSparkContext {
     val sparseGMM = new GaussianMixture()
       .setK(2)
       .setInitialModel(initialGmm)
-      .run(data)
+      .run(sparseData)
 
     assert(sparseGMM.weights(0) ~== Ew(0) absTol 1E-3)
     assert(sparseGMM.weights(1) ~== Ew(1) absTol 1E-3)
@@ -168,5 +182,9 @@ class GaussianMixtureSuite extends SparkFunSuite with 
MLlibTestSparkContext {
       Vectors.dense( 4.5605), Vectors.dense( 5.2043), Vectors.dense( 6.2734)
     )
 
+    val data2: Array[Vector] = Array.tabulate(25){ i: Int =>
+      Vectors.dense(Array.tabulate(50)(i + _.toDouble))
+    }
+
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-10164] [MLLIB] Fixed GMM distributed decomposition bug

Reply via email to