from:"jkbradley"

spark git commit: [SPARK-9122] [MLLIB] [PySpark] spark.mllib regression support batch predict

2015-07-23 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 8a94eb23d -> 52de3acca


[SPARK-9122] [MLLIB] [PySpark] spark.mllib regression support batch predict

spark.mllib support batch predict for LinearRegressionModel, 
RidgeRegressionModel and LassoModel.

Author: Yanbo Liang 

Closes #7614 from yanboliang/spark-9122 and squashes the following commits:

4e610c0 [Yanbo Liang] spark.mllib regression support batch predict


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52de3acc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52de3acc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52de3acc

Branch: refs/heads/master
Commit: 52de3acca4ce8c36fd4c9ce162473a091701bbc7
Parents: 8a94eb2
Author: Yanbo Liang 
Authored: Thu Jul 23 18:53:07 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Jul 23 18:53:07 2015 -0700

--
 python/pyspark/mllib/regression.py | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/52de3acc/python/pyspark/mllib/regression.py
--
diff --git a/python/pyspark/mllib/regression.py 
b/python/pyspark/mllib/regression.py
index 8e90ade..5b7afc1 100644
--- a/python/pyspark/mllib/regression.py
+++ b/python/pyspark/mllib/regression.py
@@ -97,9 +97,11 @@ class LinearRegressionModelBase(LinearModel):
 
 def predict(self, x):
 """
-Predict the value of the dependent variable given a vector x
-containing values for the independent variables.
+Predict the value of the dependent variable given a vector or
+an RDD of vectors containing values for the independent variables.
 """
+if isinstance(x, RDD):
+return x.map(self.predict)
 x = _convert_to_vector(x)
 return self.weights.dot(x) + self.intercept
 
@@ -124,6 +126,8 @@ class LinearRegressionModel(LinearRegressionModelBase):
 True
 >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
 True
+>>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5
+True
 >>> import os, tempfile
 >>> path = tempfile.mkdtemp()
 >>> lrm.save(sc, path)
@@ -267,6 +271,8 @@ class LassoModel(LinearRegressionModelBase):
 True
 >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
 True
+>>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5
+True
 >>> import os, tempfile
 >>> path = tempfile.mkdtemp()
 >>> lrm.save(sc, path)
@@ -382,6 +388,8 @@ class RidgeRegressionModel(LinearRegressionModelBase):
 True
 >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5
 True
+>>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5
+True
 >>> import os, tempfile
 >>> path = tempfile.mkdtemp()
 >>> lrm.save(sc, path)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8092] [ML] Allow OneVsRest Classifier feature and label column names to be configurable.

2015-07-23 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master d249636e5 -> d4d762f27


[SPARK-8092] [ML] Allow OneVsRest Classifier feature and label column names to 
be configurable.

The base classifier input and output columns are ignored in favor of  the ones 
specified in OneVsRest.

Author: Ram Sriharsha 

Closes #6631 from harsha2010/SPARK-8092 and squashes the following commits:

6591dc6 [Ram Sriharsha] add documentation for params
b7024b1 [Ram Sriharsha] cleanup
f0e2bfb [Ram Sriharsha] merge with master
108d3d7 [Ram Sriharsha] merge with master
4f74126 [Ram Sriharsha] Allow label/ features columns to be configurable


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4d762f2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4d762f2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4d762f2

Branch: refs/heads/master
Commit: d4d762f275749a923356cd84de549b14c22cc3eb
Parents: d249636
Author: Ram Sriharsha 
Authored: Thu Jul 23 22:35:41 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Jul 23 22:35:41 2015 -0700

--
 .../spark/ml/classification/OneVsRest.scala | 17 +-
 .../ml/classification/OneVsRestSuite.scala  | 24 
 2 files changed, 40 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d4d762f2/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
index ea757c5..1741f19 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -47,6 +47,8 @@ private[ml] trait OneVsRestParams extends PredictorParams {
 
   /**
* param for the base binary classifier that we reduce multiclass 
classification into.
+   * The base classifier input and output columns are ignored in favor of
+   * the ones specified in [[OneVsRest]].
* @group param
*/
   val classifier: Param[ClassifierType] = new Param(this, "classifier", "base 
binary classifier")
@@ -160,6 +162,15 @@ final class OneVsRest(override val uid: String)
 set(classifier, value.asInstanceOf[ClassifierType])
   }
 
+  /** @group setParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  /** @group setParam */
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+
+  /** @group setParam */
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
   override def transformSchema(schema: StructType): StructType = {
 validateAndTransformSchema(schema, fitting = true, 
getClassifier.featuresDataType)
   }
@@ -195,7 +206,11 @@ final class OneVsRest(override val uid: String)
   val labelUDFWithNewMeta = labelUDF(col($(labelCol))).as(labelColName, 
newLabelMeta)
   val trainingDataset = multiclassLabeled.withColumn(labelColName, 
labelUDFWithNewMeta)
   val classifier = getClassifier
-  classifier.fit(trainingDataset, classifier.labelCol -> labelColName)
+  val paramMap = new ParamMap()
+  paramMap.put(classifier.labelCol -> labelColName)
+  paramMap.put(classifier.featuresCol -> getFeaturesCol)
+  paramMap.put(classifier.predictionCol -> getPredictionCol)
+  classifier.fit(trainingDataset, paramMap)
 }.toArray[ClassificationModel[_, _]]
 
 if (handlePersistence) {

http://git-wip-us.apache.org/repos/asf/spark/blob/d4d762f2/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
index 75cf5bd..3775292 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -19,6 +19,7 @@ package org.apache.spark.ml.classification
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.attribute.NominalAttribute
+import org.apache.spark.ml.feature.StringIndexer
 import org.apache.spark.ml.param.{ParamMap, ParamsSuite}
 import org.apache.spark.ml.util.MetadataUtils
 import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
@@ -104,6 +105,29 @@ class OneVsRestSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 ova.fit(datasetWithLabelMetadata)
   }
 
+  test("SPARK-8092: ensure label features and prediction cols are 
configurable") {
+val labelIndexer = new StringIndexer()
+  .setInputCol("label")

spark git commit: [SPARK-9222] [MLlib] Make class instantiation variables in DistributedLDAModel private[clustering]

2015-07-24 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master c2b50d693 -> e25312451


[SPARK-9222] [MLlib] Make class instantiation variables in DistributedLDAModel 
private[clustering]

This makes it easier to test all the class variables of the DistributedLDAmodel.

Author: MechCoder 

Closes #7573 from MechCoder/lda_test and squashes the following commits:

2f1a293 [MechCoder] [SPARK-9222] [MLlib] Make class instantiation variables in 
DistributedLDAModel private[clustering]


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2531245
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2531245
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2531245

Branch: refs/heads/master
Commit: e25312451322969ad716dddf8248b8c17f68323b
Parents: c2b50d6
Author: MechCoder 
Authored: Fri Jul 24 10:56:48 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Jul 24 10:56:48 2015 -0700

--
 .../org/apache/spark/mllib/clustering/LDAModel.scala |  8 
 .../org/apache/spark/mllib/clustering/LDASuite.scala | 15 +++
 2 files changed, 19 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e2531245/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 920b577..31c1d52 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -283,12 +283,12 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
  */
 @Experimental
 class DistributedLDAModel private (
-private val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
-private val globalTopicTotals: LDA.TopicCounts,
+private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount],
+private[clustering] val globalTopicTotals: LDA.TopicCounts,
 val k: Int,
 val vocabSize: Int,
-private val docConcentration: Double,
-private val topicConcentration: Double,
+private[clustering] val docConcentration: Double,
+private[clustering] val topicConcentration: Double,
 private[spark] val iterationTimes: Array[Double]) extends LDAModel {
 
   import LDA._

http://git-wip-us.apache.org/repos/asf/spark/blob/e2531245/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index da70d9b..376a87f 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering
 import breeze.linalg.{DenseMatrix => BDM}
 
 import org.apache.spark.SparkFunSuite
+import org.apache.spark.graphx.Edge
 import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix, Vector, Vectors}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
@@ -318,6 +319,20 @@ class LDASuite extends SparkFunSuite with 
MLlibTestSparkContext {
   assert(distributedModel.k === sameDistributedModel.k)
   assert(distributedModel.vocabSize === sameDistributedModel.vocabSize)
   assert(distributedModel.iterationTimes === 
sameDistributedModel.iterationTimes)
+  assert(distributedModel.docConcentration === 
sameDistributedModel.docConcentration)
+  assert(distributedModel.topicConcentration === 
sameDistributedModel.topicConcentration)
+  assert(distributedModel.globalTopicTotals === 
sameDistributedModel.globalTopicTotals)
+
+  val graph = distributedModel.graph
+  val sameGraph = sameDistributedModel.graph
+  assert(graph.vertices.sortByKey().collect() === 
sameGraph.vertices.sortByKey().collect())
+  val edge = graph.edges.map {
+case Edge(sid: Long, did: Long, nos: Double) => (sid, did, nos)
+  }.sortBy(x => (x._1, x._2)).collect()
+  val sameEdge = sameGraph.edges.map {
+case Edge(sid: Long, did: Long, nos: Double) => (sid, did, nos)
+  }.sortBy(x => (x._1, x._2)).collect()
+  assert(edge === sameEdge)
 } finally {
   Utils.deleteRecursively(tempDir1)
   Utils.deleteRecursively(tempDir2)


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-7045] [MLLIB] Avoid intermediate representation when creating model

2015-07-24 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 64135cbb3 -> a400ab516


[SPARK-7045] [MLLIB] Avoid intermediate representation when creating model

Word2Vec used to convert from an Array[Float] representation to a Map[String, 
Array[Float]] and then back to an Array[Float] through Word2VecModel.

This prevents this conversion while still supporting the older method of 
supplying a Map.

Author: MechCoder 

Closes #5748 from MechCoder/spark-7045 and squashes the following commits:

e308913 [MechCoder] move docs
5703116 [MechCoder] minor
fa04313 [MechCoder] style fixes
b1d61c4 [MechCoder] better errors and tests
3b32c8c [MechCoder] [SPARK-7045] Avoid intermediate representation when 
creating model


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a400ab51
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a400ab51
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a400ab51

Branch: refs/heads/master
Commit: a400ab516fa93185aa683a596f9d7c6c1a02f330
Parents: 64135cb
Author: MechCoder 
Authored: Fri Jul 24 14:58:07 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Jul 24 14:58:07 2015 -0700

--
 .../apache/spark/mllib/feature/Word2Vec.scala   | 85 +++-
 .../spark/mllib/feature/Word2VecSuite.scala |  6 ++
 2 files changed, 55 insertions(+), 36 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a400ab51/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
index f087d06..cbbd2b0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala
@@ -403,17 +403,8 @@ class Word2Vec extends Serializable with Logging {
 }
 newSentences.unpersist()
 
-val word2VecMap = mutable.HashMap.empty[String, Array[Float]]
-var i = 0
-while (i < vocabSize) {
-  val word = bcVocab.value(i).word
-  val vector = new Array[Float](vectorSize)
-  Array.copy(syn0Global, i * vectorSize, vector, 0, vectorSize)
-  word2VecMap += word -> vector
-  i += 1
-}
-
-new Word2VecModel(word2VecMap.toMap)
+val wordArray = vocab.map(_.word)
+new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global)
   }
 
   /**
@@ -429,38 +420,42 @@ class Word2Vec extends Serializable with Logging {
 /**
  * :: Experimental ::
  * Word2Vec model
+ * @param wordIndex maps each word to an index, which can retrieve the 
corresponding
+ *  vector from wordVectors
+ * @param wordVectors array of length numWords * vectorSize, vector 
corresponding
+ *to the word mapped with index i can be retrieved by the 
slice
+ *(i * vectorSize, i * vectorSize + vectorSize)
  */
 @Experimental
-class Word2VecModel private[spark] (
-model: Map[String, Array[Float]]) extends Serializable with Saveable {
-
-  // wordList: Ordered list of words obtained from model.
-  private val wordList: Array[String] = model.keys.toArray
-
-  // wordIndex: Maps each word to an index, which can retrieve the 
corresponding
-  //vector from wordVectors (see below).
-  private val wordIndex: Map[String, Int] = wordList.zip(0 until 
model.size).toMap
+class Word2VecModel private[mllib] (
+private val wordIndex: Map[String, Int],
+private val wordVectors: Array[Float]) extends Serializable with Saveable {
 
-  // vectorSize: Dimension of each word's vector.
-  private val vectorSize = model.head._2.size
   private val numWords = wordIndex.size
+  // vectorSize: Dimension of each word's vector.
+  private val vectorSize = wordVectors.length / numWords
+
+  // wordList: Ordered list of words obtained from wordIndex.
+  private val wordList: Array[String] = {
+val (wl, _) = wordIndex.toSeq.sortBy(_._2).unzip
+wl.toArray
+  }
 
-  // wordVectors: Array of length numWords * vectorSize, vector corresponding 
to the word
-  //  mapped with index i can be retrieved by the slice
-  //  (ind * vectorSize, ind * vectorSize + vectorSize)
   // wordVecNorms: Array of length numWords, each value being the Euclidean 
norm
   //   of the wordVector.
-  private val (wordVectors: Array[Float], wordVecNorms: Array[Double]) = {
-val wordVectors = new Array[Float](vectorSize * numWords)
+  private val wordVecNorms: Array[Double] = {
 val wordVecNorms = new Array[Double](numWords)
 var i = 0
 while (i < numWords) {
-  val vec = model.get(wordList(i)).get
-  Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize)
+  val vec = wordVectors.slic

spark git commit: [SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA perplexity

2015-07-29 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 1b0099fc6 -> 2cc212d56


[SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA perplexity

Implements `logPerplexity` in `OnlineLDAOptimizer`. Also refactors inference 
code into companion object to enable future reuse (e.g. `predict` method).

Author: Feynman Liang 

Closes #7705 from feynmanliang/SPARK-6793-perplexity and squashes the following 
commits:

6da2c99 [Feynman Liang] Remove get* from LDAModel public API
8381da6 [Feynman Liang] Code review comments
17f7000 [Feynman Liang] Documentation typo fixes
2f452a4 [Feynman Liang] Remove auxillary DistributedLDAModel constructor
a275914 [Feynman Liang] Prevent empty counts calls to variationalInference
06d02d9 [Feynman Liang] Remove deprecated LocalLDAModel constructor
afecb46 [Feynman Liang] Fix regression bug in sstats accumulator
5a327a0 [Feynman Liang] Code review quick fixes
998c03e [Feynman Liang] Fix style
1cbb67d [Feynman Liang] Fix access modifier bug
4362daa [Feynman Liang] Organize imports
4f171f7 [Feynman Liang] Fix indendation
2f049ce [Feynman Liang] Fix failing save/load tests
7415e96 [Feynman Liang] Pick changes from big PR
11e7c33 [Feynman Liang] Merge remote-tracking branch 'apache/master' into 
SPARK-6793-perplexity
f8adc48 [Feynman Liang] Add logPerplexity, refactor variationalBound into a 
method
cd521d6 [Feynman Liang] Refactor methods into companion class
7f62a55 [Feynman Liang] --amend
c62cb1e [Feynman Liang] Outer product for stats, revert Range slicing
aead650 [Feynman Liang] Range slice, in-place update, reduce transposes


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2cc212d5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2cc212d5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2cc212d5

Branch: refs/heads/master
Commit: 2cc212d56a1d50fe68d5816f71b27803de1f6389
Parents: 1b0099f
Author: Feynman Liang 
Authored: Wed Jul 29 16:20:20 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Jul 29 16:20:20 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   | 200 +++
 .../spark/mllib/clustering/LDAOptimizer.scala   | 138 +++--
 .../spark/mllib/clustering/LDAUtils.scala   |  55 +
 .../spark/mllib/clustering/JavaLDASuite.java|   6 +-
 .../spark/mllib/clustering/LDASuite.scala   |  53 -
 5 files changed, 348 insertions(+), 104 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2cc212d5/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 31c1d52..059b52e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -17,10 +17,9 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM, normalize, sum => brzSum, 
DenseVector => BDV}
-
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum}
+import breeze.numerics.{exp, lgamma}
 import org.apache.hadoop.fs.Path
-
 import org.json4s.DefaultFormats
 import org.json4s.JsonDSL._
 import org.json4s.jackson.JsonMethods._
@@ -28,14 +27,13 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaPairRDD
-import org.apache.spark.graphx.{VertexId, Edge, EdgeContext, Graph}
-import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix, 
DenseVector}
-import org.apache.spark.mllib.util.{Saveable, Loader}
+import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId}
+import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
+import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{SQLContext, Row}
+import org.apache.spark.sql.{Row, SQLContext}
 import org.apache.spark.util.BoundedPriorityQueue
 
-
 /**
  * :: Experimental ::
  *
@@ -54,6 +52,31 @@ abstract class LDAModel private[clustering] extends Saveable 
{
   def vocabSize: Int
 
   /**
+   * Concentration parameter (commonly named "alpha") for the prior placed on 
documents'
+   * distributions over topics ("theta").
+   *
+   * This is the parameter to a Dirichlet distribution.
+   */
+  def docConcentration: Vector
+
+  /**
+   * Concentration parameter (commonly named "beta" or "eta") for the prior 
placed on topics'
+   * distributions over terms.
+   *
+   * This is the parameter to a symmetric Dirichlet distribution.
+   *
+   * Note: The topics' distributions over terms are called "beta" in the 
orig

spark git commit: [SPARK-9016] [ML] make random forest classifiers implement classification trait

2015-07-29 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 103d8cce7 -> 37c2d1927


[SPARK-9016] [ML] make random forest classifiers implement classification trait

Implement the classification trait for RandomForestClassifiers. The plan is to 
use this in the future to providing thresholding for RandomForestClassifiers 
(as well as other classifiers that implement that trait).

Author: Holden Karau 

Closes #7432 from 
holdenk/SPARK-9016-make-random-forest-classifiers-implement-classification-trait
 and squashes the following commits:

bf22fa6 [Holden Karau] Add missing imports for testing suite
e948f0d [Holden Karau] Check the prediction generation from rawprediciton
25320c3 [Holden Karau] Don't supply numClasses when not needed, assert model 
classes are as expected
1a67e04 [Holden Karau] Use old decission tree stuff instead
673e0c3 [Holden Karau] Merge branch 'master' into 
SPARK-9016-make-random-forest-classifiers-implement-classification-trait
0d15b96 [Holden Karau] FIx typo
5eafad4 [Holden Karau] add a constructor for rootnode + num classes
fc6156f [Holden Karau] scala style fix
2597915 [Holden Karau] take num classes in constructor
3ccfe4a [Holden Karau] Merge in master, make pass numClasses through 
randomforest for training
222a10b [Holden Karau] Increase numtrees to 3 in the python test since before 
the two were equal and the argmax was selecting the last one
16aea1c [Holden Karau] Make tests match the new models
b454a02 [Holden Karau] Make the Tree classifiers extends the Classifier base 
class
77b4114 [Holden Karau] Import vectors lib


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37c2d192
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37c2d192
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37c2d192

Branch: refs/heads/master
Commit: 37c2d1927cebdd19a14c054f670cb0fb9a263586
Parents: 103d8cc
Author: Holden Karau 
Authored: Wed Jul 29 18:18:29 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Jul 29 18:18:29 2015 -0700

--
 .../classification/RandomForestClassifier.scala | 30 +++-
 .../RandomForestClassifierSuite.scala   | 18 +---
 python/pyspark/ml/classification.py |  4 +--
 3 files changed, 32 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/37c2d192/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index fc0693f..bc19bd6 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -25,7 +25,7 @@ import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, RandomForestParams, 
TreeClassifierParams, TreeEnsembleModel}
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo}
 import org.apache.spark.mllib.tree.model.{RandomForestModel => 
OldRandomForestModel}
@@ -43,7 +43,7 @@ import org.apache.spark.sql.types.DoubleType
  */
 @Experimental
 final class RandomForestClassifier(override val uid: String)
-  extends Predictor[Vector, RandomForestClassifier, 
RandomForestClassificationModel]
+  extends Classifier[Vector, RandomForestClassifier, 
RandomForestClassificationModel]
   with RandomForestParams with TreeClassifierParams {
 
   def this() = this(Identifiable.randomUID("rfc"))
@@ -98,7 +98,7 @@ final class RandomForestClassifier(override val uid: String)
 val trees =
   RandomForest.run(oldDataset, strategy, getNumTrees, 
getFeatureSubsetStrategy, getSeed)
 .map(_.asInstanceOf[DecisionTreeClassificationModel])
-new RandomForestClassificationModel(trees)
+new RandomForestClassificationModel(trees, numClasses)
   }
 
   override def copy(extra: ParamMap): RandomForestClassifier = 
defaultCopy(extra)
@@ -125,8 +125,9 @@ object RandomForestClassifier {
 @Experimental
 final class RandomForestClassificationModel private[ml] (
 override val uid: String,
-private val _trees: Array[DecisionTreeClassificationModel])
-  extends PredictionModel[Vector, RandomForestClassificationModel]
+private val _trees: Array[DecisionTreeClassificationModel],
+override val numClasses: Int)
+  extends ClassificationModel[Vector, RandomFo

spark git commit: [SPARK-9440] [MLLIB] Add hyperparameters to LocalLDAModel save/load

2015-07-29 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 2a9fe4a4e -> a200e6456


[SPARK-9440] [MLLIB] Add hyperparameters to LocalLDAModel save/load

jkbradley MechCoder

Resolves blocking issue for SPARK-6793. Please review after #7705 is merged.

Author: Feynman Liang 

Closes #7757 from feynmanliang/SPARK-9940-localSaveLoad and squashes the 
following commits:

d0d8cf4 [Feynman Liang] Fix thisClassName
0f30109 [Feynman Liang] Fix tests after changing LDAModel public API
dc61981 [Feynman Liang] Add hyperparams to LocalLDAModel save/load


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a200e645
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a200e645
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a200e645

Branch: refs/heads/master
Commit: a200e64561c8803731578267df16906f6773cbea
Parents: 2a9fe4a
Author: Feynman Liang 
Authored: Wed Jul 29 19:02:15 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Jul 29 19:02:15 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   | 40 ++--
 .../spark/mllib/clustering/LDASuite.scala   |  6 ++-
 2 files changed, 33 insertions(+), 13 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a200e645/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 059b52e..ece2884 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -215,7 +215,8 @@ class LocalLDAModel private[clustering] (
   override protected def formatVersion = "1.0"
 
   override def save(sc: SparkContext, path: String): Unit = {
-LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix)
+LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, 
topicConcentration,
+  gammaShape)
   }
   // TODO
   // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
@@ -312,16 +313,23 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
 // as a Row in data.
 case class Data(topic: Vector, index: Int)
 
-// TODO: explicitly save docConcentration, topicConcentration, and 
gammaShape for use in
-// model.predict()
-def save(sc: SparkContext, path: String, topicsMatrix: Matrix): Unit = {
+def save(
+sc: SparkContext,
+path: String,
+topicsMatrix: Matrix,
+docConcentration: Vector,
+topicConcentration: Double,
+gammaShape: Double): Unit = {
   val sqlContext = SQLContext.getOrCreate(sc)
   import sqlContext.implicits._
 
   val k = topicsMatrix.numCols
   val metadata = compact(render
 (("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~
-  ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows)))
+  ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows) ~
+  ("docConcentration" -> docConcentration.toArray.toSeq) ~
+  ("topicConcentration" -> topicConcentration) ~
+  ("gammaShape" -> gammaShape)))
   sc.parallelize(Seq(metadata), 
1).saveAsTextFile(Loader.metadataPath(path))
 
   val topicsDenseMatrix = topicsMatrix.toBreeze.toDenseMatrix
@@ -331,7 +339,12 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
   sc.parallelize(topics, 1).toDF().write.parquet(Loader.dataPath(path))
 }
 
-def load(sc: SparkContext, path: String): LocalLDAModel = {
+def load(
+sc: SparkContext,
+path: String,
+docConcentration: Vector,
+topicConcentration: Double,
+gammaShape: Double): LocalLDAModel = {
   val dataPath = Loader.dataPath(path)
   val sqlContext = SQLContext.getOrCreate(sc)
   val dataFrame = sqlContext.read.parquet(dataPath)
@@ -348,8 +361,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
   val topicsMat = Matrices.fromBreeze(brzTopics)
 
   // TODO: initialize with docConcentration, topicConcentration, and 
gammaShape after SPARK-9940
-  new LocalLDAModel(topicsMat,
-Vectors.dense(Array.fill(topicsMat.numRows)(1.0 / topicsMat.numRows)), 
1D, 100D)
+  new LocalLDAModel(topicsMat, docConcentration, topicConcentration, 
gammaShape)
 }
   }
 
@@ -358,11 +370,15 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
 implicit val formats = DefaultFormats
 val expectedK = (metadata \ "k").extract[Int]
 val expectedVocabSize = (metadata \ "vocabSize").ex

spark git commit: [SPARK-5567] [MLLIB] Add predict method to LocalLDAModel

2015-07-30 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master a20e743fb -> d8cfd531c


[SPARK-5567] [MLLIB] Add predict method to LocalLDAModel

jkbradley hhbyyh

Adds `topicDistributions` to LocalLDAModel. Please review after #7757 is merged.

Author: Feynman Liang 

Closes #7760 from feynmanliang/SPARK-5567-predict-in-LDA and squashes the 
following commits:

0ad1134 [Feynman Liang] Remove println
27b3877 [Feynman Liang] Code review fixes
6bfb87c [Feynman Liang] Remove extra newline
476f788 [Feynman Liang] Fix checks and doc for variationalInference
061780c [Feynman Liang] Code review cleanup
3be2947 [Feynman Liang] Rename topicDistribution -> topicDistributions
2a821a6 [Feynman Liang] Add predict methods to LocalLDAModel


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d8cfd531
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d8cfd531
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d8cfd531

Branch: refs/heads/master
Commit: d8cfd531c7c50c9b00ab546be458f44f84c386ae
Parents: a20e743
Author: Feynman Liang 
Authored: Thu Jul 30 13:17:54 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Jul 30 13:17:54 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   | 42 +++--
 .../spark/mllib/clustering/LDAOptimizer.scala   |  5 +-
 .../spark/mllib/clustering/LDASuite.scala   | 63 
 3 files changed, 102 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d8cfd531/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index ece2884..6cfad3f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -186,7 +186,6 @@ abstract class LDAModel private[clustering] extends 
Saveable {
  * This model stores only the inferred topics.
  * It may be used for computing topics for new documents, but it may give less 
accurate answers
  * than the [[DistributedLDAModel]].
- *
  * @param topics Inferred topics (vocabSize x k matrix).
  */
 @Experimental
@@ -221,9 +220,6 @@ class LocalLDAModel private[clustering] (
   // TODO
   // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
 
-  // TODO:
-  // override def topicDistributions(documents: RDD[(Long, Vector)]): 
RDD[(Long, Vector)] = ???
-
   /**
* Calculate the log variational bound on perplexity. See Equation (16) in 
original Online
* LDA paper.
@@ -269,7 +265,7 @@ class LocalLDAModel private[clustering] (
 // by topic (columns of lambda)
 val Elogbeta = LDAUtils.dirichletExpectation(lambda.t).t
 
-var score = documents.filter(_._2.numActives > 0).map { case (id: Long, 
termCounts: Vector) =>
+var score = documents.filter(_._2.numNonzeros > 0).map { case (id: Long, 
termCounts: Vector) =>
   var docScore = 0.0D
   val (gammad: BDV[Double], _) = 
OnlineLDAOptimizer.variationalTopicInference(
 termCounts, exp(Elogbeta), brzAlpha, gammaShape, k)
@@ -277,7 +273,7 @@ class LocalLDAModel private[clustering] (
 
   // E[log p(doc | theta, beta)]
   termCounts.foreachActive { case (idx, count) =>
-docScore += LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t)
+docScore += count * LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, 
::).t)
   }
   // E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a 
vector
   docScore += sum((brzAlpha - gammad) :* Elogthetad)
@@ -297,6 +293,40 @@ class LocalLDAModel private[clustering] (
 score
   }
 
+  /**
+   * Predicts the topic mixture distribution for each document (often called 
"theta" in the
+   * literature).  Returns a vector of zeros for an empty document.
+   *
+   * This uses a variational approximation following Hoffman et al. (2010), 
where the approximate
+   * distribution is called "gamma."  Technically, this method returns this 
approximation "gamma"
+   * for each document.
+   * @param documents documents to predict topic mixture distributions for
+   * @return An RDD of (document ID, topic mixture distribution for document)
+   */
+  // TODO: declare in LDAModel and override once implemented in 
DistributedLDAModel
+  def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] 
= {
+// Double transpose because dirichletExpectation normalizes by row and we 
need to normalize
+// by topic (columns of lambda)
+val expElogbeta = 
exp(LDAUtils.dirichletExpectation(topicsMatrix.toBree

spark git commit: [SPARK-9454] Change LDASuite tests to use vector comparisons

2015-07-30 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 1abf7dc16 -> 89cda69ec


[SPARK-9454] Change LDASuite tests to use vector comparisons

jkbradley Changes the current hacky string-comparison for vector compares.

Author: Feynman Liang 

Closes #7775 from feynmanliang/SPARK-9454-ldasuite-vector-compare and squashes 
the following commits:

bd91a82 [Feynman Liang] Remove println
905c76e [Feynman Liang] Fix string compare in distributed EM
2f24c13 [Feynman Liang] Improve LDASuite tests


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/89cda69e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/89cda69e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/89cda69e

Branch: refs/heads/master
Commit: 89cda69ecd5ef942a68ad13fc4e1f4184010f087
Parents: 1abf7dc
Author: Feynman Liang 
Authored: Thu Jul 30 14:08:59 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Jul 30 14:08:59 2015 -0700

--
 .../spark/mllib/clustering/LDASuite.scala   | 33 +---
 1 file changed, 14 insertions(+), 19 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/89cda69e/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index d74482d..c43e1e5 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -83,21 +83,14 @@ class LDASuite extends SparkFunSuite with 
MLlibTestSparkContext {
 assert(model.topicsMatrix === localModel.topicsMatrix)
 
 // Check: topic summaries
-//  The odd decimal formatting and sorting is a hack to do a robust 
comparison.
-val roundedTopicSummary = model.describeTopics().map { case (terms, 
termWeights) =>
-  // cut values to 3 digits after the decimal place
-  terms.zip(termWeights).map { case (term, weight) =>
-("%.3f".format(weight).toDouble, term.toInt)
-  }
-}.sortBy(_.mkString(""))
-val roundedLocalTopicSummary = localModel.describeTopics().map { case 
(terms, termWeights) =>
-  // cut values to 3 digits after the decimal place
-  terms.zip(termWeights).map { case (term, weight) =>
-("%.3f".format(weight).toDouble, term.toInt)
-  }
-}.sortBy(_.mkString(""))
-roundedTopicSummary.zip(roundedLocalTopicSummary).foreach { case (t1, t2) 
=>
-  assert(t1 === t2)
+val topicSummary = model.describeTopics().map { case (terms, termWeights) 
=>
+  Vectors.sparse(tinyVocabSize, terms, termWeights)
+}.sortBy(_.toString)
+val localTopicSummary = localModel.describeTopics().map { case (terms, 
termWeights) =>
+  Vectors.sparse(tinyVocabSize, terms, termWeights)
+}.sortBy(_.toString)
+topicSummary.zip(localTopicSummary).foreach { case (topics, topicsLocal) =>
+  assert(topics ~== topicsLocal absTol 0.01)
 }
 
 // Check: per-doc topic distributions
@@ -197,10 +190,12 @@ class LDASuite extends SparkFunSuite with 
MLlibTestSparkContext {
 
 // verify the result, Note this generate the identical result as
 // [[https://github.com/Blei-Lab/onlineldavb]]
-val topic1 = op.getLambda(0, 
::).inner.toArray.map("%.4f".format(_)).mkString(", ")
-val topic2 = op.getLambda(1, 
::).inner.toArray.map("%.4f".format(_)).mkString(", ")
-assert("1.1101, 1.2076, 1.3050, 0.8899, 0.7924, 0.6950" == topic1)
-assert("0.8899, 0.7924, 0.6950, 1.1101, 1.2076, 1.3050" == topic2)
+val topic1: Vector = Vectors.fromBreeze(op.getLambda(0, ::).t)
+val topic2: Vector = Vectors.fromBreeze(op.getLambda(1, ::).t)
+val expectedTopic1 = Vectors.dense(1.1101, 1.2076, 1.3050, 0.8899, 0.7924, 
0.6950)
+val expectedTopic2 = Vectors.dense(0.8899, 0.7924, 0.6950, 1.1101, 1.2076, 
1.3050)
+assert(topic1 ~== expectedTopic1 absTol 0.01)
+assert(topic2 ~== expectedTopic2 absTol 0.01)
   }
 
   test("OnlineLDAOptimizer with toy data") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-6684] [MLLIB] [ML] Add checkpointing to GBTs

2015-07-30 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 7f7a319c4 -> be7be6d4c


[SPARK-6684] [MLLIB] [ML] Add checkpointing to GBTs

Add checkpointing to GradientBoostedTrees, GBTClassifier, GBTRegressor

CC: mengxr

Author: Joseph K. Bradley 

Closes #7804 from jkbradley/gbt-checkpoint3 and squashes the following commits:

3fbd7ba [Joseph K. Bradley] tiny fix
b3e160c [Joseph K. Bradley] unset checkpoint dir after test
9cc3a04 [Joseph K. Bradley] added checkpointing to GBTs


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be7be6d4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be7be6d4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be7be6d4

Branch: refs/heads/master
Commit: be7be6d4c7d978c20e601d1f5f56ecb3479814cb
Parents: 7f7a319
Author: Joseph K. Bradley 
Authored: Thu Jul 30 16:04:23 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Jul 30 16:04:23 2015 -0700

--
 .../spark/mllib/clustering/LDAOptimizer.scala   |  1 +
 .../spark/mllib/tree/GradientBoostedTrees.scala | 48 ++--
 .../tree/configuration/BoostingStrategy.scala   |  3 +-
 .../ml/classification/GBTClassifierSuite.scala  | 20 +
 .../spark/ml/regression/GBTRegressorSuite.scala | 20 -
 .../mllib/tree/GradientBoostedTreesSuite.scala  | 79 
 6 files changed, 114 insertions(+), 57 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/be7be6d4/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index 9dbec41..d6f8b29 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -144,6 +144,7 @@ final class EMLDAOptimizer extends LDAOptimizer {
 this.checkpointInterval = lda.getCheckpointInterval
 this.graphCheckpointer = new PeriodicGraphCheckpointer[TopicCounts, 
TokenCount](
   checkpointInterval, graph.vertices.sparkContext)
+this.graphCheckpointer.update(this.graph)
 this.globalTopicTotals = computeGlobalTopicTotals()
 this
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/be7be6d4/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
index a835f96..9ce6faa 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala
@@ -20,6 +20,7 @@ package org.apache.spark.mllib.tree
 import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaRDD
+import org.apache.spark.mllib.impl.PeriodicRDDCheckpointer
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.BoostingStrategy
 import org.apache.spark.mllib.tree.configuration.Algo._
@@ -184,22 +185,28 @@ object GradientBoostedTrees extends Logging {
   false
 }
 
+// Prepare periodic checkpointers
+val predErrorCheckpointer = new PeriodicRDDCheckpointer[(Double, Double)](
+  treeStrategy.getCheckpointInterval, input.sparkContext)
+val validatePredErrorCheckpointer = new PeriodicRDDCheckpointer[(Double, 
Double)](
+  treeStrategy.getCheckpointInterval, input.sparkContext)
+
 timer.stop("init")
 
 logDebug("##")
 logDebug("Building tree 0")
 logDebug("##")
-var data = input
 
 // Initialize tree
 timer.start("building tree 0")
-val firstTreeModel = new DecisionTree(treeStrategy).run(data)
+val firstTreeModel = new DecisionTree(treeStrategy).run(input)
 val firstTreeWeight = 1.0
 baseLearners(0) = firstTreeModel
 baseLearnerWeights(0) = firstTreeWeight
 
 var predError: RDD[(Double, Double)] = GradientBoostedTreesModel.
   computeInitialPredictionAndError(input, firstTreeWeight, firstTreeModel, 
loss)
+predErrorCheckpointer.update(predError)
 logDebug("error of gbt = " + predError.values.mean())
 
 // Note: A model of type regression is used since we require raw prediction
@@ -207,35 +214,34 @@ object GradientBoostedTrees extends Logging {
 
 var validatePredError: RDD[(Double, Double)] = GradientBoostedTreesModel.
   computeInitialPredictionAndError(validationI

spark git commit: [SPARK-9077] [MLLIB] Improve error message for decision trees when numExamples < maxCategoriesPerFeature

2015-07-30 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 351eda0e2 -> 65fa4181c


[SPARK-9077] [MLLIB] Improve error message for decision trees when numExamples 
< maxCategoriesPerFeature

Improve error message when number of examples is less than arity of high-arity 
categorical feature

CC jkbradley is this about what you had in mind? I know it's a starter, but was 
on my list to close out in the short term.

Author: Sean Owen 

Closes #7800 from srowen/SPARK-9077 and squashes the following commits:

b8f6cdb [Sean Owen] Improve error message when number of examples is less than 
arity of high-arity categorical feature


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65fa4181
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65fa4181
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65fa4181

Branch: refs/heads/master
Commit: 65fa4181c35135080870c1e4c1f904ada3a8cf59
Parents: 351eda0
Author: Sean Owen 
Authored: Thu Jul 30 17:26:18 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Jul 30 17:26:18 2015 -0700

--
 .../apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala  | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/65fa4181/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
index 380291a..9fe2646 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
@@ -128,9 +128,13 @@ private[spark] object DecisionTreeMetadata extends Logging 
{
 // based on the number of training examples.
 if (strategy.categoricalFeaturesInfo.nonEmpty) {
   val maxCategoriesPerFeature = strategy.categoricalFeaturesInfo.values.max
+  val maxCategory =
+strategy.categoricalFeaturesInfo.find(_._2 == 
maxCategoriesPerFeature).get._1
   require(maxCategoriesPerFeature <= maxPossibleBins,
-s"DecisionTree requires maxBins (= $maxPossibleBins) >= max categories 
" +
-  s"in categorical features (= $maxCategoriesPerFeature)")
+s"DecisionTree requires maxBins (= $maxPossibleBins) to be at least as 
large as the " +
+s"number of values in each categorical feature, but categorical 
feature $maxCategory " +
+s"has $maxCategoriesPerFeature values. Considering remove this and 
other categorical " +
+"features with a large number of values, or add more training 
examples.")
 }
 
 val unorderedFeatures = new mutable.HashSet[Int]()


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-7690] [ML] Multiclass classification Evaluator

2015-07-30 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 83670fc9e -> 4e5919bfb


[SPARK-7690] [ML] Multiclass classification Evaluator

Multiclass Classification Evaluator for ML Pipelines. F1 score, precision, 
recall, weighted precision and weighted recall are supported as available 
metrics.

Author: Ram Sriharsha 

Closes #7475 from harsha2010/SPARK-7690 and squashes the following commits:

9bf4ec7 [Ram Sriharsha] fix indentation
3f09a85 [Ram Sriharsha] cleanup doc
16115ae [Ram Sriharsha] code review fixes
032d2a3 [Ram Sriharsha] fix test
eec9865 [Ram Sriharsha] Fix Python Indentation
1dbeffd [Ram Sriharsha] Merge branch 'master' into SPARK-7690
68cea85 [Ram Sriharsha] Merge branch 'master' into SPARK-7690
54c03de [Ram Sriharsha] [SPARK-7690][ml][WIP] Multiclass Evaluator for ML 
Pipeline


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e5919bf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e5919bf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e5919bf

Branch: refs/heads/master
Commit: 4e5919bfb47a58bcbda90ae01c1bed2128ded983
Parents: 83670fc
Author: Ram Sriharsha 
Authored: Thu Jul 30 23:02:11 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Jul 30 23:02:11 2015 -0700

--
 .../MulticlassClassificationEvaluator.scala | 85 
 ...MulticlassClassificationEvaluatorSuite.scala | 28 +++
 python/pyspark/ml/evaluation.py | 66 +++
 3 files changed, 179 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4e5919bf/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
new file mode 100644
index 000..44f779c
--- /dev/null
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.evaluation
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param}
+import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
+import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
+import org.apache.spark.mllib.evaluation.MulticlassMetrics
+import org.apache.spark.sql.{Row, DataFrame}
+import org.apache.spark.sql.types.DoubleType
+
+/**
+ * :: Experimental ::
+ * Evaluator for multiclass classification, which expects two input columns: 
score and label.
+ */
+@Experimental
+class MulticlassClassificationEvaluator (override val uid: String)
+  extends Evaluator with HasPredictionCol with HasLabelCol {
+
+  def this() = this(Identifiable.randomUID("mcEval"))
+
+  /**
+   * param for metric name in evaluation (supports `"f1"` (default), 
`"precision"`, `"recall"`,
+   * `"weightedPrecision"`, `"weightedRecall"`)
+   * @group param
+   */
+  val metricName: Param[String] = {
+val allowedParams = ParamValidators.inArray(Array("f1", "precision",
+  "recall", "weightedPrecision", "weightedRecall"))
+new Param(this, "metricName", "metric name in evaluation " +
+  "(f1|precision|recall|weightedPrecision|weightedRecall)", allowedParams)
+  }
+
+  /** @group getParam */
+  def getMetricName: String = $(metricName)
+
+  /** @group setParam */
+  def setMetricName(value: String): this.type = set(metricName, value)
+
+  /** @group setParam */
+  def setPredictionCol(value: String): this.type = set(predictionCol, value)
+
+  /** @group setParam */
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+
+  setDefault(metricName -> "f1")
+
+  override def evaluate(dataset: DataFrame): Double = {
+val schema = dataset.schema
+SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
+SchemaUtils.checkColumnType(schema, $(labe

spark git commit: [SPARK-9214] [ML] [PySpark] support ml.NaiveBayes for Python

2015-07-30 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 4e5919bfb -> 69b62f76f


[SPARK-9214] [ML] [PySpark] support ml.NaiveBayes for Python

support ml.NaiveBayes for Python

Author: Yanbo Liang 

Closes #7568 from yanboliang/spark-9214 and squashes the following commits:

5ee3fd6 [Yanbo Liang] fix typos
3ecd046 [Yanbo Liang] fix typos
f9c94d1 [Yanbo Liang] change lambda_ to smoothing and fix other issues
180452a [Yanbo Liang] fix typos
7dda1f4 [Yanbo Liang] support ml.NaiveBayes for Python


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69b62f76
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69b62f76
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69b62f76

Branch: refs/heads/master
Commit: 69b62f76fced18efa35a107c9be4bc22eba72878
Parents: 4e5919b
Author: Yanbo Liang 
Authored: Thu Jul 30 23:03:48 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Jul 30 23:03:48 2015 -0700

--
 .../spark/ml/classification/NaiveBayes.scala|  10 +-
 .../ml/classification/JavaNaiveBayesSuite.java  |   4 +-
 .../ml/classification/NaiveBayesSuite.scala |   6 +-
 python/pyspark/ml/classification.py | 116 ++-
 4 files changed, 125 insertions(+), 11 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/69b62f76/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 1f547e4..5be35fe 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -38,11 +38,11 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
* (default = 1.0).
* @group param
*/
-  final val lambda: DoubleParam = new DoubleParam(this, "lambda", "The 
smoothing parameter.",
+  final val smoothing: DoubleParam = new DoubleParam(this, "smoothing", "The 
smoothing parameter.",
 ParamValidators.gtEq(0))
 
   /** @group getParam */
-  final def getLambda: Double = $(lambda)
+  final def getSmoothing: Double = $(smoothing)
 
   /**
* The model type which is a string (case-sensitive).
@@ -79,8 +79,8 @@ class NaiveBayes(override val uid: String)
* Default is 1.0.
* @group setParam
*/
-  def setLambda(value: Double): this.type = set(lambda, value)
-  setDefault(lambda -> 1.0)
+  def setSmoothing(value: Double): this.type = set(smoothing, value)
+  setDefault(smoothing -> 1.0)
 
   /**
* Set the model type using a string (case-sensitive).
@@ -92,7 +92,7 @@ class NaiveBayes(override val uid: String)
 
   override protected def train(dataset: DataFrame): NaiveBayesModel = {
 val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset)
-val oldModel = OldNaiveBayes.train(oldDataset, $(lambda), $(modelType))
+val oldModel = OldNaiveBayes.train(oldDataset, $(smoothing), $(modelType))
 NaiveBayesModel.fromOld(oldModel, this)
   }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/69b62f76/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
--
diff --git 
a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
 
b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
index 09a9fba..a700c9c 100644
--- 
a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
+++ 
b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java
@@ -68,7 +68,7 @@ public class JavaNaiveBayesSuite implements Serializable {
 assert(nb.getLabelCol() == "label");
 assert(nb.getFeaturesCol() == "features");
 assert(nb.getPredictionCol() == "prediction");
-assert(nb.getLambda() == 1.0);
+assert(nb.getSmoothing() == 1.0);
 assert(nb.getModelType() == "multinomial");
   }
 
@@ -89,7 +89,7 @@ public class JavaNaiveBayesSuite implements Serializable {
 });
 
 DataFrame dataset = jsql.createDataFrame(jrdd, schema);
-NaiveBayes nb = new 
NaiveBayes().setLambda(0.5).setModelType("multinomial");
+NaiveBayes nb = new 
NaiveBayes().setSmoothing(0.5).setModelType("multinomial");
 NaiveBayesModel model = nb.fit(dataset);
 
 DataFrame predictionAndLabels = 
model.transform(dataset).select("prediction", "label");

http://git-wip-us.apache.org/repos/asf/spark/blob/69b62f76/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/classificati

spark git commit: [SPARK-9231] [MLLIB] DistributedLDAModel method for top topics per document

2015-07-31 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 6add4eddb -> 4011a9471


[SPARK-9231] [MLLIB] DistributedLDAModel method for top topics per document

jira: https://issues.apache.org/jira/browse/SPARK-9231

Helper method in DistributedLDAModel of this form:
```
/**
 * For each document, return the top k weighted topics for that document.
 * return RDD of (doc ID, topic indices, topic weights)
 */
def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])]
```

Author: Yuhao Yang 

Closes #7785 from hhbyyh/topTopicsPerdoc and squashes the following commits:

30ad153 [Yuhao Yang] small fix
fd24580 [Yuhao Yang] add topTopics per document to DistributedLDAModel


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4011a947
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4011a947
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4011a947

Branch: refs/heads/master
Commit: 4011a947154d97a9ffb5a71f077481a12534d36b
Parents: 6add4ed
Author: Yuhao Yang 
Authored: Fri Jul 31 11:50:15 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Jul 31 11:50:15 2015 -0700

--
 .../apache/spark/mllib/clustering/LDAModel.scala | 19 ++-
 .../apache/spark/mllib/clustering/LDASuite.scala | 13 -
 2 files changed, 30 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4011a947/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 6cfad3f..82281a0 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum}
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argtopk, 
normalize, sum}
 import breeze.numerics.{exp, lgamma}
 import org.apache.hadoop.fs.Path
 import org.json4s.DefaultFormats
@@ -591,6 +591,23 @@ class DistributedLDAModel private[clustering] (
 JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, 
Vector)]])
   }
 
+  /**
+   * For each document, return the top k weighted topics for that document and 
their weights.
+   * @return RDD of (doc ID, topic indices, topic weights)
+   */
+  def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = {
+graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, 
topicCounts) =>
+  val topIndices = argtopk(topicCounts, k)
+  val sumCounts = sum(topicCounts)
+  val weights = if (sumCounts != 0) {
+topicCounts(topIndices) / sumCounts
+  } else {
+topicCounts(topIndices)
+  }
+  (docID.toLong, topIndices.toArray, weights.toArray)
+}
+  }
+
   // TODO:
   // override def topicDistributions(documents: RDD[(Long, Vector)]): 
RDD[(Long, Vector)] = ???
 

http://git-wip-us.apache.org/repos/asf/spark/blob/4011a947/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index c43e1e5..695ee3b 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.mllib.clustering
 
-import breeze.linalg.{DenseMatrix => BDM, max, argmax}
+import breeze.linalg.{DenseMatrix => BDM, argtopk, max, argmax}
 
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.graphx.Edge
@@ -108,6 +108,17 @@ class LDASuite extends SparkFunSuite with 
MLlibTestSparkContext {
   assert(topicDistribution.toArray.sum ~== 1.0 absTol 1e-5)
 }
 
+val top2TopicsPerDoc = model.topTopicsPerDocument(2).map(t => (t._1, 
(t._2, t._3)))
+model.topicDistributions.join(top2TopicsPerDoc).collect().foreach {
+  case (docId, (topicDistribution, (indices, weights))) =>
+assert(indices.length == 2)
+assert(weights.length == 2)
+val bdvTopicDist = topicDistribution.toBreeze
+val top2Indices = argtopk(bdvTopicDist, 2)
+assert(top2Indices.toArray === indices)
+assert(bdvTopicDist(top2Indices).toArray === weights)
+}
+
 // Check: log probabilities
 assert(model.logLikelihood < 0.0)
 assert(model.logPrior < 0.0)


-
To unsubscribe,

spark git commit: [SPARK-6885] [ML] decision tree support predict class probabilities

2015-07-31 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 4011a9471 -> e8bdcdeab


[SPARK-6885] [ML] decision tree support predict class probabilities

Decision tree support predict class probabilities.
Implement the prediction probabilities function referred the old DecisionTree 
API and the [sklean 
API](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/tree.py#L593).
I make the DecisionTreeClassificationModel inherit from 
ProbabilisticClassificationModel, make the predictRaw to return the raw counts 
vector and make raw2probabilityInPlace/predictProbability return the 
probabilities for each prediction.

Author: Yanbo Liang 

Closes #7694 from yanboliang/spark-6885 and squashes the following commits:

08d5b7f [Yanbo Liang] fix ImpurityStats null parameters and 
raw2probabilityInPlace sum = 0 issue
2174278 [Yanbo Liang] solve merge conflicts
7e90ba8 [Yanbo Liang] fix typos
33ae183 [Yanbo Liang] fix annotation
ff043d3 [Yanbo Liang] raw2probabilityInPlace should operate in-place
c32d6ce [Yanbo Liang] optimize calculateImpurityStats function again
6167fb0 [Yanbo Liang] optimize calculateImpurityStats function
fbbe2ec [Yanbo Liang] eliminate duplicated struct and code
beb1634 [Yanbo Liang] try to eliminate impurityStats for each LearningNode
99e8943 [Yanbo Liang] code optimization
5ec3323 [Yanbo Liang] implement InformationGainAndImpurityStats
227c91b [Yanbo Liang] refactor LearningNode to store ImpurityCalculator
d746ffc [Yanbo Liang] decision tree support predict class probabilities


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8bdcdea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8bdcdea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8bdcdea

Branch: refs/heads/master
Commit: e8bdcdeabb2df139a656f86686cdb53c891b1f4b
Parents: 4011a94
Author: Yanbo Liang 
Authored: Fri Jul 31 11:56:52 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Jul 31 11:56:52 2015 -0700

--
 .../classification/DecisionTreeClassifier.scala |  40 --
 .../spark/ml/classification/GBTClassifier.scala |   2 +-
 .../classification/RandomForestClassifier.scala |   2 +-
 .../ml/regression/DecisionTreeRegressor.scala   |   2 +-
 .../spark/ml/regression/GBTRegressor.scala  |   2 +-
 .../ml/regression/RandomForestRegressor.scala   |   2 +-
 .../scala/org/apache/spark/ml/tree/Node.scala   |  80 ++--
 .../spark/ml/tree/impl/RandomForest.scala   | 126 ---
 .../spark/mllib/tree/impurity/Entropy.scala |   2 +-
 .../apache/spark/mllib/tree/impurity/Gini.scala |   2 +-
 .../spark/mllib/tree/impurity/Impurity.scala|   2 +-
 .../spark/mllib/tree/impurity/Variance.scala|   2 +-
 .../mllib/tree/model/InformationGainStats.scala |  61 -
 .../DecisionTreeClassifierSuite.scala   |  30 -
 .../ml/classification/GBTClassifierSuite.scala  |   2 +-
 .../RandomForestClassifierSuite.scala   |   2 +-
 16 files changed, 229 insertions(+), 130 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e8bdcdea/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index 36fe1bd..f27cfd0 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -18,12 +18,11 @@
 package org.apache.spark.ml.classification
 
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.{PredictionModel, Predictor}
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, 
TreeClassifierParams}
 import org.apache.spark.ml.tree.impl.RandomForest
 import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
-import org.apache.spark.mllib.linalg.Vector
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, 
Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => 
OldStrategy}
 import org.apache.spark.mllib.tree.model.{DecisionTreeModel => 
OldDecisionTreeModel}
@@ -39,7 +38,7 @@ import org.apache.spark.sql.DataFrame
  */
 @Experimental
 final class DecisionTreeClassifier(override val uid: String)
-  extends Predictor[Vector, DecisionTreeClassifier, 
DecisionTreeClassificationModel]
+  extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, 
DecisionTreeClassificationModel]
   with DecisionTreeParams with TreeClassifierParams {
 
   def this() = this

spark git commit: [SPARK-9481] Add logLikelihood to LocalLDAModel

2015-07-31 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master d04634701 -> a8340fa7d


[SPARK-9481] Add logLikelihood to LocalLDAModel

jkbradley Exposes `bound` (variational log likelihood bound) through public API 
as `logLikelihood`. Also adds unit tests, some DRYing of `LDASuite`, and 
includes unit tests mentioned in #7760

Author: Feynman Liang 

Closes #7801 from feynmanliang/SPARK-9481-logLikelihood and squashes the 
following commits:

6d1b2c9 [Feynman Liang] Negate perplexity definition
5f62b20 [Feynman Liang] Add logLikelihood


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a8340fa7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a8340fa7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a8340fa7

Branch: refs/heads/master
Commit: a8340fa7df17e3f0a3658f8b8045ab840845a72a
Parents: d046347
Author: Feynman Liang 
Authored: Fri Jul 31 12:12:22 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Jul 31 12:12:22 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   |  20 ++-
 .../spark/mllib/clustering/LDASuite.scala   | 129 ++-
 2 files changed, 78 insertions(+), 71 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a8340fa7/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 82281a0..ff7035d 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -217,22 +217,28 @@ class LocalLDAModel private[clustering] (
 LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, 
topicConcentration,
   gammaShape)
   }
-  // TODO
-  // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
+
+  // TODO: declare in LDAModel and override once implemented in 
DistributedLDAModel
+  /**
+   * Calculates a lower bound on the log likelihood of the entire corpus.
+   * @param documents test corpus to use for calculating log likelihood
+   * @return variational lower bound on the log likelihood of the entire corpus
+   */
+  def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents,
+docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, 
gammaShape, k,
+vocabSize)
 
   /**
-   * Calculate the log variational bound on perplexity. See Equation (16) in 
original Online
+   * Calculate an upper bound bound on perplexity. See Equation (16) in 
original Online
* LDA paper.
* @param documents test corpus to use for calculating perplexity
-   * @return the log perplexity per word
+   * @return variational upper bound on log perplexity per word
*/
   def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
 val corpusWords = documents
   .map { case (_, termCounts) => termCounts.toArray.sum }
   .sum()
-val batchVariationalBound = bound(documents, docConcentration,
-  topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, 
vocabSize)
-val perWordBound = batchVariationalBound / corpusWords
+val perWordBound = -logLikelihood(documents) / corpusWords
 
 perWordBound
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/a8340fa7/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 695ee3b..79d2a1c 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -210,16 +210,7 @@ class LDASuite extends SparkFunSuite with 
MLlibTestSparkContext {
   }
 
   test("OnlineLDAOptimizer with toy data") {
-def toydata: Array[(Long, Vector)] = Array(
-  Vectors.sparse(6, Array(0, 1), Array(1, 1)),
-  Vectors.sparse(6, Array(1, 2), Array(1, 1)),
-  Vectors.sparse(6, Array(0, 2), Array(1, 1)),
-  Vectors.sparse(6, Array(3, 4), Array(1, 1)),
-  Vectors.sparse(6, Array(3, 5), Array(1, 1)),
-  Vectors.sparse(6, Array(4, 5), Array(1, 1))
-).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, 
wordCounts) }
-
-val docs = sc.parallelize(toydata)
+val docs = sc.parallelize(toyData)
 val op = new 
OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51)
   .setGammaShape(1e10)
 val lda = new LDA().setK(2)
@@ -242,30 +233,45 @@ class LDASuite ex

spark git commit: [SPARK-9246] [MLLIB] DistributedLDAModel predict top docs per topic

2015-07-31 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master c0686668a -> 3c0d2e552


[SPARK-9246] [MLLIB] DistributedLDAModel predict top docs per topic

Add topDocumentsPerTopic to DistributedLDAModel.

Add ScalaDoc and unit tests.

Author: Meihua Wu 

Closes #7769 from rotationsymmetry/SPARK-9246 and squashes the following 
commits:

1029e79c [Meihua Wu] clean up code comments
a023b82 [Meihua Wu] Update tests to use Long for doc index.
91e5998 [Meihua Wu] Use Long for doc index.
b9f70cf [Meihua Wu] Revise topDocumentsPerTopic
26ff3f6 [Meihua Wu] Add topDocumentsPerTopic, scala doc and unit tests


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3c0d2e55
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3c0d2e55
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3c0d2e55

Branch: refs/heads/master
Commit: 3c0d2e55210735e0df2f8febb5f63c224af230e3
Parents: c068666
Author: Meihua Wu 
Authored: Fri Jul 31 13:01:10 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Jul 31 13:01:10 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   | 37 
 .../spark/mllib/clustering/LDASuite.scala   | 22 
 2 files changed, 59 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3c0d2e55/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index ff7035d..0cdac84 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -516,6 +516,43 @@ class DistributedLDAModel private[clustering] (
 }
   }
 
+  /**
+   * Return the top documents for each topic
+   *
+   * This is approximate; it may not return exactly the top-weighted documents 
for each topic.
+   * To get a more precise set of top documents, increase maxDocumentsPerTopic.
+   *
+   * @param maxDocumentsPerTopic  Maximum number of documents to collect for 
each topic.
+   * @return  Array over topics.  Each element represent as a pair of matching 
arrays:
+   *  (IDs for the documents, weights of the topic in these documents).
+   *  For each topic, documents are sorted in order of decreasing 
topic weights.
+   */
+  def topDocumentsPerTopic(maxDocumentsPerTopic: Int): Array[(Array[Long], 
Array[Double])] = {
+val numTopics = k
+val topicsInQueues: Array[BoundedPriorityQueue[(Double, Long)]] =
+  topicDistributions.mapPartitions { docVertices =>
+// For this partition, collect the most common docs for each topic in 
queues:
+//  queues(topic) = queue of (doc topic, doc ID).
+val queues =
+  Array.fill(numTopics)(new BoundedPriorityQueue[(Double, 
Long)](maxDocumentsPerTopic))
+for ((docId, docTopics) <- docVertices) {
+  var topic = 0
+  while (topic < numTopics) {
+queues(topic) += (docTopics(topic) -> docId)
+topic += 1
+  }
+}
+Iterator(queues)
+  }.treeReduce { (q1, q2) =>
+q1.zip(q2).foreach { case (a, b) => a ++= b }
+q1
+  }
+topicsInQueues.map { q =>
+  val (docTopics, docs) = q.toArray.sortBy(-_._1).unzip
+  (docs.toArray, docTopics.toArray)
+}
+  }
+
   // TODO
   // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ???
 

http://git-wip-us.apache.org/repos/asf/spark/blob/3c0d2e55/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
index 79d2a1c..f2b9470 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala
@@ -122,6 +122,28 @@ class LDASuite extends SparkFunSuite with 
MLlibTestSparkContext {
 // Check: log probabilities
 assert(model.logLikelihood < 0.0)
 assert(model.logPrior < 0.0)
+
+// Check: topDocumentsPerTopic
+// Compare it with top documents per topic derived from topicDistributions
+val topDocsByTopicDistributions = { n: Int =>
+  Range(0, k).map { topic =>
+val (doc, docWeights) = 
topicDistributions.sortBy(-_._2(topic)).take(n).unzip
+(doc.toArray, docWeights.map(_(topic)).toArray)
+  }.toArray
+}
+
+// Top 3 documents per topic
+model.topDocumentsPerTopic(3).zip(topDocsByTopicDistributions(3)).foreach 
{case (t1, t2) =>
+  assert(t1

spark git commit: [SPARK-9308] [ML] ml.NaiveBayesModel support predicting class probabilities

2015-07-31 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 060c79aab -> fbef566a1


[SPARK-9308] [ML] ml.NaiveBayesModel support predicting class probabilities

Make NaiveBayesModel support predicting class probabilities, inherit from 
ProbabilisticClassificationModel.

Author: Yanbo Liang 

Closes #7672 from yanboliang/spark-9308 and squashes the following commits:

25e224c [Yanbo Liang] raw2probabilityInPlace should operate in-place
3ee56d6 [Yanbo Liang] change predictRaw and raw2probabilityInPlace
c07e7a2 [Yanbo Liang] ml.NaiveBayesModel support predicting class probabilities


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fbef566a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fbef566a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fbef566a

Branch: refs/heads/master
Commit: fbef566a107b47e5fddde0ea65b8587d5039062d
Parents: 060c79a
Author: Yanbo Liang 
Authored: Fri Jul 31 13:11:42 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Jul 31 13:11:42 2015 -0700

--
 .../spark/ml/classification/NaiveBayes.scala| 65 +++-
 .../ml/classification/NaiveBayesSuite.scala | 54 +++-
 2 files changed, 101 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fbef566a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index 5be35fe..b46b676 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -69,7 +69,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams {
  * The input feature values must be nonnegative.
  */
 class NaiveBayes(override val uid: String)
-  extends Predictor[Vector, NaiveBayes, NaiveBayesModel]
+  extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel]
   with NaiveBayesParams {
 
   def this() = this(Identifiable.randomUID("nb"))
@@ -106,7 +106,7 @@ class NaiveBayesModel private[ml] (
 override val uid: String,
 val pi: Vector,
 val theta: Matrix)
-  extends PredictionModel[Vector, NaiveBayesModel] with NaiveBayesParams {
+  extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] with 
NaiveBayesParams {
 
   import OldNaiveBayes.{Bernoulli, Multinomial}
 
@@ -129,29 +129,62 @@ class NaiveBayesModel private[ml] (
   throw new UnknownError(s"Invalid modelType: ${$(modelType)}.")
   }
 
-  override protected def predict(features: Vector): Double = {
+  override val numClasses: Int = pi.size
+
+  private def multinomialCalculation(features: Vector) = {
+val prob = theta.multiply(features)
+BLAS.axpy(1.0, pi, prob)
+prob
+  }
+
+  private def bernoulliCalculation(features: Vector) = {
+features.foreachActive((_, value) =>
+  if (value != 0.0 && value != 1.0) {
+throw new SparkException(
+  s"Bernoulli naive Bayes requires 0 or 1 feature values but found 
$features.")
+  }
+)
+val prob = thetaMinusNegTheta.get.multiply(features)
+BLAS.axpy(1.0, pi, prob)
+BLAS.axpy(1.0, negThetaSum.get, prob)
+prob
+  }
+
+  override protected def predictRaw(features: Vector): Vector = {
 $(modelType) match {
   case Multinomial =>
-val prob = theta.multiply(features)
-BLAS.axpy(1.0, pi, prob)
-prob.argmax
+multinomialCalculation(features)
   case Bernoulli =>
-features.foreachActive{ (index, value) =>
-  if (value != 0.0 && value != 1.0) {
-throw new SparkException(
-  s"Bernoulli naive Bayes requires 0 or 1 feature values but found 
$features")
-  }
-}
-val prob = thetaMinusNegTheta.get.multiply(features)
-BLAS.axpy(1.0, pi, prob)
-BLAS.axpy(1.0, negThetaSum.get, prob)
-prob.argmax
+bernoulliCalculation(features)
   case _ =>
 // This should never happen.
 throw new UnknownError(s"Invalid modelType: ${$(modelType)}.")
 }
   }
 
+  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector 
= {
+rawPrediction match {
+  case dv: DenseVector =>
+var i = 0
+val size = dv.size
+val maxLog = dv.values.max
+while (i < size) {
+  dv.values(i) = math.exp(dv.values(i) - maxLog)
+  i += 1
+}
+val probSum = dv.values.sum
+i = 0
+while (i < size) {
+  dv.values(i) = dv.values(i) / probSum
+  i += 1
+}
+dv
+  case sv: SparseVector =>
+throw new RuntimeException(

spark git commit: [SPARK-8936] [MLLIB] OnlineLDA document-topic Dirichlet hyperparameter optimization

2015-07-31 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 4d5a6e7b6 -> f51fd6fbb


[SPARK-8936] [MLLIB] OnlineLDA document-topic Dirichlet hyperparameter 
optimization

Adds `alpha` (document-topic Dirichlet parameter) hyperparameter optimization 
to `OnlineLDAOptimizer` following Huang: Maximum Likelihood Estimation of 
Dirichlet Distribution Parameters. Also introduces a private 
`setSampleWithReplacement` to `OnlineLDAOptimizer` for unit testing purposes.

Author: Feynman Liang 

Closes #7836 from feynmanliang/SPARK-8936-alpha-optimize and squashes the 
following commits:

4bef484 [Feynman Liang] Documentation improvements
c3c6c1d [Feynman Liang] Fix docs
151e859 [Feynman Liang] Fix style
fa77518 [Feynman Liang] Hyperparameter optimization


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f51fd6fb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f51fd6fb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f51fd6fb

Branch: refs/heads/master
Commit: f51fd6fbb4d9822502f98b312251e317d757bc3a
Parents: 4d5a6e7
Author: Feynman Liang 
Authored: Fri Jul 31 18:36:22 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Jul 31 18:36:22 2015 -0700

--
 .../spark/mllib/clustering/LDAOptimizer.scala   | 75 +---
 .../spark/mllib/clustering/LDASuite.scala   | 34 +
 2 files changed, 99 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f51fd6fb/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
index d6f8b29..b0e14cb 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala
@@ -19,8 +19,8 @@ package org.apache.spark.mllib.clustering
 
 import java.util.Random
 
-import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum}
-import breeze.numerics.{abs, exp}
+import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, all, normalize, 
sum}
+import breeze.numerics.{trigamma, abs, exp}
 import breeze.stats.distributions.{Gamma, RandBasis}
 
 import org.apache.spark.annotation.DeveloperApi
@@ -239,22 +239,26 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   /** alias for docConcentration */
   private var alpha: Vector = Vectors.dense(0)
 
-  /** (private[clustering] for debugging)  Get docConcentration */
+  /** (for debugging)  Get docConcentration */
   private[clustering] def getAlpha: Vector = alpha
 
   /** alias for topicConcentration */
   private var eta: Double = 0
 
-  /** (private[clustering] for debugging)  Get topicConcentration */
+  /** (for debugging)  Get topicConcentration */
   private[clustering] def getEta: Double = eta
 
   private var randomGenerator: java.util.Random = null
 
+  /** (for debugging) Whether to sample mini-batches with replacement. 
(default = true) */
+  private var sampleWithReplacement: Boolean = true
+
   // Online LDA specific parameters
   // Learning rate is: (tau0 + t)^{-kappa}
   private var tau0: Double = 1024
   private var kappa: Double = 0.51
   private var miniBatchFraction: Double = 0.05
+  private var optimizeAlpha: Boolean = false
 
   // internal data structure
   private var docs: RDD[(Long, Vector)] = null
@@ -262,7 +266,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   /** Dirichlet parameter for the posterior over topics */
   private var lambda: BDM[Double] = null
 
-  /** (private[clustering] for debugging) Get parameter for topics */
+  /** (for debugging) Get parameter for topics */
   private[clustering] def getLambda: BDM[Double] = lambda
 
   /** Current iteration (count of invocations of [[next()]]) */
@@ -325,7 +329,22 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   /**
-   * (private[clustering])
+   * Optimize alpha, indicates whether alpha (Dirichlet parameter for 
document-topic distribution)
+   * will be optimized during training.
+   */
+  def getOptimzeAlpha: Boolean = this.optimizeAlpha
+
+  /**
+   * Sets whether to optimize alpha parameter during training.
+   *
+   * Default: false
+   */
+  def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = {
+this.optimizeAlpha = optimizeAlpha
+this
+  }
+
+  /**
* Set the Dirichlet parameter for the posterior over topics.
* This is only used for testing now. In the future, it can help support 
training stop/resume.
*/
@@ -335,7 +354,6 @@ final class OnlineLDAOptimizer extends LDAOptimizer {
   }
 
   /**
-   * (private[clustering])
* Used for random initialization of the variationa

spark git commit: [SPARK-7446] [MLLIB] Add inverse transform for string indexer

2015-08-01 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 60ea7ab4b -> 65038973a


[SPARK-7446] [MLLIB] Add inverse transform for string indexer

It is useful to convert the encoded indices back to their string representation 
for result inspection. We can add a function which creates an inverse 
transformation.

Author: Holden Karau 

Closes #6339 from holdenk/SPARK-7446-inverse-transform-for-string-indexer and 
squashes the following commits:

7cdf915 [Holden Karau] scala style comment fix
b9cffb6 [Holden Karau] Update the labels param to have the metadata note
6a38edb [Holden Karau] Setting the default needs to come after the value gets 
defined
9e241d8 [Holden Karau] use Array.empty
21c8cfa [Holden Karau] Merge branch 'master' into 
SPARK-7446-inverse-transform-for-string-indexer
64dd3a3 [Holden Karau] Merge branch 'master' into 
SPARK-7446-inverse-transform-for-string-indexer
4f06c59 [Holden Karau] Fix comment styles, use empty array as the default, etc.
a60c0e3 [Holden Karau] CR feedback (remove old constructor, add a note about 
use of setLabels)
1987b95 [Holden Karau] Use default copy
71e8d66 [Holden Karau] Make labels a local param for StringIndexerInverse
8450d0b [Holden Karau] Use the labels param in StringIndexerInverse
7464019 [Holden Karau] Add a labels param
868b1a9 [Holden Karau] Update scaladoc since we don't have labelsCol anymore
5aa38bf [Holden Karau] Add an inverse test using only meta data, pass labels 
when calling inverse method
f3e0c64 [Holden Karau] CR feedback
ebed932 [Holden Karau] Add Experimental tag and some scaladocs. Also don't 
require that the inputCol has the metadata on it, instead have the labelsCol 
specified when creating the inverse.
03ebf95 [Holden Karau] Add explicit type for invert function
ecc65e0 [Holden Karau] Read the metadata correctly, use the array, pass the test
a42d773 [Holden Karau] Fix test to supply cols as per new invert method
16cc3c3 [Holden Karau] Add an invert method
d4bcb20 [Holden Karau] Make the inverse string indexer into a transformer 
(still needs test updates but compiles)
e8bf3ad [Holden Karau] Merge branch 'master' into 
SPARK-7446-inverse-transform-for-string-indexer
c3fdee1 [Holden Karau] Some WIP refactoring based on jkbradley's CR feedback. 
Definite work-in-progress
557bef8 [Holden Karau] Instead of using a private inverse transform, add an 
invert function so we can use it in a pipeline
88779c1 [Holden Karau] fix long line
78b28c1 [Holden Karau] Finish reverse part and add a test :)
bb16a6a [Holden Karau] Some progress


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65038973
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65038973
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65038973

Branch: refs/heads/master
Commit: 65038973a17904e0e04d453799ec108af240fbab
Parents: 60ea7ab
Author: Holden Karau 
Authored: Sat Aug 1 01:09:38 2015 -0700
Committer: Joseph K. Bradley 
Committed: Sat Aug 1 01:09:38 2015 -0700

--
 .../apache/spark/ml/feature/StringIndexer.scala | 108 ++-
 .../spark/ml/feature/StringIndexerSuite.scala   |  13 +++
 2 files changed, 118 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/65038973/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index bf7be36..ebfa972 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -20,13 +20,14 @@ package org.apache.spark.ml.feature
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.ml.{Estimator, Model}
-import org.apache.spark.ml.attribute.NominalAttribute
+import org.apache.spark.ml.attribute.{Attribute, NominalAttribute}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
-import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.util.{Identifiable, MetadataUtils}
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
-import org.apache.spark.sql.types.{NumericType, StringType, StructType}
+import org.apache.spark.sql.types.{DoubleType, NumericType, StringType, 
StructType}
 import org.apache.spark.util.collection.OpenHashMap
 
 /**
@@ -151,4 +152,105 @@ class StringIndexerModel private[ml] (
 val copied = new StringIndexerModel(uid, labels)
 copyValues(copied, extra)
   }
+
+  /**
+   * Return a model to perform the inverse transformation.
+   * Note: By default we keep the original colum

spark git commit: [SPARK-9530] [MLLIB] ScalaDoc should not indicate LDAModel.describeTopics and DistributedLDAModel.topDocumentsPerTopic as approximate

2015-08-01 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 3d1535d48 -> 84a6982b3


[SPARK-9530] [MLLIB] ScalaDoc should not indicate LDAModel.describeTopics and 
DistributedLDAModel.topDocumentsPerTopic as approximate

Remove ScalaDoc that suggests describeTopics and topDocumentsPerTopic are 
approximate.

cc jkbradley

Author: Meihua Wu 

Closes #7858 from rotationsymmetry/SPARK-9530 and squashes the following 
commits:

b574923 [Meihua Wu] Remove ScalaDoc that suggests describeTopics and 
topDocumentsPerTopic are approximate.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84a6982b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84a6982b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84a6982b

Branch: refs/heads/master
Commit: 84a6982b35d87483bdf70ef4423cc4c8e0c3feb1
Parents: 3d1535d
Author: Meihua Wu 
Authored: Sat Aug 1 17:13:28 2015 -0700
Committer: Joseph K. Bradley 
Committed: Sat Aug 1 17:13:28 2015 -0700

--
 .../scala/org/apache/spark/mllib/clustering/LDAModel.scala| 7 ---
 1 file changed, 7 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/84a6982b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 0cdac84..6af90d7 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -86,10 +86,6 @@ abstract class LDAModel private[clustering] extends Saveable 
{
   /**
* Return the topics described by weighted terms.
*
-   * This limits the number of terms per topic.
-   * This is approximate; it may not return exactly the top-weighted terms for 
each topic.
-   * To get a more precise set of top terms, increase maxTermsPerTopic.
-   *
* @param maxTermsPerTopic  Maximum number of terms to collect for each 
topic.
* @return  Array over topics.  Each topic is represented as a pair of 
matching arrays:
*  (term indices, term weights in topic).
@@ -519,9 +515,6 @@ class DistributedLDAModel private[clustering] (
   /**
* Return the top documents for each topic
*
-   * This is approximate; it may not return exactly the top-weighted documents 
for each topic.
-   * To get a more precise set of top documents, increase maxDocumentsPerTopic.
-   *
* @param maxDocumentsPerTopic  Maximum number of documents to collect for 
each topic.
* @return  Array over topics.  Each element represent as a pair of matching 
arrays:
*  (IDs for the documents, weights of the topic in these documents).


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9536] [SPARK-9537] [SPARK-9538] [ML] [PYSPARK] ml.classification support raw and probability prediction for PySpark

2015-08-02 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 114ff926f -> 4cdd8ecd6


[SPARK-9536] [SPARK-9537] [SPARK-9538] [ML] [PYSPARK] ml.classification support 
raw and probability prediction for PySpark

Make the following ml.classification class support raw and probability 
prediction for PySpark:
```scala
NaiveBayesModel
DecisionTreeClassifierModel
LogisticRegressionModel
```

Author: Yanbo Liang 

Closes #7866 from yanboliang/spark-9536-9537 and squashes the following commits:

2934dab [Yanbo Liang] ml.NaiveBayes, ml.DecisionTreeClassifier and 
ml.LogisticRegression support probability prediction


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4cdd8ecd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4cdd8ecd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4cdd8ecd

Branch: refs/heads/master
Commit: 4cdd8ecd66769316e8593da7790b84cd867968cd
Parents: 114ff92
Author: Yanbo Liang 
Authored: Sun Aug 2 22:19:27 2015 -0700
Committer: Joseph K. Bradley 
Committed: Sun Aug 2 22:19:27 2015 -0700

--
 python/pyspark/ml/classification.py | 61 ++--
 1 file changed, 43 insertions(+), 18 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4cdd8ecd/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 93ffcd4..b5814f7 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -31,7 +31,7 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 
'DecisionTreeClassif
 
 @inherit_doc
 class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasMaxIter,
- HasRegParam, HasTol, HasProbabilityCol):
+ HasRegParam, HasTol, HasProbabilityCol, 
HasRawPredictionCol):
 """
 Logistic regression.
 
@@ -42,13 +42,18 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 ... Row(label=0.0, features=Vectors.sparse(1, [], []))]).toDF()
 >>> lr = LogisticRegression(maxIter=5, regParam=0.01)
 >>> model = lr.fit(df)
->>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF()
->>> model.transform(test0).head().prediction
-0.0
 >>> model.weights
 DenseVector([5.5...])
 >>> model.intercept
 -2.68...
+>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF()
+>>> result = model.transform(test0).head()
+>>> result.prediction
+0.0
+>>> result.probability
+DenseVector([0.99..., 0.00...])
+>>> result.rawPrediction
+DenseVector([8.22..., -8.22...])
 >>> test1 = sc.parallelize([Row(features=Vectors.sparse(1, [0], 
[1.0]))]).toDF()
 >>> model.transform(test1).head().prediction
 1.0
@@ -70,11 +75,11 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 @keyword_only
 def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True,
- threshold=0.5, probabilityCol="probability"):
+ threshold=0.5, probabilityCol="probability", 
rawPredictionCol="rawPrediction"):
 """
 __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
  maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True, \
- threshold=0.5, probabilityCol="probability")
+ threshold=0.5, probabilityCol="probability", 
rawPredictionCol="rawPrediction")
 """
 super(LogisticRegression, self).__init__()
 self._java_obj = self._new_java_obj(
@@ -98,11 +103,11 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 @keyword_only
 def setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
   maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True,
-  threshold=0.5, probabilityCol="probability"):
+  threshold=0.5, probabilityCol="probability", 
rawPredictionCol="rawPrediction"):
 """
 setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
   maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, 
fitIntercept=True, \
- threshold=0.5, probabilityCol="probability")
+ threshold=0.5, probabilityCol="probability", 
rawPredictionCol="rawPrediction")
 Sets params for logistic regression.
 """
 kwargs = self.setParams._input_kwargs
@@ -187,7 +192,8 @@

spark git commit: [SPARK-9528] [ML] Changed RandomForestClassifier to extend ProbabilisticClassifier

2015-08-03 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 8be198c86 -> 69f5a7c93


[SPARK-9528] [ML] Changed RandomForestClassifier to extend 
ProbabilisticClassifier

RandomForestClassifier now outputs rawPrediction based on tree probabilities, 
plus probability column computed from normalized rawPrediction.

CC: holdenk

Author: Joseph K. Bradley 

Closes #7859 from jkbradley/rf-prob and squashes the following commits:

6c28f51 [Joseph K. Bradley] Changed RandomForestClassifier to extend 
ProbabilisticClassifier


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69f5a7c9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69f5a7c9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69f5a7c9

Branch: refs/heads/master
Commit: 69f5a7c934ac553ed52c00679b800bcffe83c1d6
Parents: 8be198c
Author: Joseph K. Bradley 
Authored: Mon Aug 3 10:46:34 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 3 10:46:34 2015 -0700

--
 .../classification/DecisionTreeClassifier.scala |  8 +
 .../ProbabilisticClassifier.scala   | 27 +-
 .../classification/RandomForestClassifier.scala | 37 ++--
 .../RandomForestClassifierSuite.scala   | 36 ++-
 4 files changed, 81 insertions(+), 27 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/69f5a7c9/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index f27cfd0..f2b992f 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -131,13 +131,7 @@ final class DecisionTreeClassificationModel private[ml] (
   override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector 
= {
 rawPrediction match {
   case dv: DenseVector =>
-var i = 0
-val size = dv.size
-val sum = dv.values.sum
-while (i < size) {
-  dv.values(i) = if (sum != 0) dv.values(i) / sum else 0.0
-  i += 1
-}
+ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
 dv
   case sv: SparseVector =>
 throw new RuntimeException("Unexpected error in 
DecisionTreeClassificationModel:" +

http://git-wip-us.apache.org/repos/asf/spark/blob/69f5a7c9/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
index dad4511..f9c9c23 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala
@@ -20,7 +20,7 @@ package org.apache.spark.ml.classification
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.SchemaUtils
-import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
+import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, 
VectorUDT}
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.types.{DoubleType, DataType, StructType}
@@ -175,3 +175,28 @@ private[spark] abstract class 
ProbabilisticClassificationModel[
*/
   protected def probability2prediction(probability: Vector): Double = 
probability.argmax
 }
+
+private[ml] object ProbabilisticClassificationModel {
+
+  /**
+   * Normalize a vector of raw predictions to be a multinomial probability 
vector, in place.
+   *
+   * The input raw predictions should be >= 0.
+   * The output vector sums to 1, unless the input vector is all-0 (in which 
case the output is
+   * all-0 too).
+   *
+   * NOTE: This is NOT applicable to all models, only ones which effectively 
use class
+   *   instance counts for raw predictions.
+   */
+  def normalizeToProbabilitiesInPlace(v: DenseVector): Unit = {
+val sum = v.values.sum
+if (sum != 0) {
+  var i = 0
+  val size = v.size
+  while (i < size) {
+v.values(i) /= sum
+i += 1
+  }
+}
+  }
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/69f5a7c9/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
---

spark git commit: [SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and Regressor

2015-08-03 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 703e44bff -> ff9169a00


[SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and 
Regressor

Added featureImportance to RandomForestClassifier and Regressor.

This follows the scikit-learn implementation here: 
[https://github.com/scikit-learn/scikit-learn/blob/a95203b249c1cf392f86d001ad999e29b2392739/sklearn/tree/_tree.pyx#L3341]

CC: yanboliang  Would you mind taking a look?  Thanks!

Author: Joseph K. Bradley 
Author: Feynman Liang 

Closes #7838 from jkbradley/dt-feature-importance and squashes the following 
commits:

72a167a [Joseph K. Bradley] fixed unit test
86cea5f [Joseph K. Bradley] Modified RF featuresImportances to return Vector 
instead of Map
5aa74f0 [Joseph K. Bradley] finally fixed unit test for real
33df5db [Joseph K. Bradley] fix unit test
42a2d3b [Joseph K. Bradley] fix unit test
fe94e72 [Joseph K. Bradley] modified feature importance unit tests
cc693ee [Feynman Liang] Add classifier tests
79a6f87 [Feynman Liang] Compare dense vectors in test
21d01fc [Feynman Liang] Added failing SKLearn test
ac0b254 [Joseph K. Bradley] Added featureImportance to 
RandomForestClassifier/Regressor.  Need to add unit tests


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ff9169a0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ff9169a0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ff9169a0

Branch: refs/heads/master
Commit: ff9169a002f1b75231fd25b7d04157a912503038
Parents: 703e44b
Author: Joseph K. Bradley 
Authored: Mon Aug 3 12:17:46 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 3 12:17:46 2015 -0700

--
 .../classification/RandomForestClassifier.scala |  30 +-
 .../ml/regression/RandomForestRegressor.scala   |  33 --
 .../scala/org/apache/spark/ml/tree/Node.scala   |  19 +++-
 .../spark/ml/tree/impl/RandomForest.scala   |  92 
 .../org/apache/spark/ml/tree/treeModels.scala   |   6 ++
 .../JavaRandomForestClassifierSuite.java|   2 +
 .../JavaRandomForestRegressorSuite.java |   2 +
 .../RandomForestClassifierSuite.scala   |  31 +-
 .../org/apache/spark/ml/impl/TreeTests.scala|  18 
 .../regression/RandomForestRegressorSuite.scala |  27 -
 .../spark/ml/tree/impl/RandomForestSuite.scala  | 107 +++
 11 files changed, 351 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ff9169a0/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 56e80cc..b59826a 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -95,7 +95,8 @@ final class RandomForestClassifier(override val uid: String)
 val trees =
   RandomForest.run(oldDataset, strategy, getNumTrees, 
getFeatureSubsetStrategy, getSeed)
 .map(_.asInstanceOf[DecisionTreeClassificationModel])
-new RandomForestClassificationModel(trees, numClasses)
+val numFeatures = oldDataset.first().features.size
+new RandomForestClassificationModel(trees, numFeatures, numClasses)
   }
 
   override def copy(extra: ParamMap): RandomForestClassifier = 
defaultCopy(extra)
@@ -118,11 +119,13 @@ object RandomForestClassifier {
  * features.
  * @param _trees  Decision trees in the ensemble.
  *   Warning: These have null parents.
+ * @param numFeatures  Number of features used by this model
  */
 @Experimental
 final class RandomForestClassificationModel private[ml] (
 override val uid: String,
 private val _trees: Array[DecisionTreeClassificationModel],
+val numFeatures: Int,
 override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, 
RandomForestClassificationModel]
   with TreeEnsembleModel with Serializable {
@@ -133,8 +136,8 @@ final class RandomForestClassificationModel private[ml] (
* Construct a random forest classification model, with all trees weighted 
equally.
* @param trees  Component trees
*/
-  def this(trees: Array[DecisionTreeClassificationModel], numClasses: Int) =
-this(Identifiable.randomUID("rfc"), trees, numClasses)
+  def this(trees: Array[DecisionTreeClassificationModel], numFeatures: Int, 
numClasses: Int) =
+this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses)
 
   override def trees: Array[DecisionTreeModel] = 
_trees.asInstanceOf[Array[DecisionTreeModel]]
 
@@ -182,13

spark git commit: [SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and Regressor

2015-08-03 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 6d46e9b7c -> b3117d312


[SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and 
Regressor

Added featureImportance to RandomForestClassifier and Regressor.

This follows the scikit-learn implementation here: 
[https://github.com/scikit-learn/scikit-learn/blob/a95203b249c1cf392f86d001ad999e29b2392739/sklearn/tree/_tree.pyx#L3341]

CC: yanboliang  Would you mind taking a look?  Thanks!

Author: Joseph K. Bradley 
Author: Feynman Liang 

Closes #7838 from jkbradley/dt-feature-importance and squashes the following 
commits:

72a167a [Joseph K. Bradley] fixed unit test
86cea5f [Joseph K. Bradley] Modified RF featuresImportances to return Vector 
instead of Map
5aa74f0 [Joseph K. Bradley] finally fixed unit test for real
33df5db [Joseph K. Bradley] fix unit test
42a2d3b [Joseph K. Bradley] fix unit test
fe94e72 [Joseph K. Bradley] modified feature importance unit tests
cc693ee [Feynman Liang] Add classifier tests
79a6f87 [Feynman Liang] Compare dense vectors in test
21d01fc [Feynman Liang] Added failing SKLearn test
ac0b254 [Joseph K. Bradley] Added featureImportance to 
RandomForestClassifier/Regressor.  Need to add unit tests

(cherry picked from commit ff9169a002f1b75231fd25b7d04157a912503038)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3117d31
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3117d31
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3117d31

Branch: refs/heads/branch-1.5
Commit: b3117d312332af3b4bd416857f632cacb5230feb
Parents: 6d46e9b
Author: Joseph K. Bradley 
Authored: Mon Aug 3 12:17:46 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 3 12:17:56 2015 -0700

--
 .../classification/RandomForestClassifier.scala |  30 +-
 .../ml/regression/RandomForestRegressor.scala   |  33 --
 .../scala/org/apache/spark/ml/tree/Node.scala   |  19 +++-
 .../spark/ml/tree/impl/RandomForest.scala   |  92 
 .../org/apache/spark/ml/tree/treeModels.scala   |   6 ++
 .../JavaRandomForestClassifierSuite.java|   2 +
 .../JavaRandomForestRegressorSuite.java |   2 +
 .../RandomForestClassifierSuite.scala   |  31 +-
 .../org/apache/spark/ml/impl/TreeTests.scala|  18 
 .../regression/RandomForestRegressorSuite.scala |  27 -
 .../spark/ml/tree/impl/RandomForestSuite.scala  | 107 +++
 11 files changed, 351 insertions(+), 16 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b3117d31/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index 56e80cc..b59826a 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -95,7 +95,8 @@ final class RandomForestClassifier(override val uid: String)
 val trees =
   RandomForest.run(oldDataset, strategy, getNumTrees, 
getFeatureSubsetStrategy, getSeed)
 .map(_.asInstanceOf[DecisionTreeClassificationModel])
-new RandomForestClassificationModel(trees, numClasses)
+val numFeatures = oldDataset.first().features.size
+new RandomForestClassificationModel(trees, numFeatures, numClasses)
   }
 
   override def copy(extra: ParamMap): RandomForestClassifier = 
defaultCopy(extra)
@@ -118,11 +119,13 @@ object RandomForestClassifier {
  * features.
  * @param _trees  Decision trees in the ensemble.
  *   Warning: These have null parents.
+ * @param numFeatures  Number of features used by this model
  */
 @Experimental
 final class RandomForestClassificationModel private[ml] (
 override val uid: String,
 private val _trees: Array[DecisionTreeClassificationModel],
+val numFeatures: Int,
 override val numClasses: Int)
   extends ProbabilisticClassificationModel[Vector, 
RandomForestClassificationModel]
   with TreeEnsembleModel with Serializable {
@@ -133,8 +136,8 @@ final class RandomForestClassificationModel private[ml] (
* Construct a random forest classification model, with all trees weighted 
equally.
* @param trees  Component trees
*/
-  def this(trees: Array[DecisionTreeClassificationModel], numClasses: Int) =
-this(Identifiable.randomUID("rfc"), trees, numClasses)
+  def this(trees: Array[DecisionTreeClassificationModel], numFeatures: Int, 
numClasses: Int) =
+this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses)

spark git commit: [SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples

2015-08-03 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master ba1c4e138 -> 8ca287ebb


[SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples

Add ml.PCA user guide document and code examples for Scala/Java/Python.

Author: Yanbo Liang 

Closes #7522 from yanboliang/ml-pca-md and squashes the following commits:

60dec05 [Yanbo Liang] address comments
f992abe [Yanbo Liang] Add ml.PCA doc and examples


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ca287eb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ca287eb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ca287eb

Branch: refs/heads/master
Commit: 8ca287ebbd58985a568341b08040d0efa9d3641a
Parents: ba1c4e1
Author: Yanbo Liang 
Authored: Mon Aug 3 13:58:00 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 3 13:58:00 2015 -0700

--
 docs/ml-features.md | 86 
 1 file changed, 86 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/8ca287eb/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 54068de..fa0ad1f 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -461,6 +461,92 @@ for binarized_feature, in binarizedFeatures.collect():
 
 
 
+## PCA
+
+[PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) is a 
statistical procedure that uses an orthogonal transformation to convert a set 
of observations of possibly correlated variables into a set of values of 
linearly uncorrelated variables called principal components. A 
[PCA](api/scala/index.html#org.apache.spark.ml.feature.PCA) class trains a 
model to project vectors to a low-dimensional space using PCA. The example 
below shows how to project 5-dimensional feature vectors into 3-dimensional 
principal components.
+
+
+
+See the [Scala API 
documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API 
details.
+{% highlight scala %}
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Array(
+  Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+  Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+  Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+)
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val pca = new PCA()
+  .setInputCol("features")
+  .setOutputCol("pcaFeatures")
+  .setK(3)
+  .fit(df)
+val pcaDF = pca.transform(df)
+val result = pcaDF.select("pcaFeatures")
+result.show()
+{% endhighlight %}
+
+
+
+See the [Java API 
documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details.
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.ml.feature.PCAModel
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaSparkContext jsc = ...
+SQLContext jsql = ...
+JavaRDD data = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 
7.0})),
+  RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
+  RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+));
+StructType schema = new StructType(new StructField[] {
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+PCAModel pca = new PCA()
+  .setInputCol("features")
+  .setOutputCol("pcaFeatures")
+  .setK(3)
+  .fit(df);
+DataFrame result = pca.transform(df).select("pcaFeatures");
+result.show();
+{% endhighlight %}
+
+
+
+See the [Python API 
documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API 
details.
+{% highlight python %}
+from pyspark.ml.feature import PCA
+from pyspark.mllib.linalg import Vectors
+
+data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+  (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+  (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+df = sqlContext.createDataFrame(data,["features"])
+pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
+model = pca.fit(df)
+result = model.transform(df).select("pcaFeatures")
+result.show(truncate=False)
+{% endhighlight %}
+
+
+
 ## PolynomialExpansion
 
 [Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion) is 
the process of expanding your features into a polynomial space, which is 
formulated by an n-degree

spark git commit: [SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples

2015-08-03 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 dc0c8c982 -> e7329ab31


[SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples

Add ml.PCA user guide document and code examples for Scala/Java/Python.

Author: Yanbo Liang 

Closes #7522 from yanboliang/ml-pca-md and squashes the following commits:

60dec05 [Yanbo Liang] address comments
f992abe [Yanbo Liang] Add ml.PCA doc and examples

(cherry picked from commit 8ca287ebbd58985a568341b08040d0efa9d3641a)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7329ab3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7329ab3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7329ab3

Branch: refs/heads/branch-1.5
Commit: e7329ab31323a89d1e07c808927e5543876e3ce3
Parents: dc0c8c9
Author: Yanbo Liang 
Authored: Mon Aug 3 13:58:00 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 3 14:01:18 2015 -0700

--
 docs/ml-features.md | 86 
 1 file changed, 86 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e7329ab3/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 54068de..fa0ad1f 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -461,6 +461,92 @@ for binarized_feature, in binarizedFeatures.collect():
 
 
 
+## PCA
+
+[PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) is a 
statistical procedure that uses an orthogonal transformation to convert a set 
of observations of possibly correlated variables into a set of values of 
linearly uncorrelated variables called principal components. A 
[PCA](api/scala/index.html#org.apache.spark.ml.feature.PCA) class trains a 
model to project vectors to a low-dimensional space using PCA. The example 
below shows how to project 5-dimensional feature vectors into 3-dimensional 
principal components.
+
+
+
+See the [Scala API 
documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API 
details.
+{% highlight scala %}
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Array(
+  Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
+  Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
+  Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
+)
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val pca = new PCA()
+  .setInputCol("features")
+  .setOutputCol("pcaFeatures")
+  .setK(3)
+  .fit(df)
+val pcaDF = pca.transform(df)
+val result = pcaDF.select("pcaFeatures")
+result.show()
+{% endhighlight %}
+
+
+
+See the [Java API 
documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details.
+{% highlight java %}
+import com.google.common.collect.Lists;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.PCA
+import org.apache.spark.ml.feature.PCAModel
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaSparkContext jsc = ...
+SQLContext jsql = ...
+JavaRDD data = jsc.parallelize(Lists.newArrayList(
+  RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 
7.0})),
+  RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)),
+  RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))
+));
+StructType schema = new StructType(new StructField[] {
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+PCAModel pca = new PCA()
+  .setInputCol("features")
+  .setOutputCol("pcaFeatures")
+  .setK(3)
+  .fit(df);
+DataFrame result = pca.transform(df).select("pcaFeatures");
+result.show();
+{% endhighlight %}
+
+
+
+See the [Python API 
documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API 
details.
+{% highlight python %}
+from pyspark.ml.feature import PCA
+from pyspark.mllib.linalg import Vectors
+
+data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
+  (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
+  (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
+df = sqlContext.createDataFrame(data,["features"])
+pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
+model = pca.fit(df)
+result = model.transform(df).select("pcaFeatures")
+result.show(truncate=False)
+{% endhighlight %}
+
+
+
 ## PolynomialExpansion
 
 [Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_exp

spark git commit: [SPARK-8874] [ML] Add missing methods in Word2Vec

2015-08-03 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master a2409d1c8 -> 13675c742


[SPARK-8874] [ML] Add missing methods in Word2Vec

Add missing methods

1. getVectors
2. findSynonyms

to W2Vec scala and python API

mengxr

Author: MechCoder 

Closes #7263 from MechCoder/missing_methods_w2vec and squashes the following 
commits:

149d5ca [MechCoder] minor doc
69d91b7 [MechCoder] [SPARK-8874] [ML] Add missing methods in Word2Vec


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/13675c74
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/13675c74
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/13675c74

Branch: refs/heads/master
Commit: 13675c742a71cbdc8324701c3694775ce1dd5c62
Parents: a2409d1
Author: MechCoder 
Authored: Mon Aug 3 16:44:25 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 3 16:44:25 2015 -0700

--
 .../org/apache/spark/ml/feature/Word2Vec.scala  | 38 +++-
 .../apache/spark/ml/feature/Word2VecSuite.scala | 62 
 2 files changed, 99 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/13675c74/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 6ea6590..b4f46ce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -18,15 +18,17 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.SparkContext
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
+import org.apache.spark.mllib.linalg.{VectorUDT, Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.types._
 
 /**
@@ -146,6 +148,40 @@ class Word2VecModel private[ml] (
 wordVectors: feature.Word2VecModel)
   extends Model[Word2VecModel] with Word2VecBase {
 
+
+  /**
+   * Returns a dataframe with two fields, "word" and "vector", with "word" 
being a String and
+   * and the vector the DenseVector that it is mapped to.
+   */
+  val getVectors: DataFrame = {
+val sc = SparkContext.getOrCreate()
+val sqlContext = SQLContext.getOrCreate(sc)
+import sqlContext.implicits._
+val wordVec = wordVectors.getVectors.mapValues(vec => 
Vectors.dense(vec.map(_.toDouble)))
+sc.parallelize(wordVec.toSeq).toDF("word", "vector")
+  }
+
+  /**
+   * Find "num" number of words closest in similarity to the given word.
+   * Returns a dataframe with the words and the cosine similarities between the
+   * synonyms and the given word.
+   */
+  def findSynonyms(word: String, num: Int): DataFrame = {
+findSynonyms(wordVectors.transform(word), num)
+  }
+
+  /**
+   * Find "num" number of words closest to similarity to the given vector 
representation
+   * of the word. Returns a dataframe with the words and the cosine 
similarities between the
+   * synonyms and the given word vector.
+   */
+  def findSynonyms(word: Vector, num: Int): DataFrame = {
+val sc = SparkContext.getOrCreate()
+val sqlContext = SQLContext.getOrCreate(sc)
+import sqlContext.implicits._
+sc.parallelize(wordVectors.findSynonyms(word, num)).toDF("word", 
"similarity")
+  }
+
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/13675c74/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index aa6ce53..adcda0e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -67,5 +67,67 @@ class Word2VecSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is 
different with expected.")
 }
   }
+
+  test("getVectors") {
+
+val sqlContext = new SQLContext(sc)
+import sqlContext.implicits._
+
+val sentence = "a b " * 100 + "a c " * 10
+val doc = sc.parallelize(Seq(sentence, sentence)).map(line =>

spark git commit: [SPARK-8874] [ML] Add missing methods in Word2Vec

2015-08-03 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 73fab8849 -> acda9d954


[SPARK-8874] [ML] Add missing methods in Word2Vec

Add missing methods

1. getVectors
2. findSynonyms

to W2Vec scala and python API

mengxr

Author: MechCoder 

Closes #7263 from MechCoder/missing_methods_w2vec and squashes the following 
commits:

149d5ca [MechCoder] minor doc
69d91b7 [MechCoder] [SPARK-8874] [ML] Add missing methods in Word2Vec

(cherry picked from commit 13675c742a71cbdc8324701c3694775ce1dd5c62)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acda9d95
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acda9d95
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acda9d95

Branch: refs/heads/branch-1.5
Commit: acda9d9546fa3f54676e48d76a2b66016d204074
Parents: 73fab88
Author: MechCoder 
Authored: Mon Aug 3 16:44:25 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 3 16:46:00 2015 -0700

--
 .../org/apache/spark/ml/feature/Word2Vec.scala  | 38 +++-
 .../apache/spark/ml/feature/Word2VecSuite.scala | 62 
 2 files changed, 99 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/acda9d95/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 6ea6590..b4f46ce 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -18,15 +18,17 @@
 package org.apache.spark.ml.feature
 
 import org.apache.spark.annotation.Experimental
+import org.apache.spark.SparkContext
 import org.apache.spark.ml.{Estimator, Model}
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
 import org.apache.spark.mllib.feature
-import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
+import org.apache.spark.mllib.linalg.{VectorUDT, Vector, Vectors}
 import org.apache.spark.mllib.linalg.BLAS._
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.functions._
+import org.apache.spark.sql.SQLContext
 import org.apache.spark.sql.types._
 
 /**
@@ -146,6 +148,40 @@ class Word2VecModel private[ml] (
 wordVectors: feature.Word2VecModel)
   extends Model[Word2VecModel] with Word2VecBase {
 
+
+  /**
+   * Returns a dataframe with two fields, "word" and "vector", with "word" 
being a String and
+   * and the vector the DenseVector that it is mapped to.
+   */
+  val getVectors: DataFrame = {
+val sc = SparkContext.getOrCreate()
+val sqlContext = SQLContext.getOrCreate(sc)
+import sqlContext.implicits._
+val wordVec = wordVectors.getVectors.mapValues(vec => 
Vectors.dense(vec.map(_.toDouble)))
+sc.parallelize(wordVec.toSeq).toDF("word", "vector")
+  }
+
+  /**
+   * Find "num" number of words closest in similarity to the given word.
+   * Returns a dataframe with the words and the cosine similarities between the
+   * synonyms and the given word.
+   */
+  def findSynonyms(word: String, num: Int): DataFrame = {
+findSynonyms(wordVectors.transform(word), num)
+  }
+
+  /**
+   * Find "num" number of words closest to similarity to the given vector 
representation
+   * of the word. Returns a dataframe with the words and the cosine 
similarities between the
+   * synonyms and the given word vector.
+   */
+  def findSynonyms(word: Vector, num: Int): DataFrame = {
+val sc = SparkContext.getOrCreate()
+val sqlContext = SQLContext.getOrCreate(sc)
+import sqlContext.implicits._
+sc.parallelize(wordVectors.findSynonyms(word, num)).toDF("word", 
"similarity")
+  }
+
   /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/acda9d95/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index aa6ce53..adcda0e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -67,5 +67,67 @@ class Word2VecSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is 
different with expected.")
 }
   }
+
+  test("getVectors") {
+
+val sqlContext = new SQLContext(sc)
+import sqlContext.implicits._
+
+

spark git commit: [SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 a9277cd5a -> c5250ddc5


[SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier

This PR replaces the old "threshold" with a generalized "thresholds" Param.  We 
keep getThreshold,setThreshold for backwards compatibility for binary 
classification.

Note that the primary author of this PR is holdenk

Author: Holden Karau 
Author: Joseph K. Bradley 

Closes #7909 from 
jkbradley/holdenk-SPARK-8069-add-cutoff-aka-threshold-to-random-forest and 
squashes the following commits:

3952977 [Joseph K. Bradley] fixed pyspark doc test
85febc8 [Joseph K. Bradley] made python unit tests a little more robust
7eb1d86 [Joseph K. Bradley] small cleanups
6cc2ed8 [Joseph K. Bradley] Fixed remaining merge issues.
0255e44 [Joseph K. Bradley] Many cleanups for thresholds, some more tests
7565a60 [Holden Karau] fix pep8 style checks, add a getThreshold method similar 
to our LogisticRegression.scala one for API compat
be87f26 [Holden Karau] Convert threshold to thresholds in the python code, add 
specialized support for Array[Double] to shared parems codegen, etc.
6747dad [Holden Karau] Override raw2prediction for ProbabilisticClassifier, fix 
some tests
25df168 [Holden Karau] Fix handling of thresholds in LogisticRegression
c02d6c0 [Holden Karau] No default for thresholds
5e43628 [Holden Karau] CR feedback and fixed the renamed test
f3fbbd1 [Holden Karau] revert the changes to random forest :(
51f581c [Holden Karau] Add explicit types to public methods, fix long line
f7032eb [Holden Karau] Fix a java test bug, remove some unecessary changes
adf15b4 [Holden Karau] rename the classifier suite test to 
ProbabilisticClassifierSuite now that we only have it in Probabilistic
398078a [Holden Karau] move the thresholding around a bunch based on the design 
doc
4893bdc [Holden Karau] Use numtrees of 3 since previous result was tied (one 
tree for each) and the switch from different max methods picked a different 
element (since they were equal I think this is ok)
638854c [Holden Karau] Add a scala RandomForestClassifierSuite test based on 
corresponding python test
e09919c [Holden Karau] Fix return type, I need more coffee
8d92cac [Holden Karau] Use ClassifierParams as the head
3456ed3 [Holden Karau] Add explicit return types even though just test
a0f3b0c [Holden Karau] scala style fixes
6f14314 [Holden Karau] Since hasthreshold/hasthresholds is in root classifier 
now
ffc8dab [Holden Karau] Update the sharedParams
0420290 [Holden Karau] Allow us to override the get methods selectively
978e77a [Holden Karau] Move HasThreshold into classifier params and start 
defining the overloaded getThreshold/getThresholds functions
1433e52 [Holden Karau] Revert "try and hide threshold but chainges the API so 
no dice there"
1f09a2e [Holden Karau] try and hide threshold but chainges the API so no dice 
there
efb9084 [Holden Karau] move setThresholds only to where its used
6b34809 [Holden Karau] Add a test with thresholding for the RFCS
74f54c3 [Holden Karau] Fix creation of vote array
1986fa8 [Holden Karau] Setting the thresholds only makes sense if the 
underlying class hasn't overridden predict, so lets push it down.
2f44b18 [Holden Karau] Add a global default of null for thresholds param
f338cfc [Holden Karau] Wait that wasn't a good idea, Revert "Some progress 
towards unifying threshold and thresholds"
634b06f [Holden Karau] Some progress towards unifying threshold and thresholds
85c9e01 [Holden Karau] Test passes again... little fnur
099c0f3 [Holden Karau] Move thresholds around some more (set on model not 
trainer)
0f46836 [Holden Karau] Start adding a classifiersuite
f70eb5e [Holden Karau] Fix test compile issues
a7d59c8 [Holden Karau] Move thresholding into Classifier trait
5d999d2 [Holden Karau] Some more progress, start adding a test (maybe try and 
see if we can find a better thing to use for the base of the test)
1fed644 [Holden Karau] Use thresholds to scale scores in random forest 
classifcation
31d6bf2 [Holden Karau] Start threading the threshold info through
0ef228c [Holden Karau] Add hasthresholds

(cherry picked from commit 5a23213c148bfe362514f9c71f5273ebda0a848a)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5250ddc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5250ddc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5250ddc

Branch: refs/heads/branch-1.5
Commit: c5250ddc5242a071549e980f69fa8bd785168979
Parents: a9277cd
Author: Holden Karau 
Authored: Tue Aug 4 10:12:22 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 10:12:33 2015 -0700

--
 .../examples/ml/JavaSimpleParamsExample.java|  3 +-
 .../src/main/python/ml/simple_params_example.py |  2 +-
 .../spark/examples/ml/SimplePar

spark git commit: [SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 34a0eb2e8 -> 5a23213c1


[SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier

This PR replaces the old "threshold" with a generalized "thresholds" Param.  We 
keep getThreshold,setThreshold for backwards compatibility for binary 
classification.

Note that the primary author of this PR is holdenk

Author: Holden Karau 
Author: Joseph K. Bradley 

Closes #7909 from 
jkbradley/holdenk-SPARK-8069-add-cutoff-aka-threshold-to-random-forest and 
squashes the following commits:

3952977 [Joseph K. Bradley] fixed pyspark doc test
85febc8 [Joseph K. Bradley] made python unit tests a little more robust
7eb1d86 [Joseph K. Bradley] small cleanups
6cc2ed8 [Joseph K. Bradley] Fixed remaining merge issues.
0255e44 [Joseph K. Bradley] Many cleanups for thresholds, some more tests
7565a60 [Holden Karau] fix pep8 style checks, add a getThreshold method similar 
to our LogisticRegression.scala one for API compat
be87f26 [Holden Karau] Convert threshold to thresholds in the python code, add 
specialized support for Array[Double] to shared parems codegen, etc.
6747dad [Holden Karau] Override raw2prediction for ProbabilisticClassifier, fix 
some tests
25df168 [Holden Karau] Fix handling of thresholds in LogisticRegression
c02d6c0 [Holden Karau] No default for thresholds
5e43628 [Holden Karau] CR feedback and fixed the renamed test
f3fbbd1 [Holden Karau] revert the changes to random forest :(
51f581c [Holden Karau] Add explicit types to public methods, fix long line
f7032eb [Holden Karau] Fix a java test bug, remove some unecessary changes
adf15b4 [Holden Karau] rename the classifier suite test to 
ProbabilisticClassifierSuite now that we only have it in Probabilistic
398078a [Holden Karau] move the thresholding around a bunch based on the design 
doc
4893bdc [Holden Karau] Use numtrees of 3 since previous result was tied (one 
tree for each) and the switch from different max methods picked a different 
element (since they were equal I think this is ok)
638854c [Holden Karau] Add a scala RandomForestClassifierSuite test based on 
corresponding python test
e09919c [Holden Karau] Fix return type, I need more coffee
8d92cac [Holden Karau] Use ClassifierParams as the head
3456ed3 [Holden Karau] Add explicit return types even though just test
a0f3b0c [Holden Karau] scala style fixes
6f14314 [Holden Karau] Since hasthreshold/hasthresholds is in root classifier 
now
ffc8dab [Holden Karau] Update the sharedParams
0420290 [Holden Karau] Allow us to override the get methods selectively
978e77a [Holden Karau] Move HasThreshold into classifier params and start 
defining the overloaded getThreshold/getThresholds functions
1433e52 [Holden Karau] Revert "try and hide threshold but chainges the API so 
no dice there"
1f09a2e [Holden Karau] try and hide threshold but chainges the API so no dice 
there
efb9084 [Holden Karau] move setThresholds only to where its used
6b34809 [Holden Karau] Add a test with thresholding for the RFCS
74f54c3 [Holden Karau] Fix creation of vote array
1986fa8 [Holden Karau] Setting the thresholds only makes sense if the 
underlying class hasn't overridden predict, so lets push it down.
2f44b18 [Holden Karau] Add a global default of null for thresholds param
f338cfc [Holden Karau] Wait that wasn't a good idea, Revert "Some progress 
towards unifying threshold and thresholds"
634b06f [Holden Karau] Some progress towards unifying threshold and thresholds
85c9e01 [Holden Karau] Test passes again... little fnur
099c0f3 [Holden Karau] Move thresholds around some more (set on model not 
trainer)
0f46836 [Holden Karau] Start adding a classifiersuite
f70eb5e [Holden Karau] Fix test compile issues
a7d59c8 [Holden Karau] Move thresholding into Classifier trait
5d999d2 [Holden Karau] Some more progress, start adding a test (maybe try and 
see if we can find a better thing to use for the base of the test)
1fed644 [Holden Karau] Use thresholds to scale scores in random forest 
classifcation
31d6bf2 [Holden Karau] Start threading the threshold info through
0ef228c [Holden Karau] Add hasthresholds


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a23213c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a23213c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a23213c

Branch: refs/heads/master
Commit: 5a23213c148bfe362514f9c71f5273ebda0a848a
Parents: 34a0eb2
Author: Holden Karau 
Authored: Tue Aug 4 10:12:22 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 10:12:22 2015 -0700

--
 .../examples/ml/JavaSimpleParamsExample.java|  3 +-
 .../src/main/python/ml/simple_params_example.py |  2 +-
 .../spark/examples/ml/SimpleParamsExample.scala |  2 +-
 .../spark/ml/classification/Classifier.scala|  3 +-

spark git commit: [SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 560b2da78 -> e682ee254


[SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to 
RandomForestClassifier

Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier, plus 
doc tests for those columns.

CC: holdenk yanboliang

Author: Joseph K. Bradley 

Closes #7903 from jkbradley/rf-prob-python and squashes the following commits:

c62a83f [Joseph K. Bradley] made unit test more robust
14eeba2 [Joseph K. Bradley] added HasRawPredictionCol, HasProbabilityCol to 
RandomForestClassifier in PySpark

(cherry picked from commit e375456063617cd7000d796024f41e5927f21edd)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e682ee25
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e682ee25
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e682ee25

Branch: refs/heads/branch-1.5
Commit: e682ee25477374737f3b1dfc08c98829564b26d4
Parents: 560b2da
Author: Joseph K. Bradley 
Authored: Tue Aug 4 14:54:26 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 14:54:34 2015 -0700

--
 python/pyspark/ml/classification.py | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e682ee25/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 291320f..5978d8f 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -347,6 +347,7 @@ class DecisionTreeClassificationModel(DecisionTreeModel):
 
 @inherit_doc
 class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasSeed,
+ HasRawPredictionCol, HasProbabilityCol,
  DecisionTreeParams, HasCheckpointInterval):
 """
 `http://en.wikipedia.org/wiki/Random_forest  Random Forest`
@@ -354,6 +355,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPred
 It supports both binary and multiclass labels, as well as both continuous 
and categorical
 features.
 
+>>> import numpy
 >>> from numpy import allclose
 >>> from pyspark.mllib.linalg import Vectors
 >>> from pyspark.ml.feature import StringIndexer
@@ -368,8 +370,13 @@ class RandomForestClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
 >>> allclose(model.treeWeights, [1.0, 1.0, 1.0])
 True
 >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], 
["features"])
->>> model.transform(test0).head().prediction
+>>> result = model.transform(test0).head()
+>>> result.prediction
 0.0
+>>> numpy.argmax(result.probability)
+0
+>>> numpy.argmax(result.rawPrediction)
+0
 >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], 
["features"])
 >>> model.transform(test1).head().prediction
 1.0
@@ -390,11 +397,13 @@ class RandomForestClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
 
 @keyword_only
 def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
+ probabilityCol="probability", 
rawPredictionCol="rawPrediction",
  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
impurity="gini",
  numTrees=20, featureSubsetStrategy="auto", seed=None):
 """
 __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
+ probabilityCol="probability", 
rawPredictionCol="rawPrediction", \
  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
impurity="gini", \
  numTrees=20, featureSubsetStrategy="auto", seed=None)
@@ -427,11 +436,13 @@ class RandomForestClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
 
 @keyword_only
 def setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
+  probabilityCol="probability", 
rawPredictionCol="rawPrediction",
   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInf

spark git commit: [SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 9d668b736 -> e37545606


[SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to 
RandomForestClassifier

Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier, plus 
doc tests for those columns.

CC: holdenk yanboliang

Author: Joseph K. Bradley 

Closes #7903 from jkbradley/rf-prob-python and squashes the following commits:

c62a83f [Joseph K. Bradley] made unit test more robust
14eeba2 [Joseph K. Bradley] added HasRawPredictionCol, HasProbabilityCol to 
RandomForestClassifier in PySpark


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e3754560
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e3754560
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e3754560

Branch: refs/heads/master
Commit: e375456063617cd7000d796024f41e5927f21edd
Parents: 9d668b7
Author: Joseph K. Bradley 
Authored: Tue Aug 4 14:54:26 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 14:54:26 2015 -0700

--
 python/pyspark/ml/classification.py | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e3754560/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 291320f..5978d8f 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -347,6 +347,7 @@ class DecisionTreeClassificationModel(DecisionTreeModel):
 
 @inherit_doc
 class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, 
HasPredictionCol, HasSeed,
+ HasRawPredictionCol, HasProbabilityCol,
  DecisionTreeParams, HasCheckpointInterval):
 """
 `http://en.wikipedia.org/wiki/Random_forest  Random Forest`
@@ -354,6 +355,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPred
 It supports both binary and multiclass labels, as well as both continuous 
and categorical
 features.
 
+>>> import numpy
 >>> from numpy import allclose
 >>> from pyspark.mllib.linalg import Vectors
 >>> from pyspark.ml.feature import StringIndexer
@@ -368,8 +370,13 @@ class RandomForestClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
 >>> allclose(model.treeWeights, [1.0, 1.0, 1.0])
 True
 >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], 
["features"])
->>> model.transform(test0).head().prediction
+>>> result = model.transform(test0).head()
+>>> result.prediction
 0.0
+>>> numpy.argmax(result.probability)
+0
+>>> numpy.argmax(result.rawPrediction)
+0
 >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], 
["features"])
 >>> model.transform(test1).head().prediction
 1.0
@@ -390,11 +397,13 @@ class RandomForestClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
 
 @keyword_only
 def __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
+ probabilityCol="probability", 
rawPredictionCol="rawPrediction",
  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
impurity="gini",
  numTrees=20, featureSubsetStrategy="auto", seed=None):
 """
 __init__(self, featuresCol="features", labelCol="label", 
predictionCol="prediction", \
+ probabilityCol="probability", 
rawPredictionCol="rawPrediction", \
  maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0, \
  maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, 
impurity="gini", \
  numTrees=20, featureSubsetStrategy="auto", seed=None)
@@ -427,11 +436,13 @@ class RandomForestClassifier(JavaEstimator, 
HasFeaturesCol, HasLabelCol, HasPred
 
 @keyword_only
 def setParams(self, featuresCol="features", labelCol="label", 
predictionCol="prediction",
+  probabilityCol="probability", 
rawPredictionCol="rawPrediction",
   maxDepth=5, maxBins=32, minInstancesPerNode=1, 
minInfoGain=0.0,
   maxMemoryInMB=256, cacheNodeIds=False, 
checkpointInterval=10, seed=None

spark git commit: [SPARK-9582] [ML] LDA cleanups

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master e37545606 -> 1833d9c08


[SPARK-9582] [ML] LDA cleanups

Small cleanups to recent LDA additions and docs.

CC: feynmanliang

Author: Joseph K. Bradley 

Closes #7916 from jkbradley/lda-cleanups and squashes the following commits:

f7021d9 [Joseph K. Bradley] broadcasting large matrices for LDA in local model 
and online learning
97947aa [Joseph K. Bradley] a few more cleanups
5b03f88 [Joseph K. Bradley] reverted split of lda log likelihood
c566915 [Joseph K. Bradley] small edit to make review easier
63f6c7d [Joseph K. Bradley] clarified log likelihood for lda models


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1833d9c0
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1833d9c0
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1833d9c0

Branch: refs/heads/master
Commit: 1833d9c08f021d991334424d0a6d5ec21d1fccb2
Parents: e375456
Author: Joseph K. Bradley 
Authored: Tue Aug 4 15:43:13 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 15:43:13 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   | 82 +++-
 .../spark/mllib/clustering/LDAOptimizer.scala   | 19 +++--
 2 files changed, 58 insertions(+), 43 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1833d9c0/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 6af90d7..33babda 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -27,6 +27,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaPairRDD
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId}
 import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
 import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -217,26 +218,28 @@ class LocalLDAModel private[clustering] (
   // TODO: declare in LDAModel and override once implemented in 
DistributedLDAModel
   /**
* Calculates a lower bound on the log likelihood of the entire corpus.
+   *
+   * See Equation (16) in original Online LDA paper.
+   *
* @param documents test corpus to use for calculating log likelihood
* @return variational lower bound on the log likelihood of the entire corpus
*/
-  def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents,
+  def logLikelihood(documents: RDD[(Long, Vector)]): Double = 
logLikelihoodBound(documents,
 docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, 
gammaShape, k,
 vocabSize)
 
   /**
-   * Calculate an upper bound bound on perplexity. See Equation (16) in 
original Online
-   * LDA paper.
+   * Calculate an upper bound bound on perplexity.  (Lower is better.)
+   * See Equation (16) in original Online LDA paper.
+   *
* @param documents test corpus to use for calculating perplexity
-   * @return variational upper bound on log perplexity per word
+   * @return Variational upper bound on log perplexity per token.
*/
   def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
-val corpusWords = documents
+val corpusTokenCount = documents
   .map { case (_, termCounts) => termCounts.toArray.sum }
   .sum()
-val perWordBound = -logLikelihood(documents) / corpusWords
-
-perWordBound
+-logLikelihood(documents) / corpusTokenCount
   }
 
   /**
@@ -244,17 +247,20 @@ class LocalLDAModel private[clustering] (
*log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)]
* This bound is derived by decomposing the LDA model to:
*log p(documents) = E_q[log p(documents)] - E_q[log q(documents)] + 
D(q|p)
-   * and noting that the KL-divergence D(q|p) >= 0. See Equation (16) in 
original Online LDA paper.
+   * and noting that the KL-divergence D(q|p) >= 0.
+   *
+   * See Equation (16) in original Online LDA paper, as well as Appendix A.3 
in the JMLR version of
+   * the original LDA paper.
* @param documents a subset of the test corpus
* @param alpha document-topic Dirichlet prior parameters
-   * @param eta topic-word Dirichlet prior parameters
+   * @param eta topic-word Dirichlet prior parameter
* @param lambda parameters for variational q(beta | lambda) topic-word 
distributions
* @param gammaShape shape parameter for random initialization of 
variational

spark git commit: [SPARK-9582] [ML] LDA cleanups

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 e682ee254 -> fe4a4f41a


[SPARK-9582] [ML] LDA cleanups

Small cleanups to recent LDA additions and docs.

CC: feynmanliang

Author: Joseph K. Bradley 

Closes #7916 from jkbradley/lda-cleanups and squashes the following commits:

f7021d9 [Joseph K. Bradley] broadcasting large matrices for LDA in local model 
and online learning
97947aa [Joseph K. Bradley] a few more cleanups
5b03f88 [Joseph K. Bradley] reverted split of lda log likelihood
c566915 [Joseph K. Bradley] small edit to make review easier
63f6c7d [Joseph K. Bradley] clarified log likelihood for lda models

(cherry picked from commit 1833d9c08f021d991334424d0a6d5ec21d1fccb2)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe4a4f41
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe4a4f41
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe4a4f41

Branch: refs/heads/branch-1.5
Commit: fe4a4f41ad8b686455d58fc2fda9494e8dba5636
Parents: e682ee2
Author: Joseph K. Bradley 
Authored: Tue Aug 4 15:43:13 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 15:43:20 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   | 82 +++-
 .../spark/mllib/clustering/LDAOptimizer.scala   | 19 +++--
 2 files changed, 58 insertions(+), 43 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fe4a4f41/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 6af90d7..33babda 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -27,6 +27,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaPairRDD
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId}
 import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
 import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -217,26 +218,28 @@ class LocalLDAModel private[clustering] (
   // TODO: declare in LDAModel and override once implemented in 
DistributedLDAModel
   /**
* Calculates a lower bound on the log likelihood of the entire corpus.
+   *
+   * See Equation (16) in original Online LDA paper.
+   *
* @param documents test corpus to use for calculating log likelihood
* @return variational lower bound on the log likelihood of the entire corpus
*/
-  def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents,
+  def logLikelihood(documents: RDD[(Long, Vector)]): Double = 
logLikelihoodBound(documents,
 docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, 
gammaShape, k,
 vocabSize)
 
   /**
-   * Calculate an upper bound bound on perplexity. See Equation (16) in 
original Online
-   * LDA paper.
+   * Calculate an upper bound bound on perplexity.  (Lower is better.)
+   * See Equation (16) in original Online LDA paper.
+   *
* @param documents test corpus to use for calculating perplexity
-   * @return variational upper bound on log perplexity per word
+   * @return Variational upper bound on log perplexity per token.
*/
   def logPerplexity(documents: RDD[(Long, Vector)]): Double = {
-val corpusWords = documents
+val corpusTokenCount = documents
   .map { case (_, termCounts) => termCounts.toArray.sum }
   .sum()
-val perWordBound = -logLikelihood(documents) / corpusWords
-
-perWordBound
+-logLikelihood(documents) / corpusTokenCount
   }
 
   /**
@@ -244,17 +247,20 @@ class LocalLDAModel private[clustering] (
*log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)]
* This bound is derived by decomposing the LDA model to:
*log p(documents) = E_q[log p(documents)] - E_q[log q(documents)] + 
D(q|p)
-   * and noting that the KL-divergence D(q|p) >= 0. See Equation (16) in 
original Online LDA paper.
+   * and noting that the KL-divergence D(q|p) >= 0.
+   *
+   * See Equation (16) in original Online LDA paper, as well as Appendix A.3 
in the JMLR version of
+   * the original LDA paper.
* @param documents a subset of the test corpus
* @param alpha document-topic Dirichlet prior parameters
-   * @param eta topic-word Dirichlet prior parameters
+   * @param eta topic-word Dirichlet prior parameter
* @param lambda parameters for variational q(beta | lambda) topic-word

spark git commit: [SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 7c8fc1f7c -> 629e26f7e


[SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy

jkbradley

Author: Feynman Liang 

Closes #7941 from feynmanliang/SPARK-9609-stategy-spelling and squashes the 
following commits:

d2aafb1 [Feynman Liang] Add deprecated backwards compatibility
aa090a8 [Feynman Liang] Fix spelling


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/629e26f7
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/629e26f7
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/629e26f7

Branch: refs/heads/master
Commit: 629e26f7ee916e70f59b017cb6083aa441b26b2c
Parents: 7c8fc1f
Author: Feynman Liang 
Authored: Tue Aug 4 18:13:18 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 18:13:18 2015 -0700

--
 .../src/main/scala/org/apache/spark/ml/tree/treeParams.scala | 2 +-
 .../spark/mllib/tree/configuration/BoostingStrategy.scala| 2 +-
 .../org/apache/spark/mllib/tree/configuration/Strategy.scala | 8 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/629e26f7/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala 
b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index e817090..dbd8d31 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -163,7 +163,7 @@ private[ml] trait DecisionTreeParams extends 
PredictorParams {
   oldAlgo: OldAlgo.Algo,
   oldImpurity: OldImpurity,
   subsamplingRate: Double): OldStrategy = {
-val strategy = OldStrategy.defaultStategy(oldAlgo)
+val strategy = OldStrategy.defaultStrategy(oldAlgo)
 strategy.impurity = oldImpurity
 strategy.checkpointInterval = getCheckpointInterval
 strategy.maxBins = getMaxBins

http://git-wip-us.apache.org/repos/asf/spark/blob/629e26f7/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 9fd30c9..50fe2ac 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -90,7 +90,7 @@ object BoostingStrategy {
* @return Configuration for boosting algorithm
*/
   def defaultParams(algo: Algo): BoostingStrategy = {
-val treeStrategy = Strategy.defaultStategy(algo)
+val treeStrategy = Strategy.defaultStrategy(algo)
 treeStrategy.maxDepth = 3
 algo match {
   case Algo.Classification =>

http://git-wip-us.apache.org/repos/asf/spark/blob/629e26f7/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index ada227c..de2c784 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -178,14 +178,14 @@ object Strategy {
* @param algo  "Classification" or "Regression"
*/
   def defaultStrategy(algo: String): Strategy = {
-defaultStategy(Algo.fromString(algo))
+defaultStrategy(Algo.fromString(algo))
   }
 
   /**
* Construct a default set of parameters for 
[[org.apache.spark.mllib.tree.DecisionTree]]
* @param algo Algo.Classification or Algo.Regression
*/
-  def defaultStategy(algo: Algo): Strategy = algo match {
+  def defaultStrategy(algo: Algo): Strategy = algo match {
 case Algo.Classification =>
   new Strategy(algo = Classification, impurity = Gini, maxDepth = 10,
 numClasses = 2)
@@ -193,4 +193,8 @@ object Strategy {
   new Strategy(algo = Regression, impurity = Variance, maxDepth = 10,
 numClasses = 0)
   }
+
+  @deprecated("Use Strategy.defaultStrategy instead.", "1.5.0")
+  def defaultStategy(algo: Algo): Strategy = defaultStrategy(algo)
+
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 1954a7bb1 -> 335097548


[SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy

jkbradley

Author: Feynman Liang 

Closes #7941 from feynmanliang/SPARK-9609-stategy-spelling and squashes the 
following commits:

d2aafb1 [Feynman Liang] Add deprecated backwards compatibility
aa090a8 [Feynman Liang] Fix spelling

(cherry picked from commit 629e26f7ee916e70f59b017cb6083aa441b26b2c)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33509754
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33509754
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33509754

Branch: refs/heads/branch-1.5
Commit: 33509754843fe8eba303c720e6c0f6853b861e7e
Parents: 1954a7b
Author: Feynman Liang 
Authored: Tue Aug 4 18:13:18 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 18:13:27 2015 -0700

--
 .../src/main/scala/org/apache/spark/ml/tree/treeParams.scala | 2 +-
 .../spark/mllib/tree/configuration/BoostingStrategy.scala| 2 +-
 .../org/apache/spark/mllib/tree/configuration/Strategy.scala | 8 ++--
 3 files changed, 8 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/33509754/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala 
b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
index e817090..dbd8d31 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala
@@ -163,7 +163,7 @@ private[ml] trait DecisionTreeParams extends 
PredictorParams {
   oldAlgo: OldAlgo.Algo,
   oldImpurity: OldImpurity,
   subsamplingRate: Double): OldStrategy = {
-val strategy = OldStrategy.defaultStategy(oldAlgo)
+val strategy = OldStrategy.defaultStrategy(oldAlgo)
 strategy.impurity = oldImpurity
 strategy.checkpointInterval = getCheckpointInterval
 strategy.maxBins = getMaxBins

http://git-wip-us.apache.org/repos/asf/spark/blob/33509754/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
index 9fd30c9..50fe2ac 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala
@@ -90,7 +90,7 @@ object BoostingStrategy {
* @return Configuration for boosting algorithm
*/
   def defaultParams(algo: Algo): BoostingStrategy = {
-val treeStrategy = Strategy.defaultStategy(algo)
+val treeStrategy = Strategy.defaultStrategy(algo)
 treeStrategy.maxDepth = 3
 algo match {
   case Algo.Classification =>

http://git-wip-us.apache.org/repos/asf/spark/blob/33509754/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
index ada227c..de2c784 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala
@@ -178,14 +178,14 @@ object Strategy {
* @param algo  "Classification" or "Regression"
*/
   def defaultStrategy(algo: String): Strategy = {
-defaultStategy(Algo.fromString(algo))
+defaultStrategy(Algo.fromString(algo))
   }
 
   /**
* Construct a default set of parameters for 
[[org.apache.spark.mllib.tree.DecisionTree]]
* @param algo Algo.Classification or Algo.Regression
*/
-  def defaultStategy(algo: Algo): Strategy = algo match {
+  def defaultStrategy(algo: Algo): Strategy = algo match {
 case Algo.Classification =>
   new Strategy(algo = Classification, impurity = Gini, maxDepth = 10,
 numClasses = 2)
@@ -193,4 +193,8 @@ object Strategy {
   new Strategy(algo = Regression, impurity = Variance, maxDepth = 10,
 numClasses = 0)
   }
+
+  @deprecated("Use Strategy.defaultStrategy instead.", "1.5.0")
+  def defaultStategy(algo: Algo): Strategy = defaultStrategy(algo)
+
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apac

spark git commit: [SPARK-8601] [ML] Add an option to disable standardization for linear regression

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 629e26f7e -> d92fa1417


[SPARK-8601] [ML] Add an option to disable standardization for linear regression

All compressed sensing applications, and some of the regression use-cases will 
have better result by turning the feature scaling off. However, if we implement 
this naively by training the dataset without doing any standardization, the 
rate of convergency will not be good. This can be implemented by still 
standardizing the training dataset but we penalize each component differently 
to get effectively the same objective function but a better numerical problem. 
As a result, for those columns with high variances, they will be penalized 
less, and vice versa. Without this, since all the features are standardized, so 
they will be penalized the same.

In R, there is an option for this.
standardize

Logical flag for x variable standardization, prior to fitting the model 
sequence. The coefficients are always returned on the original scale. Default 
is standardize=TRUE. If variables are in the same units already, you might not 
wish to standardize. See details below for y standardization with 
family="gaussian".

Note that the primary author for this PR is holdenk

Author: Holden Karau 
Author: DB Tsai 

Closes #7875 from dbtsai/SPARK-8522 and squashes the following commits:

e856036 [DB Tsai] scala doc
596e96c [DB Tsai] minor
bbff347 [DB Tsai] naming
baa0805 [DB Tsai] touch up
d6234ba [DB Tsai] Merge branch 'master' into 
SPARK-8522-Disable-Linear_featureScaling-Spark-8601-in-Linear_regression
6b1dc09 [Holden Karau] Merge branch 'master' into 
SPARK-8522-Disable-Linear_featureScaling-Spark-8601-in-Linear_regression
332f140 [Holden Karau] Merge in master
eebe10a [Holden Karau] Use same comparision operator throughout the test
3f92935 [Holden Karau] merge
b83a41e [Holden Karau] Expand the tests and make them similar to the other PR 
also providing an option to disable standardization (but for LoR).
0c334a2 [Holden Karau] Remove extra line
99ce053 [Holden Karau] merge in master
e54a8a9 [Holden Karau] Fix long line
e47c574 [Holden Karau] Add support for L2 without standardization.
55d3a66 [Holden Karau] Add standardization param for linear regression
00a1dc5 [Holden Karau] Add the param to the linearregression impl


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d92fa141
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d92fa141
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d92fa141

Branch: refs/heads/master
Commit: d92fa14179287c996407d9c7d249103109f9cdef
Parents: 629e26f
Author: Holden Karau 
Authored: Tue Aug 4 18:15:26 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 18:15:26 2015 -0700

--
 .../ml/classification/LogisticRegression.scala  |   6 +-
 .../spark/ml/regression/LinearRegression.scala  |  70 -
 .../ml/regression/LinearRegressionSuite.scala   | 278 ++-
 3 files changed, 268 insertions(+), 86 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/d92fa141/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index c937b960..0d07383 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -133,9 +133,9 @@ class LogisticRegression(override val uid: String)
   /**
* Whether to standardize the training features before fitting the model.
* The coefficients of models will be always returned on the original scale,
-   * so it will be transparent for users. Note that when no regularization,
-   * with or without standardization, the models should be always converged to
-   * the same solution.
+   * so it will be transparent for users. Note that with/without 
standardization,
+   * the models should be always converged to the same solution when no 
regularization
+   * is applied. In R's GLMNET package, the default behavior is true as well.
* Default is true.
* @group setParam
* */

http://git-wip-us.apache.org/repos/asf/spark/blob/d92fa141/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
index 3b85ba0..92d819b 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
+++ b

spark git commit: [SPARK-8601] [ML] Add an option to disable standardization for linear regression

2015-08-04 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 335097548 -> 2237ddbe0


[SPARK-8601] [ML] Add an option to disable standardization for linear regression

All compressed sensing applications, and some of the regression use-cases will 
have better result by turning the feature scaling off. However, if we implement 
this naively by training the dataset without doing any standardization, the 
rate of convergency will not be good. This can be implemented by still 
standardizing the training dataset but we penalize each component differently 
to get effectively the same objective function but a better numerical problem. 
As a result, for those columns with high variances, they will be penalized 
less, and vice versa. Without this, since all the features are standardized, so 
they will be penalized the same.

In R, there is an option for this.
standardize

Logical flag for x variable standardization, prior to fitting the model 
sequence. The coefficients are always returned on the original scale. Default 
is standardize=TRUE. If variables are in the same units already, you might not 
wish to standardize. See details below for y standardization with 
family="gaussian".

Note that the primary author for this PR is holdenk

Author: Holden Karau 
Author: DB Tsai 

Closes #7875 from dbtsai/SPARK-8522 and squashes the following commits:

e856036 [DB Tsai] scala doc
596e96c [DB Tsai] minor
bbff347 [DB Tsai] naming
baa0805 [DB Tsai] touch up
d6234ba [DB Tsai] Merge branch 'master' into 
SPARK-8522-Disable-Linear_featureScaling-Spark-8601-in-Linear_regression
6b1dc09 [Holden Karau] Merge branch 'master' into 
SPARK-8522-Disable-Linear_featureScaling-Spark-8601-in-Linear_regression
332f140 [Holden Karau] Merge in master
eebe10a [Holden Karau] Use same comparision operator throughout the test
3f92935 [Holden Karau] merge
b83a41e [Holden Karau] Expand the tests and make them similar to the other PR 
also providing an option to disable standardization (but for LoR).
0c334a2 [Holden Karau] Remove extra line
99ce053 [Holden Karau] merge in master
e54a8a9 [Holden Karau] Fix long line
e47c574 [Holden Karau] Add support for L2 without standardization.
55d3a66 [Holden Karau] Add standardization param for linear regression
00a1dc5 [Holden Karau] Add the param to the linearregression impl

(cherry picked from commit d92fa14179287c996407d9c7d249103109f9cdef)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2237ddbe
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2237ddbe
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2237ddbe

Branch: refs/heads/branch-1.5
Commit: 2237ddbe027be084afd85fc5b7a7c22270b6e7f6
Parents: 3350975
Author: Holden Karau 
Authored: Tue Aug 4 18:15:26 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 4 18:15:35 2015 -0700

--
 .../ml/classification/LogisticRegression.scala  |   6 +-
 .../spark/ml/regression/LinearRegression.scala  |  70 -
 .../ml/regression/LinearRegressionSuite.scala   | 278 ++-
 3 files changed, 268 insertions(+), 86 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2237ddbe/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index c937b960..0d07383 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -133,9 +133,9 @@ class LogisticRegression(override val uid: String)
   /**
* Whether to standardize the training features before fitting the model.
* The coefficients of models will be always returned on the original scale,
-   * so it will be transparent for users. Note that when no regularization,
-   * with or without standardization, the models should be always converged to
-   * the same solution.
+   * so it will be transparent for users. Note that with/without 
standardization,
+   * the models should be always converged to the same solution when no 
regularization
+   * is applied. In R's GLMNET package, the default behavior is true as well.
* Default is true.
* @group setParam
* */

http://git-wip-us.apache.org/repos/asf/spark/blob/2237ddbe/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala
inde

spark git commit: [SPARK-9112] [ML] Implement Stats for LogisticRegression

2015-08-06 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 9f94c85ff -> c5c6aded6


[SPARK-9112] [ML] Implement Stats for LogisticRegression

I have added support for stats in LogisticRegression. The API is similar to 
that of LinearRegression with LogisticRegressionTrainingSummary and 
LogisticRegressionSummary

I have some queries and asked them inline.

Author: MechCoder 

Closes #7538 from MechCoder/log_reg_stats and squashes the following commits:

2e9f7c7 [MechCoder] Change defs into lazy vals
d775371 [MechCoder] Clean up class inheritance
9586125 [MechCoder] Add abstraction to handle Multiclass Metrics
40ad8ef [MechCoder] minor
640376a [MechCoder] remove unnecessary dataframe stuff and add docs
80d9954 [MechCoder] Added tests
fbed861 [MechCoder] DataFrame support for metrics
70a0fc4 [MechCoder] [SPARK-9112] [ML] Implement Stats for LogisticRegression


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5c6aded
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5c6aded
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5c6aded

Branch: refs/heads/master
Commit: c5c6aded641048a3e66ac79d9e84d34e4b1abae7
Parents: 9f94c85
Author: MechCoder 
Authored: Thu Aug 6 10:08:33 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 6 10:08:33 2015 -0700

--
 .../ml/classification/LogisticRegression.scala  | 166 ++-
 .../JavaLogisticRegressionSuite.java|   9 +
 .../LogisticRegressionSuite.scala   |  37 -
 3 files changed, 209 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c5c6aded/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 0d07383..f55134d 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -30,10 +30,12 @@ import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.BLAS._
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -284,7 +286,13 @@ class LogisticRegression(override val uid: String)
 
 if (handlePersistence) instances.unpersist()
 
-copyValues(new LogisticRegressionModel(uid, weights, intercept))
+val model = copyValues(new LogisticRegressionModel(uid, weights, 
intercept))
+val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
+  model.transform(dataset),
+  $(probabilityCol),
+  $(labelCol),
+  objectiveHistory)
+model.setSummary(logRegSummary)
   }
 
   override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra)
@@ -319,6 +327,38 @@ class LogisticRegressionModel private[ml] (
 
   override val numClasses: Int = 2
 
+  private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None
+
+  /**
+   * Gets summary of model on training set. An exception is
+   * thrown if `trainingSummary == None`.
+   */
+  def summary: LogisticRegressionTrainingSummary = trainingSummary match {
+case Some(summ) => summ
+case None =>
+  throw new SparkException(
+"No training summary available for this LogisticRegressionModel",
+new NullPointerException())
+  }
+
+  private[classification] def setSummary(
+  summary: LogisticRegressionTrainingSummary): this.type = {
+this.trainingSummary = Some(summary)
+this
+  }
+
+  /** Indicates whether a training summary exists for this model instance. */
+  def hasSummary: Boolean = trainingSummary.isDefined
+
+  /**
+   * Evaluates the model on a testset.
+   * @param dataset Test dataset to evaluate model on.
+   */
+  // TODO: decide on a good name before exposing to public API
+  private[classification] def evaluate(dataset: DataFrame): 
LogisticRegressionSummary = {
+new BinaryLogisticRegressionSummary(this.transform(dataset), 
$(probabilityCol), $(labelCol))
+  }
+
   /**
* Predict label for the given feature vector.
* The behavior of this can be adjusted using [[thresholds]].
@@ -441,6 +481,128 @@ private[classification] class MultiClassSummarizer 
extends Serializ

spark git commit: [SPARK-9112] [ML] Implement Stats for LogisticRegression

2015-08-06 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 cc4c569a8 -> 70b9ed11d


[SPARK-9112] [ML] Implement Stats for LogisticRegression

I have added support for stats in LogisticRegression. The API is similar to 
that of LinearRegression with LogisticRegressionTrainingSummary and 
LogisticRegressionSummary

I have some queries and asked them inline.

Author: MechCoder 

Closes #7538 from MechCoder/log_reg_stats and squashes the following commits:

2e9f7c7 [MechCoder] Change defs into lazy vals
d775371 [MechCoder] Clean up class inheritance
9586125 [MechCoder] Add abstraction to handle Multiclass Metrics
40ad8ef [MechCoder] minor
640376a [MechCoder] remove unnecessary dataframe stuff and add docs
80d9954 [MechCoder] Added tests
fbed861 [MechCoder] DataFrame support for metrics
70a0fc4 [MechCoder] [SPARK-9112] [ML] Implement Stats for LogisticRegression

(cherry picked from commit c5c6aded641048a3e66ac79d9e84d34e4b1abae7)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70b9ed11
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70b9ed11
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70b9ed11

Branch: refs/heads/branch-1.5
Commit: 70b9ed11d08014b96da9d5747c0cebb4927c0459
Parents: cc4c569
Author: MechCoder 
Authored: Thu Aug 6 10:08:33 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 6 10:08:43 2015 -0700

--
 .../ml/classification/LogisticRegression.scala  | 166 ++-
 .../JavaLogisticRegressionSuite.java|   9 +
 .../LogisticRegressionSuite.scala   |  37 -
 3 files changed, 209 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/70b9ed11/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index 0d07383..f55134d 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -30,10 +30,12 @@ import org.apache.spark.ml.util.Identifiable
 import org.apache.spark.mllib.linalg._
 import org.apache.spark.mllib.linalg.BLAS._
 import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.{DataFrame, Row, SQLContext}
+import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -284,7 +286,13 @@ class LogisticRegression(override val uid: String)
 
 if (handlePersistence) instances.unpersist()
 
-copyValues(new LogisticRegressionModel(uid, weights, intercept))
+val model = copyValues(new LogisticRegressionModel(uid, weights, 
intercept))
+val logRegSummary = new BinaryLogisticRegressionTrainingSummary(
+  model.transform(dataset),
+  $(probabilityCol),
+  $(labelCol),
+  objectiveHistory)
+model.setSummary(logRegSummary)
   }
 
   override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra)
@@ -319,6 +327,38 @@ class LogisticRegressionModel private[ml] (
 
   override val numClasses: Int = 2
 
+  private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None
+
+  /**
+   * Gets summary of model on training set. An exception is
+   * thrown if `trainingSummary == None`.
+   */
+  def summary: LogisticRegressionTrainingSummary = trainingSummary match {
+case Some(summ) => summ
+case None =>
+  throw new SparkException(
+"No training summary available for this LogisticRegressionModel",
+new NullPointerException())
+  }
+
+  private[classification] def setSummary(
+  summary: LogisticRegressionTrainingSummary): this.type = {
+this.trainingSummary = Some(summary)
+this
+  }
+
+  /** Indicates whether a training summary exists for this model instance. */
+  def hasSummary: Boolean = trainingSummary.isDefined
+
+  /**
+   * Evaluates the model on a testset.
+   * @param dataset Test dataset to evaluate model on.
+   */
+  // TODO: decide on a good name before exposing to public API
+  private[classification] def evaluate(dataset: DataFrame): 
LogisticRegressionSummary = {
+new BinaryLogisticRegressionSummary(this.transform(dataset), 
$(probabilityCol), $(labelCol))
+  }
+
   /**
* Predict label for the given feature vector.
* The behavior of this can be adjusted

spark git commit: [SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML

2015-08-06 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master c5c6aded6 -> 076ec0568


[SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML

After https://github.com/apache/spark/pull/7263 it is pretty straightforward to 
Python wrappers.

Author: MechCoder 

Closes #7930 from MechCoder/spark-9533 and squashes the following commits:

1bea394 [MechCoder] make getVectors a lazy val
5522756 [MechCoder] [SPARK-9533] [PySpark] [ML] Add missing methods in Word2Vec 
ML


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/076ec056
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/076ec056
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/076ec056

Branch: refs/heads/master
Commit: 076ec056818a65216eaf51aa5b3bd8f697c34748
Parents: c5c6ade
Author: MechCoder 
Authored: Thu Aug 6 10:09:58 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 6 10:09:58 2015 -0700

--
 .../org/apache/spark/ml/feature/Word2Vec.scala  |  2 +-
 python/pyspark/ml/feature.py| 40 
 2 files changed, 41 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/076ec056/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index b4f46ce..29acc3e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -153,7 +153,7 @@ class Word2VecModel private[ml] (
* Returns a dataframe with two fields, "word" and "vector", with "word" 
being a String and
* and the vector the DenseVector that it is mapped to.
*/
-  val getVectors: DataFrame = {
+  @transient lazy val getVectors: DataFrame = {
 val sc = SparkContext.getOrCreate()
 val sqlContext = SQLContext.getOrCreate(sc)
 import sqlContext.implicits._

http://git-wip-us.apache.org/repos/asf/spark/blob/076ec056/python/pyspark/ml/feature.py
--
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 3f04c41..cb4dfa2 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -15,11 +15,16 @@
 # limitations under the License.
 #
 
+import sys
+if sys.version > '3':
+basestring = str
+
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.param.shared import *
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
 from pyspark.mllib.common import inherit_doc
+from pyspark.mllib.linalg import _convert_to_vector
 
 __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 
'OneHotEncoder',
'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 
'StandardScalerModel',
@@ -954,6 +959,23 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, 
HasSeed, HasInputCol, Has
 >>> sent = ("a b " * 100 + "a c " * 10).split(" ")
 >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"])
 >>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", 
outputCol="model").fit(doc)
+>>> model.getVectors().show()
++++
+|word|  vector|
++++
+|   a|[-0.3511952459812...|
+|   b|[0.29077222943305...|
+|   c|[0.02315592765808...|
++++
+...
+>>> model.findSynonyms("a", 2).show()
+++---+
+|word| similarity|
+++---+
+|   b|0.29255685145799626|
+|   c|-0.5414068302988307|
+++---+
+...
 >>> model.transform(doc).head().model
 DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276])
 """
@@ -1047,6 +1069,24 @@ class Word2VecModel(JavaModel):
 Model fitted by Word2Vec.
 """
 
+def getVectors(self):
+"""
+Returns the vector representation of the words as a dataframe
+with two fields, word and vector.
+"""
+return self._call_java("getVectors")
+
+def findSynonyms(self, word, num):
+"""
+Find "num" number of words closest in similarity to "word".
+word can be a string or vector representation.
+Returns a dataframe with two fields word and similarity (which
+gives the cosine similarity).
+"""
+if not isinstance(word, basestring):
+word = _convert_to_vector(word)
+return self._call_java("findSynonyms", word, num)
+
 
 @inherit_doc
 class PCA(JavaEstimator, HasInputCol, HasOutputCol):

spark git commit: [SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML

2015-08-06 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 70b9ed11d -> e24b97650


[SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML

After https://github.com/apache/spark/pull/7263 it is pretty straightforward to 
Python wrappers.

Author: MechCoder 

Closes #7930 from MechCoder/spark-9533 and squashes the following commits:

1bea394 [MechCoder] make getVectors a lazy val
5522756 [MechCoder] [SPARK-9533] [PySpark] [ML] Add missing methods in Word2Vec 
ML

(cherry picked from commit 076ec056818a65216eaf51aa5b3bd8f697c34748)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e24b9765
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e24b9765
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e24b9765

Branch: refs/heads/branch-1.5
Commit: e24b976506dd8563e4fe9cc295c756a1ce979e0d
Parents: 70b9ed1
Author: MechCoder 
Authored: Thu Aug 6 10:09:58 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 6 10:10:06 2015 -0700

--
 .../org/apache/spark/ml/feature/Word2Vec.scala  |  2 +-
 python/pyspark/ml/feature.py| 40 
 2 files changed, 41 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e24b9765/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index b4f46ce..29acc3e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -153,7 +153,7 @@ class Word2VecModel private[ml] (
* Returns a dataframe with two fields, "word" and "vector", with "word" 
being a String and
* and the vector the DenseVector that it is mapped to.
*/
-  val getVectors: DataFrame = {
+  @transient lazy val getVectors: DataFrame = {
 val sc = SparkContext.getOrCreate()
 val sqlContext = SQLContext.getOrCreate(sc)
 import sqlContext.implicits._

http://git-wip-us.apache.org/repos/asf/spark/blob/e24b9765/python/pyspark/ml/feature.py
--
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 3f04c41..cb4dfa2 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -15,11 +15,16 @@
 # limitations under the License.
 #
 
+import sys
+if sys.version > '3':
+basestring = str
+
 from pyspark.rdd import ignore_unicode_prefix
 from pyspark.ml.param.shared import *
 from pyspark.ml.util import keyword_only
 from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer
 from pyspark.mllib.common import inherit_doc
+from pyspark.mllib.linalg import _convert_to_vector
 
 __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 
'OneHotEncoder',
'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 
'StandardScalerModel',
@@ -954,6 +959,23 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, 
HasSeed, HasInputCol, Has
 >>> sent = ("a b " * 100 + "a c " * 10).split(" ")
 >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"])
 >>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", 
outputCol="model").fit(doc)
+>>> model.getVectors().show()
++++
+|word|  vector|
++++
+|   a|[-0.3511952459812...|
+|   b|[0.29077222943305...|
+|   c|[0.02315592765808...|
++++
+...
+>>> model.findSynonyms("a", 2).show()
+++---+
+|word| similarity|
+++---+
+|   b|0.29255685145799626|
+|   c|-0.5414068302988307|
+++---+
+...
 >>> model.transform(doc).head().model
 DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276])
 """
@@ -1047,6 +1069,24 @@ class Word2VecModel(JavaModel):
 Model fitted by Word2Vec.
 """
 
+def getVectors(self):
+"""
+Returns the vector representation of the words as a dataframe
+with two fields, word and vector.
+"""
+return self._call_java("getVectors")
+
+def findSynonyms(self, word, num):
+"""
+Find "num" number of words closest in similarity to "word".
+word can be a string or vector representation.
+Returns a dataframe with two fields word and similarity (which
+gives the cosine similarity).
+"""
+if not isinstance(word, basestring):
+word = _convert_to_vector(word)
+return self._call_java("findSynonyms", word, num

spark git commit: [SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression

2015-08-06 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 1f62f104c -> 54c0789a0


[SPARK-9493] [ML] add featureIndex to handle vector features in 
IsotonicRegression

This PR contains the following changes:
* add `featureIndex` to handle vector features (in order to chain isotonic 
regression easily with output from logistic regression
* make getter/setter names consistent with params
* remove inheritance from Regressor because it is tricky to handle both 
`DoubleType` and `VectorType`
* simplify test data generation

jkbradley zapletal-martin

Author: Xiangrui Meng 

Closes #7952 from mengxr/SPARK-9493 and squashes the following commits:

8818ac3 [Xiangrui Meng] address comments
05e2216 [Xiangrui Meng] address comments
8d08090 [Xiangrui Meng] add featureIndex to handle vector features make 
getter/setter names consistent with params remove inheritance from Regressor


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54c0789a
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54c0789a
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54c0789a

Branch: refs/heads/master
Commit: 54c0789a05a783ce90e0e9848079be442a82966b
Parents: 1f62f10
Author: Xiangrui Meng 
Authored: Thu Aug 6 13:29:31 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 6 13:29:31 2015 -0700

--
 .../ml/regression/IsotonicRegression.scala  | 202 ++-
 .../ml/regression/IsotonicRegressionSuite.scala |  82 
 2 files changed, 194 insertions(+), 90 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/54c0789a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index 4ece8cf..f570590 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -17,44 +17,113 @@
 
 package org.apache.spark.ml.regression
 
+import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.PredictorParams
-import org.apache.spark.ml.param.{Param, ParamMap, BooleanParam}
-import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
-import org.apache.spark.mllib.regression.{IsotonicRegression => 
MLlibIsotonicRegression}
-import org.apache.spark.mllib.regression.{IsotonicRegressionModel => 
MLlibIsotonicRegressionModel}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, 
HasPredictionCol}
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
+import org.apache.spark.mllib.regression.{IsotonicRegression => 
MLlibIsotonicRegression, IsotonicRegressionModel => 
MLlibIsotonicRegressionModel}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{DoubleType, DataType}
-import org.apache.spark.sql.{Row, DataFrame}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.{col, lit, udf}
+import org.apache.spark.sql.types.{DoubleType, StructType}
 import org.apache.spark.storage.StorageLevel
 
 /**
  * Params for isotonic regression.
  */
-private[regression] trait IsotonicRegressionParams extends PredictorParams {
+private[regression] trait IsotonicRegressionBase extends Params with 
HasFeaturesCol
+  with HasLabelCol with HasPredictionCol with Logging {
 
   /**
-   * Param for weight column name.
-   * TODO: Move weightCol to sharedParams.
-   *
+   * Param for weight column name (default: none).
* @group param
*/
+  // TODO: Move weightCol to sharedParams.
   final val weightCol: Param[String] =
-new Param[String](this, "weightCol", "weight column name")
+new Param[String](this, "weightCol",
+  "weight column name. If this is not set or empty, we treat all instance 
weights as 1.0.")
 
   /** @group getParam */
   final def getWeightCol: String = $(weightCol)
 
   /**
-   * Param for isotonic parameter.
-   * Isotonic (increasing) or antitonic (decreasing) sequence.
+   * Param for whether the output sequence should be isotonic/increasing 
(true) or
+   * antitonic/decreasing (false).
* @group param
*/
   final val isotonic: BooleanParam =
-new BooleanParam(this, "isotonic", "isotonic (increasing) or antitonic 
(decreasing) sequence")
+new BooleanParam(this, "isotonic",
+  "whether the output sequence should be isotonic/increasing (tru

spark git commit: [SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression

2015-08-06 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 92e8acc98 -> ee43d355b


[SPARK-9493] [ML] add featureIndex to handle vector features in 
IsotonicRegression

This PR contains the following changes:
* add `featureIndex` to handle vector features (in order to chain isotonic 
regression easily with output from logistic regression
* make getter/setter names consistent with params
* remove inheritance from Regressor because it is tricky to handle both 
`DoubleType` and `VectorType`
* simplify test data generation

jkbradley zapletal-martin

Author: Xiangrui Meng 

Closes #7952 from mengxr/SPARK-9493 and squashes the following commits:

8818ac3 [Xiangrui Meng] address comments
05e2216 [Xiangrui Meng] address comments
8d08090 [Xiangrui Meng] add featureIndex to handle vector features make 
getter/setter names consistent with params remove inheritance from Regressor

(cherry picked from commit 54c0789a05a783ce90e0e9848079be442a82966b)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ee43d355
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ee43d355
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ee43d355

Branch: refs/heads/branch-1.5
Commit: ee43d355bcfc9c3f4f281f0c44e1b1f331c7bb97
Parents: 92e8acc
Author: Xiangrui Meng 
Authored: Thu Aug 6 13:29:31 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 6 13:29:38 2015 -0700

--
 .../ml/regression/IsotonicRegression.scala  | 202 ++-
 .../ml/regression/IsotonicRegressionSuite.scala |  82 
 2 files changed, 194 insertions(+), 90 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/ee43d355/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala 
b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
index 4ece8cf..f570590 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala
@@ -17,44 +17,113 @@
 
 package org.apache.spark.ml.regression
 
+import org.apache.spark.Logging
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.ml.PredictorParams
-import org.apache.spark.ml.param.{Param, ParamMap, BooleanParam}
-import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
-import org.apache.spark.mllib.regression.{IsotonicRegression => 
MLlibIsotonicRegression}
-import org.apache.spark.mllib.regression.{IsotonicRegressionModel => 
MLlibIsotonicRegressionModel}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, 
HasPredictionCol}
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors}
+import org.apache.spark.mllib.regression.{IsotonicRegression => 
MLlibIsotonicRegression, IsotonicRegressionModel => 
MLlibIsotonicRegressionModel}
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.types.{DoubleType, DataType}
-import org.apache.spark.sql.{Row, DataFrame}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions.{col, lit, udf}
+import org.apache.spark.sql.types.{DoubleType, StructType}
 import org.apache.spark.storage.StorageLevel
 
 /**
  * Params for isotonic regression.
  */
-private[regression] trait IsotonicRegressionParams extends PredictorParams {
+private[regression] trait IsotonicRegressionBase extends Params with 
HasFeaturesCol
+  with HasLabelCol with HasPredictionCol with Logging {
 
   /**
-   * Param for weight column name.
-   * TODO: Move weightCol to sharedParams.
-   *
+   * Param for weight column name (default: none).
* @group param
*/
+  // TODO: Move weightCol to sharedParams.
   final val weightCol: Param[String] =
-new Param[String](this, "weightCol", "weight column name")
+new Param[String](this, "weightCol",
+  "weight column name. If this is not set or empty, we treat all instance 
weights as 1.0.")
 
   /** @group getParam */
   final def getWeightCol: String = $(weightCol)
 
   /**
-   * Param for isotonic parameter.
-   * Isotonic (increasing) or antitonic (decreasing) sequence.
+   * Param for whether the output sequence should be isotonic/increasing 
(true) or
+   * antitonic/decreasing (false).
* @group param
*/
   final val isotonic: BooleanParam =
-new BooleanParam(this, "isotonic", "isotonic (increasing) or antitonic 
(decreasing) sequence")
+new BooleanPar

spark git commit: Revert "[SPARK-8481] [MLLIB] GaussianMixtureModel.predict, GaussianMixtureModel.predictSoft variants for a single vector"

2015-08-07 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.4 e5a994f21 -> 4b5bbc589


Revert "[SPARK-8481] [MLLIB] GaussianMixtureModel.predict, 
GaussianMixtureModel.predictSoft variants for a single vector"

This reverts commit 07f778978d80f0af57d3dafda4c566a813ad2d09.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b5bbc58
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b5bbc58
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b5bbc58

Branch: refs/heads/branch-1.4
Commit: 4b5bbc589e11d882c993a3e6daeb0cdad9789e76
Parents: e5a994f
Author: Joseph K. Bradley 
Authored: Fri Aug 7 13:42:20 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 7 13:42:20 2015 -0700

--
 .../spark/mllib/clustering/GaussianMixtureModel.scala  | 13 -
 .../spark/mllib/clustering/GaussianMixtureSuite.scala  | 10 --
 2 files changed, 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4b5bbc58/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index 76aeebd..cb807c8 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -66,12 +66,6 @@ class GaussianMixtureModel(
 responsibilityMatrix.map(r => r.indexOf(r.max))
   }
 
-  /** Maps given point to its cluster index. */
-  def predict(point: Vector): Int = {
-val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, 
weights, k)
-r.indexOf(r.max)
-  }
-
   /** Java-friendly version of [[predict()]] */
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
 predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
@@ -90,13 +84,6 @@ class GaussianMixtureModel(
   }
 
   /**
-   * Given the input vector, return the membership values to all mixture 
components.
-   */
-  def predictSoft(point: Vector): Array[Double] = {
-computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
-  }
-
-  /**
* Compute the partial assignments for each vector
*/
   private def computeSoftAssignments(

http://git-wip-us.apache.org/repos/asf/spark/blob/4b5bbc58/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index b636d02..b218d72 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -148,16 +148,6 @@ class GaussianMixtureSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 }
   }
 
-  test("model prediction, parallel and local") {
-val data = sc.parallelize(GaussianTestData.data)
-val gmm = new GaussianMixture().setK(2).setSeed(0).run(data)
-
-val batchPredictions = gmm.predict(data)
-batchPredictions.zip(data).collect().foreach { case (batchPred, datum) =>
-  assert(batchPred === gmm.predict(datum))
-}
-  }
-
   object GaussianTestData {
 
 val data = Array(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector

2015-08-07 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 881548ab2 -> e2fbbe731


[SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector

Resubmit of [https://github.com/apache/spark/pull/6906] for adding single-vec 
predict to GMMs

CC: dkobylarz  mengxr

To be merged with master and branch-1.5
Primary author: dkobylarz

Author: Dariusz Kobylarz 

Closes #8039 from jkbradley/gmm-predict-vec and squashes the following commits:

bfbedc4 [Dariusz Kobylarz] [SPARK-8481] [MLlib] GaussianMixtureModel predict 
accepting single vector


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2fbbe73
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2fbbe73
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2fbbe73

Branch: refs/heads/master
Commit: e2fbbe73111d4624390f596a19a1799c86a05f6c
Parents: 881548a
Author: Dariusz Kobylarz 
Authored: Fri Aug 7 14:51:03 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 7 14:51:03 2015 -0700

--
 .../spark/mllib/clustering/GaussianMixtureModel.scala  | 13 +
 .../spark/mllib/clustering/GaussianMixtureSuite.scala  | 10 ++
 2 files changed, 23 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e2fbbe73/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index cb807c8..76aeebd 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -66,6 +66,12 @@ class GaussianMixtureModel(
 responsibilityMatrix.map(r => r.indexOf(r.max))
   }
 
+  /** Maps given point to its cluster index. */
+  def predict(point: Vector): Int = {
+val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, 
weights, k)
+r.indexOf(r.max)
+  }
+
   /** Java-friendly version of [[predict()]] */
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
 predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
@@ -84,6 +90,13 @@ class GaussianMixtureModel(
   }
 
   /**
+   * Given the input vector, return the membership values to all mixture 
components.
+   */
+  def predictSoft(point: Vector): Array[Double] = {
+computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
+  }
+
+  /**
* Compute the partial assignments for each vector
*/
   private def computeSoftAssignments(

http://git-wip-us.apache.org/repos/asf/spark/blob/e2fbbe73/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index b218d72..b636d02 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -148,6 +148,16 @@ class GaussianMixtureSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 }
   }
 
+  test("model prediction, parallel and local") {
+val data = sc.parallelize(GaussianTestData.data)
+val gmm = new GaussianMixture().setK(2).setSeed(0).run(data)
+
+val batchPredictions = gmm.predict(data)
+batchPredictions.zip(data).collect().foreach { case (batchPred, datum) =>
+  assert(batchPred === gmm.predict(datum))
+}
+  }
+
   object GaussianTestData {
 
 val data = Array(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector

2015-08-07 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 547120287 -> 295266049


[SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector

Resubmit of [https://github.com/apache/spark/pull/6906] for adding single-vec 
predict to GMMs

CC: dkobylarz  mengxr

To be merged with master and branch-1.5
Primary author: dkobylarz

Author: Dariusz Kobylarz 

Closes #8039 from jkbradley/gmm-predict-vec and squashes the following commits:

bfbedc4 [Dariusz Kobylarz] [SPARK-8481] [MLlib] GaussianMixtureModel predict 
accepting single vector

(cherry picked from commit e2fbbe73111d4624390f596a19a1799c86a05f6c)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29526604
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29526604
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29526604

Branch: refs/heads/branch-1.5
Commit: 29526604916a5e1dff12fcbc395f1039b3a69dcd
Parents: 5471202
Author: Dariusz Kobylarz 
Authored: Fri Aug 7 14:51:03 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 7 14:51:13 2015 -0700

--
 .../spark/mllib/clustering/GaussianMixtureModel.scala  | 13 +
 .../spark/mllib/clustering/GaussianMixtureSuite.scala  | 10 ++
 2 files changed, 23 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/29526604/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
index cb807c8..76aeebd 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala
@@ -66,6 +66,12 @@ class GaussianMixtureModel(
 responsibilityMatrix.map(r => r.indexOf(r.max))
   }
 
+  /** Maps given point to its cluster index. */
+  def predict(point: Vector): Int = {
+val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, 
weights, k)
+r.indexOf(r.max)
+  }
+
   /** Java-friendly version of [[predict()]] */
   def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] =
 predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]]
@@ -84,6 +90,13 @@ class GaussianMixtureModel(
   }
 
   /**
+   * Given the input vector, return the membership values to all mixture 
components.
+   */
+  def predictSoft(point: Vector): Array[Double] = {
+computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k)
+  }
+
+  /**
* Compute the partial assignments for each vector
*/
   private def computeSoftAssignments(

http://git-wip-us.apache.org/repos/asf/spark/blob/29526604/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
index b218d72..b636d02 100644
--- 
a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala
@@ -148,6 +148,16 @@ class GaussianMixtureSuite extends SparkFunSuite with 
MLlibTestSparkContext {
 }
   }
 
+  test("model prediction, parallel and local") {
+val data = sc.parallelize(GaussianTestData.data)
+val gmm = new GaussianMixture().setK(2).setSeed(0).run(data)
+
+val batchPredictions = gmm.predict(data)
+batchPredictions.zip(data).collect().foreach { case (batchPred, datum) =>
+  assert(batchPred === gmm.predict(datum))
+}
+  }
+
   object GaussianTestData {
 
 val data = Array(


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9748] [MLLIB] Centriod typo in KMeansModel

2015-08-07 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master e2fbbe731 -> 902334fd5


[SPARK-9748] [MLLIB] Centriod typo in KMeansModel

A minor typo (centriod -> centroid). Readable variable names help every users.

Author: Bertrand Dechoux 

Closes #8037 from BertrandDechoux/kmeans-typo and squashes the following 
commits:

47632fe [Bertrand Dechoux] centriod typo


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/902334fd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/902334fd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/902334fd

Branch: refs/heads/master
Commit: 902334fd55bbe40a57c1de2a9bdb25eddf1c8cf6
Parents: e2fbbe7
Author: Bertrand Dechoux 
Authored: Fri Aug 7 16:07:24 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 7 16:07:24 2015 -0700

--
 .../org/apache/spark/mllib/clustering/KMeansModel.scala   | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/902334fd/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
index 8ecb3df..9635902 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -120,11 +120,11 @@ object KMeansModel extends Loader[KMeansModel] {
   assert(className == thisClassName)
   assert(formatVersion == thisFormatVersion)
   val k = (metadata \ "k").extract[Int]
-  val centriods = sqlContext.read.parquet(Loader.dataPath(path))
-  Loader.checkSchema[Cluster](centriods.schema)
-  val localCentriods = centriods.map(Cluster.apply).collect()
-  assert(k == localCentriods.size)
-  new KMeansModel(localCentriods.sortBy(_.id).map(_.point))
+  val centroids = sqlContext.read.parquet(Loader.dataPath(path))
+  Loader.checkSchema[Cluster](centroids.schema)
+  val localCentroids = centroids.map(Cluster.apply).collect()
+  assert(k == localCentroids.size)
+  new KMeansModel(localCentroids.sortBy(_.id).map(_.point))
 }
   }
 }


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9756] [ML] Make constructors in ML decision trees private

2015-08-07 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 49702bd73 -> cd540c1e5


[SPARK-9756] [ML] Make constructors in ML decision trees private

These should be made private until there is a public constructor for providing 
`rootNode: Node` to use these constructors.

jkbradley

Author: Feynman Liang 

Closes #8046 from feynmanliang/SPARK-9756 and squashes the following commits:

2cbdf08 [Feynman Liang] Make RFRegressionModel aux constructor private
a06f596 [Feynman Liang] Make constructors in ML decision trees private


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cd540c1e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cd540c1e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cd540c1e

Branch: refs/heads/master
Commit: cd540c1e59561ad1fdac59af6170944c60e685d8
Parents: 49702bd
Author: Feynman Liang 
Authored: Fri Aug 7 17:19:48 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 7 17:19:48 2015 -0700

--
 .../apache/spark/ml/classification/DecisionTreeClassifier.scala | 2 +-
 .../apache/spark/ml/classification/RandomForestClassifier.scala | 5 -
 .../org/apache/spark/ml/regression/DecisionTreeRegressor.scala  | 2 +-
 .../org/apache/spark/ml/regression/RandomForestRegressor.scala  | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cd540c1e/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index f2b992f..29598f3 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -117,7 +117,7 @@ final class DecisionTreeClassificationModel private[ml] (
* Construct a decision tree classification model.
* @param rootNode  Root node of tree, with other nodes attached.
*/
-  def this(rootNode: Node, numClasses: Int) =
+  private[ml] def this(rootNode: Node, numClasses: Int) =
 this(Identifiable.randomUID("dtc"), rootNode, numClasses)
 
   override protected def predict(features: Vector): Double = {

http://git-wip-us.apache.org/repos/asf/spark/blob/cd540c1e/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index b59826a..156050a 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -136,7 +136,10 @@ final class RandomForestClassificationModel private[ml] (
* Construct a random forest classification model, with all trees weighted 
equally.
* @param trees  Component trees
*/
-  def this(trees: Array[DecisionTreeClassificationModel], numFeatures: Int, 
numClasses: Int) =
+  private[ml] def this(
+  trees: Array[DecisionTreeClassificationModel],
+  numFeatures: Int,
+  numClasses: Int) =
 this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses)
 
   override def trees: Array[DecisionTreeModel] = 
_trees.asInstanceOf[Array[DecisionTreeModel]]

http://git-wip-us.apache.org/repos/asf/spark/blob/cd540c1e/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 4d30e4b..dc94a14 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -107,7 +107,7 @@ final class DecisionTreeRegressionModel private[ml] (
* Construct a decision tree regression model.
* @param rootNode  Root node of tree, with other nodes attached.
*/
-  def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode)
+  private[ml] def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), 
rootNode)
 
   override protected def predict(features: Vector): Double = {
 rootNode.predictImpl(features).prediction

http://git-wip-us.apache.org/repos/asf/spark/blob/cd540c1e/mllib/src/main/scala/org/apache/spar

spark git commit: [SPARK-9756] [ML] Make constructors in ML decision trees private

2015-08-07 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 ea4dfb90a -> 2a179a94e


[SPARK-9756] [ML] Make constructors in ML decision trees private

These should be made private until there is a public constructor for providing 
`rootNode: Node` to use these constructors.

jkbradley

Author: Feynman Liang 

Closes #8046 from feynmanliang/SPARK-9756 and squashes the following commits:

2cbdf08 [Feynman Liang] Make RFRegressionModel aux constructor private
a06f596 [Feynman Liang] Make constructors in ML decision trees private

(cherry picked from commit cd540c1e59561ad1fdac59af6170944c60e685d8)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a179a94
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a179a94
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a179a94

Branch: refs/heads/branch-1.5
Commit: 2a179a94e0717b8aa754732e43d2206c196a
Parents: ea4dfb9
Author: Feynman Liang 
Authored: Fri Aug 7 17:19:48 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 7 17:19:59 2015 -0700

--
 .../apache/spark/ml/classification/DecisionTreeClassifier.scala | 2 +-
 .../apache/spark/ml/classification/RandomForestClassifier.scala | 5 -
 .../org/apache/spark/ml/regression/DecisionTreeRegressor.scala  | 2 +-
 .../org/apache/spark/ml/regression/RandomForestRegressor.scala  | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2a179a94/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
index f2b992f..29598f3 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -117,7 +117,7 @@ final class DecisionTreeClassificationModel private[ml] (
* Construct a decision tree classification model.
* @param rootNode  Root node of tree, with other nodes attached.
*/
-  def this(rootNode: Node, numClasses: Int) =
+  private[ml] def this(rootNode: Node, numClasses: Int) =
 this(Identifiable.randomUID("dtc"), rootNode, numClasses)
 
   override protected def predict(features: Vector): Double = {

http://git-wip-us.apache.org/repos/asf/spark/blob/2a179a94/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
index b59826a..156050a 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala
@@ -136,7 +136,10 @@ final class RandomForestClassificationModel private[ml] (
* Construct a random forest classification model, with all trees weighted 
equally.
* @param trees  Component trees
*/
-  def this(trees: Array[DecisionTreeClassificationModel], numFeatures: Int, 
numClasses: Int) =
+  private[ml] def this(
+  trees: Array[DecisionTreeClassificationModel],
+  numFeatures: Int,
+  numClasses: Int) =
 this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses)
 
   override def trees: Array[DecisionTreeModel] = 
_trees.asInstanceOf[Array[DecisionTreeModel]]

http://git-wip-us.apache.org/repos/asf/spark/blob/2a179a94/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
index 4d30e4b..dc94a14 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala
@@ -107,7 +107,7 @@ final class DecisionTreeRegressionModel private[ml] (
* Construct a decision tree regression model.
* @param rootNode  Root node of tree, with other nodes attached.
*/
-  def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode)
+  private[ml] def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), 
rootNode)
 
   override protected def predict(features: Vector): Double = {
 rootNode.predi

spark git commit: [SPARK-9719] [ML] Clean up Naive Bayes doc

2015-08-07 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master cd540c1e5 -> 85be65b39


[SPARK-9719] [ML] Clean up Naive Bayes doc

Small documentation cleanups, including:
 * Adds documentation for `pi` and `theta`
 * setParam to `setModelType`

Author: Feynman Liang 

Closes #8047 from feynmanliang/SPARK-9719 and squashes the following commits:

b372438 [Feynman Liang] Clean up naive bayes doc


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/85be65b3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/85be65b3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/85be65b3

Branch: refs/heads/master
Commit: 85be65b39ce669f937a898195a844844d757666b
Parents: cd540c1
Author: Feynman Liang 
Authored: Fri Aug 7 17:21:12 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 7 17:21:12 2015 -0700

--
 .../scala/org/apache/spark/ml/classification/NaiveBayes.scala| 4 
 1 file changed, 4 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/85be65b3/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index b46b676..97cbaf1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -86,6 +86,7 @@ class NaiveBayes(override val uid: String)
* Set the model type using a string (case-sensitive).
* Supported options: "multinomial" and "bernoulli".
* Default is "multinomial"
+   * @group setParam
*/
   def setModelType(value: String): this.type = set(modelType, value)
   setDefault(modelType -> OldNaiveBayes.Multinomial)
@@ -101,6 +102,9 @@ class NaiveBayes(override val uid: String)
 
 /**
  * Model produced by [[NaiveBayes]]
+ * @param pi log of class priors, whose dimension is C (number of classes)
+ * @param theta log of class conditional probabilities, whose dimension is C 
(number of classes)
+ *  by D (number of features)
  */
 class NaiveBayesModel private[ml] (
 override val uid: String,


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9719] [ML] Clean up Naive Bayes doc

2015-08-07 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 2a179a94e -> c5d43d6c8


[SPARK-9719] [ML] Clean up Naive Bayes doc

Small documentation cleanups, including:
 * Adds documentation for `pi` and `theta`
 * setParam to `setModelType`

Author: Feynman Liang 

Closes #8047 from feynmanliang/SPARK-9719 and squashes the following commits:

b372438 [Feynman Liang] Clean up naive bayes doc

(cherry picked from commit 85be65b39ce669f937a898195a844844d757666b)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5d43d6c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5d43d6c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5d43d6c

Branch: refs/heads/branch-1.5
Commit: c5d43d6c82c87b1b14f73bba917b835f4975fb5a
Parents: 2a179a9
Author: Feynman Liang 
Authored: Fri Aug 7 17:21:12 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 7 17:21:20 2015 -0700

--
 .../scala/org/apache/spark/ml/classification/NaiveBayes.scala| 4 
 1 file changed, 4 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c5d43d6c/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala 
b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
index b46b676..97cbaf1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala
@@ -86,6 +86,7 @@ class NaiveBayes(override val uid: String)
* Set the model type using a string (case-sensitive).
* Supported options: "multinomial" and "bernoulli".
* Default is "multinomial"
+   * @group setParam
*/
   def setModelType(value: String): this.type = set(modelType, value)
   setDefault(modelType -> OldNaiveBayes.Multinomial)
@@ -101,6 +102,9 @@ class NaiveBayes(override val uid: String)
 
 /**
  * Model produced by [[NaiveBayes]]
+ * @param pi log of class priors, whose dimension is C (number of classes)
+ * @param theta log of class conditional probabilities, whose dimension is C 
(number of classes)
+ *  by D (number of features)
  */
 class NaiveBayesModel private[ml] (
 override val uid: String,


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods

2015-08-10 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 0f3366a4c -> 00b655cce


[SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods

Adds method documentations back to `MultivariateOnlineSummarizer`, which were 
present in 1.4 but disappeared somewhere along the way to 1.5.

jkbradley

Author: Feynman Liang 

Closes #8045 from feynmanliang/SPARK-9755 and squashes the following commits:

af67fde [Feynman Liang] Add MultivariateOnlineSummarizer docs


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00b655cc
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00b655cc
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00b655cc

Branch: refs/heads/master
Commit: 00b655cced637e1c3b750c19266086b9dcd7c158
Parents: 0f3366a
Author: Feynman Liang 
Authored: Mon Aug 10 11:01:45 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 10 11:01:45 2015 -0700

--
 .../mllib/stat/MultivariateOnlineSummarizer.scala   | 16 
 1 file changed, 16 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/00b655cc/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 62da9f2..64e4be0 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -153,6 +153,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Sample mean of each dimension.
+   *
* @since 1.1.0
*/
   override def mean: Vector = {
@@ -168,6 +170,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Sample variance of each dimension.
+   *
* @since 1.1.0
*/
   override def variance: Vector = {
@@ -193,11 +197,15 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Sample size.
+   *
* @since 1.1.0
*/
   override def count: Long = totalCnt
 
   /**
+   * Number of nonzero elements in each dimension.
+   *
* @since 1.1.0
*/
   override def numNonzeros: Vector = {
@@ -207,6 +215,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Maximum value of each dimension.
+   *
* @since 1.1.0
*/
   override def max: Vector = {
@@ -221,6 +231,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Minimum value of each dimension.
+   *
* @since 1.1.0
*/
   override def min: Vector = {
@@ -235,6 +247,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * L2 (Euclidian) norm of each dimension.
+   *
* @since 1.2.0
*/
   override def normL2: Vector = {
@@ -252,6 +266,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * L1 norm of each dimension.
+   *
* @since 1.2.0
*/
   override def normL1: Vector = {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods

2015-08-10 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 94b2f5b32 -> 3ee2c8d16


[SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods

Adds method documentations back to `MultivariateOnlineSummarizer`, which were 
present in 1.4 but disappeared somewhere along the way to 1.5.

jkbradley

Author: Feynman Liang 

Closes #8045 from feynmanliang/SPARK-9755 and squashes the following commits:

af67fde [Feynman Liang] Add MultivariateOnlineSummarizer docs

(cherry picked from commit 00b655cced637e1c3b750c19266086b9dcd7c158)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3ee2c8d1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3ee2c8d1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3ee2c8d1

Branch: refs/heads/branch-1.5
Commit: 3ee2c8d169e48e0bca3fab702466e7a855f57f8e
Parents: 94b2f5b
Author: Feynman Liang 
Authored: Mon Aug 10 11:01:45 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 10 11:01:55 2015 -0700

--
 .../mllib/stat/MultivariateOnlineSummarizer.scala   | 16 
 1 file changed, 16 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/3ee2c8d1/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
index 62da9f2..64e4be0 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala
@@ -153,6 +153,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Sample mean of each dimension.
+   *
* @since 1.1.0
*/
   override def mean: Vector = {
@@ -168,6 +170,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Sample variance of each dimension.
+   *
* @since 1.1.0
*/
   override def variance: Vector = {
@@ -193,11 +197,15 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Sample size.
+   *
* @since 1.1.0
*/
   override def count: Long = totalCnt
 
   /**
+   * Number of nonzero elements in each dimension.
+   *
* @since 1.1.0
*/
   override def numNonzeros: Vector = {
@@ -207,6 +215,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Maximum value of each dimension.
+   *
* @since 1.1.0
*/
   override def max: Vector = {
@@ -221,6 +231,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * Minimum value of each dimension.
+   *
* @since 1.1.0
*/
   override def min: Vector = {
@@ -235,6 +247,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * L2 (Euclidian) norm of each dimension.
+   *
* @since 1.2.0
*/
   override def normL2: Vector = {
@@ -252,6 +266,8 @@ class MultivariateOnlineSummarizer extends 
MultivariateStatisticalSummary with S
   }
 
   /**
+   * L1 norm of each dimension.
+   *
* @since 1.2.0
*/
   override def normL1: Vector = {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8764] [ML] string indexer should take option to handle unseen values

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 8cad854ef -> dbd778d84


[SPARK-8764] [ML] string indexer should take option to handle unseen values

As a precursor to adding a public constructor add an option to handle unseen 
values by skipping rather than throwing an exception (default remains throwing 
an exception),

Author: Holden Karau 

Closes #7266 from 
holdenk/SPARK-8764-string-indexer-should-take-option-to-handle-unseen-values 
and squashes the following commits:

38a4de9 [Holden Karau] fix long line
045bf22 [Holden Karau] Add a second b entry so b gets 0 for sure
81dd312 [Holden Karau] Update the docs for handleInvalid param to be more 
descriptive
7f37f6e [Holden Karau] remove extra space (scala style)
414e249 [Holden Karau] And switch to using handleInvalid instead of skipInvalid
1e53f9b [Holden Karau] update the param (codegen side)
7a22215 [Holden Karau] fix typo
100a39b [Holden Karau] Merge in master
aa5b093 [Holden Karau] Since we filter we should never go down this code path 
if getSkipInvalid is true
75ffa69 [Holden Karau] Remove extra newline
d69ef5e [Holden Karau] Add a test
b5734be [Holden Karau] Add support for unseen labels
afecd4e [Holden Karau] Add a param to skip invalid entries.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbd778d8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbd778d8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbd778d8

Branch: refs/heads/master
Commit: dbd778d84d094ca142bc08c351478595b280bc2a
Parents: 8cad854
Author: Holden Karau 
Authored: Tue Aug 11 11:33:36 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 11 11:33:36 2015 -0700

--
 .../apache/spark/ml/feature/StringIndexer.scala | 26 +---
 .../ml/param/shared/SharedParamsCodeGen.scala   |  4 +++
 .../spark/ml/param/shared/sharedParams.scala| 15 +
 .../spark/ml/feature/StringIndexerSuite.scala   | 32 
 4 files changed, 73 insertions(+), 4 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dbd778d8/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index ebfa972..e4485eb 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -33,7 +33,8 @@ import org.apache.spark.util.collection.OpenHashMap
 /**
  * Base trait for [[StringIndexer]] and [[StringIndexerModel]].
  */
-private[feature] trait StringIndexerBase extends Params with HasInputCol with 
HasOutputCol {
+private[feature] trait StringIndexerBase extends Params with HasInputCol with 
HasOutputCol
+with HasHandleInvalid {
 
   /** Validates and transforms the input schema. */
   protected def validateAndTransformSchema(schema: StructType): StructType = {
@@ -66,12 +67,15 @@ class StringIndexer(override val uid: String) extends 
Estimator[StringIndexerMod
   def this() = this(Identifiable.randomUID("strIdx"))
 
   /** @group setParam */
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+  setDefault(handleInvalid, "error")
+
+  /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
   /** @group setParam */
   def setOutputCol(value: String): this.type = set(outputCol, value)
 
-  // TODO: handle unseen labels
 
   override def fit(dataset: DataFrame): StringIndexerModel = {
 val counts = dataset.select(col($(inputCol)).cast(StringType))
@@ -112,6 +116,10 @@ class StringIndexerModel private[ml] (
   }
 
   /** @group setParam */
+  def setHandleInvalid(value: String): this.type = set(handleInvalid, value)
+  setDefault(handleInvalid, "error")
+
+  /** @group setParam */
   def setInputCol(value: String): this.type = set(inputCol, value)
 
   /** @group setParam */
@@ -128,14 +136,24 @@ class StringIndexerModel private[ml] (
   if (labelToIndex.contains(label)) {
 labelToIndex(label)
   } else {
-// TODO: handle unseen labels
 throw new SparkException(s"Unseen label: $label.")
   }
 }
+
 val outputColName = $(outputCol)
 val metadata = NominalAttribute.defaultAttr
   .withName(outputColName).withValues(labels).toMetadata()
-dataset.select(col("*"),
+// If we are skipping invalid records, filter them out.
+val filteredDataset = (getHandleInvalid) match {
+  case "skip" => {
+val filterer = udf { label: String =>
+  labelToIndex.contains(label)
+}
+dataset.where(filterer(dataset($(inputCol
+  }
+  case _ => dataset
+}
+

spark git commit: [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 5831294a7 -> 520ad44b1


[SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix

Adds unit test for `equals` on `mllib.linalg.Matrix` class and `equals` to both 
`SparseMatrix` and `DenseMatrix`. Supports equality testing between 
`SparseMatrix` and `DenseMatrix`.

mengxr

Author: Feynman Liang 

Closes #8042 from feynmanliang/SPARK-9750 and squashes the following commits:

bb70d5e [Feynman Liang] Breeze compare for dense matrices as well, in case 
other is sparse
ab6f3c8 [Feynman Liang] Sparse matrix compare for equals
22782df [Feynman Liang] Add equality based on matrix semantics, not 
representation
78f9426 [Feynman Liang] Add casts
43d28fa [Feynman Liang] Fix failing test
6416fa0 [Feynman Liang] Add failing sparse matrix equals tests


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/520ad44b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/520ad44b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/520ad44b

Branch: refs/heads/master
Commit: 520ad44b17f72e6465bf990f64b4e289f8a83447
Parents: 5831294
Author: Feynman Liang 
Authored: Tue Aug 11 12:49:47 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 11 12:49:47 2015 -0700

--
 .../org/apache/spark/mllib/linalg/Matrices.scala  |  8 ++--
 .../apache/spark/mllib/linalg/MatricesSuite.scala | 18 ++
 2 files changed, 24 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/520ad44b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 1c85834..1139ce3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -257,8 +257,7 @@ class DenseMatrix(
 this(numRows, numCols, values, false)
 
   override def equals(o: Any): Boolean = o match {
-case m: DenseMatrix =>
-  m.numRows == numRows && m.numCols == numCols && Arrays.equals(toArray, 
m.toArray)
+case m: Matrix => toBreeze == m.toBreeze
 case _ => false
   }
 
@@ -519,6 +518,11 @@ class SparseMatrix(
   rowIndices: Array[Int],
   values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, 
values, false)
 
+  override def equals(o: Any): Boolean = o match {
+case m: Matrix => toBreeze == m.toBreeze
+case _ => false
+  }
+
   private[mllib] def toBreeze: BM[Double] = {
  if (!isTransposed) {
new BSM[Double](values, numRows, numCols, colPtrs, rowIndices)

http://git-wip-us.apache.org/repos/asf/spark/blob/520ad44b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index a270ba2..bfd6d54 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite {
 }
   }
 
+  test("equals") {
+val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0))
+assert(dm1 === dm1)
+assert(dm1 !== dm1.transpose)
+
+val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0))
+assert(dm1 === dm2.transpose)
+
+val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse
+assert(sm1 === sm1)
+assert(sm1 === dm1)
+assert(sm1 !== sm1.transpose)
+
+val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse
+assert(sm1 === sm2.transpose)
+assert(sm1 === dm2.transpose)
+  }
+
   test("matrix copies are deep copies") {
 val m = 3
 val n = 2


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 767ee1884 -> 811d23f1c


[SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix

Adds unit test for `equals` on `mllib.linalg.Matrix` class and `equals` to both 
`SparseMatrix` and `DenseMatrix`. Supports equality testing between 
`SparseMatrix` and `DenseMatrix`.

mengxr

Author: Feynman Liang 

Closes #8042 from feynmanliang/SPARK-9750 and squashes the following commits:

bb70d5e [Feynman Liang] Breeze compare for dense matrices as well, in case 
other is sparse
ab6f3c8 [Feynman Liang] Sparse matrix compare for equals
22782df [Feynman Liang] Add equality based on matrix semantics, not 
representation
78f9426 [Feynman Liang] Add casts
43d28fa [Feynman Liang] Fix failing test
6416fa0 [Feynman Liang] Add failing sparse matrix equals tests

(cherry picked from commit 520ad44b17f72e6465bf990f64b4e289f8a83447)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/811d23f1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/811d23f1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/811d23f1

Branch: refs/heads/branch-1.5
Commit: 811d23f1c27e7f461f0d37d058c07885fb0e0750
Parents: 767ee18
Author: Feynman Liang 
Authored: Tue Aug 11 12:49:47 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 11 12:49:56 2015 -0700

--
 .../org/apache/spark/mllib/linalg/Matrices.scala  |  8 ++--
 .../apache/spark/mllib/linalg/MatricesSuite.scala | 18 ++
 2 files changed, 24 insertions(+), 2 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/811d23f1/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
index 1c85834..1139ce3 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala
@@ -257,8 +257,7 @@ class DenseMatrix(
 this(numRows, numCols, values, false)
 
   override def equals(o: Any): Boolean = o match {
-case m: DenseMatrix =>
-  m.numRows == numRows && m.numCols == numCols && Arrays.equals(toArray, 
m.toArray)
+case m: Matrix => toBreeze == m.toBreeze
 case _ => false
   }
 
@@ -519,6 +518,11 @@ class SparseMatrix(
   rowIndices: Array[Int],
   values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, 
values, false)
 
+  override def equals(o: Any): Boolean = o match {
+case m: Matrix => toBreeze == m.toBreeze
+case _ => false
+  }
+
   private[mllib] def toBreeze: BM[Double] = {
  if (!isTransposed) {
new BSM[Double](values, numRows, numCols, colPtrs, rowIndices)

http://git-wip-us.apache.org/repos/asf/spark/blob/811d23f1/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala 
b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
index a270ba2..bfd6d54 100644
--- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala
@@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite {
 }
   }
 
+  test("equals") {
+val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0))
+assert(dm1 === dm1)
+assert(dm1 !== dm1.transpose)
+
+val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0))
+assert(dm1 === dm2.transpose)
+
+val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse
+assert(sm1 === sm1)
+assert(sm1 === dm1)
+assert(sm1 !== sm1.transpose)
+
+val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse
+assert(sm1 === sm2.transpose)
+assert(sm1 === dm2.transpose)
+  }
+
   test("matrix copies are deep copies") {
 val m = 3
 val n = 2


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 cdf781db6 -> 2273e7432


[SPARK-9788] [MLLIB] Fix LDA Binary Compatibility

1. Add âasymmetricDocConcentrationâ and revert docConcentration changes. If 
the (internal) doc concentration vector is a single value, 
âgetDocConcentration" returns it. If it is a constant vector, 
getDocConcentration returns the first item, and fails otherwise.
2. Give `LDAModel.gammaShape` a default value in `LDAModel` concrete class 
constructors.

jkbradley

Author: Feynman Liang 

Closes #8077 from feynmanliang/SPARK-9788 and squashes the following commits:

6b07bc8 [Feynman Liang] Code review changes
9d6a71e [Feynman Liang] Add asymmetricAlpha alias
bf4e685 [Feynman Liang] Asymmetric docConcentration
4cab972 [Feynman Liang] Default gammaShape

(cherry picked from commit be3e27164133025db860781bd5cdd3ca233edd21)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2273e743
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2273e743
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2273e743

Branch: refs/heads/branch-1.5
Commit: 2273e7432ec218ba163a94f86307ad11904a1dee
Parents: cdf781d
Author: Feynman Liang 
Authored: Tue Aug 11 14:21:53 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 11 14:22:02 2015 -0700

--
 .../org/apache/spark/mllib/clustering/LDA.scala | 27 ---
 .../spark/mllib/clustering/LDAModel.scala   | 11 
 .../spark/mllib/clustering/LDAOptimizer.scala   | 28 ++--
 .../spark/mllib/clustering/LDASuite.scala   |  4 +--
 4 files changed, 46 insertions(+), 24 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2273e743/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index ab124e6..0fc9b1a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -79,7 +79,24 @@ class LDA private (
*
* This is the parameter to a Dirichlet distribution.
*/
-  def getDocConcentration: Vector = this.docConcentration
+  def getAsymmetricDocConcentration: Vector = this.docConcentration
+
+  /**
+   * Concentration parameter (commonly named "alpha") for the prior placed on 
documents'
+   * distributions over topics ("theta").
+   *
+   * This method assumes the Dirichlet distribution is symmetric and can be 
described by a single
+   * [[Double]] parameter. It should fail if docConcentration is asymmetric.
+   */
+  def getDocConcentration: Double = {
+val parameter = docConcentration(0)
+if (docConcentration.size == 1) {
+  parameter
+} else {
+  require(docConcentration.toArray.forall(_ == parameter))
+  parameter
+}
+  }
 
   /**
* Concentration parameter (commonly named "alpha") for the prior placed on 
documents'
@@ -106,18 +123,22 @@ class LDA private (
*   [[https://github.com/Blei-Lab/onlineldavb]].
*/
   def setDocConcentration(docConcentration: Vector): this.type = {
+require(docConcentration.size > 0, "docConcentration must have > 0 
elements")
 this.docConcentration = docConcentration
 this
   }
 
-  /** Replicates Double to create a symmetric prior */
+  /** Replicates a [[Double]] docConcentration to create a symmetric prior. */
   def setDocConcentration(docConcentration: Double): this.type = {
 this.docConcentration = Vectors.dense(docConcentration)
 this
   }
 
+  /** Alias for [[getAsymmetricDocConcentration]] */
+  def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration
+
   /** Alias for [[getDocConcentration]] */
-  def getAlpha: Vector = getDocConcentration
+  def getAlpha: Double = getDocConcentration
 
   /** Alias for [[setDocConcentration()]] */
   def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)

http://git-wip-us.apache.org/repos/asf/spark/blob/2273e743/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 33babda..5dc637e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -27,7 +27,6 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.Spark

spark git commit: [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility

2015-08-11 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 423cdfd83 -> be3e27164


[SPARK-9788] [MLLIB] Fix LDA Binary Compatibility

1. Add âasymmetricDocConcentrationâ and revert docConcentration changes. If 
the (internal) doc concentration vector is a single value, 
âgetDocConcentration" returns it. If it is a constant vector, 
getDocConcentration returns the first item, and fails otherwise.
2. Give `LDAModel.gammaShape` a default value in `LDAModel` concrete class 
constructors.

jkbradley

Author: Feynman Liang 

Closes #8077 from feynmanliang/SPARK-9788 and squashes the following commits:

6b07bc8 [Feynman Liang] Code review changes
9d6a71e [Feynman Liang] Add asymmetricAlpha alias
bf4e685 [Feynman Liang] Asymmetric docConcentration
4cab972 [Feynman Liang] Default gammaShape


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be3e2716
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be3e2716
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be3e2716

Branch: refs/heads/master
Commit: be3e27164133025db860781bd5cdd3ca233edd21
Parents: 423cdfd
Author: Feynman Liang 
Authored: Tue Aug 11 14:21:53 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 11 14:21:53 2015 -0700

--
 .../org/apache/spark/mllib/clustering/LDA.scala | 27 ---
 .../spark/mllib/clustering/LDAModel.scala   | 11 
 .../spark/mllib/clustering/LDAOptimizer.scala   | 28 ++--
 .../spark/mllib/clustering/LDASuite.scala   |  4 +--
 4 files changed, 46 insertions(+), 24 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/be3e2716/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
index ab124e6..0fc9b1a 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala
@@ -79,7 +79,24 @@ class LDA private (
*
* This is the parameter to a Dirichlet distribution.
*/
-  def getDocConcentration: Vector = this.docConcentration
+  def getAsymmetricDocConcentration: Vector = this.docConcentration
+
+  /**
+   * Concentration parameter (commonly named "alpha") for the prior placed on 
documents'
+   * distributions over topics ("theta").
+   *
+   * This method assumes the Dirichlet distribution is symmetric and can be 
described by a single
+   * [[Double]] parameter. It should fail if docConcentration is asymmetric.
+   */
+  def getDocConcentration: Double = {
+val parameter = docConcentration(0)
+if (docConcentration.size == 1) {
+  parameter
+} else {
+  require(docConcentration.toArray.forall(_ == parameter))
+  parameter
+}
+  }
 
   /**
* Concentration parameter (commonly named "alpha") for the prior placed on 
documents'
@@ -106,18 +123,22 @@ class LDA private (
*   [[https://github.com/Blei-Lab/onlineldavb]].
*/
   def setDocConcentration(docConcentration: Vector): this.type = {
+require(docConcentration.size > 0, "docConcentration must have > 0 
elements")
 this.docConcentration = docConcentration
 this
   }
 
-  /** Replicates Double to create a symmetric prior */
+  /** Replicates a [[Double]] docConcentration to create a symmetric prior. */
   def setDocConcentration(docConcentration: Double): this.type = {
 this.docConcentration = Vectors.dense(docConcentration)
 this
   }
 
+  /** Alias for [[getAsymmetricDocConcentration]] */
+  def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration
+
   /** Alias for [[getDocConcentration]] */
-  def getAlpha: Vector = getDocConcentration
+  def getAlpha: Double = getDocConcentration
 
   /** Alias for [[setDocConcentration()]] */
   def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha)

http://git-wip-us.apache.org/repos/asf/spark/blob/be3e2716/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 33babda..5dc637e 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -27,7 +27,6 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
 import org.apache.spark.api.java.JavaPairRDD
-import org.apache.spark.broadcast.Bro

spark git commit: [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML

2015-08-12 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 60103ecd3 -> 762bacc16


[SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML

Check and add miss docs for PySpark ML (this issue only check miss docs for 
o.a.s.ml not o.a.s.mllib).

Author: Yanbo Liang 

Closes #8059 from yanboliang/SPARK-9766.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/762bacc1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/762bacc1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/762bacc1

Branch: refs/heads/master
Commit: 762bacc16ac5e74c8b05a7c1e3e367d1d1633cef
Parents: 60103ec
Author: Yanbo Liang 
Authored: Wed Aug 12 13:24:18 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Aug 12 13:24:18 2015 -0700

--
 python/pyspark/ml/classification.py | 12 ++--
 python/pyspark/ml/clustering.py |  4 +++-
 python/pyspark/ml/evaluation.py |  3 ++-
 python/pyspark/ml/feature.py|  9 +
 4 files changed, 20 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 5978d8f..6702dce 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -34,6 +34,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
  HasRegParam, HasTol, HasProbabilityCol, 
HasRawPredictionCol):
 """
 Logistic regression.
+Currently, this class only supports binary classification.
 
 >>> from pyspark.sql import Row
 >>> from pyspark.mllib.linalg import Vectors
@@ -96,8 +97,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
 self.elasticNetParam = \
 Param(self, "elasticNetParam",
-  "the ElasticNet mixing parameter, in range [0, 1]. For alpha 
= 0, the penalty " +
-  "is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+  "the ElasticNet mixing parameter, in range [0, 1]. For alpha 
= 0, " +
+  "the penalty is an L2 penalty. For alpha = 1, it is an L1 
penalty.")
 #: param for whether to fit an intercept term.
 self.fitIntercept = Param(self, "fitIntercept", "whether to fit an 
intercept term.")
 #: param for threshold in binary classification prediction, in range 
[0, 1].
@@ -656,6 +657,13 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol, H
  HasRawPredictionCol):
 """
 Naive Bayes Classifiers.
+It supports both Multinomial and Bernoulli NB. Multinomial NB
+
(`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`)
+can handle finitely supported discrete data. For example, by converting 
documents into
+TF-IDF vectors, it can be used for document classification. By making 
every vector a
+binary (0/1) data, it can also be used as Bernoulli NB
+
(`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`).
+The input feature values must be nonnegative.
 
 >>> from pyspark.sql import Row
 >>> from pyspark.mllib.linalg import Vectors

http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/clustering.py
--
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index b5e9b65..4833871 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -37,7 +37,9 @@ class KMeansModel(JavaModel):
 @inherit_doc
 class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed):
 """
-K-means Clustering
+K-means clustering with support for multiple parallel runs and a k-means++ 
like initialization
+mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent 
runs are requested,
+they are executed together with joint passes over the data for efficiency.
 
 >>> from pyspark.mllib.linalg import Vectors
 >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),

http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/evaluation.py
--
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 06e8093..2734092 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -23,7 +23,8 @@ from pyspark.ml.param.shared import HasLabelCol, 
HasPredictionCol, HasRawPredict
 from pyspark.ml.util import

spark git commit: [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML

2015-08-12 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 8629c33b6 -> 65b5b2172


[SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML

Check and add miss docs for PySpark ML (this issue only check miss docs for 
o.a.s.ml not o.a.s.mllib).

Author: Yanbo Liang 

Closes #8059 from yanboliang/SPARK-9766.

(cherry picked from commit 762bacc16ac5e74c8b05a7c1e3e367d1d1633cef)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65b5b217
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65b5b217
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65b5b217

Branch: refs/heads/branch-1.5
Commit: 65b5b2172681285a027e865ec9a91779e902e85a
Parents: 8629c33
Author: Yanbo Liang 
Authored: Wed Aug 12 13:24:18 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Aug 12 13:24:29 2015 -0700

--
 python/pyspark/ml/classification.py | 12 ++--
 python/pyspark/ml/clustering.py |  4 +++-
 python/pyspark/ml/evaluation.py |  3 ++-
 python/pyspark/ml/feature.py|  9 +
 4 files changed, 20 insertions(+), 8 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/65b5b217/python/pyspark/ml/classification.py
--
diff --git a/python/pyspark/ml/classification.py 
b/python/pyspark/ml/classification.py
index 5978d8f..6702dce 100644
--- a/python/pyspark/ml/classification.py
+++ b/python/pyspark/ml/classification.py
@@ -34,6 +34,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
  HasRegParam, HasTol, HasProbabilityCol, 
HasRawPredictionCol):
 """
 Logistic regression.
+Currently, this class only supports binary classification.
 
 >>> from pyspark.sql import Row
 >>> from pyspark.mllib.linalg import Vectors
@@ -96,8 +97,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredicti
 #  is an L2 penalty. For alpha = 1, it is an L1 penalty.
 self.elasticNetParam = \
 Param(self, "elasticNetParam",
-  "the ElasticNet mixing parameter, in range [0, 1]. For alpha 
= 0, the penalty " +
-  "is an L2 penalty. For alpha = 1, it is an L1 penalty.")
+  "the ElasticNet mixing parameter, in range [0, 1]. For alpha 
= 0, " +
+  "the penalty is an L2 penalty. For alpha = 1, it is an L1 
penalty.")
 #: param for whether to fit an intercept term.
 self.fitIntercept = Param(self, "fitIntercept", "whether to fit an 
intercept term.")
 #: param for threshold in binary classification prediction, in range 
[0, 1].
@@ -656,6 +657,13 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, 
HasLabelCol, HasPredictionCol, H
  HasRawPredictionCol):
 """
 Naive Bayes Classifiers.
+It supports both Multinomial and Bernoulli NB. Multinomial NB
+
(`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`)
+can handle finitely supported discrete data. For example, by converting 
documents into
+TF-IDF vectors, it can be used for document classification. By making 
every vector a
+binary (0/1) data, it can also be used as Bernoulli NB
+
(`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`).
+The input feature values must be nonnegative.
 
 >>> from pyspark.sql import Row
 >>> from pyspark.mllib.linalg import Vectors

http://git-wip-us.apache.org/repos/asf/spark/blob/65b5b217/python/pyspark/ml/clustering.py
--
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index b5e9b65..4833871 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -37,7 +37,9 @@ class KMeansModel(JavaModel):
 @inherit_doc
 class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed):
 """
-K-means Clustering
+K-means clustering with support for multiple parallel runs and a k-means++ 
like initialization
+mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent 
runs are requested,
+they are executed together with joint passes over the data for efficiency.
 
 >>> from pyspark.mllib.linalg import Vectors
 >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),

http://git-wip-us.apache.org/repos/asf/spark/blob/65b5b217/python/pyspark/ml/evaluation.py
--
diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py
index 06e8093..2734092 100644
--- a/python/pyspark/ml/evaluation.py
+++ b/python/pyspark/ml/evaluation.py
@@ -23,7 +23,8 @@

spark git commit: [SPARK-9789] [ML] Added logreg threshold param back

2015-08-12 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 65b5b2172 -> bdf8dc15d


[SPARK-9789] [ML] Added logreg threshold param back

Reinstated LogisticRegression.threshold Param for binary compatibility.  Param 
thresholds overrides threshold, if set.

CC: mengxr dbtsai feynmanliang

Author: Joseph K. Bradley 

Closes #8079 from jkbradley/logreg-reinstate-threshold.

(cherry picked from commit 551def5d6972440365bd7436d484a67138d9a8f3)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bdf8dc15
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bdf8dc15
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bdf8dc15

Branch: refs/heads/branch-1.5
Commit: bdf8dc15d3b310c8cd84c71999b1bca4d9bc825e
Parents: 65b5b21
Author: Joseph K. Bradley 
Authored: Wed Aug 12 14:27:13 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Aug 12 14:27:21 2015 -0700

--
 .../ml/classification/LogisticRegression.scala  | 127 +++
 .../ml/param/shared/SharedParamsCodeGen.scala   |   4 +-
 .../spark/ml/param/shared/sharedParams.scala|   6 +-
 .../JavaLogisticRegressionSuite.java|   7 +-
 .../LogisticRegressionSuite.scala   |  33 +++--
 python/pyspark/ml/classification.py |  98 --
 6 files changed, 199 insertions(+), 76 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bdf8dc15/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index f55134d..5bcd711 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -34,8 +34,7 @@ import 
org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -43,44 +42,115 @@ import org.apache.spark.storage.StorageLevel
  */
 private[classification] trait LogisticRegressionParams extends 
ProbabilisticClassifierParams
   with HasRegParam with HasElasticNetParam with HasMaxIter with 
HasFitIntercept with HasTol
-  with HasStandardization {
+  with HasStandardization with HasThreshold {
 
   /**
-   * Version of setThresholds() for binary classification, available for 
backwards
-   * compatibility.
+   * Set threshold in binary classification, in range [0, 1].
*
-   * Calling this with threshold p will effectively call 
`setThresholds(Array(1-p, p))`.
+   * If the estimated probability of class label 1 is > threshold, then 
predict 1, else 0.
+   * A high threshold encourages the model to predict 0 more often;
+   * a low threshold encourages the model to predict 1 more often.
+   *
+   * Note: Calling this with threshold p is equivalent to calling 
`setThresholds(Array(1-p, p))`.
+   *   When [[setThreshold()]] is called, any user-set value for 
[[thresholds]] will be cleared.
+   *   If both [[threshold]] and [[thresholds]] are set in a ParamMap, 
then they must be
+   *   equivalent.
+   *
+   * Default is 0.5.
+   * @group setParam
+   */
+  def setThreshold(value: Double): this.type = {
+if (isSet(thresholds)) clear(thresholds)
+set(threshold, value)
+  }
+
+  /**
+   * Get threshold for binary classification.
+   *
+   * If [[threshold]] is set, returns that value.
+   * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary 
classification),
+   * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / 
thresholds(1))}}}.
+   * Otherwise, returns [[threshold]] default value.
+   *
+   * @group getParam
+   * @throws IllegalArgumentException if [[thresholds]] is set to an array of 
length other than 2.
+   */
+  override def getThreshold: Double = {
+checkThresholdConsistency()
+if (isSet(thresholds)) {
+  val ts = $(thresholds)
+  require(ts.length == 2, "Logistic Regression getThreshold only applies 
to" +
+" binary classification, but thresholds has length != 2.  thresholds: 
" + ts.mkString(","))
+  1.0 / (1.0 + ts(0) / ts(1))
+} else {
+  $(threshold)
+}
+  }
+
+  /**
+   * Set thresholds in multiclass (or binary) classification to adjust the 
probability of
+   * predicting each class.

spark git commit: [SPARK-9789] [ML] Added logreg threshold param back

2015-08-12 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 762bacc16 -> 551def5d6


[SPARK-9789] [ML] Added logreg threshold param back

Reinstated LogisticRegression.threshold Param for binary compatibility.  Param 
thresholds overrides threshold, if set.

CC: mengxr dbtsai feynmanliang

Author: Joseph K. Bradley 

Closes #8079 from jkbradley/logreg-reinstate-threshold.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/551def5d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/551def5d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/551def5d

Branch: refs/heads/master
Commit: 551def5d6972440365bd7436d484a67138d9a8f3
Parents: 762bacc
Author: Joseph K. Bradley 
Authored: Wed Aug 12 14:27:13 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Aug 12 14:27:13 2015 -0700

--
 .../ml/classification/LogisticRegression.scala  | 127 +++
 .../ml/param/shared/SharedParamsCodeGen.scala   |   4 +-
 .../spark/ml/param/shared/sharedParams.scala|   6 +-
 .../JavaLogisticRegressionSuite.java|   7 +-
 .../LogisticRegressionSuite.scala   |  33 +++--
 python/pyspark/ml/classification.py |  98 --
 6 files changed, 199 insertions(+), 76 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/551def5d/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
index f55134d..5bcd711 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -34,8 +34,7 @@ import 
org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
 import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.{DataFrame, Row, SQLContext}
-import org.apache.spark.sql.functions.{col, udf}
+import org.apache.spark.sql.{DataFrame, Row}
 import org.apache.spark.storage.StorageLevel
 
 /**
@@ -43,44 +42,115 @@ import org.apache.spark.storage.StorageLevel
  */
 private[classification] trait LogisticRegressionParams extends 
ProbabilisticClassifierParams
   with HasRegParam with HasElasticNetParam with HasMaxIter with 
HasFitIntercept with HasTol
-  with HasStandardization {
+  with HasStandardization with HasThreshold {
 
   /**
-   * Version of setThresholds() for binary classification, available for 
backwards
-   * compatibility.
+   * Set threshold in binary classification, in range [0, 1].
*
-   * Calling this with threshold p will effectively call 
`setThresholds(Array(1-p, p))`.
+   * If the estimated probability of class label 1 is > threshold, then 
predict 1, else 0.
+   * A high threshold encourages the model to predict 0 more often;
+   * a low threshold encourages the model to predict 1 more often.
+   *
+   * Note: Calling this with threshold p is equivalent to calling 
`setThresholds(Array(1-p, p))`.
+   *   When [[setThreshold()]] is called, any user-set value for 
[[thresholds]] will be cleared.
+   *   If both [[threshold]] and [[thresholds]] are set in a ParamMap, 
then they must be
+   *   equivalent.
+   *
+   * Default is 0.5.
+   * @group setParam
+   */
+  def setThreshold(value: Double): this.type = {
+if (isSet(thresholds)) clear(thresholds)
+set(threshold, value)
+  }
+
+  /**
+   * Get threshold for binary classification.
+   *
+   * If [[threshold]] is set, returns that value.
+   * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary 
classification),
+   * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / 
thresholds(1))}}}.
+   * Otherwise, returns [[threshold]] default value.
+   *
+   * @group getParam
+   * @throws IllegalArgumentException if [[thresholds]] is set to an array of 
length other than 2.
+   */
+  override def getThreshold: Double = {
+checkThresholdConsistency()
+if (isSet(thresholds)) {
+  val ts = $(thresholds)
+  require(ts.length == 2, "Logistic Regression getThreshold only applies 
to" +
+" binary classification, but thresholds has length != 2.  thresholds: 
" + ts.mkString(","))
+  1.0 / (1.0 + ts(0) / ts(1))
+} else {
+  $(threshold)
+}
+  }
+
+  /**
+   * Set thresholds in multiclass (or binary) classification to adjust the 
probability of
+   * predicting each class. Array must have length equal to the number of 
classes, with values >= 0.
+   * The class with largest value p/

spark git commit: [SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a parent

2015-08-13 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 5592d162a -> fe05142f5


[SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a 
parent

Copied ML models must have the same parent of original ones

Author: lewuathe 
Author: Lewuathe 

Closes #7447 from Lewuathe/SPARK-9073.

(cherry picked from commit 2932e25da4532de9e86b01d08bce0cb680874e70)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe05142f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe05142f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe05142f

Branch: refs/heads/branch-1.5
Commit: fe05142f5bc6b11ba9d5d2d77f989610178fc7b5
Parents: 5592d16
Author: lewuathe 
Authored: Thu Aug 13 09:17:19 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 13 09:17:31 2015 -0700

--
 .../examples/ml/JavaDeveloperApiExample.java|  3 +-
 .../spark/examples/ml/DeveloperApiExample.scala |  2 +-
 .../scala/org/apache/spark/ml/Pipeline.scala|  2 +-
 .../classification/DecisionTreeClassifier.scala |  1 +
 .../spark/ml/classification/GBTClassifier.scala |  2 +-
 .../ml/classification/LogisticRegression.scala  |  2 +-
 .../spark/ml/classification/OneVsRest.scala |  2 +-
 .../classification/RandomForestClassifier.scala |  1 +
 .../apache/spark/ml/feature/Bucketizer.scala|  4 ++-
 .../scala/org/apache/spark/ml/feature/IDF.scala |  2 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  |  2 +-
 .../scala/org/apache/spark/ml/feature/PCA.scala |  2 +-
 .../spark/ml/feature/StandardScaler.scala   |  2 +-
 .../apache/spark/ml/feature/StringIndexer.scala |  2 +-
 .../apache/spark/ml/feature/VectorIndexer.scala |  2 +-
 .../org/apache/spark/ml/feature/Word2Vec.scala  |  2 +-
 .../apache/spark/ml/recommendation/ALS.scala|  2 +-
 .../ml/regression/DecisionTreeRegressor.scala   |  2 +-
 .../spark/ml/regression/GBTRegressor.scala  |  2 +-
 .../spark/ml/regression/LinearRegression.scala  |  2 +-
 .../ml/regression/RandomForestRegressor.scala   |  2 +-
 .../apache/spark/ml/tuning/CrossValidator.scala |  2 +-
 .../org/apache/spark/ml/PipelineSuite.scala |  3 ++
 .../DecisionTreeClassifierSuite.scala   |  4 +++
 .../ml/classification/GBTClassifierSuite.scala  |  4 +++
 .../LogisticRegressionSuite.scala   |  4 +++
 .../ml/classification/OneVsRestSuite.scala  |  6 +++-
 .../RandomForestClassifierSuite.scala   |  4 +++
 .../spark/ml/feature/BucketizerSuite.scala  |  1 +
 .../spark/ml/feature/MinMaxScalerSuite.scala|  4 +++
 .../org/apache/spark/ml/feature/PCASuite.scala  |  4 +++
 .../spark/ml/feature/StringIndexerSuite.scala   |  5 
 .../spark/ml/feature/VectorIndexerSuite.scala   |  5 
 .../apache/spark/ml/feature/Word2VecSuite.scala |  4 +++
 .../spark/ml/recommendation/ALSSuite.scala  |  4 +++
 .../regression/DecisionTreeRegressorSuite.scala | 11 +++
 .../spark/ml/regression/GBTRegressorSuite.scala |  5 
 .../ml/regression/LinearRegressionSuite.scala   |  5 
 .../regression/RandomForestRegressorSuite.scala |  7 -
 .../spark/ml/tuning/CrossValidatorSuite.scala   |  5 
 .../apache/spark/ml/util/MLTestingUtils.scala   | 30 
 41 files changed, 138 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fe05142f/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index 9df26ff..3f1fe90 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -230,6 +230,7 @@ class MyJavaLogisticRegressionModel
*/
   @Override
   public MyJavaLogisticRegressionModel copy(ParamMap extra) {
-return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), 
extra);
+return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), 
extra)
+  .setParent(parent());
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/fe05142f/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index 78f31b4..340c355 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExa

spark git commit: [SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a parent

2015-08-13 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 699303101 -> 2932e25da


[SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a 
parent

Copied ML models must have the same parent of original ones

Author: lewuathe 
Author: Lewuathe 

Closes #7447 from Lewuathe/SPARK-9073.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2932e25d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2932e25d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2932e25d

Branch: refs/heads/master
Commit: 2932e25da4532de9e86b01d08bce0cb680874e70
Parents: 6993031
Author: lewuathe 
Authored: Thu Aug 13 09:17:19 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 13 09:17:19 2015 -0700

--
 .../examples/ml/JavaDeveloperApiExample.java|  3 +-
 .../spark/examples/ml/DeveloperApiExample.scala |  2 +-
 .../scala/org/apache/spark/ml/Pipeline.scala|  2 +-
 .../classification/DecisionTreeClassifier.scala |  1 +
 .../spark/ml/classification/GBTClassifier.scala |  2 +-
 .../ml/classification/LogisticRegression.scala  |  2 +-
 .../spark/ml/classification/OneVsRest.scala |  2 +-
 .../classification/RandomForestClassifier.scala |  1 +
 .../apache/spark/ml/feature/Bucketizer.scala|  4 ++-
 .../scala/org/apache/spark/ml/feature/IDF.scala |  2 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  |  2 +-
 .../scala/org/apache/spark/ml/feature/PCA.scala |  2 +-
 .../spark/ml/feature/StandardScaler.scala   |  2 +-
 .../apache/spark/ml/feature/StringIndexer.scala |  2 +-
 .../apache/spark/ml/feature/VectorIndexer.scala |  2 +-
 .../org/apache/spark/ml/feature/Word2Vec.scala  |  2 +-
 .../apache/spark/ml/recommendation/ALS.scala|  2 +-
 .../ml/regression/DecisionTreeRegressor.scala   |  2 +-
 .../spark/ml/regression/GBTRegressor.scala  |  2 +-
 .../spark/ml/regression/LinearRegression.scala  |  2 +-
 .../ml/regression/RandomForestRegressor.scala   |  2 +-
 .../apache/spark/ml/tuning/CrossValidator.scala |  2 +-
 .../org/apache/spark/ml/PipelineSuite.scala |  3 ++
 .../DecisionTreeClassifierSuite.scala   |  4 +++
 .../ml/classification/GBTClassifierSuite.scala  |  4 +++
 .../LogisticRegressionSuite.scala   |  4 +++
 .../ml/classification/OneVsRestSuite.scala  |  6 +++-
 .../RandomForestClassifierSuite.scala   |  4 +++
 .../spark/ml/feature/BucketizerSuite.scala  |  1 +
 .../spark/ml/feature/MinMaxScalerSuite.scala|  4 +++
 .../org/apache/spark/ml/feature/PCASuite.scala  |  4 +++
 .../spark/ml/feature/StringIndexerSuite.scala   |  5 
 .../spark/ml/feature/VectorIndexerSuite.scala   |  5 
 .../apache/spark/ml/feature/Word2VecSuite.scala |  4 +++
 .../spark/ml/recommendation/ALSSuite.scala  |  4 +++
 .../regression/DecisionTreeRegressorSuite.scala | 11 +++
 .../spark/ml/regression/GBTRegressorSuite.scala |  5 
 .../ml/regression/LinearRegressionSuite.scala   |  5 
 .../regression/RandomForestRegressorSuite.scala |  7 -
 .../spark/ml/tuning/CrossValidatorSuite.scala   |  5 
 .../apache/spark/ml/util/MLTestingUtils.scala   | 30 
 41 files changed, 138 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/2932e25d/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
index 9df26ff..3f1fe90 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java
@@ -230,6 +230,7 @@ class MyJavaLogisticRegressionModel
*/
   @Override
   public MyJavaLogisticRegressionModel copy(ParamMap extra) {
-return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), 
extra);
+return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), 
extra)
+  .setParent(parent());
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/2932e25d/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
--
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
index 78f31b4..340c355 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala
@@ -179,7 +179,7 @@ private class MyLogisticRegressionModel(
* This is used for the default imple

spark git commit: [SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and Param

2015-08-13 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 2932e25da -> 7a539ef3b


[SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and 
Param

Added ml-guide Python Example: Estimator, Transformer, and Param
/docs/_site/ml-guide.html

Author: Rosstin 

Closes #8081 from Rosstin/SPARK-8965.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a539ef3
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a539ef3
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a539ef3

Branch: refs/heads/master
Commit: 7a539ef3b1792764f866fa88c84c78ad59903f21
Parents: 2932e25
Author: Rosstin 
Authored: Thu Aug 13 09:18:39 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 13 09:18:39 2015 -0700

--
 docs/ml-guide.md | 68 +++
 1 file changed, 68 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7a539ef3/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index b6ca50e..a03ab43 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -355,6 +355,74 @@ jsc.stop();
 {% endhighlight %}
 
 
+
+{% highlight python %}
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.ml.classification import LogisticRegression
+from pyspark.ml.param import Param, Params
+from pyspark.sql import Row, SQLContext
+
+sc = SparkContext(appName="SimpleParamsExample")
+sqlContext = SQLContext(sc)
+
+# Prepare training data.
+# We use LabeledPoint.
+# Spark SQL can convert RDDs of LabeledPoints into DataFrames.
+training = sc.parallelize([LabeledPoint(1.0, [0.0, 1.1,  0.1]),
+   LabeledPoint(0.0, [2.0, 1.0, -1.0]),
+   LabeledPoint(0.0, [2.0, 1.3,  1.0]),
+   LabeledPoint(1.0, [0.0, 1.2, -0.5])])
+
+# Create a LogisticRegression instance. This instance is an Estimator.
+lr = LogisticRegression(maxIter=10, regParam=0.01)
+# Print out the parameters, documentation, and any default values.
+print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
+
+# Learn a LogisticRegression model. This uses the parameters stored in lr.
+model1 = lr.fit(training.toDF())
+
+# Since model1 is a Model (i.e., a transformer produced by an Estimator),
+# we can view the parameters it used during fit().
+# This prints the parameter (name: value) pairs, where names are unique IDs 
for this
+# LogisticRegression instance.
+print "Model 1 was fit using parameters: "
+print model1.extractParamMap()
+
+# We may alternatively specify parameters using a Python dictionary as a 
paramMap
+paramMap = {lr.maxIter: 20}
+paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter.
+paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple 
Params.
+
+# You can combine paramMaps, which are python dictionaries.
+paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name
+paramMapCombined = paramMap.copy()
+paramMapCombined.update(paramMap2)
+
+# Now learn a new model using the paramMapCombined parameters.
+# paramMapCombined overrides all parameters set earlier via lr.set* methods.
+model2 = lr.fit(training.toDF(), paramMapCombined)
+print "Model 2 was fit using parameters: "
+print model2.extractParamMap()
+
+# Prepare test data
+test = sc.parallelize([LabeledPoint(1.0, [-1.0, 1.5,  1.3]),
+   LabeledPoint(0.0, [ 3.0, 2.0, -0.1]),
+   LabeledPoint(1.0, [ 0.0, 2.2, -1.5])])
+
+# Make predictions on test data using the Transformer.transform() method.
+# LogisticRegression.transform will only use the 'features' column.
+# Note that model2.transform() outputs a "myProbability" column instead of the 
usual
+# 'probability' column since we renamed the lr.probabilityCol parameter 
previously.
+prediction = model2.transform(test.toDF())
+selected = prediction.select("features", "label", "myProbability", 
"prediction")
+for row in selected.collect():
+print row
+
+sc.stop()
+{% endhighlight %}
+
+
 
 
 ## Example: Pipeline


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and Param

2015-08-13 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 fe05142f5 -> 49085b56c


[SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and 
Param

Added ml-guide Python Example: Estimator, Transformer, and Param
/docs/_site/ml-guide.html

Author: Rosstin 

Closes #8081 from Rosstin/SPARK-8965.

(cherry picked from commit 7a539ef3b1792764f866fa88c84c78ad59903f21)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/49085b56
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/49085b56
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/49085b56

Branch: refs/heads/branch-1.5
Commit: 49085b56c10a2d05345b343277ddf19b502aee9c
Parents: fe05142
Author: Rosstin 
Authored: Thu Aug 13 09:18:39 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 13 09:18:50 2015 -0700

--
 docs/ml-guide.md | 68 +++
 1 file changed, 68 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/49085b56/docs/ml-guide.md
--
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
index b6ca50e..a03ab43 100644
--- a/docs/ml-guide.md
+++ b/docs/ml-guide.md
@@ -355,6 +355,74 @@ jsc.stop();
 {% endhighlight %}
 
 
+
+{% highlight python %}
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.ml.classification import LogisticRegression
+from pyspark.ml.param import Param, Params
+from pyspark.sql import Row, SQLContext
+
+sc = SparkContext(appName="SimpleParamsExample")
+sqlContext = SQLContext(sc)
+
+# Prepare training data.
+# We use LabeledPoint.
+# Spark SQL can convert RDDs of LabeledPoints into DataFrames.
+training = sc.parallelize([LabeledPoint(1.0, [0.0, 1.1,  0.1]),
+   LabeledPoint(0.0, [2.0, 1.0, -1.0]),
+   LabeledPoint(0.0, [2.0, 1.3,  1.0]),
+   LabeledPoint(1.0, [0.0, 1.2, -0.5])])
+
+# Create a LogisticRegression instance. This instance is an Estimator.
+lr = LogisticRegression(maxIter=10, regParam=0.01)
+# Print out the parameters, documentation, and any default values.
+print "LogisticRegression parameters:\n" + lr.explainParams() + "\n"
+
+# Learn a LogisticRegression model. This uses the parameters stored in lr.
+model1 = lr.fit(training.toDF())
+
+# Since model1 is a Model (i.e., a transformer produced by an Estimator),
+# we can view the parameters it used during fit().
+# This prints the parameter (name: value) pairs, where names are unique IDs 
for this
+# LogisticRegression instance.
+print "Model 1 was fit using parameters: "
+print model1.extractParamMap()
+
+# We may alternatively specify parameters using a Python dictionary as a 
paramMap
+paramMap = {lr.maxIter: 20}
+paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter.
+paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple 
Params.
+
+# You can combine paramMaps, which are python dictionaries.
+paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name
+paramMapCombined = paramMap.copy()
+paramMapCombined.update(paramMap2)
+
+# Now learn a new model using the paramMapCombined parameters.
+# paramMapCombined overrides all parameters set earlier via lr.set* methods.
+model2 = lr.fit(training.toDF(), paramMapCombined)
+print "Model 2 was fit using parameters: "
+print model2.extractParamMap()
+
+# Prepare test data
+test = sc.parallelize([LabeledPoint(1.0, [-1.0, 1.5,  1.3]),
+   LabeledPoint(0.0, [ 3.0, 2.0, -0.1]),
+   LabeledPoint(1.0, [ 0.0, 2.2, -1.5])])
+
+# Make predictions on test data using the Transformer.transform() method.
+# LogisticRegression.transform will only use the 'features' column.
+# Note that model2.transform() outputs a "myProbability" column instead of the 
usual
+# 'probability' column since we renamed the lr.probabilityCol parameter 
previously.
+prediction = model2.transform(test.toDF())
+selected = prediction.select("features", "label", "myProbability", 
"prediction")
+for row in selected.collect():
+print row
+
+sc.stop()
+{% endhighlight %}
+
+
 
 
 ## Example: Pipeline


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9661] [MLLIB] [ML] Java compatibility

2015-08-13 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 30460206f -> 875ecc7f6


[SPARK-9661] [MLLIB] [ML] Java compatibility

I skimmed through the docs for various instance of Object and replaced them 
with Java compaible versions of the same.

1. Some methods in LDAModel.
2. runMiniBatchSGD
3. kolmogorovSmirnovTest

Author: MechCoder 

Closes #8126 from MechCoder/java_incop.

(cherry picked from commit 864de8eaf4b6ad5c9099f6f29e251c56b029f631)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/875ecc7f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/875ecc7f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/875ecc7f

Branch: refs/heads/branch-1.5
Commit: 875ecc7f61bf487ad8291e3c867a45f25c8852da
Parents: 3046020
Author: MechCoder 
Authored: Thu Aug 13 13:42:35 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 13 13:42:44 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   | 27 ++--
 .../apache/spark/mllib/stat/Statistics.scala| 16 +++-
 .../spark/mllib/clustering/JavaLDASuite.java| 24 +
 .../spark/mllib/stat/JavaStatisticsSuite.java   | 22 
 .../spark/mllib/clustering/LDASuite.scala   | 13 ++
 5 files changed, 99 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/875ecc7f/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 5dc637e..f31949f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -26,7 +26,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.api.java.JavaPairRDD
+import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
 import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId}
 import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
 import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -228,6 +228,11 @@ class LocalLDAModel private[clustering] (
 docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, 
gammaShape, k,
 vocabSize)
 
+  /** Java-friendly version of [[logLikelihood]] */
+  def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = {
+logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
+  }
+
   /**
* Calculate an upper bound bound on perplexity.  (Lower is better.)
* See Equation (16) in original Online LDA paper.
@@ -242,6 +247,11 @@ class LocalLDAModel private[clustering] (
 -logLikelihood(documents) / corpusTokenCount
   }
 
+  /** Java-friendly version of [[logPerplexity]] */
+  def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = {
+logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
+  }
+
   /**
* Estimate the variational likelihood bound of from `documents`:
*log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)]
@@ -341,8 +351,14 @@ class LocalLDAModel private[clustering] (
 }
   }
 
-}
+  /** Java-friendly version of [[topicDistributions]] */
+  def topicDistributions(
+  documents: JavaPairRDD[java.lang.Long, Vector]): 
JavaPairRDD[java.lang.Long, Vector] = {
+val distributions = 
topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
+JavaPairRDD.fromRDD(distributions.asInstanceOf[RDD[(java.lang.Long, 
Vector)]])
+  }
 
+}
 
 @Experimental
 object LocalLDAModel extends Loader[LocalLDAModel] {
@@ -657,6 +673,13 @@ class DistributedLDAModel private[clustering] (
 }
   }
 
+  /** Java-friendly version of [[topTopicsPerDocument]] */
+  def javaTopTopicsPerDocument(
+  k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] 
= {
+val topics = topTopicsPerDocument(k)
+topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], 
Array[java.lang.Double])]].toJavaRDD()
+  }
+
   // TODO:
   // override def topicDistributions(documents: RDD[(Long, Vector)]): 
RDD[(Long, Vector)] = ???
 

http://git-wip-us.apache.org/repos/asf/spark/blob/875ecc7f/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index f845029..24fe48c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+

spark git commit: [SPARK-9661] [MLLIB] [ML] Java compatibility

2015-08-13 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 8815ba2f6 -> 864de8eaf


[SPARK-9661] [MLLIB] [ML] Java compatibility

I skimmed through the docs for various instance of Object and replaced them 
with Java compaible versions of the same.

1. Some methods in LDAModel.
2. runMiniBatchSGD
3. kolmogorovSmirnovTest

Author: MechCoder 

Closes #8126 from MechCoder/java_incop.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/864de8ea
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/864de8ea
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/864de8ea

Branch: refs/heads/master
Commit: 864de8eaf4b6ad5c9099f6f29e251c56b029f631
Parents: 8815ba2
Author: MechCoder 
Authored: Thu Aug 13 13:42:35 2015 -0700
Committer: Joseph K. Bradley 
Committed: Thu Aug 13 13:42:35 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   | 27 ++--
 .../apache/spark/mllib/stat/Statistics.scala| 16 +++-
 .../spark/mllib/clustering/JavaLDASuite.java| 24 +
 .../spark/mllib/stat/JavaStatisticsSuite.java   | 22 
 .../spark/mllib/clustering/LDASuite.scala   | 13 ++
 5 files changed, 99 insertions(+), 3 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/864de8ea/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index 5dc637e..f31949f 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -26,7 +26,7 @@ import org.json4s.jackson.JsonMethods._
 
 import org.apache.spark.SparkContext
 import org.apache.spark.annotation.Experimental
-import org.apache.spark.api.java.JavaPairRDD
+import org.apache.spark.api.java.{JavaPairRDD, JavaRDD}
 import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId}
 import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
 import org.apache.spark.mllib.util.{Loader, Saveable}
@@ -228,6 +228,11 @@ class LocalLDAModel private[clustering] (
 docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, 
gammaShape, k,
 vocabSize)
 
+  /** Java-friendly version of [[logLikelihood]] */
+  def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = {
+logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
+  }
+
   /**
* Calculate an upper bound bound on perplexity.  (Lower is better.)
* See Equation (16) in original Online LDA paper.
@@ -242,6 +247,11 @@ class LocalLDAModel private[clustering] (
 -logLikelihood(documents) / corpusTokenCount
   }
 
+  /** Java-friendly version of [[logPerplexity]] */
+  def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = {
+logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
+  }
+
   /**
* Estimate the variational likelihood bound of from `documents`:
*log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)]
@@ -341,8 +351,14 @@ class LocalLDAModel private[clustering] (
 }
   }
 
-}
+  /** Java-friendly version of [[topicDistributions]] */
+  def topicDistributions(
+  documents: JavaPairRDD[java.lang.Long, Vector]): 
JavaPairRDD[java.lang.Long, Vector] = {
+val distributions = 
topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]])
+JavaPairRDD.fromRDD(distributions.asInstanceOf[RDD[(java.lang.Long, 
Vector)]])
+  }
 
+}
 
 @Experimental
 object LocalLDAModel extends Loader[LocalLDAModel] {
@@ -657,6 +673,13 @@ class DistributedLDAModel private[clustering] (
 }
   }
 
+  /** Java-friendly version of [[topTopicsPerDocument]] */
+  def javaTopTopicsPerDocument(
+  k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] 
= {
+val topics = topTopicsPerDocument(k)
+topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], 
Array[java.lang.Double])]].toJavaRDD()
+  }
+
   // TODO:
   // override def topicDistributions(documents: RDD[(Long, Vector)]): 
RDD[(Long, Vector)] = ???
 

http://git-wip-us.apache.org/repos/asf/spark/blob/864de8ea/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index f845029..24fe48c 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -20,7 +20,7 @@ package org.apache.spar

spark git commit: [SPARK-9661] [MLLIB] minor clean-up of SPARK-9661

2015-08-14 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 a0d52eb30 -> 4aa9238b9


[SPARK-9661] [MLLIB] minor clean-up of SPARK-9661

Some minor clean-ups after SPARK-9661. See my inline comments. MechCoder 
jkbradley

Author: Xiangrui Meng 

Closes #8190 from mengxr/SPARK-9661-fix.

(cherry picked from commit a0e1abbd010b9e73d472ce12ff1d987678005d32)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4aa9238b
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4aa9238b
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4aa9238b

Branch: refs/heads/branch-1.5
Commit: 4aa9238b9b9c85e882c867ab4322ce084743e66f
Parents: a0d52eb
Author: Xiangrui Meng 
Authored: Fri Aug 14 10:25:11 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 14 10:25:19 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   |  5 +--
 .../apache/spark/mllib/stat/Statistics.scala|  6 +--
 .../spark/mllib/clustering/JavaLDASuite.java| 40 +++-
 .../spark/mllib/clustering/LDASuite.scala   |  2 +-
 4 files changed, 28 insertions(+), 25 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4aa9238b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index f31949f..82f05e4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -674,10 +674,9 @@ class DistributedLDAModel private[clustering] (
   }
 
   /** Java-friendly version of [[topTopicsPerDocument]] */
-  def javaTopTopicsPerDocument(
-  k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] 
= {
+  def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], 
Array[Double])] = {
 val topics = topTopicsPerDocument(k)
-topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], 
Array[java.lang.Double])]].toJavaRDD()
+topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], 
Array[Double])]].toJavaRDD()
   }
 
   // TODO:

http://git-wip-us.apache.org/repos/asf/spark/blob/4aa9238b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 24fe48c..ef8d786 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -221,9 +221,7 @@ object Statistics {
   def kolmogorovSmirnovTest(
   data: JavaDoubleRDD,
   distName: String,
-  params: java.lang.Double*): KolmogorovSmirnovTestResult = {
-val javaParams = params.asInstanceOf[Seq[Double]]
-KolmogorovSmirnovTest.testOneSample(data.rdd.asInstanceOf[RDD[Double]],
-  distName, javaParams: _*)
+  params: Double*): KolmogorovSmirnovTestResult = {
+kolmogorovSmirnovTest(data.rdd.asInstanceOf[RDD[Double]], distName, 
params: _*)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/4aa9238b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
--
diff --git 
a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java 
b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index 427be94..6e91cde 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -22,12 +22,14 @@ import java.util.ArrayList;
 import java.util.Arrays;
 
 import scala.Tuple2;
+import scala.Tuple3;
 
 import org.junit.After;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertArrayEquals;
 import org.junit.Before;
 import org.junit.Test;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.JavaPairRDD;
@@ -44,9 +46,9 @@ public class JavaLDASuite implements Serializable {
   public void setUp() {
 sc = new JavaSparkContext("local", "JavaLDA");
 ArrayList> tinyCorpus = new ArrayList>();
-for (int i = 0; i < LDASuite$.MODULE$.tinyCorpus().length; i++) {
-  tinyCorpus.add(new Tuple2((Long)LDASuite$.MODULE$.tinyCorpus()[i]._1(),
-  LDASuite$.MODULE$.tinyCorpus()[i

spark git commit: [SPARK-9661] [MLLIB] minor clean-up of SPARK-9661

2015-08-14 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master c8677d736 -> a0e1abbd0


[SPARK-9661] [MLLIB] minor clean-up of SPARK-9661

Some minor clean-ups after SPARK-9661. See my inline comments. MechCoder 
jkbradley

Author: Xiangrui Meng 

Closes #8190 from mengxr/SPARK-9661-fix.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a0e1abbd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a0e1abbd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a0e1abbd

Branch: refs/heads/master
Commit: a0e1abbd010b9e73d472ce12ff1d987678005d32
Parents: c8677d7
Author: Xiangrui Meng 
Authored: Fri Aug 14 10:25:11 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 14 10:25:11 2015 -0700

--
 .../spark/mllib/clustering/LDAModel.scala   |  5 +--
 .../apache/spark/mllib/stat/Statistics.scala|  6 +--
 .../spark/mllib/clustering/JavaLDASuite.java| 40 +++-
 .../spark/mllib/clustering/LDASuite.scala   |  2 +-
 4 files changed, 28 insertions(+), 25 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a0e1abbd/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
index f31949f..82f05e4 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -674,10 +674,9 @@ class DistributedLDAModel private[clustering] (
   }
 
   /** Java-friendly version of [[topTopicsPerDocument]] */
-  def javaTopTopicsPerDocument(
-  k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] 
= {
+  def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], 
Array[Double])] = {
 val topics = topTopicsPerDocument(k)
-topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], 
Array[java.lang.Double])]].toJavaRDD()
+topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], 
Array[Double])]].toJavaRDD()
   }
 
   // TODO:

http://git-wip-us.apache.org/repos/asf/spark/blob/a0e1abbd/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
--
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala 
b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
index 24fe48c..ef8d786 100644
--- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
+++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala
@@ -221,9 +221,7 @@ object Statistics {
   def kolmogorovSmirnovTest(
   data: JavaDoubleRDD,
   distName: String,
-  params: java.lang.Double*): KolmogorovSmirnovTestResult = {
-val javaParams = params.asInstanceOf[Seq[Double]]
-KolmogorovSmirnovTest.testOneSample(data.rdd.asInstanceOf[RDD[Double]],
-  distName, javaParams: _*)
+  params: Double*): KolmogorovSmirnovTestResult = {
+kolmogorovSmirnovTest(data.rdd.asInstanceOf[RDD[Double]], distName, 
params: _*)
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/a0e1abbd/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
--
diff --git 
a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java 
b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
index 427be94..6e91cde 100644
--- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
+++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java
@@ -22,12 +22,14 @@ import java.util.ArrayList;
 import java.util.Arrays;
 
 import scala.Tuple2;
+import scala.Tuple3;
 
 import org.junit.After;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertArrayEquals;
 import org.junit.Before;
 import org.junit.Test;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
 
 import org.apache.spark.api.java.function.Function;
 import org.apache.spark.api.java.JavaPairRDD;
@@ -44,9 +46,9 @@ public class JavaLDASuite implements Serializable {
   public void setUp() {
 sc = new JavaSparkContext("local", "JavaLDA");
 ArrayList> tinyCorpus = new ArrayList>();
-for (int i = 0; i < LDASuite$.MODULE$.tinyCorpus().length; i++) {
-  tinyCorpus.add(new Tuple2((Long)LDASuite$.MODULE$.tinyCorpus()[i]._1(),
-  LDASuite$.MODULE$.tinyCorpus()[i]._2()));
+for (int i = 0; i < LDASuite.tinyCorpus().length; i++) {
+  tinyCorpus.add(new Tuple

spark git commit: [SPARK-9956] [ML] Make trees work with one-category features

2015-08-14 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master a0e1abbd0 -> 7ecf0c469


[SPARK-9956] [ML] Make trees work with one-category features

This modifies DecisionTreeMetadata construction to treat 1-category features as 
continuous, so that trees do not fail with such features.  It is important for 
the pipelines API, where VectorIndexer can automatically categorize certain 
features as categorical.

As stated in the JIRA, this is a temp fix which we can improve upon later by 
automatically filtering out those features. That will take longer, though, 
since it will require careful indexing.

Targeted for 1.5 and master

CC: manishamde  mengxr yanboliang

Author: Joseph K. Bradley 

Closes #8187 from jkbradley/tree-1cat.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7ecf0c46
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7ecf0c46
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7ecf0c46

Branch: refs/heads/master
Commit: 7ecf0c46990c39df8aeddbd64ca33d01824bcc0a
Parents: a0e1abb
Author: Joseph K. Bradley 
Authored: Fri Aug 14 10:48:02 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 14 10:48:02 2015 -0700

--
 .../mllib/tree/impl/DecisionTreeMetadata.scala  | 27 
 .../DecisionTreeClassifierSuite.scala   | 13 ++
 2 files changed, 30 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7ecf0c46/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
index 9fe2646..21ee49c 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
@@ -144,21 +144,28 @@ private[spark] object DecisionTreeMetadata extends 
Logging {
   val maxCategoriesForUnorderedFeature =
 ((math.log(maxPossibleBins / 2 + 1) / math.log(2.0)) + 1).floor.toInt
   strategy.categoricalFeaturesInfo.foreach { case (featureIndex, 
numCategories) =>
-// Decide if some categorical features should be treated as unordered 
features,
-//  which require 2 * ((1 << numCategories - 1) - 1) bins.
-// We do this check with log values to prevent overflows in case 
numCategories is large.
-// The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) 
<= maxBins
-if (numCategories <= maxCategoriesForUnorderedFeature) {
-  unorderedFeatures.add(featureIndex)
-  numBins(featureIndex) = numUnorderedBins(numCategories)
-} else {
-  numBins(featureIndex) = numCategories
+// Hack: If a categorical feature has only 1 category, we treat it as 
continuous.
+// TODO(SPARK-9957): Handle this properly by filtering out those 
features.
+if (numCategories > 1) {
+  // Decide if some categorical features should be treated as 
unordered features,
+  //  which require 2 * ((1 << numCategories - 1) - 1) bins.
+  // We do this check with log values to prevent overflows in case 
numCategories is large.
+  // The next check is equivalent to: 2 * ((1 << numCategories - 1) - 
1) <= maxBins
+  if (numCategories <= maxCategoriesForUnorderedFeature) {
+unorderedFeatures.add(featureIndex)
+numBins(featureIndex) = numUnorderedBins(numCategories)
+  } else {
+numBins(featureIndex) = numCategories
+  }
 }
   }
 } else {
   // Binary classification or regression
   strategy.categoricalFeaturesInfo.foreach { case (featureIndex, 
numCategories) =>
-numBins(featureIndex) = numCategories
+// If a categorical feature has only 1 category, we treat it as 
continuous: SPARK-9957
+if (numCategories > 1) {
+  numBins(featureIndex) = numCategories
+}
   }
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/7ecf0c46/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 4b7c5d3..f680d8d 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionT

spark git commit: [SPARK-9956] [ML] Make trees work with one-category features

2015-08-14 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 4aa9238b9 -> f5298da16


[SPARK-9956] [ML] Make trees work with one-category features

This modifies DecisionTreeMetadata construction to treat 1-category features as 
continuous, so that trees do not fail with such features.  It is important for 
the pipelines API, where VectorIndexer can automatically categorize certain 
features as categorical.

As stated in the JIRA, this is a temp fix which we can improve upon later by 
automatically filtering out those features. That will take longer, though, 
since it will require careful indexing.

Targeted for 1.5 and master

CC: manishamde  mengxr yanboliang

Author: Joseph K. Bradley 

Closes #8187 from jkbradley/tree-1cat.

(cherry picked from commit 7ecf0c46990c39df8aeddbd64ca33d01824bcc0a)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f5298da1
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f5298da1
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f5298da1

Branch: refs/heads/branch-1.5
Commit: f5298da16671496946a9f9ef614e5f4b9284b1d2
Parents: 4aa9238
Author: Joseph K. Bradley 
Authored: Fri Aug 14 10:48:02 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 14 10:48:19 2015 -0700

--
 .../mllib/tree/impl/DecisionTreeMetadata.scala  | 27 
 .../DecisionTreeClassifierSuite.scala   | 13 ++
 2 files changed, 30 insertions(+), 10 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f5298da1/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
index 9fe2646..21ee49c 100644
--- 
a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala
@@ -144,21 +144,28 @@ private[spark] object DecisionTreeMetadata extends 
Logging {
   val maxCategoriesForUnorderedFeature =
 ((math.log(maxPossibleBins / 2 + 1) / math.log(2.0)) + 1).floor.toInt
   strategy.categoricalFeaturesInfo.foreach { case (featureIndex, 
numCategories) =>
-// Decide if some categorical features should be treated as unordered 
features,
-//  which require 2 * ((1 << numCategories - 1) - 1) bins.
-// We do this check with log values to prevent overflows in case 
numCategories is large.
-// The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) 
<= maxBins
-if (numCategories <= maxCategoriesForUnorderedFeature) {
-  unorderedFeatures.add(featureIndex)
-  numBins(featureIndex) = numUnorderedBins(numCategories)
-} else {
-  numBins(featureIndex) = numCategories
+// Hack: If a categorical feature has only 1 category, we treat it as 
continuous.
+// TODO(SPARK-9957): Handle this properly by filtering out those 
features.
+if (numCategories > 1) {
+  // Decide if some categorical features should be treated as 
unordered features,
+  //  which require 2 * ((1 << numCategories - 1) - 1) bins.
+  // We do this check with log values to prevent overflows in case 
numCategories is large.
+  // The next check is equivalent to: 2 * ((1 << numCategories - 1) - 
1) <= maxBins
+  if (numCategories <= maxCategoriesForUnorderedFeature) {
+unorderedFeatures.add(featureIndex)
+numBins(featureIndex) = numUnorderedBins(numCategories)
+  } else {
+numBins(featureIndex) = numCategories
+  }
 }
   }
 } else {
   // Binary classification or regression
   strategy.categoricalFeaturesInfo.foreach { case (featureIndex, 
numCategories) =>
-numBins(featureIndex) = numCategories
+// If a categorical feature has only 1 category, we treat it as 
continuous: SPARK-9957
+if (numCategories > 1) {
+  numBins(featureIndex) = numCategories
+}
   }
 }
 

http://git-wip-us.apache.org/repos/asf/spark/blob/f5298da1/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala
index 4b7c5d3..f680d8d 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/classification/Deci

spark git commit: [SPARK-8744] [ML] Add a public constructor to StringIndexer

2015-08-14 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 7ecf0c469 -> a7317ccdc


[SPARK-8744] [ML] Add a public constructor to StringIndexer

It would be helpful to allow users to pass a pre-computed index to create an 
indexer, rather than always going through StringIndexer to create the model.

Author: Holden Karau 

Closes #7267 from 
holdenk/SPARK-8744-StringIndexerModel-should-have-public-constructor.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a7317ccd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a7317ccd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a7317ccd

Branch: refs/heads/master
Commit: a7317ccdc20d001e5b7f5277b0535923468bfbc6
Parents: 7ecf0c4
Author: Holden Karau 
Authored: Fri Aug 14 11:22:10 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 14 11:22:10 2015 -0700

--
 .../main/scala/org/apache/spark/ml/feature/StringIndexer.scala   | 4 +++-
 .../scala/org/apache/spark/ml/feature/StringIndexerSuite.scala   | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a7317ccd/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index 9f6e7b6..6347578 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -102,10 +102,12 @@ class StringIndexer(override val uid: String) extends 
Estimator[StringIndexerMod
  * This is a temporary fix for the case when target labels do not exist during 
prediction.
  */
 @Experimental
-class StringIndexerModel private[ml] (
+class StringIndexerModel (
 override val uid: String,
 labels: Array[String]) extends Model[StringIndexerModel] with 
StringIndexerBase {
 
+  def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), 
labels)
+
   private val labelToIndex: OpenHashMap[String, Double] = {
 val n = labels.length
 val map = new OpenHashMap[String, Double](n)

http://git-wip-us.apache.org/repos/asf/spark/blob/a7317ccd/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index fa918ce..0b4c8ba 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -30,7 +30,9 @@ class StringIndexerSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   test("params") {
 ParamsSuite.checkParams(new StringIndexer)
 val model = new StringIndexerModel("indexer", Array("a", "b"))
+val modelWithoutUid = new StringIndexerModel(Array("a", "b"))
 ParamsSuite.checkParams(model)
+ParamsSuite.checkParams(modelWithoutUid)
   }
 
   test("StringIndexer") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8744] [ML] Add a public constructor to StringIndexer

2015-08-14 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 f5298da16 -> e4ea2390a


[SPARK-8744] [ML] Add a public constructor to StringIndexer

It would be helpful to allow users to pass a pre-computed index to create an 
indexer, rather than always going through StringIndexer to create the model.

Author: Holden Karau 

Closes #7267 from 
holdenk/SPARK-8744-StringIndexerModel-should-have-public-constructor.

(cherry picked from commit a7317ccdc20d001e5b7f5277b0535923468bfbc6)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e4ea2390
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e4ea2390
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e4ea2390

Branch: refs/heads/branch-1.5
Commit: e4ea2390a5f64747dbc60febc4f3c29e1970e46d
Parents: f5298da
Author: Holden Karau 
Authored: Fri Aug 14 11:22:10 2015 -0700
Committer: Joseph K. Bradley 
Committed: Fri Aug 14 11:22:19 2015 -0700

--
 .../main/scala/org/apache/spark/ml/feature/StringIndexer.scala   | 4 +++-
 .../scala/org/apache/spark/ml/feature/StringIndexerSuite.scala   | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e4ea2390/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
index b87e154..f5dfba1 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala
@@ -98,10 +98,12 @@ class StringIndexer(override val uid: String) extends 
Estimator[StringIndexerMod
  * This is a temporary fix for the case when target labels do not exist during 
prediction.
  */
 @Experimental
-class StringIndexerModel private[ml] (
+class StringIndexerModel (
 override val uid: String,
 labels: Array[String]) extends Model[StringIndexerModel] with 
StringIndexerBase {
 
+  def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), 
labels)
+
   private val labelToIndex: OpenHashMap[String, Double] = {
 val n = labels.length
 val map = new OpenHashMap[String, Double](n)

http://git-wip-us.apache.org/repos/asf/spark/blob/e4ea2390/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
--
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
index 4a12e0b..d960861 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala
@@ -30,7 +30,9 @@ class StringIndexerSuite extends SparkFunSuite with 
MLlibTestSparkContext {
   test("params") {
 ParamsSuite.checkParams(new StringIndexer)
 val model = new StringIndexerModel("indexer", Array("a", "b"))
+val modelWithoutUid = new StringIndexerModel(Array("a", "b"))
 ParamsSuite.checkParams(model)
+ParamsSuite.checkParams(modelWithoutUid)
   }
 
   test("StringIndexer") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming pyspark tests

2015-08-15 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 2fda1d842 -> 881baf100


[SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming 
pyspark tests

Recently, PySpark ML streaming tests have been flaky, most likely because of 
the batches not being processed in time.  Proposal: Replace the use of 
_ssc_wait (which waits for a fixed amount of time) with a method which waits 
for a fixed amount of time but can terminate early based on a termination 
condition method.  With this, we can extend the waiting period (to make tests 
less flaky) but also stop early when possible (making tests faster on average, 
which I verified locally).

CC: mengxr tdas freeman-lab

Author: Joseph K. Bradley 

Closes #8087 from jkbradley/streaming-ml-tests.

(cherry picked from commit 1db7179fae672fcec7b8de12c374dd384ce51c67)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/881baf10
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/881baf10
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/881baf10

Branch: refs/heads/branch-1.5
Commit: 881baf100fa9d8135b16cd390c344e3a5995805e
Parents: 2fda1d8
Author: Joseph K. Bradley 
Authored: Sat Aug 15 18:48:20 2015 -0700
Committer: Joseph K. Bradley 
Committed: Sat Aug 15 18:48:29 2015 -0700

--
 python/pyspark/mllib/tests.py | 177 +++--
 1 file changed, 129 insertions(+), 48 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/881baf10/python/pyspark/mllib/tests.py
--
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 3f5a02a..5097c5e 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -32,6 +32,9 @@ from numpy import sum as array_sum
 
 from py4j.protocol import Py4JJavaError
 
+if sys.version > '3':
+basestring = str
+
 if sys.version_info[:2] <= (2, 6):
 try:
 import unittest2 as unittest
@@ -86,9 +89,42 @@ class MLLibStreamingTestCase(unittest.TestCase):
 self.ssc.stop(False)
 
 @staticmethod
-def _ssc_wait(start_time, end_time, sleep_time):
-while time() - start_time < end_time:
+def _eventually(condition, timeout=30.0, catch_assertions=False):
+"""
+Wait a given amount of time for a condition to pass, else fail with an 
error.
+This is a helper utility for streaming ML tests.
+:param condition: Function that checks for termination conditions.
+  condition() can return:
+   - True: Conditions met. Return without error.
+   - other value: Conditions not met yet. Continue. 
Upon timeout,
+  include last such value in error 
message.
+  Note that this method may be called at any time 
during
+  streaming execution (e.g., even before any results
+  have been created).
+:param timeout: Number of seconds to wait.  Default 30 seconds.
+:param catch_assertions: If False (default), do not catch 
AssertionErrors.
+ If True, catch AssertionErrors; continue, but 
save
+ error to throw upon timeout.
+"""
+start_time = time()
+lastValue = None
+while time() - start_time < timeout:
+if catch_assertions:
+try:
+lastValue = condition()
+except AssertionError as e:
+lastValue = e
+else:
+lastValue = condition()
+if lastValue is True:
+return
 sleep(0.01)
+if isinstance(lastValue, AssertionError):
+raise lastValue
+else:
+raise AssertionError(
+"Test failed due to timeout after %g sec, with last condition 
returning: %s"
+% (timeout, lastValue))
 
 
 def _squared_distance(a, b):
@@ -999,10 +1035,13 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
 [self.sc.parallelize(batch, 1) for batch in batches])
 stkm.trainOn(input_stream)
 
-t = time()
 self.ssc.start()
-self._ssc_wait(t, 10.0, 0.01)
-self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
+
+def condition():
+self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
+return True
+self._eventually(condition, catch_assertions=True)
+
 realCenters = array_sum(array(centers), axis=0)
 for i in range(5):
 modelCente

spark git commit: [SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming pyspark tests

2015-08-15 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 570567258 -> 1db7179fa


[SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming 
pyspark tests

Recently, PySpark ML streaming tests have been flaky, most likely because of 
the batches not being processed in time.  Proposal: Replace the use of 
_ssc_wait (which waits for a fixed amount of time) with a method which waits 
for a fixed amount of time but can terminate early based on a termination 
condition method.  With this, we can extend the waiting period (to make tests 
less flaky) but also stop early when possible (making tests faster on average, 
which I verified locally).

CC: mengxr tdas freeman-lab

Author: Joseph K. Bradley 

Closes #8087 from jkbradley/streaming-ml-tests.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1db7179f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1db7179f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1db7179f

Branch: refs/heads/master
Commit: 1db7179fae672fcec7b8de12c374dd384ce51c67
Parents: 5705672
Author: Joseph K. Bradley 
Authored: Sat Aug 15 18:48:20 2015 -0700
Committer: Joseph K. Bradley 
Committed: Sat Aug 15 18:48:20 2015 -0700

--
 python/pyspark/mllib/tests.py | 177 +++--
 1 file changed, 129 insertions(+), 48 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/1db7179f/python/pyspark/mllib/tests.py
--
diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py
index 3f5a02a..5097c5e 100644
--- a/python/pyspark/mllib/tests.py
+++ b/python/pyspark/mllib/tests.py
@@ -32,6 +32,9 @@ from numpy import sum as array_sum
 
 from py4j.protocol import Py4JJavaError
 
+if sys.version > '3':
+basestring = str
+
 if sys.version_info[:2] <= (2, 6):
 try:
 import unittest2 as unittest
@@ -86,9 +89,42 @@ class MLLibStreamingTestCase(unittest.TestCase):
 self.ssc.stop(False)
 
 @staticmethod
-def _ssc_wait(start_time, end_time, sleep_time):
-while time() - start_time < end_time:
+def _eventually(condition, timeout=30.0, catch_assertions=False):
+"""
+Wait a given amount of time for a condition to pass, else fail with an 
error.
+This is a helper utility for streaming ML tests.
+:param condition: Function that checks for termination conditions.
+  condition() can return:
+   - True: Conditions met. Return without error.
+   - other value: Conditions not met yet. Continue. 
Upon timeout,
+  include last such value in error 
message.
+  Note that this method may be called at any time 
during
+  streaming execution (e.g., even before any results
+  have been created).
+:param timeout: Number of seconds to wait.  Default 30 seconds.
+:param catch_assertions: If False (default), do not catch 
AssertionErrors.
+ If True, catch AssertionErrors; continue, but 
save
+ error to throw upon timeout.
+"""
+start_time = time()
+lastValue = None
+while time() - start_time < timeout:
+if catch_assertions:
+try:
+lastValue = condition()
+except AssertionError as e:
+lastValue = e
+else:
+lastValue = condition()
+if lastValue is True:
+return
 sleep(0.01)
+if isinstance(lastValue, AssertionError):
+raise lastValue
+else:
+raise AssertionError(
+"Test failed due to timeout after %g sec, with last condition 
returning: %s"
+% (timeout, lastValue))
 
 
 def _squared_distance(a, b):
@@ -999,10 +1035,13 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
 [self.sc.parallelize(batch, 1) for batch in batches])
 stkm.trainOn(input_stream)
 
-t = time()
 self.ssc.start()
-self._ssc_wait(t, 10.0, 0.01)
-self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
+
+def condition():
+self.assertEquals(stkm.latestModel().clusterWeights, [25.0])
+return True
+self._eventually(condition, catch_assertions=True)
+
 realCenters = array_sum(array(centers), axis=0)
 for i in range(5):
 modelCenters = stkm.latestModel().centers[0][i]
@@ -1027,7 +1066,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase):

spark git commit: [SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct

2015-08-17 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 52ae95257 -> 0076e8212


[SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for 
ml.feature.ElementwiseProduct

Add Python API, user guide and example for ml.feature.ElementwiseProduct.

Author: Yanbo Liang 

Closes #8061 from yanboliang/SPARK-9768.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0076e821
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0076e821
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0076e821

Branch: refs/heads/master
Commit: 0076e8212334c613599dcbc2ac23f49e9e50cc44
Parents: 52ae952
Author: Yanbo Liang 
Authored: Mon Aug 17 17:25:41 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 17 17:25:41 2015 -0700

--
 docs/ml-features.md  | 23 +++---
 python/pyspark/ml/feature.py | 67 ---
 2 files changed, 81 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/0076e821/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index cec2cbe..6b2e36b 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1212,7 +1212,7 @@ v_N
 This example below demonstrates how to transform vectors using a transforming 
vector value.
 
 
-
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.ElementwiseProduct
 import org.apache.spark.mllib.linalg.Vectors
@@ -1229,12 +1229,12 @@ val transformer = new ElementwiseProduct()
   .setOutputCol("transformedVector")
 
 // Batch transform the vectors to create new column:
-val transformedData = transformer.transform(dataFrame)
+transformer.transform(dataFrame).show()
 
 {% endhighlight %}
 
 
-
+
 {% highlight java %}
 import com.google.common.collect.Lists;
 
@@ -1267,10 +1267,25 @@ ElementwiseProduct transformer = new 
ElementwiseProduct()
   .setInputCol("vector")
   .setOutputCol("transformedVector");
 // Batch transform the vectors to create new column:
-DataFrame transformedData = transformer.transform(dataFrame);
+transformer.transform(dataFrame).show();
 
 {% endhighlight %}
 
+
+
+{% highlight python %}
+from pyspark.ml.feature import ElementwiseProduct
+from pyspark.mllib.linalg import Vectors
+
+data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
+df = sqlContext.createDataFrame(data, ["vector"])
+transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), 
+ inputCol="vector", 
outputCol="transformedVector")
+transformer.transform(df).show()
+
+{% endhighlight %}
+
+
 
 
 ## VectorAssembler

http://git-wip-us.apache.org/repos/asf/spark/blob/0076e821/python/pyspark/ml/feature.py
--
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 535d553..04b2b2c 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,11 +26,11 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, 
JavaTransformer
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
-__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 
'Normalizer',
-   'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 
'StandardScaler',
-   'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 
'Tokenizer',
-   'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 
'PCA',
-   'PCAModel', 'RFormula', 'RFormulaModel']
+__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 
'IDF', 'IDFModel',
+   'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 
'RegexTokenizer',
+   'StandardScaler', 'StandardScalerModel', 'StringIndexer', 
'StringIndexerModel',
+   'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 
'Word2VecModel',
+   'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
 
 
 @inherit_doc
@@ -167,6 +167,63 @@ class Bucketizer(JavaTransformer, HasInputCol, 
HasOutputCol):
 
 
 @inherit_doc
+class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
+"""
+Outputs the Hadamard product (i.e., the element-wise product) of each 
input vector
+with a provided "weight" vector. In other words, it scales each column of 
the dataset
+by a scalar multiplier.
+
+>>> from pyspark.mllib.linalg import Vectors
+>>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], 
["values"])
+>>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
+... inputCol="values", outputCol="eprod")
+>>> ep.transform(df).head().eprod
+DenseVector([2.0, 2.0, 9.0])
+>>> ep.setParams(scalingVec

spark git commit: [SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct

2015-08-17 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 407175e82 -> eaeebb92f


[SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for 
ml.feature.ElementwiseProduct

Add Python API, user guide and example for ml.feature.ElementwiseProduct.

Author: Yanbo Liang 

Closes #8061 from yanboliang/SPARK-9768.

(cherry picked from commit 0076e8212334c613599dcbc2ac23f49e9e50cc44)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eaeebb92
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eaeebb92
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eaeebb92

Branch: refs/heads/branch-1.5
Commit: eaeebb92f336d3862169c61e7dcc6afa2732084b
Parents: 407175e
Author: Yanbo Liang 
Authored: Mon Aug 17 17:25:41 2015 -0700
Committer: Joseph K. Bradley 
Committed: Mon Aug 17 17:25:50 2015 -0700

--
 docs/ml-features.md  | 23 +++---
 python/pyspark/ml/feature.py | 67 ---
 2 files changed, 81 insertions(+), 9 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/eaeebb92/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index cec2cbe..6b2e36b 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -1212,7 +1212,7 @@ v_N
 This example below demonstrates how to transform vectors using a transforming 
vector value.
 
 
-
+
 {% highlight scala %}
 import org.apache.spark.ml.feature.ElementwiseProduct
 import org.apache.spark.mllib.linalg.Vectors
@@ -1229,12 +1229,12 @@ val transformer = new ElementwiseProduct()
   .setOutputCol("transformedVector")
 
 // Batch transform the vectors to create new column:
-val transformedData = transformer.transform(dataFrame)
+transformer.transform(dataFrame).show()
 
 {% endhighlight %}
 
 
-
+
 {% highlight java %}
 import com.google.common.collect.Lists;
 
@@ -1267,10 +1267,25 @@ ElementwiseProduct transformer = new 
ElementwiseProduct()
   .setInputCol("vector")
   .setOutputCol("transformedVector");
 // Batch transform the vectors to create new column:
-DataFrame transformedData = transformer.transform(dataFrame);
+transformer.transform(dataFrame).show();
 
 {% endhighlight %}
 
+
+
+{% highlight python %}
+from pyspark.ml.feature import ElementwiseProduct
+from pyspark.mllib.linalg import Vectors
+
+data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
+df = sqlContext.createDataFrame(data, ["vector"])
+transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), 
+ inputCol="vector", 
outputCol="transformedVector")
+transformer.transform(df).show()
+
+{% endhighlight %}
+
+
 
 
 ## VectorAssembler

http://git-wip-us.apache.org/repos/asf/spark/blob/eaeebb92/python/pyspark/ml/feature.py
--
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 535d553..04b2b2c 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -26,11 +26,11 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, 
JavaTransformer
 from pyspark.mllib.common import inherit_doc
 from pyspark.mllib.linalg import _convert_to_vector
 
-__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 
'Normalizer',
-   'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 
'StandardScaler',
-   'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 
'Tokenizer',
-   'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 
'PCA',
-   'PCAModel', 'RFormula', 'RFormulaModel']
+__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 
'IDF', 'IDFModel',
+   'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 
'RegexTokenizer',
+   'StandardScaler', 'StandardScalerModel', 'StringIndexer', 
'StringIndexerModel',
+   'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 
'Word2VecModel',
+   'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
 
 
 @inherit_doc
@@ -167,6 +167,63 @@ class Bucketizer(JavaTransformer, HasInputCol, 
HasOutputCol):
 
 
 @inherit_doc
+class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol):
+"""
+Outputs the Hadamard product (i.e., the element-wise product) of each 
input vector
+with a provided "weight" vector. In other words, it scales each column of 
the dataset
+by a scalar multiplier.
+
+>>> from pyspark.mllib.linalg import Vectors
+>>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], 
["values"])
+>>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
+... inputCol="values", outputCol="e

spark git commit: [SPARK-9028] [ML] Add CountVectorizer as an estimator to generate CountVectorizerModel

2015-08-18 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 1968276af -> 354f4582b


[SPARK-9028] [ML] Add CountVectorizer as an estimator to generate 
CountVectorizerModel

jira: https://issues.apache.org/jira/browse/SPARK-9028

Add an estimator for CountVectorizerModel. The estimator will extract a 
vocabulary from document collections according to the term frequency.

I changed the meaning of minCount as a filter across the corpus. This aligns 
with Word2Vec and the similar parameter in SKlearn.

Author: Yuhao Yang 
Author: Joseph K. Bradley 

Closes #7388 from hhbyyh/cvEstimator.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/354f4582
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/354f4582
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/354f4582

Branch: refs/heads/master
Commit: 354f4582b637fa25d3892ec2b12869db50ed83c9
Parents: 1968276
Author: Yuhao Yang 
Authored: Tue Aug 18 11:00:09 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 18 11:00:09 2015 -0700

--
 .../spark/ml/feature/CountVectorizer.scala  | 235 +++
 .../spark/ml/feature/CountVectorizerModel.scala |  82 ---
 .../spark/ml/feature/CountVectorizerSuite.scala | 167 +
 .../spark/ml/feature/CountVectorizorSuite.scala |  73 --
 4 files changed, 402 insertions(+), 155 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/354f4582/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
new file mode 100644
index 000..49028e4
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.util.collection.OpenHashMap
+
+/**
+ * Params for [[CountVectorizer]] and [[CountVectorizerModel]].
+ */
+private[feature] trait CountVectorizerParams extends Params with HasInputCol 
with HasOutputCol {
+
+  /**
+   * Max size of the vocabulary.
+   * CountVectorizer will build a vocabulary that only considers the top
+   * vocabSize terms ordered by term frequency across the corpus.
+   *
+   * Default: 2^18^
+   * @group param
+   */
+  val vocabSize: IntParam =
+new IntParam(this, "vocabSize", "max size of the vocabulary", 
ParamValidators.gt(0))
+
+  /** @group getParam */
+  def getVocabSize: Int = $(vocabSize)
+
+  /**
+   * Specifies the minimum number of different documents a term must appear in 
to be included
+   * in the vocabulary.
+   * If this is an integer >= 1, this specifies the number of documents the 
term must appear in;
+   * if this is a double in [0,1), then this specifies the fraction of 
documents.
+   *
+   * Default: 1
+   * @group param
+   */
+  val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the 
minimum number of" +
+" different documents a term must appear in to be included in the 
vocabulary." +
+" If this is an integer >= 1, this specifies the number of documents the 
term must" +
+" appear in; if this is a double in [0,1), then this specifies the 
fraction of documents.",
+ParamValidators.gtEq(0.0))
+
+  /** @group getParam */
+  def getMinDF: Double = $(minDF)
+
+  /** Validates and transforms the input schema. */
+  protected def validateAndTransf

spark git commit: [SPARK-9028] [ML] Add CountVectorizer as an estimator to generate CountVectorizerModel

2015-08-18 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 20a760a00 -> b86378cf2


[SPARK-9028] [ML] Add CountVectorizer as an estimator to generate 
CountVectorizerModel

jira: https://issues.apache.org/jira/browse/SPARK-9028

Add an estimator for CountVectorizerModel. The estimator will extract a 
vocabulary from document collections according to the term frequency.

I changed the meaning of minCount as a filter across the corpus. This aligns 
with Word2Vec and the similar parameter in SKlearn.

Author: Yuhao Yang 
Author: Joseph K. Bradley 

Closes #7388 from hhbyyh/cvEstimator.

(cherry picked from commit 354f4582b637fa25d3892ec2b12869db50ed83c9)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b86378cf
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b86378cf
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b86378cf

Branch: refs/heads/branch-1.5
Commit: b86378cf29f8fdb70e41b2f04d831b8a15c1c859
Parents: 20a760a
Author: Yuhao Yang 
Authored: Tue Aug 18 11:00:09 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 18 11:00:22 2015 -0700

--
 .../spark/ml/feature/CountVectorizer.scala  | 235 +++
 .../spark/ml/feature/CountVectorizerModel.scala |  82 ---
 .../spark/ml/feature/CountVectorizerSuite.scala | 167 +
 .../spark/ml/feature/CountVectorizorSuite.scala |  73 --
 4 files changed, 402 insertions(+), 155 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b86378cf/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
new file mode 100644
index 000..49028e4
--- /dev/null
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.ml.feature
+
+import org.apache.spark.annotation.Experimental
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
+import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.mllib.linalg.{VectorUDT, Vectors}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.util.collection.OpenHashMap
+
+/**
+ * Params for [[CountVectorizer]] and [[CountVectorizerModel]].
+ */
+private[feature] trait CountVectorizerParams extends Params with HasInputCol 
with HasOutputCol {
+
+  /**
+   * Max size of the vocabulary.
+   * CountVectorizer will build a vocabulary that only considers the top
+   * vocabSize terms ordered by term frequency across the corpus.
+   *
+   * Default: 2^18^
+   * @group param
+   */
+  val vocabSize: IntParam =
+new IntParam(this, "vocabSize", "max size of the vocabulary", 
ParamValidators.gt(0))
+
+  /** @group getParam */
+  def getVocabSize: Int = $(vocabSize)
+
+  /**
+   * Specifies the minimum number of different documents a term must appear in 
to be included
+   * in the vocabulary.
+   * If this is an integer >= 1, this specifies the number of documents the 
term must appear in;
+   * if this is a double in [0,1), then this specifies the fraction of 
documents.
+   *
+   * Default: 1
+   * @group param
+   */
+  val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the 
minimum number of" +
+" different documents a term must appear in to be included in the 
vocabulary." +
+" If this is an integer >= 1, this specifies the number of documents the 
term must" +
+" appear in; if this is a double in [0,1), then this specifies the 
fraction of documents.",
+ParamValidators.gtEq(0.0))
+
+  /** @group getParam */
+  def getMi

spark git commit: [SPARK-10012] [ML] Missing test case for Params#arrayLengthGt

2015-08-18 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 56f4da263 -> fb207b245


[SPARK-10012] [ML] Missing test case for Params#arrayLengthGt

Currently there is no test case for `Params#arrayLengthGt`.

Author: lewuathe 

Closes #8223 from Lewuathe/SPARK-10012.

(cherry picked from commit c635a16f64c939182196b46725ef2d00ed107cca)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb207b24
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb207b24
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb207b24

Branch: refs/heads/branch-1.5
Commit: fb207b245305b30b4fe47e08f98f2571a2d05249
Parents: 56f4da2
Author: lewuathe 
Authored: Tue Aug 18 15:30:23 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 18 15:30:34 2015 -0700

--
 mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala | 3 +++
 1 file changed, 3 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/fb207b24/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
--
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index be95638..2c878f8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -199,6 +199,9 @@ class ParamsSuite extends SparkFunSuite {
 
 val inArray = ParamValidators.inArray[Int](Array(1, 2))
 assert(inArray(1) && inArray(2) && !inArray(0))
+
+val arrayLengthGt = ParamValidators.arrayLengthGt[Int](2.0)
+assert(arrayLengthGt(Array(0, 1, 2)) && !arrayLengthGt(Array(0, 1)))
   }
 
   test("Params.copyValues") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-10012] [ML] Missing test case for Params#arrayLengthGt

2015-08-18 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 1dbffba37 -> c635a16f6


[SPARK-10012] [ML] Missing test case for Params#arrayLengthGt

Currently there is no test case for `Params#arrayLengthGt`.

Author: lewuathe 

Closes #8223 from Lewuathe/SPARK-10012.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c635a16f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c635a16f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c635a16f

Branch: refs/heads/master
Commit: c635a16f64c939182196b46725ef2d00ed107cca
Parents: 1dbffba
Author: lewuathe 
Authored: Tue Aug 18 15:30:23 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 18 15:30:23 2015 -0700

--
 mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala | 3 +++
 1 file changed, 3 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c635a16f/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
--
diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
index be95638..2c878f8 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala
@@ -199,6 +199,9 @@ class ParamsSuite extends SparkFunSuite {
 
 val inArray = ParamValidators.inArray[Int](Array(1, 2))
 assert(inArray(1) && inArray(2) && !inArray(0))
+
+val arrayLengthGt = ParamValidators.arrayLengthGt[Int](2.0)
+assert(arrayLengthGt(Array(0, 1, 2)) && !arrayLengthGt(Array(0, 1)))
   }
 
   test("Params.copyValues") {


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT

2015-08-18 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 e1b50c7d2 -> 4ee225af8


[SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT

mengxr jkbradley

Author: Feynman Liang 

Closes #8184 from feynmanliang/SPARK-9889-DCT-docs.

(cherry picked from commit badf7fa650f9801c70515907fcc26b58d7ec3143)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ee225af
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ee225af
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ee225af

Branch: refs/heads/branch-1.5
Commit: 4ee225af8ecb38fbcf8e43ac1c498a76f3590b98
Parents: e1b50c7
Author: Feynman Liang 
Authored: Tue Aug 18 17:54:49 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 18 17:54:58 2015 -0700

--
 docs/ml-features.md | 71 
 1 file changed, 71 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/4ee225af/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 6b2e36b..28a6193 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -649,6 +649,77 @@ for expanded in polyDF.select("polyFeatures").take(3):
 
 
 
+## Discrete Cosine Transform (DCT)
+
+The [Discrete Cosine
+Transform](https://en.wikipedia.org/wiki/Discrete_cosine_transform)
+transforms a length $N$ real-valued sequence in the time domain into
+another length $N$ real-valued sequence in the frequency domain. A
+[DCT](api/scala/index.html#org.apache.spark.ml.feature.DCT) class
+provides this functionality, implementing the
+[DCT-II](https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II)
+and scaling the result by $1/\sqrt{2}$ such that the representing matrix
+for the transform is unitary. No shift is applied to the transformed
+sequence (e.g. the $0$th element of the transformed sequence is the
+$0$th DCT coefficient and _not_ the $N/2$th).
+
+
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.DCT
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Seq(
+  Vectors.dense(0.0, 1.0, -2.0, 3.0),
+  Vectors.dense(-1.0, 2.0, 4.0, -7.0),
+  Vectors.dense(14.0, -2.0, -5.0, 1.0))
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val dct = new DCT()
+  .setInputCol("features")
+  .setOutputCol("featuresDCT")
+  .setInverse(false)
+val dctDf = dct.transform(df)
+dctDf.select("featuresDCT").show(3)
+{% endhighlight %}
+
+
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.DCT;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD data = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
+  RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
+  RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
+));
+StructType schema = new StructType(new StructField[] {
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+DCT dct = new DCT()
+  .setInputCol("features")
+  .setOutputCol("featuresDCT")
+  .setInverse(false);
+DataFrame dctDf = dct.transform(df);
+dctDf.select("featuresDCT").show(3);
+{% endhighlight %}
+
+
+
 ## StringIndexer
 
 `StringIndexer` encodes a string column of labels to a column of label indices.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT

2015-08-18 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 9108eff74 -> badf7fa65


[SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT

mengxr jkbradley

Author: Feynman Liang 

Closes #8184 from feynmanliang/SPARK-9889-DCT-docs.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/badf7fa6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/badf7fa6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/badf7fa6

Branch: refs/heads/master
Commit: badf7fa650f9801c70515907fcc26b58d7ec3143
Parents: 9108eff
Author: Feynman Liang 
Authored: Tue Aug 18 17:54:49 2015 -0700
Committer: Joseph K. Bradley 
Committed: Tue Aug 18 17:54:49 2015 -0700

--
 docs/ml-features.md | 71 
 1 file changed, 71 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/badf7fa6/docs/ml-features.md
--
diff --git a/docs/ml-features.md b/docs/ml-features.md
index 6b2e36b..28a6193 100644
--- a/docs/ml-features.md
+++ b/docs/ml-features.md
@@ -649,6 +649,77 @@ for expanded in polyDF.select("polyFeatures").take(3):
 
 
 
+## Discrete Cosine Transform (DCT)
+
+The [Discrete Cosine
+Transform](https://en.wikipedia.org/wiki/Discrete_cosine_transform)
+transforms a length $N$ real-valued sequence in the time domain into
+another length $N$ real-valued sequence in the frequency domain. A
+[DCT](api/scala/index.html#org.apache.spark.ml.feature.DCT) class
+provides this functionality, implementing the
+[DCT-II](https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II)
+and scaling the result by $1/\sqrt{2}$ such that the representing matrix
+for the transform is unitary. No shift is applied to the transformed
+sequence (e.g. the $0$th element of the transformed sequence is the
+$0$th DCT coefficient and _not_ the $N/2$th).
+
+
+
+{% highlight scala %}
+import org.apache.spark.ml.feature.DCT
+import org.apache.spark.mllib.linalg.Vectors
+
+val data = Seq(
+  Vectors.dense(0.0, 1.0, -2.0, 3.0),
+  Vectors.dense(-1.0, 2.0, 4.0, -7.0),
+  Vectors.dense(14.0, -2.0, -5.0, 1.0))
+val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
+val dct = new DCT()
+  .setInputCol("features")
+  .setOutputCol("featuresDCT")
+  .setInverse(false)
+val dctDf = dct.transform(df)
+dctDf.select("featuresDCT").show(3)
+{% endhighlight %}
+
+
+
+{% highlight java %}
+import java.util.Arrays;
+
+import org.apache.spark.api.java.JavaRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+import org.apache.spark.ml.feature.DCT;
+import org.apache.spark.mllib.linalg.Vector;
+import org.apache.spark.mllib.linalg.VectorUDT;
+import org.apache.spark.mllib.linalg.Vectors;
+import org.apache.spark.sql.DataFrame;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.RowFactory;
+import org.apache.spark.sql.SQLContext;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.sql.types.StructType;
+
+JavaRDD data = jsc.parallelize(Arrays.asList(
+  RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)),
+  RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)),
+  RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0))
+));
+StructType schema = new StructType(new StructField[] {
+  new StructField("features", new VectorUDT(), false, Metadata.empty()),
+});
+DataFrame df = jsql.createDataFrame(data, schema);
+DCT dct = new DCT()
+  .setInputCol("features")
+  .setOutputCol("featuresDCT")
+  .setInverse(false);
+DataFrame dctDf = dct.transform(df);
+dctDf.select("featuresDCT").show(3);
+{% endhighlight %}
+
+
+
 ## StringIndexer
 
 `StringIndexer` encodes a string column of labels to a column of label indices.


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator`

2015-08-19 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 5fd53c64b -> 28a98464e


[SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator`

Previously, users of evaluator (`CrossValidator` and `TrainValidationSplit`) 
would only maximize the metric in evaluator, leading to a hacky solution which 
negated metrics to be minimized and caused erroneous negative values to be 
reported to the user.

This PR adds a `isLargerBetter` attribute to the `Evaluator` base class, 
instructing users of `Evaluator` on whether the chosen metric should be 
maximized or minimized.

CC jkbradley

Author: Feynman Liang 
Author: Joseph K. Bradley 

Closes #8290 from feynmanliang/SPARK-10097.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28a98464
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28a98464
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28a98464

Branch: refs/heads/master
Commit: 28a98464ea65aa7b35e24fca5ddaa60c2e5d53ee
Parents: 5fd53c6
Author: Feynman Liang 
Authored: Wed Aug 19 11:35:05 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Aug 19 11:35:05 2015 -0700

--
 .../BinaryClassificationEvaluator.scala | 20 
 .../apache/spark/ml/evaluation/Evaluator.scala  |  7 +++
 .../MulticlassClassificationEvaluator.scala |  8 
 .../ml/evaluation/RegressionEvaluator.scala | 19 +++
 .../apache/spark/ml/tuning/CrossValidator.scala |  4 +++-
 .../spark/ml/tuning/TrainValidationSplit.scala  |  4 +++-
 .../evaluation/RegressionEvaluatorSuite.scala   |  4 ++--
 .../spark/ml/tuning/CrossValidatorSuite.scala   |  2 ++
 .../ml/tuning/TrainValidationSplitSuite.scala   |  2 ++
 python/pyspark/ml/evaluation.py |  4 ++--
 10 files changed, 52 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/28a98464/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 5d5cb7e..56419a0 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -40,8 +40,11 @@ class BinaryClassificationEvaluator(override val uid: String)
* param for metric name in evaluation
* @group param
*/
-  val metricName: Param[String] = new Param(this, "metricName",
-"metric name in evaluation (areaUnderROC|areaUnderPR)")
+  val metricName: Param[String] = {
+val allowedParams = ParamValidators.inArray(Array("areaUnderROC", 
"areaUnderPR"))
+new Param(
+  this, "metricName", "metric name in evaluation 
(areaUnderROC|areaUnderPR)", allowedParams)
+  }
 
   /** @group getParam */
   def getMetricName: String = $(metricName)
@@ -76,16 +79,17 @@ class BinaryClassificationEvaluator(override val uid: 
String)
   }
 val metrics = new BinaryClassificationMetrics(scoreAndLabels)
 val metric = $(metricName) match {
-  case "areaUnderROC" =>
-metrics.areaUnderROC()
-  case "areaUnderPR" =>
-metrics.areaUnderPR()
-  case other =>
-throw new IllegalArgumentException(s"Does not support metric $other.")
+  case "areaUnderROC" => metrics.areaUnderROC()
+  case "areaUnderPR" => metrics.areaUnderPR()
 }
 metrics.unpersist()
 metric
   }
 
+  override def isLargerBetter: Boolean = $(metricName) match {
+case "areaUnderROC" => true
+case "areaUnderPR" => true
+  }
+
   override def copy(extra: ParamMap): BinaryClassificationEvaluator = 
defaultCopy(extra)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/28a98464/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
index e56c946..13bd330 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -46,5 +46,12 @@ abstract class Evaluator extends Params {
*/
   def evaluate(dataset: DataFrame): Double
 
+  /**
+   * Indicates whether the metric returned by [[evaluate()]] should be 
maximized (true, default)
+   * or minimized (false).
+   * A given evaluator may supp

spark git commit: [SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator`

2015-08-19 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-1.5 a8e880818 -> f25c32475


[SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator`

Previously, users of evaluator (`CrossValidator` and `TrainValidationSplit`) 
would only maximize the metric in evaluator, leading to a hacky solution which 
negated metrics to be minimized and caused erroneous negative values to be 
reported to the user.

This PR adds a `isLargerBetter` attribute to the `Evaluator` base class, 
instructing users of `Evaluator` on whether the chosen metric should be 
maximized or minimized.

CC jkbradley

Author: Feynman Liang 
Author: Joseph K. Bradley 

Closes #8290 from feynmanliang/SPARK-10097.

(cherry picked from commit 28a98464ea65aa7b35e24fca5ddaa60c2e5d53ee)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f25c3247
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f25c3247
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f25c3247

Branch: refs/heads/branch-1.5
Commit: f25c324758095ddf572157e64c8f1a93843f79c7
Parents: a8e8808
Author: Feynman Liang 
Authored: Wed Aug 19 11:35:05 2015 -0700
Committer: Joseph K. Bradley 
Committed: Wed Aug 19 11:35:17 2015 -0700

--
 .../BinaryClassificationEvaluator.scala | 20 
 .../apache/spark/ml/evaluation/Evaluator.scala  |  7 +++
 .../MulticlassClassificationEvaluator.scala |  8 
 .../ml/evaluation/RegressionEvaluator.scala | 19 +++
 .../apache/spark/ml/tuning/CrossValidator.scala |  4 +++-
 .../spark/ml/tuning/TrainValidationSplit.scala  |  4 +++-
 .../evaluation/RegressionEvaluatorSuite.scala   |  4 ++--
 .../spark/ml/tuning/CrossValidatorSuite.scala   |  2 ++
 .../ml/tuning/TrainValidationSplitSuite.scala   |  2 ++
 python/pyspark/ml/evaluation.py |  4 ++--
 10 files changed, 52 insertions(+), 22 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/f25c3247/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
index 5d5cb7e..56419a0 100644
--- 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
+++ 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala
@@ -40,8 +40,11 @@ class BinaryClassificationEvaluator(override val uid: String)
* param for metric name in evaluation
* @group param
*/
-  val metricName: Param[String] = new Param(this, "metricName",
-"metric name in evaluation (areaUnderROC|areaUnderPR)")
+  val metricName: Param[String] = {
+val allowedParams = ParamValidators.inArray(Array("areaUnderROC", 
"areaUnderPR"))
+new Param(
+  this, "metricName", "metric name in evaluation 
(areaUnderROC|areaUnderPR)", allowedParams)
+  }
 
   /** @group getParam */
   def getMetricName: String = $(metricName)
@@ -76,16 +79,17 @@ class BinaryClassificationEvaluator(override val uid: 
String)
   }
 val metrics = new BinaryClassificationMetrics(scoreAndLabels)
 val metric = $(metricName) match {
-  case "areaUnderROC" =>
-metrics.areaUnderROC()
-  case "areaUnderPR" =>
-metrics.areaUnderPR()
-  case other =>
-throw new IllegalArgumentException(s"Does not support metric $other.")
+  case "areaUnderROC" => metrics.areaUnderROC()
+  case "areaUnderPR" => metrics.areaUnderPR()
 }
 metrics.unpersist()
 metric
   }
 
+  override def isLargerBetter: Boolean = $(metricName) match {
+case "areaUnderROC" => true
+case "areaUnderPR" => true
+  }
+
   override def copy(extra: ParamMap): BinaryClassificationEvaluator = 
defaultCopy(extra)
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/f25c3247/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
--
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala 
b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
index e56c946..13bd330 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala
@@ -46,5 +46,12 @@ abstract class Evaluator extends Params {
*/
   def evaluate(dataset: DataFrame): Double
 
+  /**
+   * Indicates whether the metric returned by [

spark git commit: [SPARK-15502][DOC][ML][PYSPARK] add guide note that ALS only supports integer ids

2016-05-24 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 2574abea0 -> 31fb5fa40


[SPARK-15502][DOC][ML][PYSPARK] add guide note that ALS only supports integer 
ids

This PR adds a note to clarify that the ML API for ALS only supports integers 
for user/item ids, and that other types for these columns can be used but the 
ids must fall within integer range.

(Refer [SPARK-14891](https://issues.apache.org/jira/browse/SPARK-14891)).

Also cleaned up a reference to `mllib` in the ML doc.

## How was this patch tested?
Built and viewed User Guide doc locally.

Author: Nick Pentreath 

Closes #13278 from MLnick/SPARK-15502-als-int-id-doc-note.

(cherry picked from commit 20900e5feced76e87f0a12823d0e3f07e082105f)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31fb5fa4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31fb5fa4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31fb5fa4

Branch: refs/heads/branch-2.0
Commit: 31fb5fa4042eb46c541e5726a3f14da6f9f6bc2d
Parents: 2574abe
Author: Nick Pentreath 
Authored: Tue May 24 11:34:06 2016 -0700
Committer: Joseph K. Bradley 
Committed: Tue May 24 11:34:15 2016 -0700

--
 docs/ml-collaborative-filtering.md | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/31fb5fa4/docs/ml-collaborative-filtering.md
--
diff --git a/docs/ml-collaborative-filtering.md 
b/docs/ml-collaborative-filtering.md
index bd3d527..8bd75f3 100644
--- a/docs/ml-collaborative-filtering.md
+++ b/docs/ml-collaborative-filtering.md
@@ -29,6 +29,10 @@ following parameters:
   *baseline* confidence in preference observations (defaults to 1.0).
 * *nonnegative* specifies whether or not to use nonnegative constraints for 
least squares (defaults to `false`).
 
+**Note:** The DataFrame-based API for ALS currently only supports integers for 
user and item ids.
+Other numeric types are supported for the user and item id columns, 
+but the ids must be within the integer value range. 
+
 ### Explicit vs. implicit feedback
 
 The standard approach to matrix factorization based collaborative filtering 
treats 
@@ -36,7 +40,7 @@ the entries in the user-item matrix as *explicit* preferences 
given by the user
 for example, users giving ratings to movies.
 
 It is common in many real-world use cases to only have access to *implicit 
feedback* (e.g. views,
-clicks, purchases, likes, shares etc.). The approach used in `spark.mllib` to 
deal with such data is taken
+clicks, purchases, likes, shares etc.). The approach used in `spark.ml` to 
deal with such data is taken
 from [Collaborative Filtering for Implicit Feedback 
Datasets](http://dx.doi.org/10.1109/ICDM.2008.22).
 Essentially, instead of trying to model the matrix of ratings directly, this 
approach treats the data
 as numbers representing the *strength* in observations of user actions (such 
as the number of clicks,


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15502][DOC][ML][PYSPARK] add guide note that ALS only supports integer ids

2016-05-24 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master be99a99fe -> 20900e5fe


[SPARK-15502][DOC][ML][PYSPARK] add guide note that ALS only supports integer 
ids

This PR adds a note to clarify that the ML API for ALS only supports integers 
for user/item ids, and that other types for these columns can be used but the 
ids must fall within integer range.

(Refer [SPARK-14891](https://issues.apache.org/jira/browse/SPARK-14891)).

Also cleaned up a reference to `mllib` in the ML doc.

## How was this patch tested?
Built and viewed User Guide doc locally.

Author: Nick Pentreath 

Closes #13278 from MLnick/SPARK-15502-als-int-id-doc-note.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20900e5f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20900e5f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20900e5f

Branch: refs/heads/master
Commit: 20900e5feced76e87f0a12823d0e3f07e082105f
Parents: be99a99
Author: Nick Pentreath 
Authored: Tue May 24 11:34:06 2016 -0700
Committer: Joseph K. Bradley 
Committed: Tue May 24 11:34:06 2016 -0700

--
 docs/ml-collaborative-filtering.md | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/20900e5f/docs/ml-collaborative-filtering.md
--
diff --git a/docs/ml-collaborative-filtering.md 
b/docs/ml-collaborative-filtering.md
index bd3d527..8bd75f3 100644
--- a/docs/ml-collaborative-filtering.md
+++ b/docs/ml-collaborative-filtering.md
@@ -29,6 +29,10 @@ following parameters:
   *baseline* confidence in preference observations (defaults to 1.0).
 * *nonnegative* specifies whether or not to use nonnegative constraints for 
least squares (defaults to `false`).
 
+**Note:** The DataFrame-based API for ALS currently only supports integers for 
user and item ids.
+Other numeric types are supported for the user and item id columns, 
+but the ids must be within the integer value range. 
+
 ### Explicit vs. implicit feedback
 
 The standard approach to matrix factorization based collaborative filtering 
treats 
@@ -36,7 +40,7 @@ the entries in the user-item matrix as *explicit* preferences 
given by the user
 for example, users giving ratings to movies.
 
 It is common in many real-world use cases to only have access to *implicit 
feedback* (e.g. views,
-clicks, purchases, likes, shares etc.). The approach used in `spark.mllib` to 
deal with such data is taken
+clicks, purchases, likes, shares etc.). The approach used in `spark.ml` to 
deal with such data is taken
 from [Collaborative Filtering for Implicit Feedback 
Datasets](http://dx.doi.org/10.1109/ICDM.2008.22).
 Essentially, instead of trying to model the matrix of ratings directly, this 
approach treats the data
 as numbers representing the *strength* in observations of user actions (such 
as the number of clicks,


-
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-15457][MLLIB][ML] Eliminate some warnings from MLlib about deprecations

2016-05-26 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 0f61d6efb -> b0a03feef


[SPARK-15457][MLLIB][ML] Eliminate some warnings from MLlib about deprecations

## What changes were proposed in this pull request?

Several classes and methods have been deprecated and are creating lots of build 
warnings in branch-2.0. This issue is to identify and fix those items:
* WithSGD classes: Change to make class not deprecated, object deprecated, and 
public class constructor deprecated. Any public use will require a deprecated 
API. We need to keep a non-deprecated private API since we cannot eliminate 
certain uses: Python API, streaming algs, and examples.
  * Use in PythonMLlibAPI: Change to using private constructors
  * Streaming algs: No warnings after we un-deprecate the classes
  * Examples: Deprecate or change ones which use deprecated APIs
* MulticlassMetrics fields (precision, etc.)
* LinearRegressionSummary.model field

## How was this patch tested?

Existing tests.  Checked for warnings manually.

Author: Sean Owen 
Author: Joseph K. Bradley 

Closes #13314 from jkbradley/warning-cleanups.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b0a03fee
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b0a03fee
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b0a03fee

Branch: refs/heads/master
Commit: b0a03feef2cf4daa7642ec7f4dc479dbd473b581
Parents: 0f61d6e
Author: Sean Owen 
Authored: Thu May 26 14:25:28 2016 -0700
Committer: Joseph K. Bradley 
Committed: Thu May 26 14:25:28 2016 -0700

--
 .../JavaLogisticRegressionWithLBFGSExample.java |  4 +-
 ...aMulticlassClassificationMetricsExample.java |  4 +-
 .../spark/examples/ml/DecisionTreeExample.scala |  2 +-
 .../examples/mllib/DecisionTreeRunner.scala | 10 ++--
 .../mllib/GradientBoostedTreesRunner.scala  |  5 +-
 .../spark/examples/mllib/LinearRegression.scala |  1 +
 .../mllib/LinearRegressionWithSGDExample.scala  |  1 +
 .../LogisticRegressionWithLBFGSExample.scala|  4 +-
 .../mllib/MulticlassMetricsExample.scala|  8 +--
 .../spark/examples/mllib/PCAExample.scala   |  1 +
 .../mllib/RegressionMetricsExample.scala|  2 +
 .../MulticlassClassificationEvaluator.scala |  4 +-
 .../spark/ml/regression/LinearRegression.scala  | 53 +++-
 .../spark/mllib/api/python/PythonMLLibAPI.scala |  8 +--
 .../classification/LogisticRegression.scala |  4 +-
 .../apache/spark/mllib/regression/Lasso.scala   |  6 +--
 .../mllib/regression/LinearRegression.scala |  2 +-
 .../mllib/regression/RidgeRegression.scala  |  6 +--
 18 files changed, 63 insertions(+), 62 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b0a03fee/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
index 9d8e4a9..7fc371e 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
@@ -65,8 +65,8 @@ public class JavaLogisticRegressionWithLBFGSExample {
 
 // Get evaluation metrics.
 MulticlassMetrics metrics = new 
MulticlassMetrics(predictionAndLabels.rdd());
-double precision = metrics.precision();
-System.out.println("Precision = " + precision);
+double accuracy = metrics.accuracy();
+System.out.println("Accuracy = " + accuracy);
 
 // Save and load model
 model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");

http://git-wip-us.apache.org/repos/asf/spark/blob/b0a03fee/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
index 5247c9c..e84a3a7 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
@@ -68,9 +68,7 @@ public class JavaMulticlassClassificationMetricsExample {
 System.out.println("Confusion matrix: \n" + confusion);
 
 // Overall statistics
-System.out.println("Precision = " +

spark git commit: [SPARK-15457][MLLIB][ML] Eliminate some warnings from MLlib about deprecations

2016-05-26 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 6eea33ec3 -> 216e39505


[SPARK-15457][MLLIB][ML] Eliminate some warnings from MLlib about deprecations

## What changes were proposed in this pull request?

Several classes and methods have been deprecated and are creating lots of build 
warnings in branch-2.0. This issue is to identify and fix those items:
* WithSGD classes: Change to make class not deprecated, object deprecated, and 
public class constructor deprecated. Any public use will require a deprecated 
API. We need to keep a non-deprecated private API since we cannot eliminate 
certain uses: Python API, streaming algs, and examples.
  * Use in PythonMLlibAPI: Change to using private constructors
  * Streaming algs: No warnings after we un-deprecate the classes
  * Examples: Deprecate or change ones which use deprecated APIs
* MulticlassMetrics fields (precision, etc.)
* LinearRegressionSummary.model field

## How was this patch tested?

Existing tests.  Checked for warnings manually.

Author: Sean Owen 
Author: Joseph K. Bradley 

Closes #13314 from jkbradley/warning-cleanups.

(cherry picked from commit b0a03feef2cf4daa7642ec7f4dc479dbd473b581)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/216e3950
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/216e3950
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/216e3950

Branch: refs/heads/branch-2.0
Commit: 216e39505ef8861d12e31d5117fad90e57bed885
Parents: 6eea33e
Author: Sean Owen 
Authored: Thu May 26 14:25:28 2016 -0700
Committer: Joseph K. Bradley 
Committed: Thu May 26 14:25:39 2016 -0700

--
 .../JavaLogisticRegressionWithLBFGSExample.java |  4 +-
 ...aMulticlassClassificationMetricsExample.java |  4 +-
 .../spark/examples/ml/DecisionTreeExample.scala |  2 +-
 .../examples/mllib/DecisionTreeRunner.scala | 10 ++--
 .../mllib/GradientBoostedTreesRunner.scala  |  5 +-
 .../spark/examples/mllib/LinearRegression.scala |  1 +
 .../mllib/LinearRegressionWithSGDExample.scala  |  1 +
 .../LogisticRegressionWithLBFGSExample.scala|  4 +-
 .../mllib/MulticlassMetricsExample.scala|  8 +--
 .../spark/examples/mllib/PCAExample.scala   |  1 +
 .../mllib/RegressionMetricsExample.scala|  2 +
 .../MulticlassClassificationEvaluator.scala |  4 +-
 .../spark/ml/regression/LinearRegression.scala  | 53 +++-
 .../spark/mllib/api/python/PythonMLLibAPI.scala |  8 +--
 .../classification/LogisticRegression.scala |  4 +-
 .../apache/spark/mllib/regression/Lasso.scala   |  6 +--
 .../mllib/regression/LinearRegression.scala |  2 +-
 .../mllib/regression/RidgeRegression.scala  |  6 +--
 18 files changed, 63 insertions(+), 62 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/216e3950/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
index 9d8e4a9..7fc371e 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java
@@ -65,8 +65,8 @@ public class JavaLogisticRegressionWithLBFGSExample {
 
 // Get evaluation metrics.
 MulticlassMetrics metrics = new 
MulticlassMetrics(predictionAndLabels.rdd());
-double precision = metrics.precision();
-System.out.println("Precision = " + precision);
+double accuracy = metrics.accuracy();
+System.out.println("Accuracy = " + accuracy);
 
 // Save and load model
 model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel");

http://git-wip-us.apache.org/repos/asf/spark/blob/216e3950/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
--
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
index 5247c9c..e84a3a7 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java
@@ -68,9 +68,7 @@ public class JavaMulticlassClassificationMetricsExample {
 System.out.println("Confusion matrix: \n"

spark git commit: [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master a96e4151a -> c96244f5a


[SPARK-15186][ML][DOCS] Add user guide for generalized linear regression

## What changes were proposed in this pull request?

This patch adds a user guide section for generalized linear regression and 
includes the examples from [#12754](https://github.com/apache/spark/pull/12754).

## How was this patch tested?

Documentation only, no tests required.

## Approach

In general, it is a bit unclear what level of detail ought to be included in 
the user guide since there is a lot of variability within the current user 
guide. I tried to give a fairly brief mathematical introduction to GLMs, and 
cover what types of problems they could be used for. Additionally, I included a 
brief blurb on the IRLS solver. The input/output columns are given in a table 
as is found elsewhere in the docs (though, again, these appear rather 
intermittently in the current docs), as well as a table providing the supported 
families and their link functions.

Author: sethah 

Closes #13139 from sethah/SPARK-15186.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c96244f5
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c96244f5
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c96244f5

Branch: refs/heads/master
Commit: c96244f5acd8b335e34694c171bab32d92e6e0fb
Parents: a96e415
Author: sethah 
Authored: Fri May 27 12:55:48 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 12:55:48 2016 -0700

--
 docs/ml-classification-regression.md | 132 ++
 1 file changed, 132 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/c96244f5/docs/ml-classification-regression.md
--
diff --git a/docs/ml-classification-regression.md 
b/docs/ml-classification-regression.md
index f1a21f4..ff8dec6 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -374,6 +374,138 @@ regression model and extracting model summary statistics.
 
 
 
+## Generalized linear regression
+
+Contrasted with linear regression where the output is assumed to follow a 
Gaussian
+distribution, [generalized linear 
models](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLMs) are 
specifications of linear models where the response variable $Y_i$ follows some
+distribution from the [exponential family of 
distributions](https://en.wikipedia.org/wiki/Exponential_family).
+Spark's `GeneralizedLinearRegression` interface
+allows for flexible specification of GLMs which can be used for various types 
of
+prediction problems including linear regression, Poisson regression, logistic 
regression, and others.
+Currently in `spark.ml`, only a subset of the exponential family distributions 
are supported and they are listed
+[below](#available-families).
+
+**NOTE**: Spark currently only supports up to 4096 features through its 
`GeneralizedLinearRegression`
+interface, and will throw an exception if this constraint is exceeded. See the 
[advanced section](ml-advanced) for more details.
+ Still, for linear and logistic regression, models with an increased number of 
features can be trained 
+ using the `LinearRegression` and `LogisticRegression` estimators.
+
+GLMs require exponential family distributions that can be written in their 
"canonical" or "natural" form, aka
+[natural exponential family 
distributions](https://en.wikipedia.org/wiki/Natural_exponential_family). The 
form of a natural exponential family distribution is given as:
+
+$$
+f_Y(y|\theta, \tau) = h(y, \tau)\exp{\left( \frac{\theta \cdot y - 
A(\theta)}{d(\tau)} \right)}
+$$
+
+where $\theta$ is the parameter of interest and $\tau$ is a dispersion 
parameter. In a GLM the response variable $Y_i$ is assumed to be drawn from a 
natural exponential family distribution:
+
+$$
+Y_i \sim f\left(\cdot|\theta_i, \tau \right)
+$$
+
+where the parameter of interest $\theta_i$ is related to the expected value of 
the response variable $\mu_i$ by
+
+$$
+\mu_i = A'(\theta_i)
+$$
+
+Here, $A'(\theta_i)$ is defined by the form of the distribution selected. GLMs 
also allow specification
+of a link function, which defines the relationship between the expected value 
of the response variable $\mu_i$
+and the so called _linear predictor_ $\eta_i$:
+
+$$
+g(\mu_i) = \eta_i = \vec{x_i}^T \cdot \vec{\beta}
+$$
+
+Often, the link function is chosen such that $A' = g^{-1}$, which yields a 
simplified relationship
+between the parameter of interest $\theta$ and the linear predictor $\eta$. In 
this case, the link
+function $g(\mu)$ is said to be the "canonical" link function.
+
+$$
+\theta_i = A'^{-1}(\mu_i) = g(g^{-1}(\eta_i)) = \eta_i
+$$
+
+A GLM finds the regression coefficients

spark git commit: [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 d76e066d3 -> 5dd1423f4


[SPARK-15186][ML][DOCS] Add user guide for generalized linear regression

## What changes were proposed in this pull request?

This patch adds a user guide section for generalized linear regression and 
includes the examples from [#12754](https://github.com/apache/spark/pull/12754).

## How was this patch tested?

Documentation only, no tests required.

## Approach

In general, it is a bit unclear what level of detail ought to be included in 
the user guide since there is a lot of variability within the current user 
guide. I tried to give a fairly brief mathematical introduction to GLMs, and 
cover what types of problems they could be used for. Additionally, I included a 
brief blurb on the IRLS solver. The input/output columns are given in a table 
as is found elsewhere in the docs (though, again, these appear rather 
intermittently in the current docs), as well as a table providing the supported 
families and their link functions.

Author: sethah 

Closes #13139 from sethah/SPARK-15186.

(cherry picked from commit c96244f5acd8b335e34694c171bab32d92e6e0fb)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5dd1423f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5dd1423f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5dd1423f

Branch: refs/heads/branch-2.0
Commit: 5dd1423f462f03b7ae625a93cdaf9d882969afb6
Parents: d76e066
Author: sethah 
Authored: Fri May 27 12:55:48 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 12:56:00 2016 -0700

--
 docs/ml-classification-regression.md | 132 ++
 1 file changed, 132 insertions(+)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/5dd1423f/docs/ml-classification-regression.md
--
diff --git a/docs/ml-classification-regression.md 
b/docs/ml-classification-regression.md
index f1a21f4..ff8dec6 100644
--- a/docs/ml-classification-regression.md
+++ b/docs/ml-classification-regression.md
@@ -374,6 +374,138 @@ regression model and extracting model summary statistics.
 
 
 
+## Generalized linear regression
+
+Contrasted with linear regression where the output is assumed to follow a 
Gaussian
+distribution, [generalized linear 
models](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLMs) are 
specifications of linear models where the response variable $Y_i$ follows some
+distribution from the [exponential family of 
distributions](https://en.wikipedia.org/wiki/Exponential_family).
+Spark's `GeneralizedLinearRegression` interface
+allows for flexible specification of GLMs which can be used for various types 
of
+prediction problems including linear regression, Poisson regression, logistic 
regression, and others.
+Currently in `spark.ml`, only a subset of the exponential family distributions 
are supported and they are listed
+[below](#available-families).
+
+**NOTE**: Spark currently only supports up to 4096 features through its 
`GeneralizedLinearRegression`
+interface, and will throw an exception if this constraint is exceeded. See the 
[advanced section](ml-advanced) for more details.
+ Still, for linear and logistic regression, models with an increased number of 
features can be trained 
+ using the `LinearRegression` and `LogisticRegression` estimators.
+
+GLMs require exponential family distributions that can be written in their 
"canonical" or "natural" form, aka
+[natural exponential family 
distributions](https://en.wikipedia.org/wiki/Natural_exponential_family). The 
form of a natural exponential family distribution is given as:
+
+$$
+f_Y(y|\theta, \tau) = h(y, \tau)\exp{\left( \frac{\theta \cdot y - 
A(\theta)}{d(\tau)} \right)}
+$$
+
+where $\theta$ is the parameter of interest and $\tau$ is a dispersion 
parameter. In a GLM the response variable $Y_i$ is assumed to be drawn from a 
natural exponential family distribution:
+
+$$
+Y_i \sim f\left(\cdot|\theta_i, \tau \right)
+$$
+
+where the parameter of interest $\theta_i$ is related to the expected value of 
the response variable $\mu_i$ by
+
+$$
+\mu_i = A'(\theta_i)
+$$
+
+Here, $A'(\theta_i)$ is defined by the form of the distribution selected. GLMs 
also allow specification
+of a link function, which defines the relationship between the expected value 
of the response variable $\mu_i$
+and the so called _linear predictor_ $\eta_i$:
+
+$$
+g(\mu_i) = \eta_i = \vec{x_i}^T \cdot \vec{\beta}
+$$
+
+Often, the link function is chosen such that $A' = g^{-1}$, which yields a 
simplified relationship
+between the parameter of interest $\theta$ and the linear predictor $\eta$. In 
this case, the link
+function $g(\mu)$ is said to be the "canonical" link functi

spark git commit: [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master c96244f5a -> a3550e374


[SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS

## What changes were proposed in this pull request?
* Document ```WeightedLeastSquares```(normal equation) and 
```IterativelyReweightedLeastSquares```.
* Copy ```L-BFGS``` documents from ```spark.mllib``` to ```spark.ml```.

Due to the session ```Optimization of linear methods``` is used for developers, 
I think we should provide the brief introduction of the optimization method, 
necessary references and how it implements in Spark. It's not necessary to 
paste all mathematical formula and derivation here. If developers/users want to 
learn more, they can track reference.

## How was this patch tested?
Document update, no tests.

Author: Yanbo Liang 

Closes #13262 from yanboliang/spark-15484.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a3550e37
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a3550e37
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a3550e37

Branch: refs/heads/master
Commit: a3550e3747e21c79a5110132dc127ee83879062a
Parents: c96244f
Author: Yanbo Liang 
Authored: Fri May 27 13:16:22 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 13:16:22 2016 -0700

--
 docs/ml-advanced.md | 85 ++--
 .../IterativelyReweightedLeastSquares.scala |  2 +-
 2 files changed, 81 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a3550e37/docs/ml-advanced.md
--
diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md
index 91731d7..1c5f844 100644
--- a/docs/ml-advanced.md
+++ b/docs/ml-advanced.md
@@ -4,10 +4,85 @@ title: Advanced topics - spark.ml
 displayTitle: Advanced topics - spark.ml
 ---
 
-# Optimization of linear methods
+* Table of contents
+{:toc}
+
+`\[
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}} 
+\newcommand{\x}{\mathbf{x}}
+\newcommand{\y}{\mathbf{y}}
+\newcommand{\wv}{\mathbf{w}}
+\newcommand{\av}{\mathbf{\alpha}}
+\newcommand{\bv}{\mathbf{b}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\id}{\mathbf{I}} 
+\newcommand{\ind}{\mathbf{1}} 
+\newcommand{\0}{\mathbf{0}} 
+\newcommand{\unit}{\mathbf{e}} 
+\newcommand{\one}{\mathbf{1}} 
+\newcommand{\zero}{\mathbf{0}}
+\]`
+
+# Optimization of linear methods (developer)
+
+## Limited-memory BFGS (L-BFGS)
+[L-BFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is an optimization 
+algorithm in the family of quasi-Newton methods to solve the optimization 
problems of the form 
+`$\min_{\wv \in\R^d} \; f(\wv)$`. The L-BFGS method approximates the objective 
function locally as a 
+quadratic without evaluating the second partial derivatives of the objective 
function to construct the 
+Hessian matrix. The Hessian matrix is approximated by previous gradient 
evaluations, so there is no 
+vertical scalability issue (the number of training features) unlike computing 
the Hessian matrix 
+explicitly in Newton's method. As a result, L-BFGS often achieves faster 
convergence compared with 
+other first-order optimizations.
 
-The optimization algorithm underlying the implementation is called
 [Orthant-Wise Limited-memory
-QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
-(OWL-QN). It is an extension of L-BFGS that can effectively handle L1
-regularization and elastic net.
+Quasi-Newton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
+(OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic 
net regularization.
+
+L-BFGS is used as a solver for 
[LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression),
+[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression),
+[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression)
+and 
[MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier).
+
+MLlib L-BFGS solver calls the corresponding implementation in 
[breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala).
+
+## Normal equation solver for weighted least squares
+
+MLlib implements normal equation solver for [weighted least 
squares](https://en.wikipedia.org/wiki/Least_squares#Weighted_least_squares) by 
[WeightedLeastSquares](https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala).
+
+Given $n$ weighted observations $(w_i, a_i, b_i)$:
+
+* $w_i$ the weight of i-th observation
+* $a_i$ the features vector of i-th observation
+* $b_i$ the label of i-th

spark git commit: [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 5dd1423f4 -> e6e2f293d


[SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS

## What changes were proposed in this pull request?
* Document ```WeightedLeastSquares```(normal equation) and 
```IterativelyReweightedLeastSquares```.
* Copy ```L-BFGS``` documents from ```spark.mllib``` to ```spark.ml```.

Due to the session ```Optimization of linear methods``` is used for developers, 
I think we should provide the brief introduction of the optimization method, 
necessary references and how it implements in Spark. It's not necessary to 
paste all mathematical formula and derivation here. If developers/users want to 
learn more, they can track reference.

## How was this patch tested?
Document update, no tests.

Author: Yanbo Liang 

Closes #13262 from yanboliang/spark-15484.

(cherry picked from commit a3550e3747e21c79a5110132dc127ee83879062a)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6e2f293
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6e2f293
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6e2f293

Branch: refs/heads/branch-2.0
Commit: e6e2f293d6830ce118050e789773a09b3888fd30
Parents: 5dd1423
Author: Yanbo Liang 
Authored: Fri May 27 13:16:22 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 13:16:37 2016 -0700

--
 docs/ml-advanced.md | 85 ++--
 .../IterativelyReweightedLeastSquares.scala |  2 +-
 2 files changed, 81 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e6e2f293/docs/ml-advanced.md
--
diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md
index 91731d7..1c5f844 100644
--- a/docs/ml-advanced.md
+++ b/docs/ml-advanced.md
@@ -4,10 +4,85 @@ title: Advanced topics - spark.ml
 displayTitle: Advanced topics - spark.ml
 ---
 
-# Optimization of linear methods
+* Table of contents
+{:toc}
+
+`\[
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\E}{\mathbb{E}} 
+\newcommand{\x}{\mathbf{x}}
+\newcommand{\y}{\mathbf{y}}
+\newcommand{\wv}{\mathbf{w}}
+\newcommand{\av}{\mathbf{\alpha}}
+\newcommand{\bv}{\mathbf{b}}
+\newcommand{\N}{\mathbb{N}}
+\newcommand{\id}{\mathbf{I}} 
+\newcommand{\ind}{\mathbf{1}} 
+\newcommand{\0}{\mathbf{0}} 
+\newcommand{\unit}{\mathbf{e}} 
+\newcommand{\one}{\mathbf{1}} 
+\newcommand{\zero}{\mathbf{0}}
+\]`
+
+# Optimization of linear methods (developer)
+
+## Limited-memory BFGS (L-BFGS)
+[L-BFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is an optimization 
+algorithm in the family of quasi-Newton methods to solve the optimization 
problems of the form 
+`$\min_{\wv \in\R^d} \; f(\wv)$`. The L-BFGS method approximates the objective 
function locally as a 
+quadratic without evaluating the second partial derivatives of the objective 
function to construct the 
+Hessian matrix. The Hessian matrix is approximated by previous gradient 
evaluations, so there is no 
+vertical scalability issue (the number of training features) unlike computing 
the Hessian matrix 
+explicitly in Newton's method. As a result, L-BFGS often achieves faster 
convergence compared with 
+other first-order optimizations.
 
-The optimization algorithm underlying the implementation is called
 [Orthant-Wise Limited-memory
-QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
-(OWL-QN). It is an extension of L-BFGS that can effectively handle L1
-regularization and elastic net.
+Quasi-Newton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf)
+(OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic 
net regularization.
+
+L-BFGS is used as a solver for 
[LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression),
+[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression),
+[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression)
+and 
[MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier).
+
+MLlib L-BFGS solver calls the corresponding implementation in 
[breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala).
+
+## Normal equation solver for weighted least squares
+
+MLlib implements normal equation solver for [weighted least 
squares](https://en.wikipedia.org/wiki/Least_squares#Weighted_least_squares) by 
[WeightedLeastSquares](https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala).
+
+Given $n$ weighted observations $(w_i, a_i, b_i)$:
+
+*

spark git commit: [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master a3550e374 -> 130b8d07b


[SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest

## What changes were proposed in this pull request?

1. Add `_transfer_param_map_to/from_java` for OneVsRest;

2. Add `_compare_params` in ml/tests.py to help compare params.

3. Add `test_onevsrest` as the integration test for OneVsRest.

## How was this patch tested?

Python unit test.

Author: yinxusen 

Closes #12875 from yinxusen/SPARK-15008.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/130b8d07
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/130b8d07
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/130b8d07

Branch: refs/heads/master
Commit: 130b8d07b8eb08f2ad522081a95032b90247094d
Parents: a3550e3
Author: yinxusen 
Authored: Fri May 27 13:18:29 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 13:18:29 2016 -0700

--
 python/pyspark/ml/tests.py | 69 +++--
 1 file changed, 46 insertions(+), 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/130b8d07/python/pyspark/ml/tests.py
--
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index a7c93ac..4358175 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -747,12 +747,32 @@ class PersistenceTest(SparkSessionTestCase):
 except OSError:
 pass
 
+def _compare_params(self, m1, m2, param):
+"""
+Compare 2 ML Params instances for the given param, and assert both 
have the same param value
+and parent. The param must be a parameter of m1.
+"""
+# Prevent key not found error in case of some param in neither 
paramMap nor defaultParamMap.
+if m1.isDefined(param):
+paramValue1 = m1.getOrDefault(param)
+paramValue2 = m2.getOrDefault(m2.getParam(param.name))
+if isinstance(paramValue1, Params):
+self._compare_pipelines(paramValue1, paramValue2)
+else:
+self.assertEqual(paramValue1, paramValue2)  # for general 
types param
+# Assert parents are equal
+self.assertEqual(param.parent, m2.getParam(param.name).parent)
+else:
+# If m1 is not defined param, then m2 should not, too. See 
SPARK-14931.
+self.assertFalse(m2.isDefined(m2.getParam(param.name)))
+
 def _compare_pipelines(self, m1, m2):
 """
 Compare 2 ML types, asserting that they are equivalent.
 This currently supports:
  - basic types
  - Pipeline, PipelineModel
+ - OneVsRest, OneVsRestModel
 This checks:
  - uid
  - type
@@ -763,8 +783,7 @@ class PersistenceTest(SparkSessionTestCase):
 if isinstance(m1, JavaParams):
 self.assertEqual(len(m1.params), len(m2.params))
 for p in m1.params:
-self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p))
-self.assertEqual(p.parent, m2.getParam(p.name).parent)
+self._compare_params(m1, m2, p)
 elif isinstance(m1, Pipeline):
 self.assertEqual(len(m1.getStages()), len(m2.getStages()))
 for s1, s2 in zip(m1.getStages(), m2.getStages()):
@@ -773,6 +792,13 @@ class PersistenceTest(SparkSessionTestCase):
 self.assertEqual(len(m1.stages), len(m2.stages))
 for s1, s2 in zip(m1.stages, m2.stages):
 self._compare_pipelines(s1, s2)
+elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel):
+for p in m1.params:
+self._compare_params(m1, m2, p)
+if isinstance(m1, OneVsRestModel):
+self.assertEqual(len(m1.models), len(m2.models))
+for x, y in zip(m1.models, m2.models):
+self._compare_pipelines(x, y)
 else:
 raise RuntimeError("_compare_pipelines does not yet support type: 
%s" % type(m1))
 
@@ -833,6 +859,24 @@ class PersistenceTest(SparkSessionTestCase):
 except OSError:
 pass
 
+def test_onevsrest(self):
+temp_path = tempfile.mkdtemp()
+df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
+ (1.0, Vectors.sparse(2, [], [])),
+ (2.0, Vectors.dense(0.5, 0.5))] * 10,
+["label", "features"])
+lr = LogisticRegression(maxIter=5, regParam=0.01)
+ovr = OneVsRest(classifier=lr)
+model = ovr.fit(df)
+ovrPath = temp_path + "/ovr"
+ovr.save(ovrPath)
+loadedOvr = OneVsRest.load(ovr

spark git commit: [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 e6e2f293d -> a778d3c90


[SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest

## What changes were proposed in this pull request?

1. Add `_transfer_param_map_to/from_java` for OneVsRest;

2. Add `_compare_params` in ml/tests.py to help compare params.

3. Add `test_onevsrest` as the integration test for OneVsRest.

## How was this patch tested?

Python unit test.

Author: yinxusen 

Closes #12875 from yinxusen/SPARK-15008.

(cherry picked from commit 130b8d07b8eb08f2ad522081a95032b90247094d)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a778d3c9
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a778d3c9
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a778d3c9

Branch: refs/heads/branch-2.0
Commit: a778d3c90599eb76e6bca87b7aa3c0f9910f24c5
Parents: e6e2f29
Author: yinxusen 
Authored: Fri May 27 13:18:29 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 13:18:36 2016 -0700

--
 python/pyspark/ml/tests.py | 69 +++--
 1 file changed, 46 insertions(+), 23 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/a778d3c9/python/pyspark/ml/tests.py
--
diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
index a7c93ac..4358175 100755
--- a/python/pyspark/ml/tests.py
+++ b/python/pyspark/ml/tests.py
@@ -747,12 +747,32 @@ class PersistenceTest(SparkSessionTestCase):
 except OSError:
 pass
 
+def _compare_params(self, m1, m2, param):
+"""
+Compare 2 ML Params instances for the given param, and assert both 
have the same param value
+and parent. The param must be a parameter of m1.
+"""
+# Prevent key not found error in case of some param in neither 
paramMap nor defaultParamMap.
+if m1.isDefined(param):
+paramValue1 = m1.getOrDefault(param)
+paramValue2 = m2.getOrDefault(m2.getParam(param.name))
+if isinstance(paramValue1, Params):
+self._compare_pipelines(paramValue1, paramValue2)
+else:
+self.assertEqual(paramValue1, paramValue2)  # for general 
types param
+# Assert parents are equal
+self.assertEqual(param.parent, m2.getParam(param.name).parent)
+else:
+# If m1 is not defined param, then m2 should not, too. See 
SPARK-14931.
+self.assertFalse(m2.isDefined(m2.getParam(param.name)))
+
 def _compare_pipelines(self, m1, m2):
 """
 Compare 2 ML types, asserting that they are equivalent.
 This currently supports:
  - basic types
  - Pipeline, PipelineModel
+ - OneVsRest, OneVsRestModel
 This checks:
  - uid
  - type
@@ -763,8 +783,7 @@ class PersistenceTest(SparkSessionTestCase):
 if isinstance(m1, JavaParams):
 self.assertEqual(len(m1.params), len(m2.params))
 for p in m1.params:
-self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p))
-self.assertEqual(p.parent, m2.getParam(p.name).parent)
+self._compare_params(m1, m2, p)
 elif isinstance(m1, Pipeline):
 self.assertEqual(len(m1.getStages()), len(m2.getStages()))
 for s1, s2 in zip(m1.getStages(), m2.getStages()):
@@ -773,6 +792,13 @@ class PersistenceTest(SparkSessionTestCase):
 self.assertEqual(len(m1.stages), len(m2.stages))
 for s1, s2 in zip(m1.stages, m2.stages):
 self._compare_pipelines(s1, s2)
+elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel):
+for p in m1.params:
+self._compare_params(m1, m2, p)
+if isinstance(m1, OneVsRestModel):
+self.assertEqual(len(m1.models), len(m2.models))
+for x, y in zip(m1.models, m2.models):
+self._compare_pipelines(x, y)
 else:
 raise RuntimeError("_compare_pipelines does not yet support type: 
%s" % type(m1))
 
@@ -833,6 +859,24 @@ class PersistenceTest(SparkSessionTestCase):
 except OSError:
 pass
 
+def test_onevsrest(self):
+temp_path = tempfile.mkdtemp()
+df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
+ (1.0, Vectors.sparse(2, [], [])),
+ (2.0, Vectors.dense(0.5, 0.5))] * 10,
+["label", "features"])
+lr = LogisticRegression(maxIter=5, regParam=0.01)
+ovr = OneVsRest(classifier=lr)
+model = ovr.

spark git commit: [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/master 130b8d07b -> 21b2605dc


[SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix

## What changes were proposed in this pull request?

We're using `asML` to convert the mllib vector/matrix to ml vector/matrix now. 
Using `as` is more correct given that this conversion actually shares the same 
underline data structure. As a result, in this PR, `toBreeze` will be changed 
to `asBreeze`. This is a private API, as a result, it will not affect any 
user's application.

## How was this patch tested?

unit tests

Author: DB Tsai 

Closes #13198 from dbtsai/minor.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/21b2605d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/21b2605d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/21b2605d

Branch: refs/heads/master
Commit: 21b2605dc4900894ea7a911e039781ecc2a18c14
Parents: 130b8d0
Author: DB Tsai 
Authored: Fri May 27 14:02:39 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 14:02:39 2016 -0700

--
 .../org/apache/spark/ml/linalg/Matrices.scala   | 16 ++--
 .../org/apache/spark/ml/linalg/Vectors.scala|  8 +++---
 .../distribution/MultivariateGaussian.scala |  8 +++---
 .../ml/linalg/BreezeMatrixConversionSuite.scala |  4 +--
 .../ml/linalg/BreezeVectorConversionSuite.scala |  4 +--
 .../apache/spark/ml/linalg/MatricesSuite.scala  | 14 +--
 .../apache/spark/ml/linalg/VectorsSuite.scala   |  2 +-
 .../scala/org/apache/spark/ml/ann/Layer.scala   |  8 +++---
 .../ml/classification/LogisticRegression.scala  |  2 +-
 .../spark/ml/clustering/GaussianMixture.scala   |  2 +-
 .../apache/spark/ml/feature/MaxAbsScaler.scala  |  2 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  |  2 +-
 .../ml/regression/AFTSurvivalRegression.scala   |  2 +-
 .../spark/ml/regression/LinearRegression.scala  |  2 +-
 .../apache/spark/mllib/classification/SVM.scala |  2 +-
 .../mllib/clustering/GaussianMixture.scala  |  2 +-
 .../mllib/clustering/GaussianMixtureModel.scala |  4 +--
 .../spark/mllib/clustering/LDAModel.scala   | 26 ++--
 .../spark/mllib/clustering/LDAOptimizer.scala   |  6 ++---
 .../mllib/clustering/StreamingKMeans.scala  |  4 +--
 .../apache/spark/mllib/linalg/Matrices.scala| 16 ++--
 .../org/apache/spark/mllib/linalg/Vectors.scala |  8 +++---
 .../mllib/linalg/distributed/BlockMatrix.scala  |  8 +++---
 .../mllib/linalg/distributed/RowMatrix.scala| 16 ++--
 .../mllib/optimization/GradientDescent.scala|  4 +--
 .../apache/spark/mllib/optimization/LBFGS.scala |  4 +--
 .../spark/mllib/optimization/Updater.scala  | 14 +--
 .../apache/spark/mllib/regression/Lasso.scala   |  2 +-
 .../mllib/regression/LinearRegression.scala |  2 +-
 .../mllib/regression/RidgeRegression.scala  |  2 +-
 .../stat/correlation/PearsonCorrelation.scala   |  2 +-
 .../distribution/MultivariateGaussian.scala |  8 +++---
 .../spark/mllib/stat/test/ChiSqTest.scala   |  2 +-
 .../ml/classification/NaiveBayesSuite.scala |  6 ++---
 .../LogisticRegressionSuite.scala   |  4 +--
 .../mllib/classification/NaiveBayesSuite.scala  |  4 +--
 .../spark/mllib/clustering/LDASuite.scala   |  4 +--
 .../mllib/clustering/StreamingKMeansSuite.scala |  2 +-
 .../spark/mllib/feature/NormalizerSuite.scala   | 16 ++--
 .../linalg/BreezeMatrixConversionSuite.scala|  4 +--
 .../linalg/BreezeVectorConversionSuite.scala|  4 +--
 .../spark/mllib/linalg/MatricesSuite.scala  | 14 +--
 .../spark/mllib/linalg/VectorsSuite.scala   |  2 +-
 .../linalg/distributed/BlockMatrixSuite.scala   |  2 +-
 .../distributed/IndexedRowMatrixSuite.scala | 10 
 .../linalg/distributed/RowMatrixSuite.scala | 14 +--
 .../spark/mllib/stat/CorrelationSuite.scala |  6 ++---
 .../apache/spark/mllib/util/MLUtilsSuite.scala  |  6 ++---
 project/MimaExcludes.scala  |  3 +++
 49 files changed, 156 insertions(+), 153 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/21b2605d/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
--
diff --git 
a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala 
b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index a47526d..0ea687b 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -69,7 +69,7 @@ sealed trait Matrix extends Serializable {
   def rowIter: Iterator[Vector] = this.transpose.colIter
 
   /** Converts to a breeze matrix. */
-  private[ml] def toBreeze: BM[Double]

spark git commit: [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix

2016-05-27 Thread jkbradley

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 a778d3c90 -> dcf498e8a


[SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix

## What changes were proposed in this pull request?

We're using `asML` to convert the mllib vector/matrix to ml vector/matrix now. 
Using `as` is more correct given that this conversion actually shares the same 
underline data structure. As a result, in this PR, `toBreeze` will be changed 
to `asBreeze`. This is a private API, as a result, it will not affect any 
user's application.

## How was this patch tested?

unit tests

Author: DB Tsai 

Closes #13198 from dbtsai/minor.

(cherry picked from commit 21b2605dc4900894ea7a911e039781ecc2a18c14)
Signed-off-by: Joseph K. Bradley 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dcf498e8
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dcf498e8
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dcf498e8

Branch: refs/heads/branch-2.0
Commit: dcf498e8aafd2b53c5680cf7f3ada31829686b62
Parents: a778d3c
Author: DB Tsai 
Authored: Fri May 27 14:02:39 2016 -0700
Committer: Joseph K. Bradley 
Committed: Fri May 27 14:02:51 2016 -0700

--
 .../org/apache/spark/ml/linalg/Matrices.scala   | 16 ++--
 .../org/apache/spark/ml/linalg/Vectors.scala|  8 +++---
 .../distribution/MultivariateGaussian.scala |  8 +++---
 .../ml/linalg/BreezeMatrixConversionSuite.scala |  4 +--
 .../ml/linalg/BreezeVectorConversionSuite.scala |  4 +--
 .../apache/spark/ml/linalg/MatricesSuite.scala  | 14 +--
 .../apache/spark/ml/linalg/VectorsSuite.scala   |  2 +-
 .../scala/org/apache/spark/ml/ann/Layer.scala   |  8 +++---
 .../ml/classification/LogisticRegression.scala  |  2 +-
 .../spark/ml/clustering/GaussianMixture.scala   |  2 +-
 .../apache/spark/ml/feature/MaxAbsScaler.scala  |  2 +-
 .../apache/spark/ml/feature/MinMaxScaler.scala  |  2 +-
 .../ml/regression/AFTSurvivalRegression.scala   |  2 +-
 .../spark/ml/regression/LinearRegression.scala  |  2 +-
 .../apache/spark/mllib/classification/SVM.scala |  2 +-
 .../mllib/clustering/GaussianMixture.scala  |  2 +-
 .../mllib/clustering/GaussianMixtureModel.scala |  4 +--
 .../spark/mllib/clustering/LDAModel.scala   | 26 ++--
 .../spark/mllib/clustering/LDAOptimizer.scala   |  6 ++---
 .../mllib/clustering/StreamingKMeans.scala  |  4 +--
 .../apache/spark/mllib/linalg/Matrices.scala| 16 ++--
 .../org/apache/spark/mllib/linalg/Vectors.scala |  8 +++---
 .../mllib/linalg/distributed/BlockMatrix.scala  |  8 +++---
 .../mllib/linalg/distributed/RowMatrix.scala| 16 ++--
 .../mllib/optimization/GradientDescent.scala|  4 +--
 .../apache/spark/mllib/optimization/LBFGS.scala |  4 +--
 .../spark/mllib/optimization/Updater.scala  | 14 +--
 .../apache/spark/mllib/regression/Lasso.scala   |  2 +-
 .../mllib/regression/LinearRegression.scala |  2 +-
 .../mllib/regression/RidgeRegression.scala  |  2 +-
 .../stat/correlation/PearsonCorrelation.scala   |  2 +-
 .../distribution/MultivariateGaussian.scala |  8 +++---
 .../spark/mllib/stat/test/ChiSqTest.scala   |  2 +-
 .../ml/classification/NaiveBayesSuite.scala |  6 ++---
 .../LogisticRegressionSuite.scala   |  4 +--
 .../mllib/classification/NaiveBayesSuite.scala  |  4 +--
 .../spark/mllib/clustering/LDASuite.scala   |  4 +--
 .../mllib/clustering/StreamingKMeansSuite.scala |  2 +-
 .../spark/mllib/feature/NormalizerSuite.scala   | 16 ++--
 .../linalg/BreezeMatrixConversionSuite.scala|  4 +--
 .../linalg/BreezeVectorConversionSuite.scala|  4 +--
 .../spark/mllib/linalg/MatricesSuite.scala  | 14 +--
 .../spark/mllib/linalg/VectorsSuite.scala   |  2 +-
 .../linalg/distributed/BlockMatrixSuite.scala   |  2 +-
 .../distributed/IndexedRowMatrixSuite.scala | 10 
 .../linalg/distributed/RowMatrixSuite.scala | 14 +--
 .../spark/mllib/stat/CorrelationSuite.scala |  6 ++---
 .../apache/spark/mllib/util/MLUtilsSuite.scala  |  6 ++---
 project/MimaExcludes.scala  |  3 +++
 49 files changed, 156 insertions(+), 153 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/dcf498e8/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
--
diff --git 
a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala 
b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
index a47526d..0ea687b 100644
--- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
+++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala
@@ -69,7 +69,7 @@ sealed trait Matrix extends Serializable {
   def rowIter: Iterator[V

1 2 3 4 5 6 7 8 >

1 - 100 of 763 matches

Mail list logo