spark git commit: [SPARK-9122] [MLLIB] [PySpark] spark.mllib regression support batch predict
Repository: spark Updated Branches: refs/heads/master 8a94eb23d -> 52de3acca [SPARK-9122] [MLLIB] [PySpark] spark.mllib regression support batch predict spark.mllib support batch predict for LinearRegressionModel, RidgeRegressionModel and LassoModel. Author: Yanbo Liang Closes #7614 from yanboliang/spark-9122 and squashes the following commits: 4e610c0 [Yanbo Liang] spark.mllib regression support batch predict Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52de3acc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52de3acc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52de3acc Branch: refs/heads/master Commit: 52de3acca4ce8c36fd4c9ce162473a091701bbc7 Parents: 8a94eb2 Author: Yanbo Liang Authored: Thu Jul 23 18:53:07 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Jul 23 18:53:07 2015 -0700 -- python/pyspark/mllib/regression.py | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/52de3acc/python/pyspark/mllib/regression.py -- diff --git a/python/pyspark/mllib/regression.py b/python/pyspark/mllib/regression.py index 8e90ade..5b7afc1 100644 --- a/python/pyspark/mllib/regression.py +++ b/python/pyspark/mllib/regression.py @@ -97,9 +97,11 @@ class LinearRegressionModelBase(LinearModel): def predict(self, x): """ -Predict the value of the dependent variable given a vector x -containing values for the independent variables. +Predict the value of the dependent variable given a vector or +an RDD of vectors containing values for the independent variables. """ +if isinstance(x, RDD): +return x.map(self.predict) x = _convert_to_vector(x) return self.weights.dot(x) + self.intercept @@ -124,6 +126,8 @@ class LinearRegressionModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True +>>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 +True >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> lrm.save(sc, path) @@ -267,6 +271,8 @@ class LassoModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True +>>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 +True >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> lrm.save(sc, path) @@ -382,6 +388,8 @@ class RidgeRegressionModel(LinearRegressionModelBase): True >>> abs(lrm.predict(SparseVector(1, {0: 1.0})) - 1) < 0.5 True +>>> abs(lrm.predict(sc.parallelize([[1.0]])).collect()[0] - 1) < 0.5 +True >>> import os, tempfile >>> path = tempfile.mkdtemp() >>> lrm.save(sc, path) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8092] [ML] Allow OneVsRest Classifier feature and label column names to be configurable.
Repository: spark Updated Branches: refs/heads/master d249636e5 -> d4d762f27 [SPARK-8092] [ML] Allow OneVsRest Classifier feature and label column names to be configurable. The base classifier input and output columns are ignored in favor of the ones specified in OneVsRest. Author: Ram Sriharsha Closes #6631 from harsha2010/SPARK-8092 and squashes the following commits: 6591dc6 [Ram Sriharsha] add documentation for params b7024b1 [Ram Sriharsha] cleanup f0e2bfb [Ram Sriharsha] merge with master 108d3d7 [Ram Sriharsha] merge with master 4f74126 [Ram Sriharsha] Allow label/ features columns to be configurable Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d4d762f2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d4d762f2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d4d762f2 Branch: refs/heads/master Commit: d4d762f275749a923356cd84de549b14c22cc3eb Parents: d249636 Author: Ram Sriharsha Authored: Thu Jul 23 22:35:41 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Jul 23 22:35:41 2015 -0700 -- .../spark/ml/classification/OneVsRest.scala | 17 +- .../ml/classification/OneVsRestSuite.scala | 24 2 files changed, 40 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d4d762f2/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala index ea757c5..1741f19 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala @@ -47,6 +47,8 @@ private[ml] trait OneVsRestParams extends PredictorParams { /** * param for the base binary classifier that we reduce multiclass classification into. + * The base classifier input and output columns are ignored in favor of + * the ones specified in [[OneVsRest]]. * @group param */ val classifier: Param[ClassifierType] = new Param(this, "classifier", "base binary classifier") @@ -160,6 +162,15 @@ final class OneVsRest(override val uid: String) set(classifier, value.asInstanceOf[ClassifierType]) } + /** @group setParam */ + def setLabelCol(value: String): this.type = set(labelCol, value) + + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** @group setParam */ + def setPredictionCol(value: String): this.type = set(predictionCol, value) + override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema, fitting = true, getClassifier.featuresDataType) } @@ -195,7 +206,11 @@ final class OneVsRest(override val uid: String) val labelUDFWithNewMeta = labelUDF(col($(labelCol))).as(labelColName, newLabelMeta) val trainingDataset = multiclassLabeled.withColumn(labelColName, labelUDFWithNewMeta) val classifier = getClassifier - classifier.fit(trainingDataset, classifier.labelCol -> labelColName) + val paramMap = new ParamMap() + paramMap.put(classifier.labelCol -> labelColName) + paramMap.put(classifier.featuresCol -> getFeaturesCol) + paramMap.put(classifier.predictionCol -> getPredictionCol) + classifier.fit(trainingDataset, paramMap) }.toArray[ClassificationModel[_, _]] if (handlePersistence) { http://git-wip-us.apache.org/repos/asf/spark/blob/d4d762f2/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala index 75cf5bd..3775292 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala @@ -19,6 +19,7 @@ package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.NominalAttribute +import org.apache.spark.ml.feature.StringIndexer import org.apache.spark.ml.param.{ParamMap, ParamsSuite} import org.apache.spark.ml.util.MetadataUtils import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS @@ -104,6 +105,29 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext { ova.fit(datasetWithLabelMetadata) } + test("SPARK-8092: ensure label features and prediction cols are configurable") { +val labelIndexer = new StringIndexer() + .setInputCol("label")
spark git commit: [SPARK-9222] [MLlib] Make class instantiation variables in DistributedLDAModel private[clustering]
Repository: spark Updated Branches: refs/heads/master c2b50d693 -> e25312451 [SPARK-9222] [MLlib] Make class instantiation variables in DistributedLDAModel private[clustering] This makes it easier to test all the class variables of the DistributedLDAmodel. Author: MechCoder Closes #7573 from MechCoder/lda_test and squashes the following commits: 2f1a293 [MechCoder] [SPARK-9222] [MLlib] Make class instantiation variables in DistributedLDAModel private[clustering] Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2531245 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2531245 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2531245 Branch: refs/heads/master Commit: e25312451322969ad716dddf8248b8c17f68323b Parents: c2b50d6 Author: MechCoder Authored: Fri Jul 24 10:56:48 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Jul 24 10:56:48 2015 -0700 -- .../org/apache/spark/mllib/clustering/LDAModel.scala | 8 .../org/apache/spark/mllib/clustering/LDASuite.scala | 15 +++ 2 files changed, 19 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e2531245/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 920b577..31c1d52 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -283,12 +283,12 @@ object LocalLDAModel extends Loader[LocalLDAModel] { */ @Experimental class DistributedLDAModel private ( -private val graph: Graph[LDA.TopicCounts, LDA.TokenCount], -private val globalTopicTotals: LDA.TopicCounts, +private[clustering] val graph: Graph[LDA.TopicCounts, LDA.TokenCount], +private[clustering] val globalTopicTotals: LDA.TopicCounts, val k: Int, val vocabSize: Int, -private val docConcentration: Double, -private val topicConcentration: Double, +private[clustering] val docConcentration: Double, +private[clustering] val topicConcentration: Double, private[spark] val iterationTimes: Array[Double]) extends LDAModel { import LDA._ http://git-wip-us.apache.org/repos/asf/spark/blob/e2531245/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index da70d9b..376a87f 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -20,6 +20,7 @@ package org.apache.spark.mllib.clustering import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite +import org.apache.spark.graphx.Edge import org.apache.spark.mllib.linalg.{DenseMatrix, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ @@ -318,6 +319,20 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { assert(distributedModel.k === sameDistributedModel.k) assert(distributedModel.vocabSize === sameDistributedModel.vocabSize) assert(distributedModel.iterationTimes === sameDistributedModel.iterationTimes) + assert(distributedModel.docConcentration === sameDistributedModel.docConcentration) + assert(distributedModel.topicConcentration === sameDistributedModel.topicConcentration) + assert(distributedModel.globalTopicTotals === sameDistributedModel.globalTopicTotals) + + val graph = distributedModel.graph + val sameGraph = sameDistributedModel.graph + assert(graph.vertices.sortByKey().collect() === sameGraph.vertices.sortByKey().collect()) + val edge = graph.edges.map { +case Edge(sid: Long, did: Long, nos: Double) => (sid, did, nos) + }.sortBy(x => (x._1, x._2)).collect() + val sameEdge = sameGraph.edges.map { +case Edge(sid: Long, did: Long, nos: Double) => (sid, did, nos) + }.sortBy(x => (x._1, x._2)).collect() + assert(edge === sameEdge) } finally { Utils.deleteRecursively(tempDir1) Utils.deleteRecursively(tempDir2) - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-7045] [MLLIB] Avoid intermediate representation when creating model
Repository: spark Updated Branches: refs/heads/master 64135cbb3 -> a400ab516 [SPARK-7045] [MLLIB] Avoid intermediate representation when creating model Word2Vec used to convert from an Array[Float] representation to a Map[String, Array[Float]] and then back to an Array[Float] through Word2VecModel. This prevents this conversion while still supporting the older method of supplying a Map. Author: MechCoder Closes #5748 from MechCoder/spark-7045 and squashes the following commits: e308913 [MechCoder] move docs 5703116 [MechCoder] minor fa04313 [MechCoder] style fixes b1d61c4 [MechCoder] better errors and tests 3b32c8c [MechCoder] [SPARK-7045] Avoid intermediate representation when creating model Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a400ab51 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a400ab51 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a400ab51 Branch: refs/heads/master Commit: a400ab516fa93185aa683a596f9d7c6c1a02f330 Parents: 64135cb Author: MechCoder Authored: Fri Jul 24 14:58:07 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Jul 24 14:58:07 2015 -0700 -- .../apache/spark/mllib/feature/Word2Vec.scala | 85 +++- .../spark/mllib/feature/Word2VecSuite.scala | 6 ++ 2 files changed, 55 insertions(+), 36 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a400ab51/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala index f087d06..cbbd2b0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/feature/Word2Vec.scala @@ -403,17 +403,8 @@ class Word2Vec extends Serializable with Logging { } newSentences.unpersist() -val word2VecMap = mutable.HashMap.empty[String, Array[Float]] -var i = 0 -while (i < vocabSize) { - val word = bcVocab.value(i).word - val vector = new Array[Float](vectorSize) - Array.copy(syn0Global, i * vectorSize, vector, 0, vectorSize) - word2VecMap += word -> vector - i += 1 -} - -new Word2VecModel(word2VecMap.toMap) +val wordArray = vocab.map(_.word) +new Word2VecModel(wordArray.zipWithIndex.toMap, syn0Global) } /** @@ -429,38 +420,42 @@ class Word2Vec extends Serializable with Logging { /** * :: Experimental :: * Word2Vec model + * @param wordIndex maps each word to an index, which can retrieve the corresponding + * vector from wordVectors + * @param wordVectors array of length numWords * vectorSize, vector corresponding + *to the word mapped with index i can be retrieved by the slice + *(i * vectorSize, i * vectorSize + vectorSize) */ @Experimental -class Word2VecModel private[spark] ( -model: Map[String, Array[Float]]) extends Serializable with Saveable { - - // wordList: Ordered list of words obtained from model. - private val wordList: Array[String] = model.keys.toArray - - // wordIndex: Maps each word to an index, which can retrieve the corresponding - //vector from wordVectors (see below). - private val wordIndex: Map[String, Int] = wordList.zip(0 until model.size).toMap +class Word2VecModel private[mllib] ( +private val wordIndex: Map[String, Int], +private val wordVectors: Array[Float]) extends Serializable with Saveable { - // vectorSize: Dimension of each word's vector. - private val vectorSize = model.head._2.size private val numWords = wordIndex.size + // vectorSize: Dimension of each word's vector. + private val vectorSize = wordVectors.length / numWords + + // wordList: Ordered list of words obtained from wordIndex. + private val wordList: Array[String] = { +val (wl, _) = wordIndex.toSeq.sortBy(_._2).unzip +wl.toArray + } - // wordVectors: Array of length numWords * vectorSize, vector corresponding to the word - // mapped with index i can be retrieved by the slice - // (ind * vectorSize, ind * vectorSize + vectorSize) // wordVecNorms: Array of length numWords, each value being the Euclidean norm // of the wordVector. - private val (wordVectors: Array[Float], wordVecNorms: Array[Double]) = { -val wordVectors = new Array[Float](vectorSize * numWords) + private val wordVecNorms: Array[Double] = { val wordVecNorms = new Array[Double](numWords) var i = 0 while (i < numWords) { - val vec = model.get(wordList(i)).get - Array.copy(vec, 0, wordVectors, i * vectorSize, vectorSize) + val vec = wordVectors.slic
spark git commit: [SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA perplexity
Repository: spark Updated Branches: refs/heads/master 1b0099fc6 -> 2cc212d56 [SPARK-6793] [MLLIB] OnlineLDAOptimizer LDA perplexity Implements `logPerplexity` in `OnlineLDAOptimizer`. Also refactors inference code into companion object to enable future reuse (e.g. `predict` method). Author: Feynman Liang Closes #7705 from feynmanliang/SPARK-6793-perplexity and squashes the following commits: 6da2c99 [Feynman Liang] Remove get* from LDAModel public API 8381da6 [Feynman Liang] Code review comments 17f7000 [Feynman Liang] Documentation typo fixes 2f452a4 [Feynman Liang] Remove auxillary DistributedLDAModel constructor a275914 [Feynman Liang] Prevent empty counts calls to variationalInference 06d02d9 [Feynman Liang] Remove deprecated LocalLDAModel constructor afecb46 [Feynman Liang] Fix regression bug in sstats accumulator 5a327a0 [Feynman Liang] Code review quick fixes 998c03e [Feynman Liang] Fix style 1cbb67d [Feynman Liang] Fix access modifier bug 4362daa [Feynman Liang] Organize imports 4f171f7 [Feynman Liang] Fix indendation 2f049ce [Feynman Liang] Fix failing save/load tests 7415e96 [Feynman Liang] Pick changes from big PR 11e7c33 [Feynman Liang] Merge remote-tracking branch 'apache/master' into SPARK-6793-perplexity f8adc48 [Feynman Liang] Add logPerplexity, refactor variationalBound into a method cd521d6 [Feynman Liang] Refactor methods into companion class 7f62a55 [Feynman Liang] --amend c62cb1e [Feynman Liang] Outer product for stats, revert Range slicing aead650 [Feynman Liang] Range slice, in-place update, reduce transposes Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2cc212d5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2cc212d5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2cc212d5 Branch: refs/heads/master Commit: 2cc212d56a1d50fe68d5816f71b27803de1f6389 Parents: 1b0099f Author: Feynman Liang Authored: Wed Jul 29 16:20:20 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Jul 29 16:20:20 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 200 +++ .../spark/mllib/clustering/LDAOptimizer.scala | 138 +++-- .../spark/mllib/clustering/LDAUtils.scala | 55 + .../spark/mllib/clustering/JavaLDASuite.java| 6 +- .../spark/mllib/clustering/LDASuite.scala | 53 - 5 files changed, 348 insertions(+), 104 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2cc212d5/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 31c1d52..059b52e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -17,10 +17,9 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{DenseMatrix => BDM, normalize, sum => brzSum, DenseVector => BDV} - +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum} +import breeze.numerics.{exp, lgamma} import org.apache.hadoop.fs.Path - import org.json4s.DefaultFormats import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ @@ -28,14 +27,13 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaPairRDD -import org.apache.spark.graphx.{VertexId, Edge, EdgeContext, Graph} -import org.apache.spark.mllib.linalg.{Vectors, Vector, Matrices, Matrix, DenseVector} -import org.apache.spark.mllib.util.{Saveable, Loader} +import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId} +import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} +import org.apache.spark.mllib.util.{Loader, Saveable} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{SQLContext, Row} +import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.util.BoundedPriorityQueue - /** * :: Experimental :: * @@ -54,6 +52,31 @@ abstract class LDAModel private[clustering] extends Saveable { def vocabSize: Int /** + * Concentration parameter (commonly named "alpha") for the prior placed on documents' + * distributions over topics ("theta"). + * + * This is the parameter to a Dirichlet distribution. + */ + def docConcentration: Vector + + /** + * Concentration parameter (commonly named "beta" or "eta") for the prior placed on topics' + * distributions over terms. + * + * This is the parameter to a symmetric Dirichlet distribution. + * + * Note: The topics' distributions over terms are called "beta" in the orig
spark git commit: [SPARK-9016] [ML] make random forest classifiers implement classification trait
Repository: spark Updated Branches: refs/heads/master 103d8cce7 -> 37c2d1927 [SPARK-9016] [ML] make random forest classifiers implement classification trait Implement the classification trait for RandomForestClassifiers. The plan is to use this in the future to providing thresholding for RandomForestClassifiers (as well as other classifiers that implement that trait). Author: Holden Karau Closes #7432 from holdenk/SPARK-9016-make-random-forest-classifiers-implement-classification-trait and squashes the following commits: bf22fa6 [Holden Karau] Add missing imports for testing suite e948f0d [Holden Karau] Check the prediction generation from rawprediciton 25320c3 [Holden Karau] Don't supply numClasses when not needed, assert model classes are as expected 1a67e04 [Holden Karau] Use old decission tree stuff instead 673e0c3 [Holden Karau] Merge branch 'master' into SPARK-9016-make-random-forest-classifiers-implement-classification-trait 0d15b96 [Holden Karau] FIx typo 5eafad4 [Holden Karau] add a constructor for rootnode + num classes fc6156f [Holden Karau] scala style fix 2597915 [Holden Karau] take num classes in constructor 3ccfe4a [Holden Karau] Merge in master, make pass numClasses through randomforest for training 222a10b [Holden Karau] Increase numtrees to 3 in the python test since before the two were equal and the argmax was selecting the last one 16aea1c [Holden Karau] Make tests match the new models b454a02 [Holden Karau] Make the Tree classifiers extends the Classifier base class 77b4114 [Holden Karau] Import vectors lib Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/37c2d192 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/37c2d192 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/37c2d192 Branch: refs/heads/master Commit: 37c2d1927cebdd19a14c054f670cb0fb9a263586 Parents: 103d8cc Author: Holden Karau Authored: Wed Jul 29 18:18:29 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Jul 29 18:18:29 2015 -0700 -- .../classification/RandomForestClassifier.scala | 30 +++- .../RandomForestClassifierSuite.scala | 18 +--- python/pyspark/ml/classification.py | 4 +-- 3 files changed, 32 insertions(+), 20 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/37c2d192/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index fc0693f..bc19bd6 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -25,7 +25,7 @@ import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree.{DecisionTreeModel, RandomForestParams, TreeClassifierParams, TreeEnsembleModel} import org.apache.spark.ml.util.{Identifiable, MetadataUtils} -import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} import org.apache.spark.mllib.tree.model.{RandomForestModel => OldRandomForestModel} @@ -43,7 +43,7 @@ import org.apache.spark.sql.types.DoubleType */ @Experimental final class RandomForestClassifier(override val uid: String) - extends Predictor[Vector, RandomForestClassifier, RandomForestClassificationModel] + extends Classifier[Vector, RandomForestClassifier, RandomForestClassificationModel] with RandomForestParams with TreeClassifierParams { def this() = this(Identifiable.randomUID("rfc")) @@ -98,7 +98,7 @@ final class RandomForestClassifier(override val uid: String) val trees = RandomForest.run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed) .map(_.asInstanceOf[DecisionTreeClassificationModel]) -new RandomForestClassificationModel(trees) +new RandomForestClassificationModel(trees, numClasses) } override def copy(extra: ParamMap): RandomForestClassifier = defaultCopy(extra) @@ -125,8 +125,9 @@ object RandomForestClassifier { @Experimental final class RandomForestClassificationModel private[ml] ( override val uid: String, -private val _trees: Array[DecisionTreeClassificationModel]) - extends PredictionModel[Vector, RandomForestClassificationModel] +private val _trees: Array[DecisionTreeClassificationModel], +override val numClasses: Int) + extends ClassificationModel[Vector, RandomFo
spark git commit: [SPARK-9440] [MLLIB] Add hyperparameters to LocalLDAModel save/load
Repository: spark Updated Branches: refs/heads/master 2a9fe4a4e -> a200e6456 [SPARK-9440] [MLLIB] Add hyperparameters to LocalLDAModel save/load jkbradley MechCoder Resolves blocking issue for SPARK-6793. Please review after #7705 is merged. Author: Feynman Liang Closes #7757 from feynmanliang/SPARK-9940-localSaveLoad and squashes the following commits: d0d8cf4 [Feynman Liang] Fix thisClassName 0f30109 [Feynman Liang] Fix tests after changing LDAModel public API dc61981 [Feynman Liang] Add hyperparams to LocalLDAModel save/load Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a200e645 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a200e645 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a200e645 Branch: refs/heads/master Commit: a200e64561c8803731578267df16906f6773cbea Parents: 2a9fe4a Author: Feynman Liang Authored: Wed Jul 29 19:02:15 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Jul 29 19:02:15 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 40 ++-- .../spark/mllib/clustering/LDASuite.scala | 6 ++- 2 files changed, 33 insertions(+), 13 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a200e645/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 059b52e..ece2884 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -215,7 +215,8 @@ class LocalLDAModel private[clustering] ( override protected def formatVersion = "1.0" override def save(sc: SparkContext, path: String): Unit = { -LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix) +LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration, + gammaShape) } // TODO // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ??? @@ -312,16 +313,23 @@ object LocalLDAModel extends Loader[LocalLDAModel] { // as a Row in data. case class Data(topic: Vector, index: Int) -// TODO: explicitly save docConcentration, topicConcentration, and gammaShape for use in -// model.predict() -def save(sc: SparkContext, path: String, topicsMatrix: Matrix): Unit = { +def save( +sc: SparkContext, +path: String, +topicsMatrix: Matrix, +docConcentration: Vector, +topicConcentration: Double, +gammaShape: Double): Unit = { val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ val k = topicsMatrix.numCols val metadata = compact(render (("class" -> thisClassName) ~ ("version" -> thisFormatVersion) ~ - ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows))) + ("k" -> k) ~ ("vocabSize" -> topicsMatrix.numRows) ~ + ("docConcentration" -> docConcentration.toArray.toSeq) ~ + ("topicConcentration" -> topicConcentration) ~ + ("gammaShape" -> gammaShape))) sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path)) val topicsDenseMatrix = topicsMatrix.toBreeze.toDenseMatrix @@ -331,7 +339,12 @@ object LocalLDAModel extends Loader[LocalLDAModel] { sc.parallelize(topics, 1).toDF().write.parquet(Loader.dataPath(path)) } -def load(sc: SparkContext, path: String): LocalLDAModel = { +def load( +sc: SparkContext, +path: String, +docConcentration: Vector, +topicConcentration: Double, +gammaShape: Double): LocalLDAModel = { val dataPath = Loader.dataPath(path) val sqlContext = SQLContext.getOrCreate(sc) val dataFrame = sqlContext.read.parquet(dataPath) @@ -348,8 +361,7 @@ object LocalLDAModel extends Loader[LocalLDAModel] { val topicsMat = Matrices.fromBreeze(brzTopics) // TODO: initialize with docConcentration, topicConcentration, and gammaShape after SPARK-9940 - new LocalLDAModel(topicsMat, -Vectors.dense(Array.fill(topicsMat.numRows)(1.0 / topicsMat.numRows)), 1D, 100D) + new LocalLDAModel(topicsMat, docConcentration, topicConcentration, gammaShape) } } @@ -358,11 +370,15 @@ object LocalLDAModel extends Loader[LocalLDAModel] { implicit val formats = DefaultFormats val expectedK = (metadata \ "k").extract[Int] val expectedVocabSize = (metadata \ "vocabSize").ex
spark git commit: [SPARK-5567] [MLLIB] Add predict method to LocalLDAModel
Repository: spark Updated Branches: refs/heads/master a20e743fb -> d8cfd531c [SPARK-5567] [MLLIB] Add predict method to LocalLDAModel jkbradley hhbyyh Adds `topicDistributions` to LocalLDAModel. Please review after #7757 is merged. Author: Feynman Liang Closes #7760 from feynmanliang/SPARK-5567-predict-in-LDA and squashes the following commits: 0ad1134 [Feynman Liang] Remove println 27b3877 [Feynman Liang] Code review fixes 6bfb87c [Feynman Liang] Remove extra newline 476f788 [Feynman Liang] Fix checks and doc for variationalInference 061780c [Feynman Liang] Code review cleanup 3be2947 [Feynman Liang] Rename topicDistribution -> topicDistributions 2a821a6 [Feynman Liang] Add predict methods to LocalLDAModel Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d8cfd531 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d8cfd531 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d8cfd531 Branch: refs/heads/master Commit: d8cfd531c7c50c9b00ab546be458f44f84c386ae Parents: a20e743 Author: Feynman Liang Authored: Thu Jul 30 13:17:54 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Jul 30 13:17:54 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 42 +++-- .../spark/mllib/clustering/LDAOptimizer.scala | 5 +- .../spark/mllib/clustering/LDASuite.scala | 63 3 files changed, 102 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d8cfd531/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index ece2884..6cfad3f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -186,7 +186,6 @@ abstract class LDAModel private[clustering] extends Saveable { * This model stores only the inferred topics. * It may be used for computing topics for new documents, but it may give less accurate answers * than the [[DistributedLDAModel]]. - * * @param topics Inferred topics (vocabSize x k matrix). */ @Experimental @@ -221,9 +220,6 @@ class LocalLDAModel private[clustering] ( // TODO // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ??? - // TODO: - // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ??? - /** * Calculate the log variational bound on perplexity. See Equation (16) in original Online * LDA paper. @@ -269,7 +265,7 @@ class LocalLDAModel private[clustering] ( // by topic (columns of lambda) val Elogbeta = LDAUtils.dirichletExpectation(lambda.t).t -var score = documents.filter(_._2.numActives > 0).map { case (id: Long, termCounts: Vector) => +var score = documents.filter(_._2.numNonzeros > 0).map { case (id: Long, termCounts: Vector) => var docScore = 0.0D val (gammad: BDV[Double], _) = OnlineLDAOptimizer.variationalTopicInference( termCounts, exp(Elogbeta), brzAlpha, gammaShape, k) @@ -277,7 +273,7 @@ class LocalLDAModel private[clustering] ( // E[log p(doc | theta, beta)] termCounts.foreachActive { case (idx, count) => -docScore += LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t) +docScore += count * LDAUtils.logSumExp(Elogthetad + Elogbeta(idx, ::).t) } // E[log p(theta | alpha) - log q(theta | gamma)]; assumes alpha is a vector docScore += sum((brzAlpha - gammad) :* Elogthetad) @@ -297,6 +293,40 @@ class LocalLDAModel private[clustering] ( score } + /** + * Predicts the topic mixture distribution for each document (often called "theta" in the + * literature). Returns a vector of zeros for an empty document. + * + * This uses a variational approximation following Hoffman et al. (2010), where the approximate + * distribution is called "gamma." Technically, this method returns this approximation "gamma" + * for each document. + * @param documents documents to predict topic mixture distributions for + * @return An RDD of (document ID, topic mixture distribution for document) + */ + // TODO: declare in LDAModel and override once implemented in DistributedLDAModel + def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = { +// Double transpose because dirichletExpectation normalizes by row and we need to normalize +// by topic (columns of lambda) +val expElogbeta = exp(LDAUtils.dirichletExpectation(topicsMatrix.toBree
spark git commit: [SPARK-9454] Change LDASuite tests to use vector comparisons
Repository: spark Updated Branches: refs/heads/master 1abf7dc16 -> 89cda69ec [SPARK-9454] Change LDASuite tests to use vector comparisons jkbradley Changes the current hacky string-comparison for vector compares. Author: Feynman Liang Closes #7775 from feynmanliang/SPARK-9454-ldasuite-vector-compare and squashes the following commits: bd91a82 [Feynman Liang] Remove println 905c76e [Feynman Liang] Fix string compare in distributed EM 2f24c13 [Feynman Liang] Improve LDASuite tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/89cda69e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/89cda69e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/89cda69e Branch: refs/heads/master Commit: 89cda69ecd5ef942a68ad13fc4e1f4184010f087 Parents: 1abf7dc Author: Feynman Liang Authored: Thu Jul 30 14:08:59 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Jul 30 14:08:59 2015 -0700 -- .../spark/mllib/clustering/LDASuite.scala | 33 +--- 1 file changed, 14 insertions(+), 19 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/89cda69e/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index d74482d..c43e1e5 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -83,21 +83,14 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { assert(model.topicsMatrix === localModel.topicsMatrix) // Check: topic summaries -// The odd decimal formatting and sorting is a hack to do a robust comparison. -val roundedTopicSummary = model.describeTopics().map { case (terms, termWeights) => - // cut values to 3 digits after the decimal place - terms.zip(termWeights).map { case (term, weight) => -("%.3f".format(weight).toDouble, term.toInt) - } -}.sortBy(_.mkString("")) -val roundedLocalTopicSummary = localModel.describeTopics().map { case (terms, termWeights) => - // cut values to 3 digits after the decimal place - terms.zip(termWeights).map { case (term, weight) => -("%.3f".format(weight).toDouble, term.toInt) - } -}.sortBy(_.mkString("")) -roundedTopicSummary.zip(roundedLocalTopicSummary).foreach { case (t1, t2) => - assert(t1 === t2) +val topicSummary = model.describeTopics().map { case (terms, termWeights) => + Vectors.sparse(tinyVocabSize, terms, termWeights) +}.sortBy(_.toString) +val localTopicSummary = localModel.describeTopics().map { case (terms, termWeights) => + Vectors.sparse(tinyVocabSize, terms, termWeights) +}.sortBy(_.toString) +topicSummary.zip(localTopicSummary).foreach { case (topics, topicsLocal) => + assert(topics ~== topicsLocal absTol 0.01) } // Check: per-doc topic distributions @@ -197,10 +190,12 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { // verify the result, Note this generate the identical result as // [[https://github.com/Blei-Lab/onlineldavb]] -val topic1 = op.getLambda(0, ::).inner.toArray.map("%.4f".format(_)).mkString(", ") -val topic2 = op.getLambda(1, ::).inner.toArray.map("%.4f".format(_)).mkString(", ") -assert("1.1101, 1.2076, 1.3050, 0.8899, 0.7924, 0.6950" == topic1) -assert("0.8899, 0.7924, 0.6950, 1.1101, 1.2076, 1.3050" == topic2) +val topic1: Vector = Vectors.fromBreeze(op.getLambda(0, ::).t) +val topic2: Vector = Vectors.fromBreeze(op.getLambda(1, ::).t) +val expectedTopic1 = Vectors.dense(1.1101, 1.2076, 1.3050, 0.8899, 0.7924, 0.6950) +val expectedTopic2 = Vectors.dense(0.8899, 0.7924, 0.6950, 1.1101, 1.2076, 1.3050) +assert(topic1 ~== expectedTopic1 absTol 0.01) +assert(topic2 ~== expectedTopic2 absTol 0.01) } test("OnlineLDAOptimizer with toy data") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-6684] [MLLIB] [ML] Add checkpointing to GBTs
Repository: spark Updated Branches: refs/heads/master 7f7a319c4 -> be7be6d4c [SPARK-6684] [MLLIB] [ML] Add checkpointing to GBTs Add checkpointing to GradientBoostedTrees, GBTClassifier, GBTRegressor CC: mengxr Author: Joseph K. Bradley Closes #7804 from jkbradley/gbt-checkpoint3 and squashes the following commits: 3fbd7ba [Joseph K. Bradley] tiny fix b3e160c [Joseph K. Bradley] unset checkpoint dir after test 9cc3a04 [Joseph K. Bradley] added checkpointing to GBTs Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be7be6d4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be7be6d4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be7be6d4 Branch: refs/heads/master Commit: be7be6d4c7d978c20e601d1f5f56ecb3479814cb Parents: 7f7a319 Author: Joseph K. Bradley Authored: Thu Jul 30 16:04:23 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Jul 30 16:04:23 2015 -0700 -- .../spark/mllib/clustering/LDAOptimizer.scala | 1 + .../spark/mllib/tree/GradientBoostedTrees.scala | 48 ++-- .../tree/configuration/BoostingStrategy.scala | 3 +- .../ml/classification/GBTClassifierSuite.scala | 20 + .../spark/ml/regression/GBTRegressorSuite.scala | 20 - .../mllib/tree/GradientBoostedTreesSuite.scala | 79 6 files changed, 114 insertions(+), 57 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/be7be6d4/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index 9dbec41..d6f8b29 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -144,6 +144,7 @@ final class EMLDAOptimizer extends LDAOptimizer { this.checkpointInterval = lda.getCheckpointInterval this.graphCheckpointer = new PeriodicGraphCheckpointer[TopicCounts, TokenCount]( checkpointInterval, graph.vertices.sparkContext) +this.graphCheckpointer.update(this.graph) this.globalTopicTotals = computeGlobalTopicTotals() this } http://git-wip-us.apache.org/repos/asf/spark/blob/be7be6d4/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala index a835f96..9ce6faa 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/GradientBoostedTrees.scala @@ -20,6 +20,7 @@ package org.apache.spark.mllib.tree import org.apache.spark.Logging import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaRDD +import org.apache.spark.mllib.impl.PeriodicRDDCheckpointer import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.mllib.tree.configuration.Algo._ @@ -184,22 +185,28 @@ object GradientBoostedTrees extends Logging { false } +// Prepare periodic checkpointers +val predErrorCheckpointer = new PeriodicRDDCheckpointer[(Double, Double)]( + treeStrategy.getCheckpointInterval, input.sparkContext) +val validatePredErrorCheckpointer = new PeriodicRDDCheckpointer[(Double, Double)]( + treeStrategy.getCheckpointInterval, input.sparkContext) + timer.stop("init") logDebug("##") logDebug("Building tree 0") logDebug("##") -var data = input // Initialize tree timer.start("building tree 0") -val firstTreeModel = new DecisionTree(treeStrategy).run(data) +val firstTreeModel = new DecisionTree(treeStrategy).run(input) val firstTreeWeight = 1.0 baseLearners(0) = firstTreeModel baseLearnerWeights(0) = firstTreeWeight var predError: RDD[(Double, Double)] = GradientBoostedTreesModel. computeInitialPredictionAndError(input, firstTreeWeight, firstTreeModel, loss) +predErrorCheckpointer.update(predError) logDebug("error of gbt = " + predError.values.mean()) // Note: A model of type regression is used since we require raw prediction @@ -207,35 +214,34 @@ object GradientBoostedTrees extends Logging { var validatePredError: RDD[(Double, Double)] = GradientBoostedTreesModel. computeInitialPredictionAndError(validationI
spark git commit: [SPARK-9077] [MLLIB] Improve error message for decision trees when numExamples < maxCategoriesPerFeature
Repository: spark Updated Branches: refs/heads/master 351eda0e2 -> 65fa4181c [SPARK-9077] [MLLIB] Improve error message for decision trees when numExamples < maxCategoriesPerFeature Improve error message when number of examples is less than arity of high-arity categorical feature CC jkbradley is this about what you had in mind? I know it's a starter, but was on my list to close out in the short term. Author: Sean Owen Closes #7800 from srowen/SPARK-9077 and squashes the following commits: b8f6cdb [Sean Owen] Improve error message when number of examples is less than arity of high-arity categorical feature Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65fa4181 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65fa4181 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65fa4181 Branch: refs/heads/master Commit: 65fa4181c35135080870c1e4c1f904ada3a8cf59 Parents: 351eda0 Author: Sean Owen Authored: Thu Jul 30 17:26:18 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Jul 30 17:26:18 2015 -0700 -- .../apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/65fa4181/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala index 380291a..9fe2646 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala @@ -128,9 +128,13 @@ private[spark] object DecisionTreeMetadata extends Logging { // based on the number of training examples. if (strategy.categoricalFeaturesInfo.nonEmpty) { val maxCategoriesPerFeature = strategy.categoricalFeaturesInfo.values.max + val maxCategory = +strategy.categoricalFeaturesInfo.find(_._2 == maxCategoriesPerFeature).get._1 require(maxCategoriesPerFeature <= maxPossibleBins, -s"DecisionTree requires maxBins (= $maxPossibleBins) >= max categories " + - s"in categorical features (= $maxCategoriesPerFeature)") +s"DecisionTree requires maxBins (= $maxPossibleBins) to be at least as large as the " + +s"number of values in each categorical feature, but categorical feature $maxCategory " + +s"has $maxCategoriesPerFeature values. Considering remove this and other categorical " + +"features with a large number of values, or add more training examples.") } val unorderedFeatures = new mutable.HashSet[Int]() - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-7690] [ML] Multiclass classification Evaluator
Repository: spark Updated Branches: refs/heads/master 83670fc9e -> 4e5919bfb [SPARK-7690] [ML] Multiclass classification Evaluator Multiclass Classification Evaluator for ML Pipelines. F1 score, precision, recall, weighted precision and weighted recall are supported as available metrics. Author: Ram Sriharsha Closes #7475 from harsha2010/SPARK-7690 and squashes the following commits: 9bf4ec7 [Ram Sriharsha] fix indentation 3f09a85 [Ram Sriharsha] cleanup doc 16115ae [Ram Sriharsha] code review fixes 032d2a3 [Ram Sriharsha] fix test eec9865 [Ram Sriharsha] Fix Python Indentation 1dbeffd [Ram Sriharsha] Merge branch 'master' into SPARK-7690 68cea85 [Ram Sriharsha] Merge branch 'master' into SPARK-7690 54c03de [Ram Sriharsha] [SPARK-7690][ml][WIP] Multiclass Evaluator for ML Pipeline Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4e5919bf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4e5919bf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4e5919bf Branch: refs/heads/master Commit: 4e5919bfb47a58bcbda90ae01c1bed2128ded983 Parents: 83670fc Author: Ram Sriharsha Authored: Thu Jul 30 23:02:11 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Jul 30 23:02:11 2015 -0700 -- .../MulticlassClassificationEvaluator.scala | 85 ...MulticlassClassificationEvaluatorSuite.scala | 28 +++ python/pyspark/ml/evaluation.py | 66 +++ 3 files changed, 179 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4e5919bf/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala new file mode 100644 index 000..44f779c --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.scala @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.ml.evaluation + +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} +import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} +import org.apache.spark.ml.util.{SchemaUtils, Identifiable} +import org.apache.spark.mllib.evaluation.MulticlassMetrics +import org.apache.spark.sql.{Row, DataFrame} +import org.apache.spark.sql.types.DoubleType + +/** + * :: Experimental :: + * Evaluator for multiclass classification, which expects two input columns: score and label. + */ +@Experimental +class MulticlassClassificationEvaluator (override val uid: String) + extends Evaluator with HasPredictionCol with HasLabelCol { + + def this() = this(Identifiable.randomUID("mcEval")) + + /** + * param for metric name in evaluation (supports `"f1"` (default), `"precision"`, `"recall"`, + * `"weightedPrecision"`, `"weightedRecall"`) + * @group param + */ + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("f1", "precision", + "recall", "weightedPrecision", "weightedRecall")) +new Param(this, "metricName", "metric name in evaluation " + + "(f1|precision|recall|weightedPrecision|weightedRecall)", allowedParams) + } + + /** @group getParam */ + def getMetricName: String = $(metricName) + + /** @group setParam */ + def setMetricName(value: String): this.type = set(metricName, value) + + /** @group setParam */ + def setPredictionCol(value: String): this.type = set(predictionCol, value) + + /** @group setParam */ + def setLabelCol(value: String): this.type = set(labelCol, value) + + setDefault(metricName -> "f1") + + override def evaluate(dataset: DataFrame): Double = { +val schema = dataset.schema +SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) +SchemaUtils.checkColumnType(schema, $(labe
spark git commit: [SPARK-9214] [ML] [PySpark] support ml.NaiveBayes for Python
Repository: spark Updated Branches: refs/heads/master 4e5919bfb -> 69b62f76f [SPARK-9214] [ML] [PySpark] support ml.NaiveBayes for Python support ml.NaiveBayes for Python Author: Yanbo Liang Closes #7568 from yanboliang/spark-9214 and squashes the following commits: 5ee3fd6 [Yanbo Liang] fix typos 3ecd046 [Yanbo Liang] fix typos f9c94d1 [Yanbo Liang] change lambda_ to smoothing and fix other issues 180452a [Yanbo Liang] fix typos 7dda1f4 [Yanbo Liang] support ml.NaiveBayes for Python Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69b62f76 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69b62f76 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69b62f76 Branch: refs/heads/master Commit: 69b62f76fced18efa35a107c9be4bc22eba72878 Parents: 4e5919b Author: Yanbo Liang Authored: Thu Jul 30 23:03:48 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Jul 30 23:03:48 2015 -0700 -- .../spark/ml/classification/NaiveBayes.scala| 10 +- .../ml/classification/JavaNaiveBayesSuite.java | 4 +- .../ml/classification/NaiveBayesSuite.scala | 6 +- python/pyspark/ml/classification.py | 116 ++- 4 files changed, 125 insertions(+), 11 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69b62f76/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index 1f547e4..5be35fe 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -38,11 +38,11 @@ private[ml] trait NaiveBayesParams extends PredictorParams { * (default = 1.0). * @group param */ - final val lambda: DoubleParam = new DoubleParam(this, "lambda", "The smoothing parameter.", + final val smoothing: DoubleParam = new DoubleParam(this, "smoothing", "The smoothing parameter.", ParamValidators.gtEq(0)) /** @group getParam */ - final def getLambda: Double = $(lambda) + final def getSmoothing: Double = $(smoothing) /** * The model type which is a string (case-sensitive). @@ -79,8 +79,8 @@ class NaiveBayes(override val uid: String) * Default is 1.0. * @group setParam */ - def setLambda(value: Double): this.type = set(lambda, value) - setDefault(lambda -> 1.0) + def setSmoothing(value: Double): this.type = set(smoothing, value) + setDefault(smoothing -> 1.0) /** * Set the model type using a string (case-sensitive). @@ -92,7 +92,7 @@ class NaiveBayes(override val uid: String) override protected def train(dataset: DataFrame): NaiveBayesModel = { val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset) -val oldModel = OldNaiveBayes.train(oldDataset, $(lambda), $(modelType)) +val oldModel = OldNaiveBayes.train(oldDataset, $(smoothing), $(modelType)) NaiveBayesModel.fromOld(oldModel, this) } http://git-wip-us.apache.org/repos/asf/spark/blob/69b62f76/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java index 09a9fba..a700c9c 100644 --- a/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java +++ b/mllib/src/test/java/org/apache/spark/ml/classification/JavaNaiveBayesSuite.java @@ -68,7 +68,7 @@ public class JavaNaiveBayesSuite implements Serializable { assert(nb.getLabelCol() == "label"); assert(nb.getFeaturesCol() == "features"); assert(nb.getPredictionCol() == "prediction"); -assert(nb.getLambda() == 1.0); +assert(nb.getSmoothing() == 1.0); assert(nb.getModelType() == "multinomial"); } @@ -89,7 +89,7 @@ public class JavaNaiveBayesSuite implements Serializable { }); DataFrame dataset = jsql.createDataFrame(jrdd, schema); -NaiveBayes nb = new NaiveBayes().setLambda(0.5).setModelType("multinomial"); +NaiveBayes nb = new NaiveBayes().setSmoothing(0.5).setModelType("multinomial"); NaiveBayesModel model = nb.fit(dataset); DataFrame predictionAndLabels = model.transform(dataset).select("prediction", "label"); http://git-wip-us.apache.org/repos/asf/spark/blob/69b62f76/mllib/src/test/scala/org/apache/spark/ml/classification/NaiveBayesSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/classificati
spark git commit: [SPARK-9231] [MLLIB] DistributedLDAModel method for top topics per document
Repository: spark Updated Branches: refs/heads/master 6add4eddb -> 4011a9471 [SPARK-9231] [MLLIB] DistributedLDAModel method for top topics per document jira: https://issues.apache.org/jira/browse/SPARK-9231 Helper method in DistributedLDAModel of this form: ``` /** * For each document, return the top k weighted topics for that document. * return RDD of (doc ID, topic indices, topic weights) */ def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] ``` Author: Yuhao Yang Closes #7785 from hhbyyh/topTopicsPerdoc and squashes the following commits: 30ad153 [Yuhao Yang] small fix fd24580 [Yuhao Yang] add topTopics per document to DistributedLDAModel Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4011a947 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4011a947 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4011a947 Branch: refs/heads/master Commit: 4011a947154d97a9ffb5a71f077481a12534d36b Parents: 6add4ed Author: Yuhao Yang Authored: Fri Jul 31 11:50:15 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Jul 31 11:50:15 2015 -0700 -- .../apache/spark/mllib/clustering/LDAModel.scala | 19 ++- .../apache/spark/mllib/clustering/LDASuite.scala | 13 - 2 files changed, 30 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4011a947/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 6cfad3f..82281a0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum} +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, argtopk, normalize, sum} import breeze.numerics.{exp, lgamma} import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats @@ -591,6 +591,23 @@ class DistributedLDAModel private[clustering] ( JavaPairRDD.fromRDD(topicDistributions.asInstanceOf[RDD[(java.lang.Long, Vector)]]) } + /** + * For each document, return the top k weighted topics for that document and their weights. + * @return RDD of (doc ID, topic indices, topic weights) + */ + def topTopicsPerDocument(k: Int): RDD[(Long, Array[Int], Array[Double])] = { +graph.vertices.filter(LDA.isDocumentVertex).map { case (docID, topicCounts) => + val topIndices = argtopk(topicCounts, k) + val sumCounts = sum(topicCounts) + val weights = if (sumCounts != 0) { +topicCounts(topIndices) / sumCounts + } else { +topicCounts(topIndices) + } + (docID.toLong, topIndices.toArray, weights.toArray) +} + } + // TODO: // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ??? http://git-wip-us.apache.org/repos/asf/spark/blob/4011a947/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index c43e1e5..695ee3b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.clustering -import breeze.linalg.{DenseMatrix => BDM, max, argmax} +import breeze.linalg.{DenseMatrix => BDM, argtopk, max, argmax} import org.apache.spark.SparkFunSuite import org.apache.spark.graphx.Edge @@ -108,6 +108,17 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { assert(topicDistribution.toArray.sum ~== 1.0 absTol 1e-5) } +val top2TopicsPerDoc = model.topTopicsPerDocument(2).map(t => (t._1, (t._2, t._3))) +model.topicDistributions.join(top2TopicsPerDoc).collect().foreach { + case (docId, (topicDistribution, (indices, weights))) => +assert(indices.length == 2) +assert(weights.length == 2) +val bdvTopicDist = topicDistribution.toBreeze +val top2Indices = argtopk(bdvTopicDist, 2) +assert(top2Indices.toArray === indices) +assert(bdvTopicDist(top2Indices).toArray === weights) +} + // Check: log probabilities assert(model.logLikelihood < 0.0) assert(model.logPrior < 0.0) - To unsubscribe,
spark git commit: [SPARK-6885] [ML] decision tree support predict class probabilities
Repository: spark Updated Branches: refs/heads/master 4011a9471 -> e8bdcdeab [SPARK-6885] [ML] decision tree support predict class probabilities Decision tree support predict class probabilities. Implement the prediction probabilities function referred the old DecisionTree API and the [sklean API](https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/tree/tree.py#L593). I make the DecisionTreeClassificationModel inherit from ProbabilisticClassificationModel, make the predictRaw to return the raw counts vector and make raw2probabilityInPlace/predictProbability return the probabilities for each prediction. Author: Yanbo Liang Closes #7694 from yanboliang/spark-6885 and squashes the following commits: 08d5b7f [Yanbo Liang] fix ImpurityStats null parameters and raw2probabilityInPlace sum = 0 issue 2174278 [Yanbo Liang] solve merge conflicts 7e90ba8 [Yanbo Liang] fix typos 33ae183 [Yanbo Liang] fix annotation ff043d3 [Yanbo Liang] raw2probabilityInPlace should operate in-place c32d6ce [Yanbo Liang] optimize calculateImpurityStats function again 6167fb0 [Yanbo Liang] optimize calculateImpurityStats function fbbe2ec [Yanbo Liang] eliminate duplicated struct and code beb1634 [Yanbo Liang] try to eliminate impurityStats for each LearningNode 99e8943 [Yanbo Liang] code optimization 5ec3323 [Yanbo Liang] implement InformationGainAndImpurityStats 227c91b [Yanbo Liang] refactor LearningNode to store ImpurityCalculator d746ffc [Yanbo Liang] decision tree support predict class probabilities Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e8bdcdea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e8bdcdea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e8bdcdea Branch: refs/heads/master Commit: e8bdcdeabb2df139a656f86686cdb53c891b1f4b Parents: 4011a94 Author: Yanbo Liang Authored: Fri Jul 31 11:56:52 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Jul 31 11:56:52 2015 -0700 -- .../classification/DecisionTreeClassifier.scala | 40 -- .../spark/ml/classification/GBTClassifier.scala | 2 +- .../classification/RandomForestClassifier.scala | 2 +- .../ml/regression/DecisionTreeRegressor.scala | 2 +- .../spark/ml/regression/GBTRegressor.scala | 2 +- .../ml/regression/RandomForestRegressor.scala | 2 +- .../scala/org/apache/spark/ml/tree/Node.scala | 80 ++-- .../spark/ml/tree/impl/RandomForest.scala | 126 --- .../spark/mllib/tree/impurity/Entropy.scala | 2 +- .../apache/spark/mllib/tree/impurity/Gini.scala | 2 +- .../spark/mllib/tree/impurity/Impurity.scala| 2 +- .../spark/mllib/tree/impurity/Variance.scala| 2 +- .../mllib/tree/model/InformationGainStats.scala | 61 - .../DecisionTreeClassifierSuite.scala | 30 - .../ml/classification/GBTClassifierSuite.scala | 2 +- .../RandomForestClassifierSuite.scala | 2 +- 16 files changed, 229 insertions(+), 130 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e8bdcdea/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index 36fe1bd..f27cfd0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -18,12 +18,11 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tree.{DecisionTreeModel, DecisionTreeParams, Node, TreeClassifierParams} import org.apache.spark.ml.tree.impl.RandomForest import org.apache.spark.ml.util.{Identifiable, MetadataUtils} -import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => OldStrategy} import org.apache.spark.mllib.tree.model.{DecisionTreeModel => OldDecisionTreeModel} @@ -39,7 +38,7 @@ import org.apache.spark.sql.DataFrame */ @Experimental final class DecisionTreeClassifier(override val uid: String) - extends Predictor[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel] + extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel] with DecisionTreeParams with TreeClassifierParams { def this() = this
spark git commit: [SPARK-9481] Add logLikelihood to LocalLDAModel
Repository: spark Updated Branches: refs/heads/master d04634701 -> a8340fa7d [SPARK-9481] Add logLikelihood to LocalLDAModel jkbradley Exposes `bound` (variational log likelihood bound) through public API as `logLikelihood`. Also adds unit tests, some DRYing of `LDASuite`, and includes unit tests mentioned in #7760 Author: Feynman Liang Closes #7801 from feynmanliang/SPARK-9481-logLikelihood and squashes the following commits: 6d1b2c9 [Feynman Liang] Negate perplexity definition 5f62b20 [Feynman Liang] Add logLikelihood Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a8340fa7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a8340fa7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a8340fa7 Branch: refs/heads/master Commit: a8340fa7df17e3f0a3658f8b8045ab840845a72a Parents: d046347 Author: Feynman Liang Authored: Fri Jul 31 12:12:22 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Jul 31 12:12:22 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 20 ++- .../spark/mllib/clustering/LDASuite.scala | 129 ++- 2 files changed, 78 insertions(+), 71 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a8340fa7/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 82281a0..ff7035d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -217,22 +217,28 @@ class LocalLDAModel private[clustering] ( LocalLDAModel.SaveLoadV1_0.save(sc, path, topicsMatrix, docConcentration, topicConcentration, gammaShape) } - // TODO - // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ??? + + // TODO: declare in LDAModel and override once implemented in DistributedLDAModel + /** + * Calculates a lower bound on the log likelihood of the entire corpus. + * @param documents test corpus to use for calculating log likelihood + * @return variational lower bound on the log likelihood of the entire corpus + */ + def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents, +docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, +vocabSize) /** - * Calculate the log variational bound on perplexity. See Equation (16) in original Online + * Calculate an upper bound bound on perplexity. See Equation (16) in original Online * LDA paper. * @param documents test corpus to use for calculating perplexity - * @return the log perplexity per word + * @return variational upper bound on log perplexity per word */ def logPerplexity(documents: RDD[(Long, Vector)]): Double = { val corpusWords = documents .map { case (_, termCounts) => termCounts.toArray.sum } .sum() -val batchVariationalBound = bound(documents, docConcentration, - topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) -val perWordBound = batchVariationalBound / corpusWords +val perWordBound = -logLikelihood(documents) / corpusWords perWordBound } http://git-wip-us.apache.org/repos/asf/spark/blob/a8340fa7/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 695ee3b..79d2a1c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -210,16 +210,7 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { } test("OnlineLDAOptimizer with toy data") { -def toydata: Array[(Long, Vector)] = Array( - Vectors.sparse(6, Array(0, 1), Array(1, 1)), - Vectors.sparse(6, Array(1, 2), Array(1, 1)), - Vectors.sparse(6, Array(0, 2), Array(1, 1)), - Vectors.sparse(6, Array(3, 4), Array(1, 1)), - Vectors.sparse(6, Array(3, 5), Array(1, 1)), - Vectors.sparse(6, Array(4, 5), Array(1, 1)) -).zipWithIndex.map { case (wordCounts, docId) => (docId.toLong, wordCounts) } - -val docs = sc.parallelize(toydata) +val docs = sc.parallelize(toyData) val op = new OnlineLDAOptimizer().setMiniBatchFraction(1).setTau0(1024).setKappa(0.51) .setGammaShape(1e10) val lda = new LDA().setK(2) @@ -242,30 +233,45 @@ class LDASuite ex
spark git commit: [SPARK-9246] [MLLIB] DistributedLDAModel predict top docs per topic
Repository: spark Updated Branches: refs/heads/master c0686668a -> 3c0d2e552 [SPARK-9246] [MLLIB] DistributedLDAModel predict top docs per topic Add topDocumentsPerTopic to DistributedLDAModel. Add ScalaDoc and unit tests. Author: Meihua Wu Closes #7769 from rotationsymmetry/SPARK-9246 and squashes the following commits: 1029e79c [Meihua Wu] clean up code comments a023b82 [Meihua Wu] Update tests to use Long for doc index. 91e5998 [Meihua Wu] Use Long for doc index. b9f70cf [Meihua Wu] Revise topDocumentsPerTopic 26ff3f6 [Meihua Wu] Add topDocumentsPerTopic, scala doc and unit tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3c0d2e55 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3c0d2e55 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3c0d2e55 Branch: refs/heads/master Commit: 3c0d2e55210735e0df2f8febb5f63c224af230e3 Parents: c068666 Author: Meihua Wu Authored: Fri Jul 31 13:01:10 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Jul 31 13:01:10 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 37 .../spark/mllib/clustering/LDASuite.scala | 22 2 files changed, 59 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3c0d2e55/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index ff7035d..0cdac84 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -516,6 +516,43 @@ class DistributedLDAModel private[clustering] ( } } + /** + * Return the top documents for each topic + * + * This is approximate; it may not return exactly the top-weighted documents for each topic. + * To get a more precise set of top documents, increase maxDocumentsPerTopic. + * + * @param maxDocumentsPerTopic Maximum number of documents to collect for each topic. + * @return Array over topics. Each element represent as a pair of matching arrays: + * (IDs for the documents, weights of the topic in these documents). + * For each topic, documents are sorted in order of decreasing topic weights. + */ + def topDocumentsPerTopic(maxDocumentsPerTopic: Int): Array[(Array[Long], Array[Double])] = { +val numTopics = k +val topicsInQueues: Array[BoundedPriorityQueue[(Double, Long)]] = + topicDistributions.mapPartitions { docVertices => +// For this partition, collect the most common docs for each topic in queues: +// queues(topic) = queue of (doc topic, doc ID). +val queues = + Array.fill(numTopics)(new BoundedPriorityQueue[(Double, Long)](maxDocumentsPerTopic)) +for ((docId, docTopics) <- docVertices) { + var topic = 0 + while (topic < numTopics) { +queues(topic) += (docTopics(topic) -> docId) +topic += 1 + } +} +Iterator(queues) + }.treeReduce { (q1, q2) => +q1.zip(q2).foreach { case (a, b) => a ++= b } +q1 + } +topicsInQueues.map { q => + val (docTopics, docs) = q.toArray.sortBy(-_._1).unzip + (docs.toArray, docTopics.toArray) +} + } + // TODO // override def logLikelihood(documents: RDD[(Long, Vector)]): Double = ??? http://git-wip-us.apache.org/repos/asf/spark/blob/3c0d2e55/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala index 79d2a1c..f2b9470 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/LDASuite.scala @@ -122,6 +122,28 @@ class LDASuite extends SparkFunSuite with MLlibTestSparkContext { // Check: log probabilities assert(model.logLikelihood < 0.0) assert(model.logPrior < 0.0) + +// Check: topDocumentsPerTopic +// Compare it with top documents per topic derived from topicDistributions +val topDocsByTopicDistributions = { n: Int => + Range(0, k).map { topic => +val (doc, docWeights) = topicDistributions.sortBy(-_._2(topic)).take(n).unzip +(doc.toArray, docWeights.map(_(topic)).toArray) + }.toArray +} + +// Top 3 documents per topic +model.topDocumentsPerTopic(3).zip(topDocsByTopicDistributions(3)).foreach {case (t1, t2) => + assert(t1
spark git commit: [SPARK-9308] [ML] ml.NaiveBayesModel support predicting class probabilities
Repository: spark Updated Branches: refs/heads/master 060c79aab -> fbef566a1 [SPARK-9308] [ML] ml.NaiveBayesModel support predicting class probabilities Make NaiveBayesModel support predicting class probabilities, inherit from ProbabilisticClassificationModel. Author: Yanbo Liang Closes #7672 from yanboliang/spark-9308 and squashes the following commits: 25e224c [Yanbo Liang] raw2probabilityInPlace should operate in-place 3ee56d6 [Yanbo Liang] change predictRaw and raw2probabilityInPlace c07e7a2 [Yanbo Liang] ml.NaiveBayesModel support predicting class probabilities Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fbef566a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fbef566a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fbef566a Branch: refs/heads/master Commit: fbef566a107b47e5fddde0ea65b8587d5039062d Parents: 060c79a Author: Yanbo Liang Authored: Fri Jul 31 13:11:42 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Jul 31 13:11:42 2015 -0700 -- .../spark/ml/classification/NaiveBayes.scala| 65 +++- .../ml/classification/NaiveBayesSuite.scala | 54 +++- 2 files changed, 101 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fbef566a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index 5be35fe..b46b676 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -69,7 +69,7 @@ private[ml] trait NaiveBayesParams extends PredictorParams { * The input feature values must be nonnegative. */ class NaiveBayes(override val uid: String) - extends Predictor[Vector, NaiveBayes, NaiveBayesModel] + extends ProbabilisticClassifier[Vector, NaiveBayes, NaiveBayesModel] with NaiveBayesParams { def this() = this(Identifiable.randomUID("nb")) @@ -106,7 +106,7 @@ class NaiveBayesModel private[ml] ( override val uid: String, val pi: Vector, val theta: Matrix) - extends PredictionModel[Vector, NaiveBayesModel] with NaiveBayesParams { + extends ProbabilisticClassificationModel[Vector, NaiveBayesModel] with NaiveBayesParams { import OldNaiveBayes.{Bernoulli, Multinomial} @@ -129,29 +129,62 @@ class NaiveBayesModel private[ml] ( throw new UnknownError(s"Invalid modelType: ${$(modelType)}.") } - override protected def predict(features: Vector): Double = { + override val numClasses: Int = pi.size + + private def multinomialCalculation(features: Vector) = { +val prob = theta.multiply(features) +BLAS.axpy(1.0, pi, prob) +prob + } + + private def bernoulliCalculation(features: Vector) = { +features.foreachActive((_, value) => + if (value != 0.0 && value != 1.0) { +throw new SparkException( + s"Bernoulli naive Bayes requires 0 or 1 feature values but found $features.") + } +) +val prob = thetaMinusNegTheta.get.multiply(features) +BLAS.axpy(1.0, pi, prob) +BLAS.axpy(1.0, negThetaSum.get, prob) +prob + } + + override protected def predictRaw(features: Vector): Vector = { $(modelType) match { case Multinomial => -val prob = theta.multiply(features) -BLAS.axpy(1.0, pi, prob) -prob.argmax +multinomialCalculation(features) case Bernoulli => -features.foreachActive{ (index, value) => - if (value != 0.0 && value != 1.0) { -throw new SparkException( - s"Bernoulli naive Bayes requires 0 or 1 feature values but found $features") - } -} -val prob = thetaMinusNegTheta.get.multiply(features) -BLAS.axpy(1.0, pi, prob) -BLAS.axpy(1.0, negThetaSum.get, prob) -prob.argmax +bernoulliCalculation(features) case _ => // This should never happen. throw new UnknownError(s"Invalid modelType: ${$(modelType)}.") } } + override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { +rawPrediction match { + case dv: DenseVector => +var i = 0 +val size = dv.size +val maxLog = dv.values.max +while (i < size) { + dv.values(i) = math.exp(dv.values(i) - maxLog) + i += 1 +} +val probSum = dv.values.sum +i = 0 +while (i < size) { + dv.values(i) = dv.values(i) / probSum + i += 1 +} +dv + case sv: SparseVector => +throw new RuntimeException(
spark git commit: [SPARK-8936] [MLLIB] OnlineLDA document-topic Dirichlet hyperparameter optimization
Repository: spark Updated Branches: refs/heads/master 4d5a6e7b6 -> f51fd6fbb [SPARK-8936] [MLLIB] OnlineLDA document-topic Dirichlet hyperparameter optimization Adds `alpha` (document-topic Dirichlet parameter) hyperparameter optimization to `OnlineLDAOptimizer` following Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters. Also introduces a private `setSampleWithReplacement` to `OnlineLDAOptimizer` for unit testing purposes. Author: Feynman Liang Closes #7836 from feynmanliang/SPARK-8936-alpha-optimize and squashes the following commits: 4bef484 [Feynman Liang] Documentation improvements c3c6c1d [Feynman Liang] Fix docs 151e859 [Feynman Liang] Fix style fa77518 [Feynman Liang] Hyperparameter optimization Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f51fd6fb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f51fd6fb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f51fd6fb Branch: refs/heads/master Commit: f51fd6fbb4d9822502f98b312251e317d757bc3a Parents: 4d5a6e7 Author: Feynman Liang Authored: Fri Jul 31 18:36:22 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Jul 31 18:36:22 2015 -0700 -- .../spark/mllib/clustering/LDAOptimizer.scala | 75 +--- .../spark/mllib/clustering/LDASuite.scala | 34 + 2 files changed, 99 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f51fd6fb/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala index d6f8b29..b0e14cb 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAOptimizer.scala @@ -19,8 +19,8 @@ package org.apache.spark.mllib.clustering import java.util.Random -import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, normalize, sum} -import breeze.numerics.{abs, exp} +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, all, normalize, sum} +import breeze.numerics.{trigamma, abs, exp} import breeze.stats.distributions.{Gamma, RandBasis} import org.apache.spark.annotation.DeveloperApi @@ -239,22 +239,26 @@ final class OnlineLDAOptimizer extends LDAOptimizer { /** alias for docConcentration */ private var alpha: Vector = Vectors.dense(0) - /** (private[clustering] for debugging) Get docConcentration */ + /** (for debugging) Get docConcentration */ private[clustering] def getAlpha: Vector = alpha /** alias for topicConcentration */ private var eta: Double = 0 - /** (private[clustering] for debugging) Get topicConcentration */ + /** (for debugging) Get topicConcentration */ private[clustering] def getEta: Double = eta private var randomGenerator: java.util.Random = null + /** (for debugging) Whether to sample mini-batches with replacement. (default = true) */ + private var sampleWithReplacement: Boolean = true + // Online LDA specific parameters // Learning rate is: (tau0 + t)^{-kappa} private var tau0: Double = 1024 private var kappa: Double = 0.51 private var miniBatchFraction: Double = 0.05 + private var optimizeAlpha: Boolean = false // internal data structure private var docs: RDD[(Long, Vector)] = null @@ -262,7 +266,7 @@ final class OnlineLDAOptimizer extends LDAOptimizer { /** Dirichlet parameter for the posterior over topics */ private var lambda: BDM[Double] = null - /** (private[clustering] for debugging) Get parameter for topics */ + /** (for debugging) Get parameter for topics */ private[clustering] def getLambda: BDM[Double] = lambda /** Current iteration (count of invocations of [[next()]]) */ @@ -325,7 +329,22 @@ final class OnlineLDAOptimizer extends LDAOptimizer { } /** - * (private[clustering]) + * Optimize alpha, indicates whether alpha (Dirichlet parameter for document-topic distribution) + * will be optimized during training. + */ + def getOptimzeAlpha: Boolean = this.optimizeAlpha + + /** + * Sets whether to optimize alpha parameter during training. + * + * Default: false + */ + def setOptimzeAlpha(optimizeAlpha: Boolean): this.type = { +this.optimizeAlpha = optimizeAlpha +this + } + + /** * Set the Dirichlet parameter for the posterior over topics. * This is only used for testing now. In the future, it can help support training stop/resume. */ @@ -335,7 +354,6 @@ final class OnlineLDAOptimizer extends LDAOptimizer { } /** - * (private[clustering]) * Used for random initialization of the variationa
spark git commit: [SPARK-7446] [MLLIB] Add inverse transform for string indexer
Repository: spark Updated Branches: refs/heads/master 60ea7ab4b -> 65038973a [SPARK-7446] [MLLIB] Add inverse transform for string indexer It is useful to convert the encoded indices back to their string representation for result inspection. We can add a function which creates an inverse transformation. Author: Holden Karau Closes #6339 from holdenk/SPARK-7446-inverse-transform-for-string-indexer and squashes the following commits: 7cdf915 [Holden Karau] scala style comment fix b9cffb6 [Holden Karau] Update the labels param to have the metadata note 6a38edb [Holden Karau] Setting the default needs to come after the value gets defined 9e241d8 [Holden Karau] use Array.empty 21c8cfa [Holden Karau] Merge branch 'master' into SPARK-7446-inverse-transform-for-string-indexer 64dd3a3 [Holden Karau] Merge branch 'master' into SPARK-7446-inverse-transform-for-string-indexer 4f06c59 [Holden Karau] Fix comment styles, use empty array as the default, etc. a60c0e3 [Holden Karau] CR feedback (remove old constructor, add a note about use of setLabels) 1987b95 [Holden Karau] Use default copy 71e8d66 [Holden Karau] Make labels a local param for StringIndexerInverse 8450d0b [Holden Karau] Use the labels param in StringIndexerInverse 7464019 [Holden Karau] Add a labels param 868b1a9 [Holden Karau] Update scaladoc since we don't have labelsCol anymore 5aa38bf [Holden Karau] Add an inverse test using only meta data, pass labels when calling inverse method f3e0c64 [Holden Karau] CR feedback ebed932 [Holden Karau] Add Experimental tag and some scaladocs. Also don't require that the inputCol has the metadata on it, instead have the labelsCol specified when creating the inverse. 03ebf95 [Holden Karau] Add explicit type for invert function ecc65e0 [Holden Karau] Read the metadata correctly, use the array, pass the test a42d773 [Holden Karau] Fix test to supply cols as per new invert method 16cc3c3 [Holden Karau] Add an invert method d4bcb20 [Holden Karau] Make the inverse string indexer into a transformer (still needs test updates but compiles) e8bf3ad [Holden Karau] Merge branch 'master' into SPARK-7446-inverse-transform-for-string-indexer c3fdee1 [Holden Karau] Some WIP refactoring based on jkbradley's CR feedback. Definite work-in-progress 557bef8 [Holden Karau] Instead of using a private inverse transform, add an invert function so we can use it in a pipeline 88779c1 [Holden Karau] fix long line 78b28c1 [Holden Karau] Finish reverse part and add a test :) bb16a6a [Holden Karau] Some progress Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65038973 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65038973 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65038973 Branch: refs/heads/master Commit: 65038973a17904e0e04d453799ec108af240fbab Parents: 60ea7ab Author: Holden Karau Authored: Sat Aug 1 01:09:38 2015 -0700 Committer: Joseph K. Bradley Committed: Sat Aug 1 01:09:38 2015 -0700 -- .../apache/spark/ml/feature/StringIndexer.scala | 108 ++- .../spark/ml/feature/StringIndexerSuite.scala | 13 +++ 2 files changed, 118 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/65038973/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index bf7be36..ebfa972 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -20,13 +20,14 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.annotation.Experimental import org.apache.spark.ml.{Estimator, Model} -import org.apache.spark.ml.attribute.NominalAttribute +import org.apache.spark.ml.attribute.{Attribute, NominalAttribute} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ -import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.util.{Identifiable, MetadataUtils} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ -import org.apache.spark.sql.types.{NumericType, StringType, StructType} +import org.apache.spark.sql.types.{DoubleType, NumericType, StringType, StructType} import org.apache.spark.util.collection.OpenHashMap /** @@ -151,4 +152,105 @@ class StringIndexerModel private[ml] ( val copied = new StringIndexerModel(uid, labels) copyValues(copied, extra) } + + /** + * Return a model to perform the inverse transformation. + * Note: By default we keep the original colum
spark git commit: [SPARK-9530] [MLLIB] ScalaDoc should not indicate LDAModel.describeTopics and DistributedLDAModel.topDocumentsPerTopic as approximate
Repository: spark Updated Branches: refs/heads/master 3d1535d48 -> 84a6982b3 [SPARK-9530] [MLLIB] ScalaDoc should not indicate LDAModel.describeTopics and DistributedLDAModel.topDocumentsPerTopic as approximate Remove ScalaDoc that suggests describeTopics and topDocumentsPerTopic are approximate. cc jkbradley Author: Meihua Wu Closes #7858 from rotationsymmetry/SPARK-9530 and squashes the following commits: b574923 [Meihua Wu] Remove ScalaDoc that suggests describeTopics and topDocumentsPerTopic are approximate. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/84a6982b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/84a6982b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/84a6982b Branch: refs/heads/master Commit: 84a6982b35d87483bdf70ef4423cc4c8e0c3feb1 Parents: 3d1535d Author: Meihua Wu Authored: Sat Aug 1 17:13:28 2015 -0700 Committer: Joseph K. Bradley Committed: Sat Aug 1 17:13:28 2015 -0700 -- .../scala/org/apache/spark/mllib/clustering/LDAModel.scala| 7 --- 1 file changed, 7 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/84a6982b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 0cdac84..6af90d7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -86,10 +86,6 @@ abstract class LDAModel private[clustering] extends Saveable { /** * Return the topics described by weighted terms. * - * This limits the number of terms per topic. - * This is approximate; it may not return exactly the top-weighted terms for each topic. - * To get a more precise set of top terms, increase maxTermsPerTopic. - * * @param maxTermsPerTopic Maximum number of terms to collect for each topic. * @return Array over topics. Each topic is represented as a pair of matching arrays: * (term indices, term weights in topic). @@ -519,9 +515,6 @@ class DistributedLDAModel private[clustering] ( /** * Return the top documents for each topic * - * This is approximate; it may not return exactly the top-weighted documents for each topic. - * To get a more precise set of top documents, increase maxDocumentsPerTopic. - * * @param maxDocumentsPerTopic Maximum number of documents to collect for each topic. * @return Array over topics. Each element represent as a pair of matching arrays: * (IDs for the documents, weights of the topic in these documents). - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9536] [SPARK-9537] [SPARK-9538] [ML] [PYSPARK] ml.classification support raw and probability prediction for PySpark
Repository: spark Updated Branches: refs/heads/master 114ff926f -> 4cdd8ecd6 [SPARK-9536] [SPARK-9537] [SPARK-9538] [ML] [PYSPARK] ml.classification support raw and probability prediction for PySpark Make the following ml.classification class support raw and probability prediction for PySpark: ```scala NaiveBayesModel DecisionTreeClassifierModel LogisticRegressionModel ``` Author: Yanbo Liang Closes #7866 from yanboliang/spark-9536-9537 and squashes the following commits: 2934dab [Yanbo Liang] ml.NaiveBayes, ml.DecisionTreeClassifier and ml.LogisticRegression support probability prediction Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4cdd8ecd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4cdd8ecd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4cdd8ecd Branch: refs/heads/master Commit: 4cdd8ecd66769316e8593da7790b84cd867968cd Parents: 114ff92 Author: Yanbo Liang Authored: Sun Aug 2 22:19:27 2015 -0700 Committer: Joseph K. Bradley Committed: Sun Aug 2 22:19:27 2015 -0700 -- python/pyspark/ml/classification.py | 61 ++-- 1 file changed, 43 insertions(+), 18 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4cdd8ecd/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 93ffcd4..b5814f7 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -31,7 +31,7 @@ __all__ = ['LogisticRegression', 'LogisticRegressionModel', 'DecisionTreeClassif @inherit_doc class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasMaxIter, - HasRegParam, HasTol, HasProbabilityCol): + HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol): """ Logistic regression. @@ -42,13 +42,18 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti ... Row(label=0.0, features=Vectors.sparse(1, [], []))]).toDF() >>> lr = LogisticRegression(maxIter=5, regParam=0.01) >>> model = lr.fit(df) ->>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF() ->>> model.transform(test0).head().prediction -0.0 >>> model.weights DenseVector([5.5...]) >>> model.intercept -2.68... +>>> test0 = sc.parallelize([Row(features=Vectors.dense(-1.0))]).toDF() +>>> result = model.transform(test0).head() +>>> result.prediction +0.0 +>>> result.probability +DenseVector([0.99..., 0.00...]) +>>> result.rawPrediction +DenseVector([8.22..., -8.22...]) >>> test1 = sc.parallelize([Row(features=Vectors.sparse(1, [0], [1.0]))]).toDF() >>> model.transform(test1).head().prediction 1.0 @@ -70,11 +75,11 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, - threshold=0.5, probabilityCol="probability"): + threshold=0.5, probabilityCol="probability", rawPredictionCol="rawPrediction"): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ - threshold=0.5, probabilityCol="probability") + threshold=0.5, probabilityCol="probability", rawPredictionCol="rawPrediction") """ super(LogisticRegression, self).__init__() self._java_obj = self._new_java_obj( @@ -98,11 +103,11 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti @keyword_only def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, - threshold=0.5, probabilityCol="probability"): + threshold=0.5, probabilityCol="probability", rawPredictionCol="rawPrediction"): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.1, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ - threshold=0.5, probabilityCol="probability") + threshold=0.5, probabilityCol="probability", rawPredictionCol="rawPrediction") Sets params for logistic regression. """ kwargs = self.setParams._input_kwargs @@ -187,7 +192,8 @@
spark git commit: [SPARK-9528] [ML] Changed RandomForestClassifier to extend ProbabilisticClassifier
Repository: spark Updated Branches: refs/heads/master 8be198c86 -> 69f5a7c93 [SPARK-9528] [ML] Changed RandomForestClassifier to extend ProbabilisticClassifier RandomForestClassifier now outputs rawPrediction based on tree probabilities, plus probability column computed from normalized rawPrediction. CC: holdenk Author: Joseph K. Bradley Closes #7859 from jkbradley/rf-prob and squashes the following commits: 6c28f51 [Joseph K. Bradley] Changed RandomForestClassifier to extend ProbabilisticClassifier Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/69f5a7c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/69f5a7c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/69f5a7c9 Branch: refs/heads/master Commit: 69f5a7c934ac553ed52c00679b800bcffe83c1d6 Parents: 8be198c Author: Joseph K. Bradley Authored: Mon Aug 3 10:46:34 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 3 10:46:34 2015 -0700 -- .../classification/DecisionTreeClassifier.scala | 8 + .../ProbabilisticClassifier.scala | 27 +- .../classification/RandomForestClassifier.scala | 37 ++-- .../RandomForestClassifierSuite.scala | 36 ++- 4 files changed, 81 insertions(+), 27 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/69f5a7c9/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index f27cfd0..f2b992f 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -131,13 +131,7 @@ final class DecisionTreeClassificationModel private[ml] ( override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { rawPrediction match { case dv: DenseVector => -var i = 0 -val size = dv.size -val sum = dv.values.sum -while (i < size) { - dv.values(i) = if (sum != 0) dv.values(i) / sum else 0.0 - i += 1 -} +ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in DecisionTreeClassificationModel:" + http://git-wip-us.apache.org/repos/asf/spark/blob/69f5a7c9/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala index dad4511..f9c9c23 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/ProbabilisticClassifier.scala @@ -20,7 +20,7 @@ package org.apache.spark.ml.classification import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.SchemaUtils -import org.apache.spark.mllib.linalg.{Vector, VectorUDT} +import org.apache.spark.mllib.linalg.{SparseVector, DenseVector, Vector, VectorUDT} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, DataType, StructType} @@ -175,3 +175,28 @@ private[spark] abstract class ProbabilisticClassificationModel[ */ protected def probability2prediction(probability: Vector): Double = probability.argmax } + +private[ml] object ProbabilisticClassificationModel { + + /** + * Normalize a vector of raw predictions to be a multinomial probability vector, in place. + * + * The input raw predictions should be >= 0. + * The output vector sums to 1, unless the input vector is all-0 (in which case the output is + * all-0 too). + * + * NOTE: This is NOT applicable to all models, only ones which effectively use class + * instance counts for raw predictions. + */ + def normalizeToProbabilitiesInPlace(v: DenseVector): Unit = { +val sum = v.values.sum +if (sum != 0) { + var i = 0 + val size = v.size + while (i < size) { +v.values(i) /= sum +i += 1 + } +} + } +} http://git-wip-us.apache.org/repos/asf/spark/blob/69f5a7c9/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala ---
spark git commit: [SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and Regressor
Repository: spark Updated Branches: refs/heads/master 703e44bff -> ff9169a00 [SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and Regressor Added featureImportance to RandomForestClassifier and Regressor. This follows the scikit-learn implementation here: [https://github.com/scikit-learn/scikit-learn/blob/a95203b249c1cf392f86d001ad999e29b2392739/sklearn/tree/_tree.pyx#L3341] CC: yanboliang Would you mind taking a look? Thanks! Author: Joseph K. Bradley Author: Feynman Liang Closes #7838 from jkbradley/dt-feature-importance and squashes the following commits: 72a167a [Joseph K. Bradley] fixed unit test 86cea5f [Joseph K. Bradley] Modified RF featuresImportances to return Vector instead of Map 5aa74f0 [Joseph K. Bradley] finally fixed unit test for real 33df5db [Joseph K. Bradley] fix unit test 42a2d3b [Joseph K. Bradley] fix unit test fe94e72 [Joseph K. Bradley] modified feature importance unit tests cc693ee [Feynman Liang] Add classifier tests 79a6f87 [Feynman Liang] Compare dense vectors in test 21d01fc [Feynman Liang] Added failing SKLearn test ac0b254 [Joseph K. Bradley] Added featureImportance to RandomForestClassifier/Regressor. Need to add unit tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ff9169a0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ff9169a0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ff9169a0 Branch: refs/heads/master Commit: ff9169a002f1b75231fd25b7d04157a912503038 Parents: 703e44b Author: Joseph K. Bradley Authored: Mon Aug 3 12:17:46 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 3 12:17:46 2015 -0700 -- .../classification/RandomForestClassifier.scala | 30 +- .../ml/regression/RandomForestRegressor.scala | 33 -- .../scala/org/apache/spark/ml/tree/Node.scala | 19 +++- .../spark/ml/tree/impl/RandomForest.scala | 92 .../org/apache/spark/ml/tree/treeModels.scala | 6 ++ .../JavaRandomForestClassifierSuite.java| 2 + .../JavaRandomForestRegressorSuite.java | 2 + .../RandomForestClassifierSuite.scala | 31 +- .../org/apache/spark/ml/impl/TreeTests.scala| 18 .../regression/RandomForestRegressorSuite.scala | 27 - .../spark/ml/tree/impl/RandomForestSuite.scala | 107 +++ 11 files changed, 351 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ff9169a0/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 56e80cc..b59826a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -95,7 +95,8 @@ final class RandomForestClassifier(override val uid: String) val trees = RandomForest.run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed) .map(_.asInstanceOf[DecisionTreeClassificationModel]) -new RandomForestClassificationModel(trees, numClasses) +val numFeatures = oldDataset.first().features.size +new RandomForestClassificationModel(trees, numFeatures, numClasses) } override def copy(extra: ParamMap): RandomForestClassifier = defaultCopy(extra) @@ -118,11 +119,13 @@ object RandomForestClassifier { * features. * @param _trees Decision trees in the ensemble. * Warning: These have null parents. + * @param numFeatures Number of features used by this model */ @Experimental final class RandomForestClassificationModel private[ml] ( override val uid: String, private val _trees: Array[DecisionTreeClassificationModel], +val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel] with TreeEnsembleModel with Serializable { @@ -133,8 +136,8 @@ final class RandomForestClassificationModel private[ml] ( * Construct a random forest classification model, with all trees weighted equally. * @param trees Component trees */ - def this(trees: Array[DecisionTreeClassificationModel], numClasses: Int) = -this(Identifiable.randomUID("rfc"), trees, numClasses) + def this(trees: Array[DecisionTreeClassificationModel], numFeatures: Int, numClasses: Int) = +this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses) override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]] @@ -182,13
spark git commit: [SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and Regressor
Repository: spark Updated Branches: refs/heads/branch-1.5 6d46e9b7c -> b3117d312 [SPARK-5133] [ML] Added featureImportance to RandomForestClassifier and Regressor Added featureImportance to RandomForestClassifier and Regressor. This follows the scikit-learn implementation here: [https://github.com/scikit-learn/scikit-learn/blob/a95203b249c1cf392f86d001ad999e29b2392739/sklearn/tree/_tree.pyx#L3341] CC: yanboliang Would you mind taking a look? Thanks! Author: Joseph K. Bradley Author: Feynman Liang Closes #7838 from jkbradley/dt-feature-importance and squashes the following commits: 72a167a [Joseph K. Bradley] fixed unit test 86cea5f [Joseph K. Bradley] Modified RF featuresImportances to return Vector instead of Map 5aa74f0 [Joseph K. Bradley] finally fixed unit test for real 33df5db [Joseph K. Bradley] fix unit test 42a2d3b [Joseph K. Bradley] fix unit test fe94e72 [Joseph K. Bradley] modified feature importance unit tests cc693ee [Feynman Liang] Add classifier tests 79a6f87 [Feynman Liang] Compare dense vectors in test 21d01fc [Feynman Liang] Added failing SKLearn test ac0b254 [Joseph K. Bradley] Added featureImportance to RandomForestClassifier/Regressor. Need to add unit tests (cherry picked from commit ff9169a002f1b75231fd25b7d04157a912503038) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b3117d31 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b3117d31 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b3117d31 Branch: refs/heads/branch-1.5 Commit: b3117d312332af3b4bd416857f632cacb5230feb Parents: 6d46e9b Author: Joseph K. Bradley Authored: Mon Aug 3 12:17:46 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 3 12:17:56 2015 -0700 -- .../classification/RandomForestClassifier.scala | 30 +- .../ml/regression/RandomForestRegressor.scala | 33 -- .../scala/org/apache/spark/ml/tree/Node.scala | 19 +++- .../spark/ml/tree/impl/RandomForest.scala | 92 .../org/apache/spark/ml/tree/treeModels.scala | 6 ++ .../JavaRandomForestClassifierSuite.java| 2 + .../JavaRandomForestRegressorSuite.java | 2 + .../RandomForestClassifierSuite.scala | 31 +- .../org/apache/spark/ml/impl/TreeTests.scala| 18 .../regression/RandomForestRegressorSuite.scala | 27 - .../spark/ml/tree/impl/RandomForestSuite.scala | 107 +++ 11 files changed, 351 insertions(+), 16 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b3117d31/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index 56e80cc..b59826a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -95,7 +95,8 @@ final class RandomForestClassifier(override val uid: String) val trees = RandomForest.run(oldDataset, strategy, getNumTrees, getFeatureSubsetStrategy, getSeed) .map(_.asInstanceOf[DecisionTreeClassificationModel]) -new RandomForestClassificationModel(trees, numClasses) +val numFeatures = oldDataset.first().features.size +new RandomForestClassificationModel(trees, numFeatures, numClasses) } override def copy(extra: ParamMap): RandomForestClassifier = defaultCopy(extra) @@ -118,11 +119,13 @@ object RandomForestClassifier { * features. * @param _trees Decision trees in the ensemble. * Warning: These have null parents. + * @param numFeatures Number of features used by this model */ @Experimental final class RandomForestClassificationModel private[ml] ( override val uid: String, private val _trees: Array[DecisionTreeClassificationModel], +val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, RandomForestClassificationModel] with TreeEnsembleModel with Serializable { @@ -133,8 +136,8 @@ final class RandomForestClassificationModel private[ml] ( * Construct a random forest classification model, with all trees weighted equally. * @param trees Component trees */ - def this(trees: Array[DecisionTreeClassificationModel], numClasses: Int) = -this(Identifiable.randomUID("rfc"), trees, numClasses) + def this(trees: Array[DecisionTreeClassificationModel], numFeatures: Int, numClasses: Int) = +this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses)
spark git commit: [SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples
Repository: spark Updated Branches: refs/heads/master ba1c4e138 -> 8ca287ebb [SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples Add ml.PCA user guide document and code examples for Scala/Java/Python. Author: Yanbo Liang Closes #7522 from yanboliang/ml-pca-md and squashes the following commits: 60dec05 [Yanbo Liang] address comments f992abe [Yanbo Liang] Add ml.PCA doc and examples Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8ca287eb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8ca287eb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8ca287eb Branch: refs/heads/master Commit: 8ca287ebbd58985a568341b08040d0efa9d3641a Parents: ba1c4e1 Author: Yanbo Liang Authored: Mon Aug 3 13:58:00 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 3 13:58:00 2015 -0700 -- docs/ml-features.md | 86 1 file changed, 86 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8ca287eb/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 54068de..fa0ad1f 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -461,6 +461,92 @@ for binarized_feature, in binarizedFeatures.collect(): +## PCA + +[PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. A [PCA](api/scala/index.html#org.apache.spark.ml.feature.PCA) class trains a model to project vectors to a low-dimensional space using PCA. The example below shows how to project 5-dimensional feature vectors into 3-dimensional principal components. + + + +See the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API details. +{% highlight scala %} +import org.apache.spark.ml.feature.PCA +import org.apache.spark.mllib.linalg.Vectors + +val data = Array( + Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), + Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), + Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) +) +val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") +val pca = new PCA() + .setInputCol("features") + .setOutputCol("pcaFeatures") + .setK(3) + .fit(df) +val pcaDF = pca.transform(df) +val result = pcaDF.select("pcaFeatures") +result.show() +{% endhighlight %} + + + +See the [Java API documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details. +{% highlight java %} +import com.google.common.collect.Lists; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.feature.PCA +import org.apache.spark.ml.feature.PCAModel +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +JavaSparkContext jsc = ... +SQLContext jsql = ... +JavaRDD data = jsc.parallelize(Lists.newArrayList( + RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})), + RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), + RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) +)); +StructType schema = new StructType(new StructField[] { + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); +DataFrame df = jsql.createDataFrame(data, schema); +PCAModel pca = new PCA() + .setInputCol("features") + .setOutputCol("pcaFeatures") + .setK(3) + .fit(df); +DataFrame result = pca.transform(df).select("pcaFeatures"); +result.show(); +{% endhighlight %} + + + +See the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API details. +{% highlight python %} +from pyspark.ml.feature import PCA +from pyspark.mllib.linalg import Vectors + +data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), + (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), + (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] +df = sqlContext.createDataFrame(data,["features"]) +pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") +model = pca.fit(df) +result = model.transform(df).select("pcaFeatures") +result.show(truncate=False) +{% endhighlight %} + + + ## PolynomialExpansion [Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_expansion) is the process of expanding your features into a polynomial space, which is formulated by an n-degree
spark git commit: [SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples
Repository: spark Updated Branches: refs/heads/branch-1.5 dc0c8c982 -> e7329ab31 [SPARK-9191] [ML] [Doc] Add ml.PCA user guide and code examples Add ml.PCA user guide document and code examples for Scala/Java/Python. Author: Yanbo Liang Closes #7522 from yanboliang/ml-pca-md and squashes the following commits: 60dec05 [Yanbo Liang] address comments f992abe [Yanbo Liang] Add ml.PCA doc and examples (cherry picked from commit 8ca287ebbd58985a568341b08040d0efa9d3641a) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e7329ab3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e7329ab3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e7329ab3 Branch: refs/heads/branch-1.5 Commit: e7329ab31323a89d1e07c808927e5543876e3ce3 Parents: dc0c8c9 Author: Yanbo Liang Authored: Mon Aug 3 13:58:00 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 3 14:01:18 2015 -0700 -- docs/ml-features.md | 86 1 file changed, 86 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e7329ab3/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 54068de..fa0ad1f 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -461,6 +461,92 @@ for binarized_feature, in binarizedFeatures.collect(): +## PCA + +[PCA](http://en.wikipedia.org/wiki/Principal_component_analysis) is a statistical procedure that uses an orthogonal transformation to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables called principal components. A [PCA](api/scala/index.html#org.apache.spark.ml.feature.PCA) class trains a model to project vectors to a low-dimensional space using PCA. The example below shows how to project 5-dimensional feature vectors into 3-dimensional principal components. + + + +See the [Scala API documentation](api/scala/index.html#org.apache.spark.ml.feature.PCA) for API details. +{% highlight scala %} +import org.apache.spark.ml.feature.PCA +import org.apache.spark.mllib.linalg.Vectors + +val data = Array( + Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), + Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), + Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) +) +val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") +val pca = new PCA() + .setInputCol("features") + .setOutputCol("pcaFeatures") + .setK(3) + .fit(df) +val pcaDF = pca.transform(df) +val result = pcaDF.select("pcaFeatures") +result.show() +{% endhighlight %} + + + +See the [Java API documentation](api/java/org/apache/spark/ml/feature/PCA.html) for API details. +{% highlight java %} +import com.google.common.collect.Lists; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.feature.PCA +import org.apache.spark.ml.feature.PCAModel +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +JavaSparkContext jsc = ... +SQLContext jsql = ... +JavaRDD data = jsc.parallelize(Lists.newArrayList( + RowFactory.create(Vectors.sparse(5, new int[]{1, 3}, new double[]{1.0, 7.0})), + RowFactory.create(Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0)), + RowFactory.create(Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)) +)); +StructType schema = new StructType(new StructField[] { + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); +DataFrame df = jsql.createDataFrame(data, schema); +PCAModel pca = new PCA() + .setInputCol("features") + .setOutputCol("pcaFeatures") + .setK(3) + .fit(df); +DataFrame result = pca.transform(df).select("pcaFeatures"); +result.show(); +{% endhighlight %} + + + +See the [Python API documentation](api/python/pyspark.ml.html#pyspark.ml.feature.PCA) for API details. +{% highlight python %} +from pyspark.ml.feature import PCA +from pyspark.mllib.linalg import Vectors + +data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),), + (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),), + (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)] +df = sqlContext.createDataFrame(data,["features"]) +pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") +model = pca.fit(df) +result = model.transform(df).select("pcaFeatures") +result.show(truncate=False) +{% endhighlight %} + + + ## PolynomialExpansion [Polynomial expansion](http://en.wikipedia.org/wiki/Polynomial_exp
spark git commit: [SPARK-8874] [ML] Add missing methods in Word2Vec
Repository: spark Updated Branches: refs/heads/master a2409d1c8 -> 13675c742 [SPARK-8874] [ML] Add missing methods in Word2Vec Add missing methods 1. getVectors 2. findSynonyms to W2Vec scala and python API mengxr Author: MechCoder Closes #7263 from MechCoder/missing_methods_w2vec and squashes the following commits: 149d5ca [MechCoder] minor doc 69d91b7 [MechCoder] [SPARK-8874] [ML] Add missing methods in Word2Vec Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/13675c74 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/13675c74 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/13675c74 Branch: refs/heads/master Commit: 13675c742a71cbdc8324701c3694775ce1dd5c62 Parents: a2409d1 Author: MechCoder Authored: Mon Aug 3 16:44:25 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 3 16:44:25 2015 -0700 -- .../org/apache/spark/ml/feature/Word2Vec.scala | 38 +++- .../apache/spark/ml/feature/Word2VecSuite.scala | 62 2 files changed, 99 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/13675c74/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 6ea6590..b4f46ce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -18,15 +18,17 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental +import org.apache.spark.SparkContext import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature -import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} +import org.apache.spark.mllib.linalg.{VectorUDT, Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ +import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types._ /** @@ -146,6 +148,40 @@ class Word2VecModel private[ml] ( wordVectors: feature.Word2VecModel) extends Model[Word2VecModel] with Word2VecBase { + + /** + * Returns a dataframe with two fields, "word" and "vector", with "word" being a String and + * and the vector the DenseVector that it is mapped to. + */ + val getVectors: DataFrame = { +val sc = SparkContext.getOrCreate() +val sqlContext = SQLContext.getOrCreate(sc) +import sqlContext.implicits._ +val wordVec = wordVectors.getVectors.mapValues(vec => Vectors.dense(vec.map(_.toDouble))) +sc.parallelize(wordVec.toSeq).toDF("word", "vector") + } + + /** + * Find "num" number of words closest in similarity to the given word. + * Returns a dataframe with the words and the cosine similarities between the + * synonyms and the given word. + */ + def findSynonyms(word: String, num: Int): DataFrame = { +findSynonyms(wordVectors.transform(word), num) + } + + /** + * Find "num" number of words closest to similarity to the given vector representation + * of the word. Returns a dataframe with the words and the cosine similarities between the + * synonyms and the given word vector. + */ + def findSynonyms(word: Vector, num: Int): DataFrame = { +val sc = SparkContext.getOrCreate() +val sqlContext = SQLContext.getOrCreate(sc) +import sqlContext.implicits._ +sc.parallelize(wordVectors.findSynonyms(word, num)).toDF("word", "similarity") + } + /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) http://git-wip-us.apache.org/repos/asf/spark/blob/13675c74/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index aa6ce53..adcda0e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -67,5 +67,67 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is different with expected.") } } + + test("getVectors") { + +val sqlContext = new SQLContext(sc) +import sqlContext.implicits._ + +val sentence = "a b " * 100 + "a c " * 10 +val doc = sc.parallelize(Seq(sentence, sentence)).map(line =>
spark git commit: [SPARK-8874] [ML] Add missing methods in Word2Vec
Repository: spark Updated Branches: refs/heads/branch-1.5 73fab8849 -> acda9d954 [SPARK-8874] [ML] Add missing methods in Word2Vec Add missing methods 1. getVectors 2. findSynonyms to W2Vec scala and python API mengxr Author: MechCoder Closes #7263 from MechCoder/missing_methods_w2vec and squashes the following commits: 149d5ca [MechCoder] minor doc 69d91b7 [MechCoder] [SPARK-8874] [ML] Add missing methods in Word2Vec (cherry picked from commit 13675c742a71cbdc8324701c3694775ce1dd5c62) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/acda9d95 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/acda9d95 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/acda9d95 Branch: refs/heads/branch-1.5 Commit: acda9d9546fa3f54676e48d76a2b66016d204074 Parents: 73fab88 Author: MechCoder Authored: Mon Aug 3 16:44:25 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 3 16:46:00 2015 -0700 -- .../org/apache/spark/ml/feature/Word2Vec.scala | 38 +++- .../apache/spark/ml/feature/Word2VecSuite.scala | 62 2 files changed, 99 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/acda9d95/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 6ea6590..b4f46ce 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -18,15 +18,17 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental +import org.apache.spark.SparkContext import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature -import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} +import org.apache.spark.mllib.linalg.{VectorUDT, Vector, Vectors} import org.apache.spark.mllib.linalg.BLAS._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ +import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types._ /** @@ -146,6 +148,40 @@ class Word2VecModel private[ml] ( wordVectors: feature.Word2VecModel) extends Model[Word2VecModel] with Word2VecBase { + + /** + * Returns a dataframe with two fields, "word" and "vector", with "word" being a String and + * and the vector the DenseVector that it is mapped to. + */ + val getVectors: DataFrame = { +val sc = SparkContext.getOrCreate() +val sqlContext = SQLContext.getOrCreate(sc) +import sqlContext.implicits._ +val wordVec = wordVectors.getVectors.mapValues(vec => Vectors.dense(vec.map(_.toDouble))) +sc.parallelize(wordVec.toSeq).toDF("word", "vector") + } + + /** + * Find "num" number of words closest in similarity to the given word. + * Returns a dataframe with the words and the cosine similarities between the + * synonyms and the given word. + */ + def findSynonyms(word: String, num: Int): DataFrame = { +findSynonyms(wordVectors.transform(word), num) + } + + /** + * Find "num" number of words closest to similarity to the given vector representation + * of the word. Returns a dataframe with the words and the cosine similarities between the + * synonyms and the given word vector. + */ + def findSynonyms(word: Vector, num: Int): DataFrame = { +val sc = SparkContext.getOrCreate() +val sqlContext = SQLContext.getOrCreate(sc) +import sqlContext.implicits._ +sc.parallelize(wordVectors.findSynonyms(word, num)).toDF("word", "similarity") + } + /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) http://git-wip-us.apache.org/repos/asf/spark/blob/acda9d95/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index aa6ce53..adcda0e 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -67,5 +67,67 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext { assert(vector1 ~== vector2 absTol 1E-5, "Transformed vector is different with expected.") } } + + test("getVectors") { + +val sqlContext = new SQLContext(sc) +import sqlContext.implicits._ + +
spark git commit: [SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier
Repository: spark Updated Branches: refs/heads/branch-1.5 a9277cd5a -> c5250ddc5 [SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier This PR replaces the old "threshold" with a generalized "thresholds" Param. We keep getThreshold,setThreshold for backwards compatibility for binary classification. Note that the primary author of this PR is holdenk Author: Holden Karau Author: Joseph K. Bradley Closes #7909 from jkbradley/holdenk-SPARK-8069-add-cutoff-aka-threshold-to-random-forest and squashes the following commits: 3952977 [Joseph K. Bradley] fixed pyspark doc test 85febc8 [Joseph K. Bradley] made python unit tests a little more robust 7eb1d86 [Joseph K. Bradley] small cleanups 6cc2ed8 [Joseph K. Bradley] Fixed remaining merge issues. 0255e44 [Joseph K. Bradley] Many cleanups for thresholds, some more tests 7565a60 [Holden Karau] fix pep8 style checks, add a getThreshold method similar to our LogisticRegression.scala one for API compat be87f26 [Holden Karau] Convert threshold to thresholds in the python code, add specialized support for Array[Double] to shared parems codegen, etc. 6747dad [Holden Karau] Override raw2prediction for ProbabilisticClassifier, fix some tests 25df168 [Holden Karau] Fix handling of thresholds in LogisticRegression c02d6c0 [Holden Karau] No default for thresholds 5e43628 [Holden Karau] CR feedback and fixed the renamed test f3fbbd1 [Holden Karau] revert the changes to random forest :( 51f581c [Holden Karau] Add explicit types to public methods, fix long line f7032eb [Holden Karau] Fix a java test bug, remove some unecessary changes adf15b4 [Holden Karau] rename the classifier suite test to ProbabilisticClassifierSuite now that we only have it in Probabilistic 398078a [Holden Karau] move the thresholding around a bunch based on the design doc 4893bdc [Holden Karau] Use numtrees of 3 since previous result was tied (one tree for each) and the switch from different max methods picked a different element (since they were equal I think this is ok) 638854c [Holden Karau] Add a scala RandomForestClassifierSuite test based on corresponding python test e09919c [Holden Karau] Fix return type, I need more coffee 8d92cac [Holden Karau] Use ClassifierParams as the head 3456ed3 [Holden Karau] Add explicit return types even though just test a0f3b0c [Holden Karau] scala style fixes 6f14314 [Holden Karau] Since hasthreshold/hasthresholds is in root classifier now ffc8dab [Holden Karau] Update the sharedParams 0420290 [Holden Karau] Allow us to override the get methods selectively 978e77a [Holden Karau] Move HasThreshold into classifier params and start defining the overloaded getThreshold/getThresholds functions 1433e52 [Holden Karau] Revert "try and hide threshold but chainges the API so no dice there" 1f09a2e [Holden Karau] try and hide threshold but chainges the API so no dice there efb9084 [Holden Karau] move setThresholds only to where its used 6b34809 [Holden Karau] Add a test with thresholding for the RFCS 74f54c3 [Holden Karau] Fix creation of vote array 1986fa8 [Holden Karau] Setting the thresholds only makes sense if the underlying class hasn't overridden predict, so lets push it down. 2f44b18 [Holden Karau] Add a global default of null for thresholds param f338cfc [Holden Karau] Wait that wasn't a good idea, Revert "Some progress towards unifying threshold and thresholds" 634b06f [Holden Karau] Some progress towards unifying threshold and thresholds 85c9e01 [Holden Karau] Test passes again... little fnur 099c0f3 [Holden Karau] Move thresholds around some more (set on model not trainer) 0f46836 [Holden Karau] Start adding a classifiersuite f70eb5e [Holden Karau] Fix test compile issues a7d59c8 [Holden Karau] Move thresholding into Classifier trait 5d999d2 [Holden Karau] Some more progress, start adding a test (maybe try and see if we can find a better thing to use for the base of the test) 1fed644 [Holden Karau] Use thresholds to scale scores in random forest classifcation 31d6bf2 [Holden Karau] Start threading the threshold info through 0ef228c [Holden Karau] Add hasthresholds (cherry picked from commit 5a23213c148bfe362514f9c71f5273ebda0a848a) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5250ddc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5250ddc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5250ddc Branch: refs/heads/branch-1.5 Commit: c5250ddc5242a071549e980f69fa8bd785168979 Parents: a9277cd Author: Holden Karau Authored: Tue Aug 4 10:12:22 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 10:12:33 2015 -0700 -- .../examples/ml/JavaSimpleParamsExample.java| 3 +- .../src/main/python/ml/simple_params_example.py | 2 +- .../spark/examples/ml/SimplePar
spark git commit: [SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier
Repository: spark Updated Branches: refs/heads/master 34a0eb2e8 -> 5a23213c1 [SPARK-8069] [ML] Add multiclass thresholds for ProbabilisticClassifier This PR replaces the old "threshold" with a generalized "thresholds" Param. We keep getThreshold,setThreshold for backwards compatibility for binary classification. Note that the primary author of this PR is holdenk Author: Holden Karau Author: Joseph K. Bradley Closes #7909 from jkbradley/holdenk-SPARK-8069-add-cutoff-aka-threshold-to-random-forest and squashes the following commits: 3952977 [Joseph K. Bradley] fixed pyspark doc test 85febc8 [Joseph K. Bradley] made python unit tests a little more robust 7eb1d86 [Joseph K. Bradley] small cleanups 6cc2ed8 [Joseph K. Bradley] Fixed remaining merge issues. 0255e44 [Joseph K. Bradley] Many cleanups for thresholds, some more tests 7565a60 [Holden Karau] fix pep8 style checks, add a getThreshold method similar to our LogisticRegression.scala one for API compat be87f26 [Holden Karau] Convert threshold to thresholds in the python code, add specialized support for Array[Double] to shared parems codegen, etc. 6747dad [Holden Karau] Override raw2prediction for ProbabilisticClassifier, fix some tests 25df168 [Holden Karau] Fix handling of thresholds in LogisticRegression c02d6c0 [Holden Karau] No default for thresholds 5e43628 [Holden Karau] CR feedback and fixed the renamed test f3fbbd1 [Holden Karau] revert the changes to random forest :( 51f581c [Holden Karau] Add explicit types to public methods, fix long line f7032eb [Holden Karau] Fix a java test bug, remove some unecessary changes adf15b4 [Holden Karau] rename the classifier suite test to ProbabilisticClassifierSuite now that we only have it in Probabilistic 398078a [Holden Karau] move the thresholding around a bunch based on the design doc 4893bdc [Holden Karau] Use numtrees of 3 since previous result was tied (one tree for each) and the switch from different max methods picked a different element (since they were equal I think this is ok) 638854c [Holden Karau] Add a scala RandomForestClassifierSuite test based on corresponding python test e09919c [Holden Karau] Fix return type, I need more coffee 8d92cac [Holden Karau] Use ClassifierParams as the head 3456ed3 [Holden Karau] Add explicit return types even though just test a0f3b0c [Holden Karau] scala style fixes 6f14314 [Holden Karau] Since hasthreshold/hasthresholds is in root classifier now ffc8dab [Holden Karau] Update the sharedParams 0420290 [Holden Karau] Allow us to override the get methods selectively 978e77a [Holden Karau] Move HasThreshold into classifier params and start defining the overloaded getThreshold/getThresholds functions 1433e52 [Holden Karau] Revert "try and hide threshold but chainges the API so no dice there" 1f09a2e [Holden Karau] try and hide threshold but chainges the API so no dice there efb9084 [Holden Karau] move setThresholds only to where its used 6b34809 [Holden Karau] Add a test with thresholding for the RFCS 74f54c3 [Holden Karau] Fix creation of vote array 1986fa8 [Holden Karau] Setting the thresholds only makes sense if the underlying class hasn't overridden predict, so lets push it down. 2f44b18 [Holden Karau] Add a global default of null for thresholds param f338cfc [Holden Karau] Wait that wasn't a good idea, Revert "Some progress towards unifying threshold and thresholds" 634b06f [Holden Karau] Some progress towards unifying threshold and thresholds 85c9e01 [Holden Karau] Test passes again... little fnur 099c0f3 [Holden Karau] Move thresholds around some more (set on model not trainer) 0f46836 [Holden Karau] Start adding a classifiersuite f70eb5e [Holden Karau] Fix test compile issues a7d59c8 [Holden Karau] Move thresholding into Classifier trait 5d999d2 [Holden Karau] Some more progress, start adding a test (maybe try and see if we can find a better thing to use for the base of the test) 1fed644 [Holden Karau] Use thresholds to scale scores in random forest classifcation 31d6bf2 [Holden Karau] Start threading the threshold info through 0ef228c [Holden Karau] Add hasthresholds Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5a23213c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5a23213c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5a23213c Branch: refs/heads/master Commit: 5a23213c148bfe362514f9c71f5273ebda0a848a Parents: 34a0eb2 Author: Holden Karau Authored: Tue Aug 4 10:12:22 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 10:12:22 2015 -0700 -- .../examples/ml/JavaSimpleParamsExample.java| 3 +- .../src/main/python/ml/simple_params_example.py | 2 +- .../spark/examples/ml/SimpleParamsExample.scala | 2 +- .../spark/ml/classification/Classifier.scala| 3 +-
spark git commit: [SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier
Repository: spark Updated Branches: refs/heads/branch-1.5 560b2da78 -> e682ee254 [SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier, plus doc tests for those columns. CC: holdenk yanboliang Author: Joseph K. Bradley Closes #7903 from jkbradley/rf-prob-python and squashes the following commits: c62a83f [Joseph K. Bradley] made unit test more robust 14eeba2 [Joseph K. Bradley] added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier in PySpark (cherry picked from commit e375456063617cd7000d796024f41e5927f21edd) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e682ee25 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e682ee25 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e682ee25 Branch: refs/heads/branch-1.5 Commit: e682ee25477374737f3b1dfc08c98829564b26d4 Parents: 560b2da Author: Joseph K. Bradley Authored: Tue Aug 4 14:54:26 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 14:54:34 2015 -0700 -- python/pyspark/ml/classification.py | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e682ee25/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 291320f..5978d8f 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -347,6 +347,7 @@ class DecisionTreeClassificationModel(DecisionTreeModel): @inherit_doc class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, + HasRawPredictionCol, HasProbabilityCol, DecisionTreeParams, HasCheckpointInterval): """ `http://en.wikipedia.org/wiki/Random_forest Random Forest` @@ -354,6 +355,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred It supports both binary and multiclass labels, as well as both continuous and categorical features. +>>> import numpy >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer @@ -368,8 +370,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> allclose(model.treeWeights, [1.0, 1.0, 1.0]) True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) ->>> model.transform(test0).head().prediction +>>> result = model.transform(test0).head() +>>> result.prediction 0.0 +>>> numpy.argmax(result.probability) +0 +>>> numpy.argmax(result.rawPrediction) +0 >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 1.0 @@ -390,11 +397,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ + probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ numTrees=20, featureSubsetStrategy="auto", seed=None) @@ -427,11 +436,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred @keyword_only def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInf
spark git commit: [SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier
Repository: spark Updated Branches: refs/heads/master 9d668b736 -> e37545606 [SPARK-9447] [ML] [PYTHON] Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier Added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier, plus doc tests for those columns. CC: holdenk yanboliang Author: Joseph K. Bradley Closes #7903 from jkbradley/rf-prob-python and squashes the following commits: c62a83f [Joseph K. Bradley] made unit test more robust 14eeba2 [Joseph K. Bradley] added HasRawPredictionCol, HasProbabilityCol to RandomForestClassifier in PySpark Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e3754560 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e3754560 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e3754560 Branch: refs/heads/master Commit: e375456063617cd7000d796024f41e5927f21edd Parents: 9d668b7 Author: Joseph K. Bradley Authored: Tue Aug 4 14:54:26 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 14:54:26 2015 -0700 -- python/pyspark/ml/classification.py | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e3754560/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 291320f..5978d8f 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -347,6 +347,7 @@ class DecisionTreeClassificationModel(DecisionTreeModel): @inherit_doc class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasSeed, + HasRawPredictionCol, HasProbabilityCol, DecisionTreeParams, HasCheckpointInterval): """ `http://en.wikipedia.org/wiki/Random_forest Random Forest` @@ -354,6 +355,7 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred It supports both binary and multiclass labels, as well as both continuous and categorical features. +>>> import numpy >>> from numpy import allclose >>> from pyspark.mllib.linalg import Vectors >>> from pyspark.ml.feature import StringIndexer @@ -368,8 +370,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred >>> allclose(model.treeWeights, [1.0, 1.0, 1.0]) True >>> test0 = sqlContext.createDataFrame([(Vectors.dense(-1.0),)], ["features"]) ->>> model.transform(test0).head().prediction +>>> result = model.transform(test0).head() +>>> result.prediction 0.0 +>>> numpy.argmax(result.probability) +0 +>>> numpy.argmax(result.rawPrediction) +0 >>> test1 = sqlContext.createDataFrame([(Vectors.sparse(1, [0], [1.0]),)], ["features"]) >>> model.transform(test1).head().prediction 1.0 @@ -390,11 +397,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", + probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", numTrees=20, featureSubsetStrategy="auto", seed=None): """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ + probabilityCol="probability", rawPredictionCol="rawPrediction", \ maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, \ maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, impurity="gini", \ numTrees=20, featureSubsetStrategy="auto", seed=None) @@ -427,11 +436,13 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred @keyword_only def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", + probabilityCol="probability", rawPredictionCol="rawPrediction", maxDepth=5, maxBins=32, minInstancesPerNode=1, minInfoGain=0.0, maxMemoryInMB=256, cacheNodeIds=False, checkpointInterval=10, seed=None
spark git commit: [SPARK-9582] [ML] LDA cleanups
Repository: spark Updated Branches: refs/heads/master e37545606 -> 1833d9c08 [SPARK-9582] [ML] LDA cleanups Small cleanups to recent LDA additions and docs. CC: feynmanliang Author: Joseph K. Bradley Closes #7916 from jkbradley/lda-cleanups and squashes the following commits: f7021d9 [Joseph K. Bradley] broadcasting large matrices for LDA in local model and online learning 97947aa [Joseph K. Bradley] a few more cleanups 5b03f88 [Joseph K. Bradley] reverted split of lda log likelihood c566915 [Joseph K. Bradley] small edit to make review easier 63f6c7d [Joseph K. Bradley] clarified log likelihood for lda models Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1833d9c0 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1833d9c0 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1833d9c0 Branch: refs/heads/master Commit: 1833d9c08f021d991334424d0a6d5ec21d1fccb2 Parents: e375456 Author: Joseph K. Bradley Authored: Tue Aug 4 15:43:13 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 15:43:13 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 82 +++- .../spark/mllib/clustering/LDAOptimizer.scala | 19 +++-- 2 files changed, 58 insertions(+), 43 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1833d9c0/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 6af90d7..33babda 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -27,6 +27,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaPairRDD +import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.{Loader, Saveable} @@ -217,26 +218,28 @@ class LocalLDAModel private[clustering] ( // TODO: declare in LDAModel and override once implemented in DistributedLDAModel /** * Calculates a lower bound on the log likelihood of the entire corpus. + * + * See Equation (16) in original Online LDA paper. + * * @param documents test corpus to use for calculating log likelihood * @return variational lower bound on the log likelihood of the entire corpus */ - def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents, + def logLikelihood(documents: RDD[(Long, Vector)]): Double = logLikelihoodBound(documents, docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) /** - * Calculate an upper bound bound on perplexity. See Equation (16) in original Online - * LDA paper. + * Calculate an upper bound bound on perplexity. (Lower is better.) + * See Equation (16) in original Online LDA paper. + * * @param documents test corpus to use for calculating perplexity - * @return variational upper bound on log perplexity per word + * @return Variational upper bound on log perplexity per token. */ def logPerplexity(documents: RDD[(Long, Vector)]): Double = { -val corpusWords = documents +val corpusTokenCount = documents .map { case (_, termCounts) => termCounts.toArray.sum } .sum() -val perWordBound = -logLikelihood(documents) / corpusWords - -perWordBound +-logLikelihood(documents) / corpusTokenCount } /** @@ -244,17 +247,20 @@ class LocalLDAModel private[clustering] ( *log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)] * This bound is derived by decomposing the LDA model to: *log p(documents) = E_q[log p(documents)] - E_q[log q(documents)] + D(q|p) - * and noting that the KL-divergence D(q|p) >= 0. See Equation (16) in original Online LDA paper. + * and noting that the KL-divergence D(q|p) >= 0. + * + * See Equation (16) in original Online LDA paper, as well as Appendix A.3 in the JMLR version of + * the original LDA paper. * @param documents a subset of the test corpus * @param alpha document-topic Dirichlet prior parameters - * @param eta topic-word Dirichlet prior parameters + * @param eta topic-word Dirichlet prior parameter * @param lambda parameters for variational q(beta | lambda) topic-word distributions * @param gammaShape shape parameter for random initialization of variational
spark git commit: [SPARK-9582] [ML] LDA cleanups
Repository: spark Updated Branches: refs/heads/branch-1.5 e682ee254 -> fe4a4f41a [SPARK-9582] [ML] LDA cleanups Small cleanups to recent LDA additions and docs. CC: feynmanliang Author: Joseph K. Bradley Closes #7916 from jkbradley/lda-cleanups and squashes the following commits: f7021d9 [Joseph K. Bradley] broadcasting large matrices for LDA in local model and online learning 97947aa [Joseph K. Bradley] a few more cleanups 5b03f88 [Joseph K. Bradley] reverted split of lda log likelihood c566915 [Joseph K. Bradley] small edit to make review easier 63f6c7d [Joseph K. Bradley] clarified log likelihood for lda models (cherry picked from commit 1833d9c08f021d991334424d0a6d5ec21d1fccb2) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe4a4f41 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe4a4f41 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe4a4f41 Branch: refs/heads/branch-1.5 Commit: fe4a4f41ad8b686455d58fc2fda9494e8dba5636 Parents: e682ee2 Author: Joseph K. Bradley Authored: Tue Aug 4 15:43:13 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 15:43:20 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 82 +++- .../spark/mllib/clustering/LDAOptimizer.scala | 19 +++-- 2 files changed, 58 insertions(+), 43 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fe4a4f41/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 6af90d7..33babda 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -27,6 +27,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaPairRDD +import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.{Loader, Saveable} @@ -217,26 +218,28 @@ class LocalLDAModel private[clustering] ( // TODO: declare in LDAModel and override once implemented in DistributedLDAModel /** * Calculates a lower bound on the log likelihood of the entire corpus. + * + * See Equation (16) in original Online LDA paper. + * * @param documents test corpus to use for calculating log likelihood * @return variational lower bound on the log likelihood of the entire corpus */ - def logLikelihood(documents: RDD[(Long, Vector)]): Double = bound(documents, + def logLikelihood(documents: RDD[(Long, Vector)]): Double = logLikelihoodBound(documents, docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) /** - * Calculate an upper bound bound on perplexity. See Equation (16) in original Online - * LDA paper. + * Calculate an upper bound bound on perplexity. (Lower is better.) + * See Equation (16) in original Online LDA paper. + * * @param documents test corpus to use for calculating perplexity - * @return variational upper bound on log perplexity per word + * @return Variational upper bound on log perplexity per token. */ def logPerplexity(documents: RDD[(Long, Vector)]): Double = { -val corpusWords = documents +val corpusTokenCount = documents .map { case (_, termCounts) => termCounts.toArray.sum } .sum() -val perWordBound = -logLikelihood(documents) / corpusWords - -perWordBound +-logLikelihood(documents) / corpusTokenCount } /** @@ -244,17 +247,20 @@ class LocalLDAModel private[clustering] ( *log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)] * This bound is derived by decomposing the LDA model to: *log p(documents) = E_q[log p(documents)] - E_q[log q(documents)] + D(q|p) - * and noting that the KL-divergence D(q|p) >= 0. See Equation (16) in original Online LDA paper. + * and noting that the KL-divergence D(q|p) >= 0. + * + * See Equation (16) in original Online LDA paper, as well as Appendix A.3 in the JMLR version of + * the original LDA paper. * @param documents a subset of the test corpus * @param alpha document-topic Dirichlet prior parameters - * @param eta topic-word Dirichlet prior parameters + * @param eta topic-word Dirichlet prior parameter * @param lambda parameters for variational q(beta | lambda) topic-word
spark git commit: [SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy
Repository: spark Updated Branches: refs/heads/master 7c8fc1f7c -> 629e26f7e [SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy jkbradley Author: Feynman Liang Closes #7941 from feynmanliang/SPARK-9609-stategy-spelling and squashes the following commits: d2aafb1 [Feynman Liang] Add deprecated backwards compatibility aa090a8 [Feynman Liang] Fix spelling Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/629e26f7 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/629e26f7 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/629e26f7 Branch: refs/heads/master Commit: 629e26f7ee916e70f59b017cb6083aa441b26b2c Parents: 7c8fc1f Author: Feynman Liang Authored: Tue Aug 4 18:13:18 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 18:13:18 2015 -0700 -- .../src/main/scala/org/apache/spark/ml/tree/treeParams.scala | 2 +- .../spark/mllib/tree/configuration/BoostingStrategy.scala| 2 +- .../org/apache/spark/mllib/tree/configuration/Strategy.scala | 8 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/629e26f7/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index e817090..dbd8d31 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -163,7 +163,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams { oldAlgo: OldAlgo.Algo, oldImpurity: OldImpurity, subsamplingRate: Double): OldStrategy = { -val strategy = OldStrategy.defaultStategy(oldAlgo) +val strategy = OldStrategy.defaultStrategy(oldAlgo) strategy.impurity = oldImpurity strategy.checkpointInterval = getCheckpointInterval strategy.maxBins = getMaxBins http://git-wip-us.apache.org/repos/asf/spark/blob/629e26f7/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 9fd30c9..50fe2ac 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala @@ -90,7 +90,7 @@ object BoostingStrategy { * @return Configuration for boosting algorithm */ def defaultParams(algo: Algo): BoostingStrategy = { -val treeStrategy = Strategy.defaultStategy(algo) +val treeStrategy = Strategy.defaultStrategy(algo) treeStrategy.maxDepth = 3 algo match { case Algo.Classification => http://git-wip-us.apache.org/repos/asf/spark/blob/629e26f7/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala index ada227c..de2c784 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala @@ -178,14 +178,14 @@ object Strategy { * @param algo "Classification" or "Regression" */ def defaultStrategy(algo: String): Strategy = { -defaultStategy(Algo.fromString(algo)) +defaultStrategy(Algo.fromString(algo)) } /** * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]] * @param algo Algo.Classification or Algo.Regression */ - def defaultStategy(algo: Algo): Strategy = algo match { + def defaultStrategy(algo: Algo): Strategy = algo match { case Algo.Classification => new Strategy(algo = Classification, impurity = Gini, maxDepth = 10, numClasses = 2) @@ -193,4 +193,8 @@ object Strategy { new Strategy(algo = Regression, impurity = Variance, maxDepth = 10, numClasses = 0) } + + @deprecated("Use Strategy.defaultStrategy instead.", "1.5.0") + def defaultStategy(algo: Algo): Strategy = defaultStrategy(algo) + } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy
Repository: spark Updated Branches: refs/heads/branch-1.5 1954a7bb1 -> 335097548 [SPARK-9609] [MLLIB] Fix spelling of Strategy.defaultStrategy jkbradley Author: Feynman Liang Closes #7941 from feynmanliang/SPARK-9609-stategy-spelling and squashes the following commits: d2aafb1 [Feynman Liang] Add deprecated backwards compatibility aa090a8 [Feynman Liang] Fix spelling (cherry picked from commit 629e26f7ee916e70f59b017cb6083aa441b26b2c) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33509754 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33509754 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33509754 Branch: refs/heads/branch-1.5 Commit: 33509754843fe8eba303c720e6c0f6853b861e7e Parents: 1954a7b Author: Feynman Liang Authored: Tue Aug 4 18:13:18 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 18:13:27 2015 -0700 -- .../src/main/scala/org/apache/spark/ml/tree/treeParams.scala | 2 +- .../spark/mllib/tree/configuration/BoostingStrategy.scala| 2 +- .../org/apache/spark/mllib/tree/configuration/Strategy.scala | 8 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/33509754/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala index e817090..dbd8d31 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/tree/treeParams.scala @@ -163,7 +163,7 @@ private[ml] trait DecisionTreeParams extends PredictorParams { oldAlgo: OldAlgo.Algo, oldImpurity: OldImpurity, subsamplingRate: Double): OldStrategy = { -val strategy = OldStrategy.defaultStategy(oldAlgo) +val strategy = OldStrategy.defaultStrategy(oldAlgo) strategy.impurity = oldImpurity strategy.checkpointInterval = getCheckpointInterval strategy.maxBins = getMaxBins http://git-wip-us.apache.org/repos/asf/spark/blob/33509754/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala index 9fd30c9..50fe2ac 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.scala @@ -90,7 +90,7 @@ object BoostingStrategy { * @return Configuration for boosting algorithm */ def defaultParams(algo: Algo): BoostingStrategy = { -val treeStrategy = Strategy.defaultStategy(algo) +val treeStrategy = Strategy.defaultStrategy(algo) treeStrategy.maxDepth = 3 algo match { case Algo.Classification => http://git-wip-us.apache.org/repos/asf/spark/blob/33509754/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala index ada227c..de2c784 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/configuration/Strategy.scala @@ -178,14 +178,14 @@ object Strategy { * @param algo "Classification" or "Regression" */ def defaultStrategy(algo: String): Strategy = { -defaultStategy(Algo.fromString(algo)) +defaultStrategy(Algo.fromString(algo)) } /** * Construct a default set of parameters for [[org.apache.spark.mllib.tree.DecisionTree]] * @param algo Algo.Classification or Algo.Regression */ - def defaultStategy(algo: Algo): Strategy = algo match { + def defaultStrategy(algo: Algo): Strategy = algo match { case Algo.Classification => new Strategy(algo = Classification, impurity = Gini, maxDepth = 10, numClasses = 2) @@ -193,4 +193,8 @@ object Strategy { new Strategy(algo = Regression, impurity = Variance, maxDepth = 10, numClasses = 0) } + + @deprecated("Use Strategy.defaultStrategy instead.", "1.5.0") + def defaultStategy(algo: Algo): Strategy = defaultStrategy(algo) + } - To unsubscribe, e-mail: commits-unsubscr...@spark.apac
spark git commit: [SPARK-8601] [ML] Add an option to disable standardization for linear regression
Repository: spark Updated Branches: refs/heads/master 629e26f7e -> d92fa1417 [SPARK-8601] [ML] Add an option to disable standardization for linear regression All compressed sensing applications, and some of the regression use-cases will have better result by turning the feature scaling off. However, if we implement this naively by training the dataset without doing any standardization, the rate of convergency will not be good. This can be implemented by still standardizing the training dataset but we penalize each component differently to get effectively the same objective function but a better numerical problem. As a result, for those columns with high variances, they will be penalized less, and vice versa. Without this, since all the features are standardized, so they will be penalized the same. In R, there is an option for this. standardize Logical flag for x variable standardization, prior to fitting the model sequence. The coefficients are always returned on the original scale. Default is standardize=TRUE. If variables are in the same units already, you might not wish to standardize. See details below for y standardization with family="gaussian". Note that the primary author for this PR is holdenk Author: Holden Karau Author: DB Tsai Closes #7875 from dbtsai/SPARK-8522 and squashes the following commits: e856036 [DB Tsai] scala doc 596e96c [DB Tsai] minor bbff347 [DB Tsai] naming baa0805 [DB Tsai] touch up d6234ba [DB Tsai] Merge branch 'master' into SPARK-8522-Disable-Linear_featureScaling-Spark-8601-in-Linear_regression 6b1dc09 [Holden Karau] Merge branch 'master' into SPARK-8522-Disable-Linear_featureScaling-Spark-8601-in-Linear_regression 332f140 [Holden Karau] Merge in master eebe10a [Holden Karau] Use same comparision operator throughout the test 3f92935 [Holden Karau] merge b83a41e [Holden Karau] Expand the tests and make them similar to the other PR also providing an option to disable standardization (but for LoR). 0c334a2 [Holden Karau] Remove extra line 99ce053 [Holden Karau] merge in master e54a8a9 [Holden Karau] Fix long line e47c574 [Holden Karau] Add support for L2 without standardization. 55d3a66 [Holden Karau] Add standardization param for linear regression 00a1dc5 [Holden Karau] Add the param to the linearregression impl Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d92fa141 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d92fa141 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d92fa141 Branch: refs/heads/master Commit: d92fa14179287c996407d9c7d249103109f9cdef Parents: 629e26f Author: Holden Karau Authored: Tue Aug 4 18:15:26 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 18:15:26 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 6 +- .../spark/ml/regression/LinearRegression.scala | 70 - .../ml/regression/LinearRegressionSuite.scala | 278 ++- 3 files changed, 268 insertions(+), 86 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d92fa141/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index c937b960..0d07383 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -133,9 +133,9 @@ class LogisticRegression(override val uid: String) /** * Whether to standardize the training features before fitting the model. * The coefficients of models will be always returned on the original scale, - * so it will be transparent for users. Note that when no regularization, - * with or without standardization, the models should be always converged to - * the same solution. + * so it will be transparent for users. Note that with/without standardization, + * the models should be always converged to the same solution when no regularization + * is applied. In R's GLMNET package, the default behavior is true as well. * Default is true. * @group setParam * */ http://git-wip-us.apache.org/repos/asf/spark/blob/d92fa141/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala index 3b85ba0..92d819b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala +++ b
spark git commit: [SPARK-8601] [ML] Add an option to disable standardization for linear regression
Repository: spark Updated Branches: refs/heads/branch-1.5 335097548 -> 2237ddbe0 [SPARK-8601] [ML] Add an option to disable standardization for linear regression All compressed sensing applications, and some of the regression use-cases will have better result by turning the feature scaling off. However, if we implement this naively by training the dataset without doing any standardization, the rate of convergency will not be good. This can be implemented by still standardizing the training dataset but we penalize each component differently to get effectively the same objective function but a better numerical problem. As a result, for those columns with high variances, they will be penalized less, and vice versa. Without this, since all the features are standardized, so they will be penalized the same. In R, there is an option for this. standardize Logical flag for x variable standardization, prior to fitting the model sequence. The coefficients are always returned on the original scale. Default is standardize=TRUE. If variables are in the same units already, you might not wish to standardize. See details below for y standardization with family="gaussian". Note that the primary author for this PR is holdenk Author: Holden Karau Author: DB Tsai Closes #7875 from dbtsai/SPARK-8522 and squashes the following commits: e856036 [DB Tsai] scala doc 596e96c [DB Tsai] minor bbff347 [DB Tsai] naming baa0805 [DB Tsai] touch up d6234ba [DB Tsai] Merge branch 'master' into SPARK-8522-Disable-Linear_featureScaling-Spark-8601-in-Linear_regression 6b1dc09 [Holden Karau] Merge branch 'master' into SPARK-8522-Disable-Linear_featureScaling-Spark-8601-in-Linear_regression 332f140 [Holden Karau] Merge in master eebe10a [Holden Karau] Use same comparision operator throughout the test 3f92935 [Holden Karau] merge b83a41e [Holden Karau] Expand the tests and make them similar to the other PR also providing an option to disable standardization (but for LoR). 0c334a2 [Holden Karau] Remove extra line 99ce053 [Holden Karau] merge in master e54a8a9 [Holden Karau] Fix long line e47c574 [Holden Karau] Add support for L2 without standardization. 55d3a66 [Holden Karau] Add standardization param for linear regression 00a1dc5 [Holden Karau] Add the param to the linearregression impl (cherry picked from commit d92fa14179287c996407d9c7d249103109f9cdef) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2237ddbe Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2237ddbe Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2237ddbe Branch: refs/heads/branch-1.5 Commit: 2237ddbe027be084afd85fc5b7a7c22270b6e7f6 Parents: 3350975 Author: Holden Karau Authored: Tue Aug 4 18:15:26 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 4 18:15:35 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 6 +- .../spark/ml/regression/LinearRegression.scala | 70 - .../ml/regression/LinearRegressionSuite.scala | 278 ++- 3 files changed, 268 insertions(+), 86 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2237ddbe/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index c937b960..0d07383 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -133,9 +133,9 @@ class LogisticRegression(override val uid: String) /** * Whether to standardize the training features before fitting the model. * The coefficients of models will be always returned on the original scale, - * so it will be transparent for users. Note that when no regularization, - * with or without standardization, the models should be always converged to - * the same solution. + * so it will be transparent for users. Note that with/without standardization, + * the models should be always converged to the same solution when no regularization + * is applied. In R's GLMNET package, the default behavior is true as well. * Default is true. * @group setParam * */ http://git-wip-us.apache.org/repos/asf/spark/blob/2237ddbe/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala inde
spark git commit: [SPARK-9112] [ML] Implement Stats for LogisticRegression
Repository: spark Updated Branches: refs/heads/master 9f94c85ff -> c5c6aded6 [SPARK-9112] [ML] Implement Stats for LogisticRegression I have added support for stats in LogisticRegression. The API is similar to that of LinearRegression with LogisticRegressionTrainingSummary and LogisticRegressionSummary I have some queries and asked them inline. Author: MechCoder Closes #7538 from MechCoder/log_reg_stats and squashes the following commits: 2e9f7c7 [MechCoder] Change defs into lazy vals d775371 [MechCoder] Clean up class inheritance 9586125 [MechCoder] Add abstraction to handle Multiclass Metrics 40ad8ef [MechCoder] minor 640376a [MechCoder] remove unnecessary dataframe stuff and add docs 80d9954 [MechCoder] Added tests fbed861 [MechCoder] DataFrame support for metrics 70a0fc4 [MechCoder] [SPARK-9112] [ML] Implement Stats for LogisticRegression Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5c6aded Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5c6aded Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5c6aded Branch: refs/heads/master Commit: c5c6aded641048a3e66ac79d9e84d34e4b1abae7 Parents: 9f94c85 Author: MechCoder Authored: Thu Aug 6 10:08:33 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 6 10:08:33 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 166 ++- .../JavaLogisticRegressionSuite.java| 9 + .../LogisticRegressionSuite.scala | 37 - 3 files changed, 209 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c5c6aded/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 0d07383..f55134d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -30,10 +30,12 @@ import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.linalg.BLAS._ import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.storage.StorageLevel /** @@ -284,7 +286,13 @@ class LogisticRegression(override val uid: String) if (handlePersistence) instances.unpersist() -copyValues(new LogisticRegressionModel(uid, weights, intercept)) +val model = copyValues(new LogisticRegressionModel(uid, weights, intercept)) +val logRegSummary = new BinaryLogisticRegressionTrainingSummary( + model.transform(dataset), + $(probabilityCol), + $(labelCol), + objectiveHistory) +model.setSummary(logRegSummary) } override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra) @@ -319,6 +327,38 @@ class LogisticRegressionModel private[ml] ( override val numClasses: Int = 2 + private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None + + /** + * Gets summary of model on training set. An exception is + * thrown if `trainingSummary == None`. + */ + def summary: LogisticRegressionTrainingSummary = trainingSummary match { +case Some(summ) => summ +case None => + throw new SparkException( +"No training summary available for this LogisticRegressionModel", +new NullPointerException()) + } + + private[classification] def setSummary( + summary: LogisticRegressionTrainingSummary): this.type = { +this.trainingSummary = Some(summary) +this + } + + /** Indicates whether a training summary exists for this model instance. */ + def hasSummary: Boolean = trainingSummary.isDefined + + /** + * Evaluates the model on a testset. + * @param dataset Test dataset to evaluate model on. + */ + // TODO: decide on a good name before exposing to public API + private[classification] def evaluate(dataset: DataFrame): LogisticRegressionSummary = { +new BinaryLogisticRegressionSummary(this.transform(dataset), $(probabilityCol), $(labelCol)) + } + /** * Predict label for the given feature vector. * The behavior of this can be adjusted using [[thresholds]]. @@ -441,6 +481,128 @@ private[classification] class MultiClassSummarizer extends Serializ
spark git commit: [SPARK-9112] [ML] Implement Stats for LogisticRegression
Repository: spark Updated Branches: refs/heads/branch-1.5 cc4c569a8 -> 70b9ed11d [SPARK-9112] [ML] Implement Stats for LogisticRegression I have added support for stats in LogisticRegression. The API is similar to that of LinearRegression with LogisticRegressionTrainingSummary and LogisticRegressionSummary I have some queries and asked them inline. Author: MechCoder Closes #7538 from MechCoder/log_reg_stats and squashes the following commits: 2e9f7c7 [MechCoder] Change defs into lazy vals d775371 [MechCoder] Clean up class inheritance 9586125 [MechCoder] Add abstraction to handle Multiclass Metrics 40ad8ef [MechCoder] minor 640376a [MechCoder] remove unnecessary dataframe stuff and add docs 80d9954 [MechCoder] Added tests fbed861 [MechCoder] DataFrame support for metrics 70a0fc4 [MechCoder] [SPARK-9112] [ML] Implement Stats for LogisticRegression (cherry picked from commit c5c6aded641048a3e66ac79d9e84d34e4b1abae7) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/70b9ed11 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/70b9ed11 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/70b9ed11 Branch: refs/heads/branch-1.5 Commit: 70b9ed11d08014b96da9d5747c0cebb4927c0459 Parents: cc4c569 Author: MechCoder Authored: Thu Aug 6 10:08:33 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 6 10:08:43 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 166 ++- .../JavaLogisticRegressionSuite.java| 9 + .../LogisticRegressionSuite.scala | 37 - 3 files changed, 209 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/70b9ed11/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index 0d07383..f55134d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -30,10 +30,12 @@ import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.linalg.BLAS._ import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Row, SQLContext} +import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.storage.StorageLevel /** @@ -284,7 +286,13 @@ class LogisticRegression(override val uid: String) if (handlePersistence) instances.unpersist() -copyValues(new LogisticRegressionModel(uid, weights, intercept)) +val model = copyValues(new LogisticRegressionModel(uid, weights, intercept)) +val logRegSummary = new BinaryLogisticRegressionTrainingSummary( + model.transform(dataset), + $(probabilityCol), + $(labelCol), + objectiveHistory) +model.setSummary(logRegSummary) } override def copy(extra: ParamMap): LogisticRegression = defaultCopy(extra) @@ -319,6 +327,38 @@ class LogisticRegressionModel private[ml] ( override val numClasses: Int = 2 + private var trainingSummary: Option[LogisticRegressionTrainingSummary] = None + + /** + * Gets summary of model on training set. An exception is + * thrown if `trainingSummary == None`. + */ + def summary: LogisticRegressionTrainingSummary = trainingSummary match { +case Some(summ) => summ +case None => + throw new SparkException( +"No training summary available for this LogisticRegressionModel", +new NullPointerException()) + } + + private[classification] def setSummary( + summary: LogisticRegressionTrainingSummary): this.type = { +this.trainingSummary = Some(summary) +this + } + + /** Indicates whether a training summary exists for this model instance. */ + def hasSummary: Boolean = trainingSummary.isDefined + + /** + * Evaluates the model on a testset. + * @param dataset Test dataset to evaluate model on. + */ + // TODO: decide on a good name before exposing to public API + private[classification] def evaluate(dataset: DataFrame): LogisticRegressionSummary = { +new BinaryLogisticRegressionSummary(this.transform(dataset), $(probabilityCol), $(labelCol)) + } + /** * Predict label for the given feature vector. * The behavior of this can be adjusted
spark git commit: [SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML
Repository: spark Updated Branches: refs/heads/master c5c6aded6 -> 076ec0568 [SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML After https://github.com/apache/spark/pull/7263 it is pretty straightforward to Python wrappers. Author: MechCoder Closes #7930 from MechCoder/spark-9533 and squashes the following commits: 1bea394 [MechCoder] make getVectors a lazy val 5522756 [MechCoder] [SPARK-9533] [PySpark] [ML] Add missing methods in Word2Vec ML Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/076ec056 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/076ec056 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/076ec056 Branch: refs/heads/master Commit: 076ec056818a65216eaf51aa5b3bd8f697c34748 Parents: c5c6ade Author: MechCoder Authored: Thu Aug 6 10:09:58 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 6 10:09:58 2015 -0700 -- .../org/apache/spark/ml/feature/Word2Vec.scala | 2 +- python/pyspark/ml/feature.py| 40 2 files changed, 41 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/076ec056/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index b4f46ce..29acc3e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -153,7 +153,7 @@ class Word2VecModel private[ml] ( * Returns a dataframe with two fields, "word" and "vector", with "word" being a String and * and the vector the DenseVector that it is mapped to. */ - val getVectors: DataFrame = { + @transient lazy val getVectors: DataFrame = { val sc = SparkContext.getOrCreate() val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ http://git-wip-us.apache.org/repos/asf/spark/blob/076ec056/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 3f04c41..cb4dfa2 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -15,11 +15,16 @@ # limitations under the License. # +import sys +if sys.version > '3': +basestring = str + from pyspark.rdd import ignore_unicode_prefix from pyspark.ml.param.shared import * from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer from pyspark.mllib.common import inherit_doc +from pyspark.mllib.linalg import _convert_to_vector __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', @@ -954,6 +959,23 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has >>> sent = ("a b " * 100 + "a c " * 10).split(" ") >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"]) >>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model").fit(doc) +>>> model.getVectors().show() ++++ +|word| vector| ++++ +| a|[-0.3511952459812...| +| b|[0.29077222943305...| +| c|[0.02315592765808...| ++++ +... +>>> model.findSynonyms("a", 2).show() +++---+ +|word| similarity| +++---+ +| b|0.29255685145799626| +| c|-0.5414068302988307| +++---+ +... >>> model.transform(doc).head().model DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276]) """ @@ -1047,6 +1069,24 @@ class Word2VecModel(JavaModel): Model fitted by Word2Vec. """ +def getVectors(self): +""" +Returns the vector representation of the words as a dataframe +with two fields, word and vector. +""" +return self._call_java("getVectors") + +def findSynonyms(self, word, num): +""" +Find "num" number of words closest in similarity to "word". +word can be a string or vector representation. +Returns a dataframe with two fields word and similarity (which +gives the cosine similarity). +""" +if not isinstance(word, basestring): +word = _convert_to_vector(word) +return self._call_java("findSynonyms", word, num) + @inherit_doc class PCA(JavaEstimator, HasInputCol, HasOutputCol):
spark git commit: [SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML
Repository: spark Updated Branches: refs/heads/branch-1.5 70b9ed11d -> e24b97650 [SPARK-9533] [PYSPARK] [ML] Add missing methods in Word2Vec ML After https://github.com/apache/spark/pull/7263 it is pretty straightforward to Python wrappers. Author: MechCoder Closes #7930 from MechCoder/spark-9533 and squashes the following commits: 1bea394 [MechCoder] make getVectors a lazy val 5522756 [MechCoder] [SPARK-9533] [PySpark] [ML] Add missing methods in Word2Vec ML (cherry picked from commit 076ec056818a65216eaf51aa5b3bd8f697c34748) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e24b9765 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e24b9765 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e24b9765 Branch: refs/heads/branch-1.5 Commit: e24b976506dd8563e4fe9cc295c756a1ce979e0d Parents: 70b9ed1 Author: MechCoder Authored: Thu Aug 6 10:09:58 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 6 10:10:06 2015 -0700 -- .../org/apache/spark/ml/feature/Word2Vec.scala | 2 +- python/pyspark/ml/feature.py| 40 2 files changed, 41 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e24b9765/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index b4f46ce..29acc3e 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -153,7 +153,7 @@ class Word2VecModel private[ml] ( * Returns a dataframe with two fields, "word" and "vector", with "word" being a String and * and the vector the DenseVector that it is mapped to. */ - val getVectors: DataFrame = { + @transient lazy val getVectors: DataFrame = { val sc = SparkContext.getOrCreate() val sqlContext = SQLContext.getOrCreate(sc) import sqlContext.implicits._ http://git-wip-us.apache.org/repos/asf/spark/blob/e24b9765/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 3f04c41..cb4dfa2 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -15,11 +15,16 @@ # limitations under the License. # +import sys +if sys.version > '3': +basestring = str + from pyspark.rdd import ignore_unicode_prefix from pyspark.ml.param.shared import * from pyspark.ml.util import keyword_only from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer from pyspark.mllib.common import inherit_doc +from pyspark.mllib.linalg import _convert_to_vector __all__ = ['Binarizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', 'StandardScalerModel', @@ -954,6 +959,23 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has >>> sent = ("a b " * 100 + "a c " * 10).split(" ") >>> doc = sqlContext.createDataFrame([(sent,), (sent,)], ["sentence"]) >>> model = Word2Vec(vectorSize=5, seed=42, inputCol="sentence", outputCol="model").fit(doc) +>>> model.getVectors().show() ++++ +|word| vector| ++++ +| a|[-0.3511952459812...| +| b|[0.29077222943305...| +| c|[0.02315592765808...| ++++ +... +>>> model.findSynonyms("a", 2).show() +++---+ +|word| similarity| +++---+ +| b|0.29255685145799626| +| c|-0.5414068302988307| +++---+ +... >>> model.transform(doc).head().model DenseVector([-0.0422, -0.5138, -0.2546, 0.6885, 0.276]) """ @@ -1047,6 +1069,24 @@ class Word2VecModel(JavaModel): Model fitted by Word2Vec. """ +def getVectors(self): +""" +Returns the vector representation of the words as a dataframe +with two fields, word and vector. +""" +return self._call_java("getVectors") + +def findSynonyms(self, word, num): +""" +Find "num" number of words closest in similarity to "word". +word can be a string or vector representation. +Returns a dataframe with two fields word and similarity (which +gives the cosine similarity). +""" +if not isinstance(word, basestring): +word = _convert_to_vector(word) +return self._call_java("findSynonyms", word, num
spark git commit: [SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression
Repository: spark Updated Branches: refs/heads/master 1f62f104c -> 54c0789a0 [SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression This PR contains the following changes: * add `featureIndex` to handle vector features (in order to chain isotonic regression easily with output from logistic regression * make getter/setter names consistent with params * remove inheritance from Regressor because it is tricky to handle both `DoubleType` and `VectorType` * simplify test data generation jkbradley zapletal-martin Author: Xiangrui Meng Closes #7952 from mengxr/SPARK-9493 and squashes the following commits: 8818ac3 [Xiangrui Meng] address comments 05e2216 [Xiangrui Meng] address comments 8d08090 [Xiangrui Meng] add featureIndex to handle vector features make getter/setter names consistent with params remove inheritance from Regressor Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/54c0789a Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/54c0789a Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/54c0789a Branch: refs/heads/master Commit: 54c0789a05a783ce90e0e9848079be442a82966b Parents: 1f62f10 Author: Xiangrui Meng Authored: Thu Aug 6 13:29:31 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 6 13:29:31 2015 -0700 -- .../ml/regression/IsotonicRegression.scala | 202 ++- .../ml/regression/IsotonicRegressionSuite.scala | 82 2 files changed, 194 insertions(+), 90 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/54c0789a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index 4ece8cf..f570590 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -17,44 +17,113 @@ package org.apache.spark.ml.regression +import org.apache.spark.Logging import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.PredictorParams -import org.apache.spark.ml.param.{Param, ParamMap, BooleanParam} -import org.apache.spark.ml.util.{SchemaUtils, Identifiable} -import org.apache.spark.mllib.regression.{IsotonicRegression => MLlibIsotonicRegression} -import org.apache.spark.mllib.regression.{IsotonicRegressionModel => MLlibIsotonicRegressionModel} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} +import org.apache.spark.mllib.regression.{IsotonicRegression => MLlibIsotonicRegression, IsotonicRegressionModel => MLlibIsotonicRegressionModel} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.{DoubleType, DataType} -import org.apache.spark.sql.{Row, DataFrame} +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.{col, lit, udf} +import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.storage.StorageLevel /** * Params for isotonic regression. */ -private[regression] trait IsotonicRegressionParams extends PredictorParams { +private[regression] trait IsotonicRegressionBase extends Params with HasFeaturesCol + with HasLabelCol with HasPredictionCol with Logging { /** - * Param for weight column name. - * TODO: Move weightCol to sharedParams. - * + * Param for weight column name (default: none). * @group param */ + // TODO: Move weightCol to sharedParams. final val weightCol: Param[String] = -new Param[String](this, "weightCol", "weight column name") +new Param[String](this, "weightCol", + "weight column name. If this is not set or empty, we treat all instance weights as 1.0.") /** @group getParam */ final def getWeightCol: String = $(weightCol) /** - * Param for isotonic parameter. - * Isotonic (increasing) or antitonic (decreasing) sequence. + * Param for whether the output sequence should be isotonic/increasing (true) or + * antitonic/decreasing (false). * @group param */ final val isotonic: BooleanParam = -new BooleanParam(this, "isotonic", "isotonic (increasing) or antitonic (decreasing) sequence") +new BooleanParam(this, "isotonic", + "whether the output sequence should be isotonic/increasing (tru
spark git commit: [SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression
Repository: spark Updated Branches: refs/heads/branch-1.5 92e8acc98 -> ee43d355b [SPARK-9493] [ML] add featureIndex to handle vector features in IsotonicRegression This PR contains the following changes: * add `featureIndex` to handle vector features (in order to chain isotonic regression easily with output from logistic regression * make getter/setter names consistent with params * remove inheritance from Regressor because it is tricky to handle both `DoubleType` and `VectorType` * simplify test data generation jkbradley zapletal-martin Author: Xiangrui Meng Closes #7952 from mengxr/SPARK-9493 and squashes the following commits: 8818ac3 [Xiangrui Meng] address comments 05e2216 [Xiangrui Meng] address comments 8d08090 [Xiangrui Meng] add featureIndex to handle vector features make getter/setter names consistent with params remove inheritance from Regressor (cherry picked from commit 54c0789a05a783ce90e0e9848079be442a82966b) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ee43d355 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ee43d355 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ee43d355 Branch: refs/heads/branch-1.5 Commit: ee43d355bcfc9c3f4f281f0c44e1b1f331c7bb97 Parents: 92e8acc Author: Xiangrui Meng Authored: Thu Aug 6 13:29:31 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 6 13:29:38 2015 -0700 -- .../ml/regression/IsotonicRegression.scala | 202 ++- .../ml/regression/IsotonicRegressionSuite.scala | 82 2 files changed, 194 insertions(+), 90 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/ee43d355/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala index 4ece8cf..f570590 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala @@ -17,44 +17,113 @@ package org.apache.spark.ml.regression +import org.apache.spark.Logging import org.apache.spark.annotation.Experimental -import org.apache.spark.ml.PredictorParams -import org.apache.spark.ml.param.{Param, ParamMap, BooleanParam} -import org.apache.spark.ml.util.{SchemaUtils, Identifiable} -import org.apache.spark.mllib.regression.{IsotonicRegression => MLlibIsotonicRegression} -import org.apache.spark.mllib.regression.{IsotonicRegressionModel => MLlibIsotonicRegressionModel} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} +import org.apache.spark.mllib.regression.{IsotonicRegression => MLlibIsotonicRegression, IsotonicRegressionModel => MLlibIsotonicRegressionModel} import org.apache.spark.rdd.RDD -import org.apache.spark.sql.types.{DoubleType, DataType} -import org.apache.spark.sql.{Row, DataFrame} +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.functions.{col, lit, udf} +import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.storage.StorageLevel /** * Params for isotonic regression. */ -private[regression] trait IsotonicRegressionParams extends PredictorParams { +private[regression] trait IsotonicRegressionBase extends Params with HasFeaturesCol + with HasLabelCol with HasPredictionCol with Logging { /** - * Param for weight column name. - * TODO: Move weightCol to sharedParams. - * + * Param for weight column name (default: none). * @group param */ + // TODO: Move weightCol to sharedParams. final val weightCol: Param[String] = -new Param[String](this, "weightCol", "weight column name") +new Param[String](this, "weightCol", + "weight column name. If this is not set or empty, we treat all instance weights as 1.0.") /** @group getParam */ final def getWeightCol: String = $(weightCol) /** - * Param for isotonic parameter. - * Isotonic (increasing) or antitonic (decreasing) sequence. + * Param for whether the output sequence should be isotonic/increasing (true) or + * antitonic/decreasing (false). * @group param */ final val isotonic: BooleanParam = -new BooleanParam(this, "isotonic", "isotonic (increasing) or antitonic (decreasing) sequence") +new BooleanPar
spark git commit: Revert "[SPARK-8481] [MLLIB] GaussianMixtureModel.predict, GaussianMixtureModel.predictSoft variants for a single vector"
Repository: spark Updated Branches: refs/heads/branch-1.4 e5a994f21 -> 4b5bbc589 Revert "[SPARK-8481] [MLLIB] GaussianMixtureModel.predict, GaussianMixtureModel.predictSoft variants for a single vector" This reverts commit 07f778978d80f0af57d3dafda4c566a813ad2d09. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4b5bbc58 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4b5bbc58 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4b5bbc58 Branch: refs/heads/branch-1.4 Commit: 4b5bbc589e11d882c993a3e6daeb0cdad9789e76 Parents: e5a994f Author: Joseph K. Bradley Authored: Fri Aug 7 13:42:20 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 7 13:42:20 2015 -0700 -- .../spark/mllib/clustering/GaussianMixtureModel.scala | 13 - .../spark/mllib/clustering/GaussianMixtureSuite.scala | 10 -- 2 files changed, 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4b5bbc58/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index 76aeebd..cb807c8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -66,12 +66,6 @@ class GaussianMixtureModel( responsibilityMatrix.map(r => r.indexOf(r.max)) } - /** Maps given point to its cluster index. */ - def predict(point: Vector): Int = { -val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) -r.indexOf(r.max) - } - /** Java-friendly version of [[predict()]] */ def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] @@ -90,13 +84,6 @@ class GaussianMixtureModel( } /** - * Given the input vector, return the membership values to all mixture components. - */ - def predictSoft(point: Vector): Array[Double] = { -computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) - } - - /** * Compute the partial assignments for each vector */ private def computeSoftAssignments( http://git-wip-us.apache.org/repos/asf/spark/blob/4b5bbc58/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala index b636d02..b218d72 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala @@ -148,16 +148,6 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext { } } - test("model prediction, parallel and local") { -val data = sc.parallelize(GaussianTestData.data) -val gmm = new GaussianMixture().setK(2).setSeed(0).run(data) - -val batchPredictions = gmm.predict(data) -batchPredictions.zip(data).collect().foreach { case (batchPred, datum) => - assert(batchPred === gmm.predict(datum)) -} - } - object GaussianTestData { val data = Array( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector
Repository: spark Updated Branches: refs/heads/master 881548ab2 -> e2fbbe731 [SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector Resubmit of [https://github.com/apache/spark/pull/6906] for adding single-vec predict to GMMs CC: dkobylarz mengxr To be merged with master and branch-1.5 Primary author: dkobylarz Author: Dariusz Kobylarz Closes #8039 from jkbradley/gmm-predict-vec and squashes the following commits: bfbedc4 [Dariusz Kobylarz] [SPARK-8481] [MLlib] GaussianMixtureModel predict accepting single vector Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e2fbbe73 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e2fbbe73 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e2fbbe73 Branch: refs/heads/master Commit: e2fbbe73111d4624390f596a19a1799c86a05f6c Parents: 881548a Author: Dariusz Kobylarz Authored: Fri Aug 7 14:51:03 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 7 14:51:03 2015 -0700 -- .../spark/mllib/clustering/GaussianMixtureModel.scala | 13 + .../spark/mllib/clustering/GaussianMixtureSuite.scala | 10 ++ 2 files changed, 23 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e2fbbe73/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index cb807c8..76aeebd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -66,6 +66,12 @@ class GaussianMixtureModel( responsibilityMatrix.map(r => r.indexOf(r.max)) } + /** Maps given point to its cluster index. */ + def predict(point: Vector): Int = { +val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) +r.indexOf(r.max) + } + /** Java-friendly version of [[predict()]] */ def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] @@ -84,6 +90,13 @@ class GaussianMixtureModel( } /** + * Given the input vector, return the membership values to all mixture components. + */ + def predictSoft(point: Vector): Array[Double] = { +computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) + } + + /** * Compute the partial assignments for each vector */ private def computeSoftAssignments( http://git-wip-us.apache.org/repos/asf/spark/blob/e2fbbe73/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala index b218d72..b636d02 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala @@ -148,6 +148,16 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("model prediction, parallel and local") { +val data = sc.parallelize(GaussianTestData.data) +val gmm = new GaussianMixture().setK(2).setSeed(0).run(data) + +val batchPredictions = gmm.predict(data) +batchPredictions.zip(data).collect().foreach { case (batchPred, datum) => + assert(batchPred === gmm.predict(datum)) +} + } + object GaussianTestData { val data = Array( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector
Repository: spark Updated Branches: refs/heads/branch-1.5 547120287 -> 295266049 [SPARK-8481] [MLLIB] GaussianMixtureModel predict accepting single vector Resubmit of [https://github.com/apache/spark/pull/6906] for adding single-vec predict to GMMs CC: dkobylarz mengxr To be merged with master and branch-1.5 Primary author: dkobylarz Author: Dariusz Kobylarz Closes #8039 from jkbradley/gmm-predict-vec and squashes the following commits: bfbedc4 [Dariusz Kobylarz] [SPARK-8481] [MLlib] GaussianMixtureModel predict accepting single vector (cherry picked from commit e2fbbe73111d4624390f596a19a1799c86a05f6c) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/29526604 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/29526604 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/29526604 Branch: refs/heads/branch-1.5 Commit: 29526604916a5e1dff12fcbc395f1039b3a69dcd Parents: 5471202 Author: Dariusz Kobylarz Authored: Fri Aug 7 14:51:03 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 7 14:51:13 2015 -0700 -- .../spark/mllib/clustering/GaussianMixtureModel.scala | 13 + .../spark/mllib/clustering/GaussianMixtureSuite.scala | 10 ++ 2 files changed, 23 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/29526604/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala index cb807c8..76aeebd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/GaussianMixtureModel.scala @@ -66,6 +66,12 @@ class GaussianMixtureModel( responsibilityMatrix.map(r => r.indexOf(r.max)) } + /** Maps given point to its cluster index. */ + def predict(point: Vector): Int = { +val r = computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) +r.indexOf(r.max) + } + /** Java-friendly version of [[predict()]] */ def predict(points: JavaRDD[Vector]): JavaRDD[java.lang.Integer] = predict(points.rdd).toJavaRDD().asInstanceOf[JavaRDD[java.lang.Integer]] @@ -84,6 +90,13 @@ class GaussianMixtureModel( } /** + * Given the input vector, return the membership values to all mixture components. + */ + def predictSoft(point: Vector): Array[Double] = { +computeSoftAssignments(point.toBreeze.toDenseVector, gaussians, weights, k) + } + + /** * Compute the partial assignments for each vector */ private def computeSoftAssignments( http://git-wip-us.apache.org/repos/asf/spark/blob/29526604/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala index b218d72..b636d02 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/clustering/GaussianMixtureSuite.scala @@ -148,6 +148,16 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext { } } + test("model prediction, parallel and local") { +val data = sc.parallelize(GaussianTestData.data) +val gmm = new GaussianMixture().setK(2).setSeed(0).run(data) + +val batchPredictions = gmm.predict(data) +batchPredictions.zip(data).collect().foreach { case (batchPred, datum) => + assert(batchPred === gmm.predict(datum)) +} + } + object GaussianTestData { val data = Array( - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9748] [MLLIB] Centriod typo in KMeansModel
Repository: spark Updated Branches: refs/heads/master e2fbbe731 -> 902334fd5 [SPARK-9748] [MLLIB] Centriod typo in KMeansModel A minor typo (centriod -> centroid). Readable variable names help every users. Author: Bertrand Dechoux Closes #8037 from BertrandDechoux/kmeans-typo and squashes the following commits: 47632fe [Bertrand Dechoux] centriod typo Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/902334fd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/902334fd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/902334fd Branch: refs/heads/master Commit: 902334fd55bbe40a57c1de2a9bdb25eddf1c8cf6 Parents: e2fbbe7 Author: Bertrand Dechoux Authored: Fri Aug 7 16:07:24 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 7 16:07:24 2015 -0700 -- .../org/apache/spark/mllib/clustering/KMeansModel.scala | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/902334fd/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index 8ecb3df..9635902 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -120,11 +120,11 @@ object KMeansModel extends Loader[KMeansModel] { assert(className == thisClassName) assert(formatVersion == thisFormatVersion) val k = (metadata \ "k").extract[Int] - val centriods = sqlContext.read.parquet(Loader.dataPath(path)) - Loader.checkSchema[Cluster](centriods.schema) - val localCentriods = centriods.map(Cluster.apply).collect() - assert(k == localCentriods.size) - new KMeansModel(localCentriods.sortBy(_.id).map(_.point)) + val centroids = sqlContext.read.parquet(Loader.dataPath(path)) + Loader.checkSchema[Cluster](centroids.schema) + val localCentroids = centroids.map(Cluster.apply).collect() + assert(k == localCentroids.size) + new KMeansModel(localCentroids.sortBy(_.id).map(_.point)) } } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9756] [ML] Make constructors in ML decision trees private
Repository: spark Updated Branches: refs/heads/master 49702bd73 -> cd540c1e5 [SPARK-9756] [ML] Make constructors in ML decision trees private These should be made private until there is a public constructor for providing `rootNode: Node` to use these constructors. jkbradley Author: Feynman Liang Closes #8046 from feynmanliang/SPARK-9756 and squashes the following commits: 2cbdf08 [Feynman Liang] Make RFRegressionModel aux constructor private a06f596 [Feynman Liang] Make constructors in ML decision trees private Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cd540c1e Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cd540c1e Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cd540c1e Branch: refs/heads/master Commit: cd540c1e59561ad1fdac59af6170944c60e685d8 Parents: 49702bd Author: Feynman Liang Authored: Fri Aug 7 17:19:48 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 7 17:19:48 2015 -0700 -- .../apache/spark/ml/classification/DecisionTreeClassifier.scala | 2 +- .../apache/spark/ml/classification/RandomForestClassifier.scala | 5 - .../org/apache/spark/ml/regression/DecisionTreeRegressor.scala | 2 +- .../org/apache/spark/ml/regression/RandomForestRegressor.scala | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/cd540c1e/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index f2b992f..29598f3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -117,7 +117,7 @@ final class DecisionTreeClassificationModel private[ml] ( * Construct a decision tree classification model. * @param rootNode Root node of tree, with other nodes attached. */ - def this(rootNode: Node, numClasses: Int) = + private[ml] def this(rootNode: Node, numClasses: Int) = this(Identifiable.randomUID("dtc"), rootNode, numClasses) override protected def predict(features: Vector): Double = { http://git-wip-us.apache.org/repos/asf/spark/blob/cd540c1e/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index b59826a..156050a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -136,7 +136,10 @@ final class RandomForestClassificationModel private[ml] ( * Construct a random forest classification model, with all trees weighted equally. * @param trees Component trees */ - def this(trees: Array[DecisionTreeClassificationModel], numFeatures: Int, numClasses: Int) = + private[ml] def this( + trees: Array[DecisionTreeClassificationModel], + numFeatures: Int, + numClasses: Int) = this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses) override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]] http://git-wip-us.apache.org/repos/asf/spark/blob/cd540c1e/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 4d30e4b..dc94a14 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -107,7 +107,7 @@ final class DecisionTreeRegressionModel private[ml] ( * Construct a decision tree regression model. * @param rootNode Root node of tree, with other nodes attached. */ - def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode) + private[ml] def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode) override protected def predict(features: Vector): Double = { rootNode.predictImpl(features).prediction http://git-wip-us.apache.org/repos/asf/spark/blob/cd540c1e/mllib/src/main/scala/org/apache/spar
spark git commit: [SPARK-9756] [ML] Make constructors in ML decision trees private
Repository: spark Updated Branches: refs/heads/branch-1.5 ea4dfb90a -> 2a179a94e [SPARK-9756] [ML] Make constructors in ML decision trees private These should be made private until there is a public constructor for providing `rootNode: Node` to use these constructors. jkbradley Author: Feynman Liang Closes #8046 from feynmanliang/SPARK-9756 and squashes the following commits: 2cbdf08 [Feynman Liang] Make RFRegressionModel aux constructor private a06f596 [Feynman Liang] Make constructors in ML decision trees private (cherry picked from commit cd540c1e59561ad1fdac59af6170944c60e685d8) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2a179a94 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2a179a94 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2a179a94 Branch: refs/heads/branch-1.5 Commit: 2a179a94e0717b8aa754732e43d2206c196a Parents: ea4dfb9 Author: Feynman Liang Authored: Fri Aug 7 17:19:48 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 7 17:19:59 2015 -0700 -- .../apache/spark/ml/classification/DecisionTreeClassifier.scala | 2 +- .../apache/spark/ml/classification/RandomForestClassifier.scala | 5 - .../org/apache/spark/ml/regression/DecisionTreeRegressor.scala | 2 +- .../org/apache/spark/ml/regression/RandomForestRegressor.scala | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2a179a94/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala index f2b992f..29598f3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala @@ -117,7 +117,7 @@ final class DecisionTreeClassificationModel private[ml] ( * Construct a decision tree classification model. * @param rootNode Root node of tree, with other nodes attached. */ - def this(rootNode: Node, numClasses: Int) = + private[ml] def this(rootNode: Node, numClasses: Int) = this(Identifiable.randomUID("dtc"), rootNode, numClasses) override protected def predict(features: Vector): Double = { http://git-wip-us.apache.org/repos/asf/spark/blob/2a179a94/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala index b59826a..156050a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/RandomForestClassifier.scala @@ -136,7 +136,10 @@ final class RandomForestClassificationModel private[ml] ( * Construct a random forest classification model, with all trees weighted equally. * @param trees Component trees */ - def this(trees: Array[DecisionTreeClassificationModel], numFeatures: Int, numClasses: Int) = + private[ml] def this( + trees: Array[DecisionTreeClassificationModel], + numFeatures: Int, + numClasses: Int) = this(Identifiable.randomUID("rfc"), trees, numFeatures, numClasses) override def trees: Array[DecisionTreeModel] = _trees.asInstanceOf[Array[DecisionTreeModel]] http://git-wip-us.apache.org/repos/asf/spark/blob/2a179a94/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala index 4d30e4b..dc94a14 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/regression/DecisionTreeRegressor.scala @@ -107,7 +107,7 @@ final class DecisionTreeRegressionModel private[ml] ( * Construct a decision tree regression model. * @param rootNode Root node of tree, with other nodes attached. */ - def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode) + private[ml] def this(rootNode: Node) = this(Identifiable.randomUID("dtr"), rootNode) override protected def predict(features: Vector): Double = { rootNode.predi
spark git commit: [SPARK-9719] [ML] Clean up Naive Bayes doc
Repository: spark Updated Branches: refs/heads/master cd540c1e5 -> 85be65b39 [SPARK-9719] [ML] Clean up Naive Bayes doc Small documentation cleanups, including: * Adds documentation for `pi` and `theta` * setParam to `setModelType` Author: Feynman Liang Closes #8047 from feynmanliang/SPARK-9719 and squashes the following commits: b372438 [Feynman Liang] Clean up naive bayes doc Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/85be65b3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/85be65b3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/85be65b3 Branch: refs/heads/master Commit: 85be65b39ce669f937a898195a844844d757666b Parents: cd540c1 Author: Feynman Liang Authored: Fri Aug 7 17:21:12 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 7 17:21:12 2015 -0700 -- .../scala/org/apache/spark/ml/classification/NaiveBayes.scala| 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/85be65b3/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index b46b676..97cbaf1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -86,6 +86,7 @@ class NaiveBayes(override val uid: String) * Set the model type using a string (case-sensitive). * Supported options: "multinomial" and "bernoulli". * Default is "multinomial" + * @group setParam */ def setModelType(value: String): this.type = set(modelType, value) setDefault(modelType -> OldNaiveBayes.Multinomial) @@ -101,6 +102,9 @@ class NaiveBayes(override val uid: String) /** * Model produced by [[NaiveBayes]] + * @param pi log of class priors, whose dimension is C (number of classes) + * @param theta log of class conditional probabilities, whose dimension is C (number of classes) + * by D (number of features) */ class NaiveBayesModel private[ml] ( override val uid: String, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9719] [ML] Clean up Naive Bayes doc
Repository: spark Updated Branches: refs/heads/branch-1.5 2a179a94e -> c5d43d6c8 [SPARK-9719] [ML] Clean up Naive Bayes doc Small documentation cleanups, including: * Adds documentation for `pi` and `theta` * setParam to `setModelType` Author: Feynman Liang Closes #8047 from feynmanliang/SPARK-9719 and squashes the following commits: b372438 [Feynman Liang] Clean up naive bayes doc (cherry picked from commit 85be65b39ce669f937a898195a844844d757666b) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c5d43d6c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c5d43d6c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c5d43d6c Branch: refs/heads/branch-1.5 Commit: c5d43d6c82c87b1b14f73bba917b835f4975fb5a Parents: 2a179a9 Author: Feynman Liang Authored: Fri Aug 7 17:21:12 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 7 17:21:20 2015 -0700 -- .../scala/org/apache/spark/ml/classification/NaiveBayes.scala| 4 1 file changed, 4 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c5d43d6c/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala index b46b676..97cbaf1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala @@ -86,6 +86,7 @@ class NaiveBayes(override val uid: String) * Set the model type using a string (case-sensitive). * Supported options: "multinomial" and "bernoulli". * Default is "multinomial" + * @group setParam */ def setModelType(value: String): this.type = set(modelType, value) setDefault(modelType -> OldNaiveBayes.Multinomial) @@ -101,6 +102,9 @@ class NaiveBayes(override val uid: String) /** * Model produced by [[NaiveBayes]] + * @param pi log of class priors, whose dimension is C (number of classes) + * @param theta log of class conditional probabilities, whose dimension is C (number of classes) + * by D (number of features) */ class NaiveBayesModel private[ml] ( override val uid: String, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods
Repository: spark Updated Branches: refs/heads/master 0f3366a4c -> 00b655cce [SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods Adds method documentations back to `MultivariateOnlineSummarizer`, which were present in 1.4 but disappeared somewhere along the way to 1.5. jkbradley Author: Feynman Liang Closes #8045 from feynmanliang/SPARK-9755 and squashes the following commits: af67fde [Feynman Liang] Add MultivariateOnlineSummarizer docs Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/00b655cc Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/00b655cc Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/00b655cc Branch: refs/heads/master Commit: 00b655cced637e1c3b750c19266086b9dcd7c158 Parents: 0f3366a Author: Feynman Liang Authored: Mon Aug 10 11:01:45 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 10 11:01:45 2015 -0700 -- .../mllib/stat/MultivariateOnlineSummarizer.scala | 16 1 file changed, 16 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/00b655cc/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index 62da9f2..64e4be0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -153,6 +153,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Sample mean of each dimension. + * * @since 1.1.0 */ override def mean: Vector = { @@ -168,6 +170,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Sample variance of each dimension. + * * @since 1.1.0 */ override def variance: Vector = { @@ -193,11 +197,15 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Sample size. + * * @since 1.1.0 */ override def count: Long = totalCnt /** + * Number of nonzero elements in each dimension. + * * @since 1.1.0 */ override def numNonzeros: Vector = { @@ -207,6 +215,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Maximum value of each dimension. + * * @since 1.1.0 */ override def max: Vector = { @@ -221,6 +231,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Minimum value of each dimension. + * * @since 1.1.0 */ override def min: Vector = { @@ -235,6 +247,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * L2 (Euclidian) norm of each dimension. + * * @since 1.2.0 */ override def normL2: Vector = { @@ -252,6 +266,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * L1 norm of each dimension. + * * @since 1.2.0 */ override def normL1: Vector = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods
Repository: spark Updated Branches: refs/heads/branch-1.5 94b2f5b32 -> 3ee2c8d16 [SPARK-9755] [MLLIB] Add docs to MultivariateOnlineSummarizer methods Adds method documentations back to `MultivariateOnlineSummarizer`, which were present in 1.4 but disappeared somewhere along the way to 1.5. jkbradley Author: Feynman Liang Closes #8045 from feynmanliang/SPARK-9755 and squashes the following commits: af67fde [Feynman Liang] Add MultivariateOnlineSummarizer docs (cherry picked from commit 00b655cced637e1c3b750c19266086b9dcd7c158) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3ee2c8d1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3ee2c8d1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3ee2c8d1 Branch: refs/heads/branch-1.5 Commit: 3ee2c8d169e48e0bca3fab702466e7a855f57f8e Parents: 94b2f5b Author: Feynman Liang Authored: Mon Aug 10 11:01:45 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 10 11:01:55 2015 -0700 -- .../mllib/stat/MultivariateOnlineSummarizer.scala | 16 1 file changed, 16 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/3ee2c8d1/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala index 62da9f2..64e4be0 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateOnlineSummarizer.scala @@ -153,6 +153,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Sample mean of each dimension. + * * @since 1.1.0 */ override def mean: Vector = { @@ -168,6 +170,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Sample variance of each dimension. + * * @since 1.1.0 */ override def variance: Vector = { @@ -193,11 +197,15 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Sample size. + * * @since 1.1.0 */ override def count: Long = totalCnt /** + * Number of nonzero elements in each dimension. + * * @since 1.1.0 */ override def numNonzeros: Vector = { @@ -207,6 +215,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Maximum value of each dimension. + * * @since 1.1.0 */ override def max: Vector = { @@ -221,6 +231,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * Minimum value of each dimension. + * * @since 1.1.0 */ override def min: Vector = { @@ -235,6 +247,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * L2 (Euclidian) norm of each dimension. + * * @since 1.2.0 */ override def normL2: Vector = { @@ -252,6 +266,8 @@ class MultivariateOnlineSummarizer extends MultivariateStatisticalSummary with S } /** + * L1 norm of each dimension. + * * @since 1.2.0 */ override def normL1: Vector = { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8764] [ML] string indexer should take option to handle unseen values
Repository: spark Updated Branches: refs/heads/master 8cad854ef -> dbd778d84 [SPARK-8764] [ML] string indexer should take option to handle unseen values As a precursor to adding a public constructor add an option to handle unseen values by skipping rather than throwing an exception (default remains throwing an exception), Author: Holden Karau Closes #7266 from holdenk/SPARK-8764-string-indexer-should-take-option-to-handle-unseen-values and squashes the following commits: 38a4de9 [Holden Karau] fix long line 045bf22 [Holden Karau] Add a second b entry so b gets 0 for sure 81dd312 [Holden Karau] Update the docs for handleInvalid param to be more descriptive 7f37f6e [Holden Karau] remove extra space (scala style) 414e249 [Holden Karau] And switch to using handleInvalid instead of skipInvalid 1e53f9b [Holden Karau] update the param (codegen side) 7a22215 [Holden Karau] fix typo 100a39b [Holden Karau] Merge in master aa5b093 [Holden Karau] Since we filter we should never go down this code path if getSkipInvalid is true 75ffa69 [Holden Karau] Remove extra newline d69ef5e [Holden Karau] Add a test b5734be [Holden Karau] Add support for unseen labels afecd4e [Holden Karau] Add a param to skip invalid entries. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dbd778d8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dbd778d8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dbd778d8 Branch: refs/heads/master Commit: dbd778d84d094ca142bc08c351478595b280bc2a Parents: 8cad854 Author: Holden Karau Authored: Tue Aug 11 11:33:36 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 11 11:33:36 2015 -0700 -- .../apache/spark/ml/feature/StringIndexer.scala | 26 +--- .../ml/param/shared/SharedParamsCodeGen.scala | 4 +++ .../spark/ml/param/shared/sharedParams.scala| 15 + .../spark/ml/feature/StringIndexerSuite.scala | 32 4 files changed, 73 insertions(+), 4 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dbd778d8/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index ebfa972..e4485eb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -33,7 +33,8 @@ import org.apache.spark.util.collection.OpenHashMap /** * Base trait for [[StringIndexer]] and [[StringIndexerModel]]. */ -private[feature] trait StringIndexerBase extends Params with HasInputCol with HasOutputCol { +private[feature] trait StringIndexerBase extends Params with HasInputCol with HasOutputCol +with HasHandleInvalid { /** Validates and transforms the input schema. */ protected def validateAndTransformSchema(schema: StructType): StructType = { @@ -66,12 +67,15 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod def this() = this(Identifiable.randomUID("strIdx")) /** @group setParam */ + def setHandleInvalid(value: String): this.type = set(handleInvalid, value) + setDefault(handleInvalid, "error") + + /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ def setOutputCol(value: String): this.type = set(outputCol, value) - // TODO: handle unseen labels override def fit(dataset: DataFrame): StringIndexerModel = { val counts = dataset.select(col($(inputCol)).cast(StringType)) @@ -112,6 +116,10 @@ class StringIndexerModel private[ml] ( } /** @group setParam */ + def setHandleInvalid(value: String): this.type = set(handleInvalid, value) + setDefault(handleInvalid, "error") + + /** @group setParam */ def setInputCol(value: String): this.type = set(inputCol, value) /** @group setParam */ @@ -128,14 +136,24 @@ class StringIndexerModel private[ml] ( if (labelToIndex.contains(label)) { labelToIndex(label) } else { -// TODO: handle unseen labels throw new SparkException(s"Unseen label: $label.") } } + val outputColName = $(outputCol) val metadata = NominalAttribute.defaultAttr .withName(outputColName).withValues(labels).toMetadata() -dataset.select(col("*"), +// If we are skipping invalid records, filter them out. +val filteredDataset = (getHandleInvalid) match { + case "skip" => { +val filterer = udf { label: String => + labelToIndex.contains(label) +} +dataset.where(filterer(dataset($(inputCol + } + case _ => dataset +} +
spark git commit: [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix
Repository: spark Updated Branches: refs/heads/master 5831294a7 -> 520ad44b1 [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix Adds unit test for `equals` on `mllib.linalg.Matrix` class and `equals` to both `SparseMatrix` and `DenseMatrix`. Supports equality testing between `SparseMatrix` and `DenseMatrix`. mengxr Author: Feynman Liang Closes #8042 from feynmanliang/SPARK-9750 and squashes the following commits: bb70d5e [Feynman Liang] Breeze compare for dense matrices as well, in case other is sparse ab6f3c8 [Feynman Liang] Sparse matrix compare for equals 22782df [Feynman Liang] Add equality based on matrix semantics, not representation 78f9426 [Feynman Liang] Add casts 43d28fa [Feynman Liang] Fix failing test 6416fa0 [Feynman Liang] Add failing sparse matrix equals tests Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/520ad44b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/520ad44b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/520ad44b Branch: refs/heads/master Commit: 520ad44b17f72e6465bf990f64b4e289f8a83447 Parents: 5831294 Author: Feynman Liang Authored: Tue Aug 11 12:49:47 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 11 12:49:47 2015 -0700 -- .../org/apache/spark/mllib/linalg/Matrices.scala | 8 ++-- .../apache/spark/mllib/linalg/MatricesSuite.scala | 18 ++ 2 files changed, 24 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/520ad44b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 1c85834..1139ce3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -257,8 +257,7 @@ class DenseMatrix( this(numRows, numCols, values, false) override def equals(o: Any): Boolean = o match { -case m: DenseMatrix => - m.numRows == numRows && m.numCols == numCols && Arrays.equals(toArray, m.toArray) +case m: Matrix => toBreeze == m.toBreeze case _ => false } @@ -519,6 +518,11 @@ class SparseMatrix( rowIndices: Array[Int], values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, values, false) + override def equals(o: Any): Boolean = o match { +case m: Matrix => toBreeze == m.toBreeze +case _ => false + } + private[mllib] def toBreeze: BM[Double] = { if (!isTransposed) { new BSM[Double](values, numRows, numCols, colPtrs, rowIndices) http://git-wip-us.apache.org/repos/asf/spark/blob/520ad44b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index a270ba2..bfd6d54 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite { } } + test("equals") { +val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)) +assert(dm1 === dm1) +assert(dm1 !== dm1.transpose) + +val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0)) +assert(dm1 === dm2.transpose) + +val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse +assert(sm1 === sm1) +assert(sm1 === dm1) +assert(sm1 !== sm1.transpose) + +val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse +assert(sm1 === sm2.transpose) +assert(sm1 === dm2.transpose) + } + test("matrix copies are deep copies") { val m = 3 val n = 2 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix
Repository: spark Updated Branches: refs/heads/branch-1.5 767ee1884 -> 811d23f1c [SPARK-9750] [MLLIB] Improve equals on SparseMatrix and DenseMatrix Adds unit test for `equals` on `mllib.linalg.Matrix` class and `equals` to both `SparseMatrix` and `DenseMatrix`. Supports equality testing between `SparseMatrix` and `DenseMatrix`. mengxr Author: Feynman Liang Closes #8042 from feynmanliang/SPARK-9750 and squashes the following commits: bb70d5e [Feynman Liang] Breeze compare for dense matrices as well, in case other is sparse ab6f3c8 [Feynman Liang] Sparse matrix compare for equals 22782df [Feynman Liang] Add equality based on matrix semantics, not representation 78f9426 [Feynman Liang] Add casts 43d28fa [Feynman Liang] Fix failing test 6416fa0 [Feynman Liang] Add failing sparse matrix equals tests (cherry picked from commit 520ad44b17f72e6465bf990f64b4e289f8a83447) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/811d23f1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/811d23f1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/811d23f1 Branch: refs/heads/branch-1.5 Commit: 811d23f1c27e7f461f0d37d058c07885fb0e0750 Parents: 767ee18 Author: Feynman Liang Authored: Tue Aug 11 12:49:47 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 11 12:49:56 2015 -0700 -- .../org/apache/spark/mllib/linalg/Matrices.scala | 8 ++-- .../apache/spark/mllib/linalg/MatricesSuite.scala | 18 ++ 2 files changed, 24 insertions(+), 2 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/811d23f1/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala index 1c85834..1139ce3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/Matrices.scala @@ -257,8 +257,7 @@ class DenseMatrix( this(numRows, numCols, values, false) override def equals(o: Any): Boolean = o match { -case m: DenseMatrix => - m.numRows == numRows && m.numCols == numCols && Arrays.equals(toArray, m.toArray) +case m: Matrix => toBreeze == m.toBreeze case _ => false } @@ -519,6 +518,11 @@ class SparseMatrix( rowIndices: Array[Int], values: Array[Double]) = this(numRows, numCols, colPtrs, rowIndices, values, false) + override def equals(o: Any): Boolean = o match { +case m: Matrix => toBreeze == m.toBreeze +case _ => false + } + private[mllib] def toBreeze: BM[Double] = { if (!isTransposed) { new BSM[Double](values, numRows, numCols, colPtrs, rowIndices) http://git-wip-us.apache.org/repos/asf/spark/blob/811d23f1/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala index a270ba2..bfd6d54 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/MatricesSuite.scala @@ -74,6 +74,24 @@ class MatricesSuite extends SparkFunSuite { } } + test("equals") { +val dm1 = Matrices.dense(2, 2, Array(0.0, 1.0, 2.0, 3.0)) +assert(dm1 === dm1) +assert(dm1 !== dm1.transpose) + +val dm2 = Matrices.dense(2, 2, Array(0.0, 2.0, 1.0, 3.0)) +assert(dm1 === dm2.transpose) + +val sm1 = dm1.asInstanceOf[DenseMatrix].toSparse +assert(sm1 === sm1) +assert(sm1 === dm1) +assert(sm1 !== sm1.transpose) + +val sm2 = dm2.asInstanceOf[DenseMatrix].toSparse +assert(sm1 === sm2.transpose) +assert(sm1 === dm2.transpose) + } + test("matrix copies are deep copies") { val m = 3 val n = 2 - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility
Repository: spark Updated Branches: refs/heads/branch-1.5 cdf781db6 -> 2273e7432 [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility 1. Add âasymmetricDocConcentrationâ and revert docConcentration changes. If the (internal) doc concentration vector is a single value, âgetDocConcentration" returns it. If it is a constant vector, getDocConcentration returns the first item, and fails otherwise. 2. Give `LDAModel.gammaShape` a default value in `LDAModel` concrete class constructors. jkbradley Author: Feynman Liang Closes #8077 from feynmanliang/SPARK-9788 and squashes the following commits: 6b07bc8 [Feynman Liang] Code review changes 9d6a71e [Feynman Liang] Add asymmetricAlpha alias bf4e685 [Feynman Liang] Asymmetric docConcentration 4cab972 [Feynman Liang] Default gammaShape (cherry picked from commit be3e27164133025db860781bd5cdd3ca233edd21) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2273e743 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2273e743 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2273e743 Branch: refs/heads/branch-1.5 Commit: 2273e7432ec218ba163a94f86307ad11904a1dee Parents: cdf781d Author: Feynman Liang Authored: Tue Aug 11 14:21:53 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 11 14:22:02 2015 -0700 -- .../org/apache/spark/mllib/clustering/LDA.scala | 27 --- .../spark/mllib/clustering/LDAModel.scala | 11 .../spark/mllib/clustering/LDAOptimizer.scala | 28 ++-- .../spark/mllib/clustering/LDASuite.scala | 4 +-- 4 files changed, 46 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2273e743/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index ab124e6..0fc9b1a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -79,7 +79,24 @@ class LDA private ( * * This is the parameter to a Dirichlet distribution. */ - def getDocConcentration: Vector = this.docConcentration + def getAsymmetricDocConcentration: Vector = this.docConcentration + + /** + * Concentration parameter (commonly named "alpha") for the prior placed on documents' + * distributions over topics ("theta"). + * + * This method assumes the Dirichlet distribution is symmetric and can be described by a single + * [[Double]] parameter. It should fail if docConcentration is asymmetric. + */ + def getDocConcentration: Double = { +val parameter = docConcentration(0) +if (docConcentration.size == 1) { + parameter +} else { + require(docConcentration.toArray.forall(_ == parameter)) + parameter +} + } /** * Concentration parameter (commonly named "alpha") for the prior placed on documents' @@ -106,18 +123,22 @@ class LDA private ( * [[https://github.com/Blei-Lab/onlineldavb]]. */ def setDocConcentration(docConcentration: Vector): this.type = { +require(docConcentration.size > 0, "docConcentration must have > 0 elements") this.docConcentration = docConcentration this } - /** Replicates Double to create a symmetric prior */ + /** Replicates a [[Double]] docConcentration to create a symmetric prior. */ def setDocConcentration(docConcentration: Double): this.type = { this.docConcentration = Vectors.dense(docConcentration) this } + /** Alias for [[getAsymmetricDocConcentration]] */ + def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration + /** Alias for [[getDocConcentration]] */ - def getAlpha: Vector = getDocConcentration + def getAlpha: Double = getDocConcentration /** Alias for [[setDocConcentration()]] */ def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha) http://git-wip-us.apache.org/repos/asf/spark/blob/2273e743/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 33babda..5dc637e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -27,7 +27,6 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.Spark
spark git commit: [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility
Repository: spark Updated Branches: refs/heads/master 423cdfd83 -> be3e27164 [SPARK-9788] [MLLIB] Fix LDA Binary Compatibility 1. Add âasymmetricDocConcentrationâ and revert docConcentration changes. If the (internal) doc concentration vector is a single value, âgetDocConcentration" returns it. If it is a constant vector, getDocConcentration returns the first item, and fails otherwise. 2. Give `LDAModel.gammaShape` a default value in `LDAModel` concrete class constructors. jkbradley Author: Feynman Liang Closes #8077 from feynmanliang/SPARK-9788 and squashes the following commits: 6b07bc8 [Feynman Liang] Code review changes 9d6a71e [Feynman Liang] Add asymmetricAlpha alias bf4e685 [Feynman Liang] Asymmetric docConcentration 4cab972 [Feynman Liang] Default gammaShape Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/be3e2716 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/be3e2716 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/be3e2716 Branch: refs/heads/master Commit: be3e27164133025db860781bd5cdd3ca233edd21 Parents: 423cdfd Author: Feynman Liang Authored: Tue Aug 11 14:21:53 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 11 14:21:53 2015 -0700 -- .../org/apache/spark/mllib/clustering/LDA.scala | 27 --- .../spark/mllib/clustering/LDAModel.scala | 11 .../spark/mllib/clustering/LDAOptimizer.scala | 28 ++-- .../spark/mllib/clustering/LDASuite.scala | 4 +-- 4 files changed, 46 insertions(+), 24 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/be3e2716/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala index ab124e6..0fc9b1a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDA.scala @@ -79,7 +79,24 @@ class LDA private ( * * This is the parameter to a Dirichlet distribution. */ - def getDocConcentration: Vector = this.docConcentration + def getAsymmetricDocConcentration: Vector = this.docConcentration + + /** + * Concentration parameter (commonly named "alpha") for the prior placed on documents' + * distributions over topics ("theta"). + * + * This method assumes the Dirichlet distribution is symmetric and can be described by a single + * [[Double]] parameter. It should fail if docConcentration is asymmetric. + */ + def getDocConcentration: Double = { +val parameter = docConcentration(0) +if (docConcentration.size == 1) { + parameter +} else { + require(docConcentration.toArray.forall(_ == parameter)) + parameter +} + } /** * Concentration parameter (commonly named "alpha") for the prior placed on documents' @@ -106,18 +123,22 @@ class LDA private ( * [[https://github.com/Blei-Lab/onlineldavb]]. */ def setDocConcentration(docConcentration: Vector): this.type = { +require(docConcentration.size > 0, "docConcentration must have > 0 elements") this.docConcentration = docConcentration this } - /** Replicates Double to create a symmetric prior */ + /** Replicates a [[Double]] docConcentration to create a symmetric prior. */ def setDocConcentration(docConcentration: Double): this.type = { this.docConcentration = Vectors.dense(docConcentration) this } + /** Alias for [[getAsymmetricDocConcentration]] */ + def getAsymmetricAlpha: Vector = getAsymmetricDocConcentration + /** Alias for [[getDocConcentration]] */ - def getAlpha: Vector = getDocConcentration + def getAlpha: Double = getDocConcentration /** Alias for [[setDocConcentration()]] */ def setAlpha(alpha: Vector): this.type = setDocConcentration(alpha) http://git-wip-us.apache.org/repos/asf/spark/blob/be3e2716/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 33babda..5dc637e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -27,7 +27,6 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental import org.apache.spark.api.java.JavaPairRDD -import org.apache.spark.broadcast.Bro
spark git commit: [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML
Repository: spark Updated Branches: refs/heads/master 60103ecd3 -> 762bacc16 [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML Check and add miss docs for PySpark ML (this issue only check miss docs for o.a.s.ml not o.a.s.mllib). Author: Yanbo Liang Closes #8059 from yanboliang/SPARK-9766. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/762bacc1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/762bacc1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/762bacc1 Branch: refs/heads/master Commit: 762bacc16ac5e74c8b05a7c1e3e367d1d1633cef Parents: 60103ec Author: Yanbo Liang Authored: Wed Aug 12 13:24:18 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Aug 12 13:24:18 2015 -0700 -- python/pyspark/ml/classification.py | 12 ++-- python/pyspark/ml/clustering.py | 4 +++- python/pyspark/ml/evaluation.py | 3 ++- python/pyspark/ml/feature.py| 9 + 4 files changed, 20 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5978d8f..6702dce 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -34,6 +34,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol): """ Logistic regression. +Currently, this class only supports binary classification. >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors @@ -96,8 +97,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti # is an L2 penalty. For alpha = 1, it is an L1 penalty. self.elasticNetParam = \ Param(self, "elasticNetParam", - "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " + - "is an L2 penalty. For alpha = 1, it is an L1 penalty.") + "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " + + "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.") #: param for whether to fit an intercept term. self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.") #: param for threshold in binary classification prediction, in range [0, 1]. @@ -656,6 +657,13 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H HasRawPredictionCol): """ Naive Bayes Classifiers. +It supports both Multinomial and Bernoulli NB. Multinomial NB + (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`) +can handle finitely supported discrete data. For example, by converting documents into +TF-IDF vectors, it can be used for document classification. By making every vector a +binary (0/1) data, it can also be used as Bernoulli NB + (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`). +The input feature values must be nonnegative. >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/clustering.py -- diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index b5e9b65..4833871 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -37,7 +37,9 @@ class KMeansModel(JavaModel): @inherit_doc class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed): """ -K-means Clustering +K-means clustering with support for multiple parallel runs and a k-means++ like initialization +mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested, +they are executed together with joint passes over the data for efficiency. >>> from pyspark.mllib.linalg import Vectors >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), http://git-wip-us.apache.org/repos/asf/spark/blob/762bacc1/python/pyspark/ml/evaluation.py -- diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 06e8093..2734092 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -23,7 +23,8 @@ from pyspark.ml.param.shared import HasLabelCol, HasPredictionCol, HasRawPredict from pyspark.ml.util import
spark git commit: [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML
Repository: spark Updated Branches: refs/heads/branch-1.5 8629c33b6 -> 65b5b2172 [SPARK-9766] [ML] [PySpark] check and add miss docs for PySpark ML Check and add miss docs for PySpark ML (this issue only check miss docs for o.a.s.ml not o.a.s.mllib). Author: Yanbo Liang Closes #8059 from yanboliang/SPARK-9766. (cherry picked from commit 762bacc16ac5e74c8b05a7c1e3e367d1d1633cef) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/65b5b217 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/65b5b217 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/65b5b217 Branch: refs/heads/branch-1.5 Commit: 65b5b2172681285a027e865ec9a91779e902e85a Parents: 8629c33 Author: Yanbo Liang Authored: Wed Aug 12 13:24:18 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Aug 12 13:24:29 2015 -0700 -- python/pyspark/ml/classification.py | 12 ++-- python/pyspark/ml/clustering.py | 4 +++- python/pyspark/ml/evaluation.py | 3 ++- python/pyspark/ml/feature.py| 9 + 4 files changed, 20 insertions(+), 8 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/65b5b217/python/pyspark/ml/classification.py -- diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 5978d8f..6702dce 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -34,6 +34,7 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti HasRegParam, HasTol, HasProbabilityCol, HasRawPredictionCol): """ Logistic regression. +Currently, this class only supports binary classification. >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors @@ -96,8 +97,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti # is an L2 penalty. For alpha = 1, it is an L1 penalty. self.elasticNetParam = \ Param(self, "elasticNetParam", - "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty " + - "is an L2 penalty. For alpha = 1, it is an L1 penalty.") + "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, " + + "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.") #: param for whether to fit an intercept term. self.fitIntercept = Param(self, "fitIntercept", "whether to fit an intercept term.") #: param for threshold in binary classification prediction, in range [0, 1]. @@ -656,6 +657,13 @@ class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, H HasRawPredictionCol): """ Naive Bayes Classifiers. +It supports both Multinomial and Bernoulli NB. Multinomial NB + (`http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html`) +can handle finitely supported discrete data. For example, by converting documents into +TF-IDF vectors, it can be used for document classification. By making every vector a +binary (0/1) data, it can also be used as Bernoulli NB + (`http://nlp.stanford.edu/IR-book/html/htmledition/the-bernoulli-model-1.html`). +The input feature values must be nonnegative. >>> from pyspark.sql import Row >>> from pyspark.mllib.linalg import Vectors http://git-wip-us.apache.org/repos/asf/spark/blob/65b5b217/python/pyspark/ml/clustering.py -- diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index b5e9b65..4833871 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -37,7 +37,9 @@ class KMeansModel(JavaModel): @inherit_doc class KMeans(JavaEstimator, HasFeaturesCol, HasMaxIter, HasSeed): """ -K-means Clustering +K-means clustering with support for multiple parallel runs and a k-means++ like initialization +mode (the k-means|| algorithm by Bahmani et al). When multiple concurrent runs are requested, +they are executed together with joint passes over the data for efficiency. >>> from pyspark.mllib.linalg import Vectors >>> data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),), http://git-wip-us.apache.org/repos/asf/spark/blob/65b5b217/python/pyspark/ml/evaluation.py -- diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 06e8093..2734092 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -23,7 +23,8 @@
spark git commit: [SPARK-9789] [ML] Added logreg threshold param back
Repository: spark Updated Branches: refs/heads/branch-1.5 65b5b2172 -> bdf8dc15d [SPARK-9789] [ML] Added logreg threshold param back Reinstated LogisticRegression.threshold Param for binary compatibility. Param thresholds overrides threshold, if set. CC: mengxr dbtsai feynmanliang Author: Joseph K. Bradley Closes #8079 from jkbradley/logreg-reinstate-threshold. (cherry picked from commit 551def5d6972440365bd7436d484a67138d9a8f3) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bdf8dc15 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bdf8dc15 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bdf8dc15 Branch: refs/heads/branch-1.5 Commit: bdf8dc15d3b310c8cd84c71999b1bca4d9bc825e Parents: 65b5b21 Author: Joseph K. Bradley Authored: Wed Aug 12 14:27:13 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Aug 12 14:27:21 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 127 +++ .../ml/param/shared/SharedParamsCodeGen.scala | 4 +- .../spark/ml/param/shared/sharedParams.scala| 6 +- .../JavaLogisticRegressionSuite.java| 7 +- .../LogisticRegressionSuite.scala | 33 +++-- python/pyspark/ml/classification.py | 98 -- 6 files changed, 199 insertions(+), 76 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bdf8dc15/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index f55134d..5bcd711 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -34,8 +34,7 @@ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel /** @@ -43,44 +42,115 @@ import org.apache.spark.storage.StorageLevel */ private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol - with HasStandardization { + with HasStandardization with HasThreshold { /** - * Version of setThresholds() for binary classification, available for backwards - * compatibility. + * Set threshold in binary classification, in range [0, 1]. * - * Calling this with threshold p will effectively call `setThresholds(Array(1-p, p))`. + * If the estimated probability of class label 1 is > threshold, then predict 1, else 0. + * A high threshold encourages the model to predict 0 more often; + * a low threshold encourages the model to predict 1 more often. + * + * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`. + * When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared. + * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be + * equivalent. + * + * Default is 0.5. + * @group setParam + */ + def setThreshold(value: Double): this.type = { +if (isSet(thresholds)) clear(thresholds) +set(threshold, value) + } + + /** + * Get threshold for binary classification. + * + * If [[threshold]] is set, returns that value. + * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification), + * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. + * Otherwise, returns [[threshold]] default value. + * + * @group getParam + * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. + */ + override def getThreshold: Double = { +checkThresholdConsistency() +if (isSet(thresholds)) { + val ts = $(thresholds) + require(ts.length == 2, "Logistic Regression getThreshold only applies to" + +" binary classification, but thresholds has length != 2. thresholds: " + ts.mkString(",")) + 1.0 / (1.0 + ts(0) / ts(1)) +} else { + $(threshold) +} + } + + /** + * Set thresholds in multiclass (or binary) classification to adjust the probability of + * predicting each class.
spark git commit: [SPARK-9789] [ML] Added logreg threshold param back
Repository: spark Updated Branches: refs/heads/master 762bacc16 -> 551def5d6 [SPARK-9789] [ML] Added logreg threshold param back Reinstated LogisticRegression.threshold Param for binary compatibility. Param thresholds overrides threshold, if set. CC: mengxr dbtsai feynmanliang Author: Joseph K. Bradley Closes #8079 from jkbradley/logreg-reinstate-threshold. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/551def5d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/551def5d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/551def5d Branch: refs/heads/master Commit: 551def5d6972440365bd7436d484a67138d9a8f3 Parents: 762bacc Author: Joseph K. Bradley Authored: Wed Aug 12 14:27:13 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Aug 12 14:27:13 2015 -0700 -- .../ml/classification/LogisticRegression.scala | 127 +++ .../ml/param/shared/SharedParamsCodeGen.scala | 4 +- .../spark/ml/param/shared/sharedParams.scala| 6 +- .../JavaLogisticRegressionSuite.java| 7 +- .../LogisticRegressionSuite.scala | 33 +++-- python/pyspark/ml/classification.py | 98 -- 6 files changed, 199 insertions(+), 76 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/551def5d/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala index f55134d..5bcd711 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala @@ -34,8 +34,7 @@ import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{DataFrame, Row, SQLContext} -import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.storage.StorageLevel /** @@ -43,44 +42,115 @@ import org.apache.spark.storage.StorageLevel */ private[classification] trait LogisticRegressionParams extends ProbabilisticClassifierParams with HasRegParam with HasElasticNetParam with HasMaxIter with HasFitIntercept with HasTol - with HasStandardization { + with HasStandardization with HasThreshold { /** - * Version of setThresholds() for binary classification, available for backwards - * compatibility. + * Set threshold in binary classification, in range [0, 1]. * - * Calling this with threshold p will effectively call `setThresholds(Array(1-p, p))`. + * If the estimated probability of class label 1 is > threshold, then predict 1, else 0. + * A high threshold encourages the model to predict 0 more often; + * a low threshold encourages the model to predict 1 more often. + * + * Note: Calling this with threshold p is equivalent to calling `setThresholds(Array(1-p, p))`. + * When [[setThreshold()]] is called, any user-set value for [[thresholds]] will be cleared. + * If both [[threshold]] and [[thresholds]] are set in a ParamMap, then they must be + * equivalent. + * + * Default is 0.5. + * @group setParam + */ + def setThreshold(value: Double): this.type = { +if (isSet(thresholds)) clear(thresholds) +set(threshold, value) + } + + /** + * Get threshold for binary classification. + * + * If [[threshold]] is set, returns that value. + * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification), + * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}. + * Otherwise, returns [[threshold]] default value. + * + * @group getParam + * @throws IllegalArgumentException if [[thresholds]] is set to an array of length other than 2. + */ + override def getThreshold: Double = { +checkThresholdConsistency() +if (isSet(thresholds)) { + val ts = $(thresholds) + require(ts.length == 2, "Logistic Regression getThreshold only applies to" + +" binary classification, but thresholds has length != 2. thresholds: " + ts.mkString(",")) + 1.0 / (1.0 + ts(0) / ts(1)) +} else { + $(threshold) +} + } + + /** + * Set thresholds in multiclass (or binary) classification to adjust the probability of + * predicting each class. Array must have length equal to the number of classes, with values >= 0. + * The class with largest value p/
spark git commit: [SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a parent
Repository: spark Updated Branches: refs/heads/branch-1.5 5592d162a -> fe05142f5 [SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a parent Copied ML models must have the same parent of original ones Author: lewuathe Author: Lewuathe Closes #7447 from Lewuathe/SPARK-9073. (cherry picked from commit 2932e25da4532de9e86b01d08bce0cb680874e70) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fe05142f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fe05142f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fe05142f Branch: refs/heads/branch-1.5 Commit: fe05142f5bc6b11ba9d5d2d77f989610178fc7b5 Parents: 5592d16 Author: lewuathe Authored: Thu Aug 13 09:17:19 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 13 09:17:31 2015 -0700 -- .../examples/ml/JavaDeveloperApiExample.java| 3 +- .../spark/examples/ml/DeveloperApiExample.scala | 2 +- .../scala/org/apache/spark/ml/Pipeline.scala| 2 +- .../classification/DecisionTreeClassifier.scala | 1 + .../spark/ml/classification/GBTClassifier.scala | 2 +- .../ml/classification/LogisticRegression.scala | 2 +- .../spark/ml/classification/OneVsRest.scala | 2 +- .../classification/RandomForestClassifier.scala | 1 + .../apache/spark/ml/feature/Bucketizer.scala| 4 ++- .../scala/org/apache/spark/ml/feature/IDF.scala | 2 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 2 +- .../scala/org/apache/spark/ml/feature/PCA.scala | 2 +- .../spark/ml/feature/StandardScaler.scala | 2 +- .../apache/spark/ml/feature/StringIndexer.scala | 2 +- .../apache/spark/ml/feature/VectorIndexer.scala | 2 +- .../org/apache/spark/ml/feature/Word2Vec.scala | 2 +- .../apache/spark/ml/recommendation/ALS.scala| 2 +- .../ml/regression/DecisionTreeRegressor.scala | 2 +- .../spark/ml/regression/GBTRegressor.scala | 2 +- .../spark/ml/regression/LinearRegression.scala | 2 +- .../ml/regression/RandomForestRegressor.scala | 2 +- .../apache/spark/ml/tuning/CrossValidator.scala | 2 +- .../org/apache/spark/ml/PipelineSuite.scala | 3 ++ .../DecisionTreeClassifierSuite.scala | 4 +++ .../ml/classification/GBTClassifierSuite.scala | 4 +++ .../LogisticRegressionSuite.scala | 4 +++ .../ml/classification/OneVsRestSuite.scala | 6 +++- .../RandomForestClassifierSuite.scala | 4 +++ .../spark/ml/feature/BucketizerSuite.scala | 1 + .../spark/ml/feature/MinMaxScalerSuite.scala| 4 +++ .../org/apache/spark/ml/feature/PCASuite.scala | 4 +++ .../spark/ml/feature/StringIndexerSuite.scala | 5 .../spark/ml/feature/VectorIndexerSuite.scala | 5 .../apache/spark/ml/feature/Word2VecSuite.scala | 4 +++ .../spark/ml/recommendation/ALSSuite.scala | 4 +++ .../regression/DecisionTreeRegressorSuite.scala | 11 +++ .../spark/ml/regression/GBTRegressorSuite.scala | 5 .../ml/regression/LinearRegressionSuite.scala | 5 .../regression/RandomForestRegressorSuite.scala | 7 - .../spark/ml/tuning/CrossValidatorSuite.scala | 5 .../apache/spark/ml/util/MLTestingUtils.scala | 30 41 files changed, 138 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fe05142f/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index 9df26ff..3f1fe90 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -230,6 +230,7 @@ class MyJavaLogisticRegressionModel */ @Override public MyJavaLogisticRegressionModel copy(ParamMap extra) { -return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), extra); +return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), extra) + .setParent(parent()); } } http://git-wip-us.apache.org/repos/asf/spark/blob/fe05142f/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index 78f31b4..340c355 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExa
spark git commit: [SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a parent
Repository: spark Updated Branches: refs/heads/master 699303101 -> 2932e25da [SPARK-9073] [ML] spark.ml Models copy() should call setParent when there is a parent Copied ML models must have the same parent of original ones Author: lewuathe Author: Lewuathe Closes #7447 from Lewuathe/SPARK-9073. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2932e25d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2932e25d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2932e25d Branch: refs/heads/master Commit: 2932e25da4532de9e86b01d08bce0cb680874e70 Parents: 6993031 Author: lewuathe Authored: Thu Aug 13 09:17:19 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 13 09:17:19 2015 -0700 -- .../examples/ml/JavaDeveloperApiExample.java| 3 +- .../spark/examples/ml/DeveloperApiExample.scala | 2 +- .../scala/org/apache/spark/ml/Pipeline.scala| 2 +- .../classification/DecisionTreeClassifier.scala | 1 + .../spark/ml/classification/GBTClassifier.scala | 2 +- .../ml/classification/LogisticRegression.scala | 2 +- .../spark/ml/classification/OneVsRest.scala | 2 +- .../classification/RandomForestClassifier.scala | 1 + .../apache/spark/ml/feature/Bucketizer.scala| 4 ++- .../scala/org/apache/spark/ml/feature/IDF.scala | 2 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 2 +- .../scala/org/apache/spark/ml/feature/PCA.scala | 2 +- .../spark/ml/feature/StandardScaler.scala | 2 +- .../apache/spark/ml/feature/StringIndexer.scala | 2 +- .../apache/spark/ml/feature/VectorIndexer.scala | 2 +- .../org/apache/spark/ml/feature/Word2Vec.scala | 2 +- .../apache/spark/ml/recommendation/ALS.scala| 2 +- .../ml/regression/DecisionTreeRegressor.scala | 2 +- .../spark/ml/regression/GBTRegressor.scala | 2 +- .../spark/ml/regression/LinearRegression.scala | 2 +- .../ml/regression/RandomForestRegressor.scala | 2 +- .../apache/spark/ml/tuning/CrossValidator.scala | 2 +- .../org/apache/spark/ml/PipelineSuite.scala | 3 ++ .../DecisionTreeClassifierSuite.scala | 4 +++ .../ml/classification/GBTClassifierSuite.scala | 4 +++ .../LogisticRegressionSuite.scala | 4 +++ .../ml/classification/OneVsRestSuite.scala | 6 +++- .../RandomForestClassifierSuite.scala | 4 +++ .../spark/ml/feature/BucketizerSuite.scala | 1 + .../spark/ml/feature/MinMaxScalerSuite.scala| 4 +++ .../org/apache/spark/ml/feature/PCASuite.scala | 4 +++ .../spark/ml/feature/StringIndexerSuite.scala | 5 .../spark/ml/feature/VectorIndexerSuite.scala | 5 .../apache/spark/ml/feature/Word2VecSuite.scala | 4 +++ .../spark/ml/recommendation/ALSSuite.scala | 4 +++ .../regression/DecisionTreeRegressorSuite.scala | 11 +++ .../spark/ml/regression/GBTRegressorSuite.scala | 5 .../ml/regression/LinearRegressionSuite.scala | 5 .../regression/RandomForestRegressorSuite.scala | 7 - .../spark/ml/tuning/CrossValidatorSuite.scala | 5 .../apache/spark/ml/util/MLTestingUtils.scala | 30 41 files changed, 138 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/2932e25d/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java index 9df26ff..3f1fe90 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaDeveloperApiExample.java @@ -230,6 +230,7 @@ class MyJavaLogisticRegressionModel */ @Override public MyJavaLogisticRegressionModel copy(ParamMap extra) { -return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), extra); +return copyValues(new MyJavaLogisticRegressionModel(uid(), weights_), extra) + .setParent(parent()); } } http://git-wip-us.apache.org/repos/asf/spark/blob/2932e25d/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala -- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala index 78f31b4..340c355 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/DeveloperApiExample.scala @@ -179,7 +179,7 @@ private class MyLogisticRegressionModel( * This is used for the default imple
spark git commit: [SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and Param
Repository: spark Updated Branches: refs/heads/master 2932e25da -> 7a539ef3b [SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and Param Added ml-guide Python Example: Estimator, Transformer, and Param /docs/_site/ml-guide.html Author: Rosstin Closes #8081 from Rosstin/SPARK-8965. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a539ef3 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a539ef3 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a539ef3 Branch: refs/heads/master Commit: 7a539ef3b1792764f866fa88c84c78ad59903f21 Parents: 2932e25 Author: Rosstin Authored: Thu Aug 13 09:18:39 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 13 09:18:39 2015 -0700 -- docs/ml-guide.md | 68 +++ 1 file changed, 68 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7a539ef3/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index b6ca50e..a03ab43 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -355,6 +355,74 @@ jsc.stop(); {% endhighlight %} + +{% highlight python %} +from pyspark import SparkContext +from pyspark.mllib.regression import LabeledPoint +from pyspark.ml.classification import LogisticRegression +from pyspark.ml.param import Param, Params +from pyspark.sql import Row, SQLContext + +sc = SparkContext(appName="SimpleParamsExample") +sqlContext = SQLContext(sc) + +# Prepare training data. +# We use LabeledPoint. +# Spark SQL can convert RDDs of LabeledPoints into DataFrames. +training = sc.parallelize([LabeledPoint(1.0, [0.0, 1.1, 0.1]), + LabeledPoint(0.0, [2.0, 1.0, -1.0]), + LabeledPoint(0.0, [2.0, 1.3, 1.0]), + LabeledPoint(1.0, [0.0, 1.2, -0.5])]) + +# Create a LogisticRegression instance. This instance is an Estimator. +lr = LogisticRegression(maxIter=10, regParam=0.01) +# Print out the parameters, documentation, and any default values. +print "LogisticRegression parameters:\n" + lr.explainParams() + "\n" + +# Learn a LogisticRegression model. This uses the parameters stored in lr. +model1 = lr.fit(training.toDF()) + +# Since model1 is a Model (i.e., a transformer produced by an Estimator), +# we can view the parameters it used during fit(). +# This prints the parameter (name: value) pairs, where names are unique IDs for this +# LogisticRegression instance. +print "Model 1 was fit using parameters: " +print model1.extractParamMap() + +# We may alternatively specify parameters using a Python dictionary as a paramMap +paramMap = {lr.maxIter: 20} +paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. +paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params. + +# You can combine paramMaps, which are python dictionaries. +paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name +paramMapCombined = paramMap.copy() +paramMapCombined.update(paramMap2) + +# Now learn a new model using the paramMapCombined parameters. +# paramMapCombined overrides all parameters set earlier via lr.set* methods. +model2 = lr.fit(training.toDF(), paramMapCombined) +print "Model 2 was fit using parameters: " +print model2.extractParamMap() + +# Prepare test data +test = sc.parallelize([LabeledPoint(1.0, [-1.0, 1.5, 1.3]), + LabeledPoint(0.0, [ 3.0, 2.0, -0.1]), + LabeledPoint(1.0, [ 0.0, 2.2, -1.5])]) + +# Make predictions on test data using the Transformer.transform() method. +# LogisticRegression.transform will only use the 'features' column. +# Note that model2.transform() outputs a "myProbability" column instead of the usual +# 'probability' column since we renamed the lr.probabilityCol parameter previously. +prediction = model2.transform(test.toDF()) +selected = prediction.select("features", "label", "myProbability", "prediction") +for row in selected.collect(): +print row + +sc.stop() +{% endhighlight %} + + ## Example: Pipeline - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and Param
Repository: spark Updated Branches: refs/heads/branch-1.5 fe05142f5 -> 49085b56c [SPARK-8965] [DOCS] Add ml-guide Python Example: Estimator, Transformer, and Param Added ml-guide Python Example: Estimator, Transformer, and Param /docs/_site/ml-guide.html Author: Rosstin Closes #8081 from Rosstin/SPARK-8965. (cherry picked from commit 7a539ef3b1792764f866fa88c84c78ad59903f21) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/49085b56 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/49085b56 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/49085b56 Branch: refs/heads/branch-1.5 Commit: 49085b56c10a2d05345b343277ddf19b502aee9c Parents: fe05142 Author: Rosstin Authored: Thu Aug 13 09:18:39 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 13 09:18:50 2015 -0700 -- docs/ml-guide.md | 68 +++ 1 file changed, 68 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/49085b56/docs/ml-guide.md -- diff --git a/docs/ml-guide.md b/docs/ml-guide.md index b6ca50e..a03ab43 100644 --- a/docs/ml-guide.md +++ b/docs/ml-guide.md @@ -355,6 +355,74 @@ jsc.stop(); {% endhighlight %} + +{% highlight python %} +from pyspark import SparkContext +from pyspark.mllib.regression import LabeledPoint +from pyspark.ml.classification import LogisticRegression +from pyspark.ml.param import Param, Params +from pyspark.sql import Row, SQLContext + +sc = SparkContext(appName="SimpleParamsExample") +sqlContext = SQLContext(sc) + +# Prepare training data. +# We use LabeledPoint. +# Spark SQL can convert RDDs of LabeledPoints into DataFrames. +training = sc.parallelize([LabeledPoint(1.0, [0.0, 1.1, 0.1]), + LabeledPoint(0.0, [2.0, 1.0, -1.0]), + LabeledPoint(0.0, [2.0, 1.3, 1.0]), + LabeledPoint(1.0, [0.0, 1.2, -0.5])]) + +# Create a LogisticRegression instance. This instance is an Estimator. +lr = LogisticRegression(maxIter=10, regParam=0.01) +# Print out the parameters, documentation, and any default values. +print "LogisticRegression parameters:\n" + lr.explainParams() + "\n" + +# Learn a LogisticRegression model. This uses the parameters stored in lr. +model1 = lr.fit(training.toDF()) + +# Since model1 is a Model (i.e., a transformer produced by an Estimator), +# we can view the parameters it used during fit(). +# This prints the parameter (name: value) pairs, where names are unique IDs for this +# LogisticRegression instance. +print "Model 1 was fit using parameters: " +print model1.extractParamMap() + +# We may alternatively specify parameters using a Python dictionary as a paramMap +paramMap = {lr.maxIter: 20} +paramMap[lr.maxIter] = 30 # Specify 1 Param, overwriting the original maxIter. +paramMap.update({lr.regParam: 0.1, lr.threshold: 0.55}) # Specify multiple Params. + +# You can combine paramMaps, which are python dictionaries. +paramMap2 = {lr.probabilityCol: "myProbability"} # Change output column name +paramMapCombined = paramMap.copy() +paramMapCombined.update(paramMap2) + +# Now learn a new model using the paramMapCombined parameters. +# paramMapCombined overrides all parameters set earlier via lr.set* methods. +model2 = lr.fit(training.toDF(), paramMapCombined) +print "Model 2 was fit using parameters: " +print model2.extractParamMap() + +# Prepare test data +test = sc.parallelize([LabeledPoint(1.0, [-1.0, 1.5, 1.3]), + LabeledPoint(0.0, [ 3.0, 2.0, -0.1]), + LabeledPoint(1.0, [ 0.0, 2.2, -1.5])]) + +# Make predictions on test data using the Transformer.transform() method. +# LogisticRegression.transform will only use the 'features' column. +# Note that model2.transform() outputs a "myProbability" column instead of the usual +# 'probability' column since we renamed the lr.probabilityCol parameter previously. +prediction = model2.transform(test.toDF()) +selected = prediction.select("features", "label", "myProbability", "prediction") +for row in selected.collect(): +print row + +sc.stop() +{% endhighlight %} + + ## Example: Pipeline - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9661] [MLLIB] [ML] Java compatibility
Repository: spark Updated Branches: refs/heads/branch-1.5 30460206f -> 875ecc7f6 [SPARK-9661] [MLLIB] [ML] Java compatibility I skimmed through the docs for various instance of Object and replaced them with Java compaible versions of the same. 1. Some methods in LDAModel. 2. runMiniBatchSGD 3. kolmogorovSmirnovTest Author: MechCoder Closes #8126 from MechCoder/java_incop. (cherry picked from commit 864de8eaf4b6ad5c9099f6f29e251c56b029f631) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/875ecc7f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/875ecc7f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/875ecc7f Branch: refs/heads/branch-1.5 Commit: 875ecc7f61bf487ad8291e3c867a45f25c8852da Parents: 3046020 Author: MechCoder Authored: Thu Aug 13 13:42:35 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 13 13:42:44 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 27 ++-- .../apache/spark/mllib/stat/Statistics.scala| 16 +++- .../spark/mllib/clustering/JavaLDASuite.java| 24 + .../spark/mllib/stat/JavaStatisticsSuite.java | 22 .../spark/mllib/clustering/LDASuite.scala | 13 ++ 5 files changed, 99 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/875ecc7f/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 5dc637e..f31949f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -26,7 +26,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental -import org.apache.spark.api.java.JavaPairRDD +import org.apache.spark.api.java.{JavaPairRDD, JavaRDD} import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.{Loader, Saveable} @@ -228,6 +228,11 @@ class LocalLDAModel private[clustering] ( docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) + /** Java-friendly version of [[logLikelihood]] */ + def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { +logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) + } + /** * Calculate an upper bound bound on perplexity. (Lower is better.) * See Equation (16) in original Online LDA paper. @@ -242,6 +247,11 @@ class LocalLDAModel private[clustering] ( -logLikelihood(documents) / corpusTokenCount } + /** Java-friendly version of [[logPerplexity]] */ + def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { +logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) + } + /** * Estimate the variational likelihood bound of from `documents`: *log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)] @@ -341,8 +351,14 @@ class LocalLDAModel private[clustering] ( } } -} + /** Java-friendly version of [[topicDistributions]] */ + def topicDistributions( + documents: JavaPairRDD[java.lang.Long, Vector]): JavaPairRDD[java.lang.Long, Vector] = { +val distributions = topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) +JavaPairRDD.fromRDD(distributions.asInstanceOf[RDD[(java.lang.Long, Vector)]]) + } +} @Experimental object LocalLDAModel extends Loader[LocalLDAModel] { @@ -657,6 +673,13 @@ class DistributedLDAModel private[clustering] ( } } + /** Java-friendly version of [[topTopicsPerDocument]] */ + def javaTopTopicsPerDocument( + k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] = { +val topics = topTopicsPerDocument(k) +topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[java.lang.Double])]].toJavaRDD() + } + // TODO: // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ??? http://git-wip-us.apache.org/repos/asf/spark/blob/875ecc7f/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala index f845029..24fe48c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala +
spark git commit: [SPARK-9661] [MLLIB] [ML] Java compatibility
Repository: spark Updated Branches: refs/heads/master 8815ba2f6 -> 864de8eaf [SPARK-9661] [MLLIB] [ML] Java compatibility I skimmed through the docs for various instance of Object and replaced them with Java compaible versions of the same. 1. Some methods in LDAModel. 2. runMiniBatchSGD 3. kolmogorovSmirnovTest Author: MechCoder Closes #8126 from MechCoder/java_incop. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/864de8ea Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/864de8ea Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/864de8ea Branch: refs/heads/master Commit: 864de8eaf4b6ad5c9099f6f29e251c56b029f631 Parents: 8815ba2 Author: MechCoder Authored: Thu Aug 13 13:42:35 2015 -0700 Committer: Joseph K. Bradley Committed: Thu Aug 13 13:42:35 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 27 ++-- .../apache/spark/mllib/stat/Statistics.scala| 16 +++- .../spark/mllib/clustering/JavaLDASuite.java| 24 + .../spark/mllib/stat/JavaStatisticsSuite.java | 22 .../spark/mllib/clustering/LDASuite.scala | 13 ++ 5 files changed, 99 insertions(+), 3 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/864de8ea/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index 5dc637e..f31949f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -26,7 +26,7 @@ import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkContext import org.apache.spark.annotation.Experimental -import org.apache.spark.api.java.JavaPairRDD +import org.apache.spark.api.java.{JavaPairRDD, JavaRDD} import org.apache.spark.graphx.{Edge, EdgeContext, Graph, VertexId} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.{Loader, Saveable} @@ -228,6 +228,11 @@ class LocalLDAModel private[clustering] ( docConcentration, topicConcentration, topicsMatrix.toBreeze.toDenseMatrix, gammaShape, k, vocabSize) + /** Java-friendly version of [[logLikelihood]] */ + def logLikelihood(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { +logLikelihood(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) + } + /** * Calculate an upper bound bound on perplexity. (Lower is better.) * See Equation (16) in original Online LDA paper. @@ -242,6 +247,11 @@ class LocalLDAModel private[clustering] ( -logLikelihood(documents) / corpusTokenCount } + /** Java-friendly version of [[logPerplexity]] */ + def logPerplexity(documents: JavaPairRDD[java.lang.Long, Vector]): Double = { +logPerplexity(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) + } + /** * Estimate the variational likelihood bound of from `documents`: *log p(documents) >= E_q[log p(documents)] - E_q[log q(documents)] @@ -341,8 +351,14 @@ class LocalLDAModel private[clustering] ( } } -} + /** Java-friendly version of [[topicDistributions]] */ + def topicDistributions( + documents: JavaPairRDD[java.lang.Long, Vector]): JavaPairRDD[java.lang.Long, Vector] = { +val distributions = topicDistributions(documents.rdd.asInstanceOf[RDD[(Long, Vector)]]) +JavaPairRDD.fromRDD(distributions.asInstanceOf[RDD[(java.lang.Long, Vector)]]) + } +} @Experimental object LocalLDAModel extends Loader[LocalLDAModel] { @@ -657,6 +673,13 @@ class DistributedLDAModel private[clustering] ( } } + /** Java-friendly version of [[topTopicsPerDocument]] */ + def javaTopTopicsPerDocument( + k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] = { +val topics = topTopicsPerDocument(k) +topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[java.lang.Double])]].toJavaRDD() + } + // TODO: // override def topicDistributions(documents: RDD[(Long, Vector)]): RDD[(Long, Vector)] = ??? http://git-wip-us.apache.org/repos/asf/spark/blob/864de8ea/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala index f845029..24fe48c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala @@ -20,7 +20,7 @@ package org.apache.spar
spark git commit: [SPARK-9661] [MLLIB] minor clean-up of SPARK-9661
Repository: spark Updated Branches: refs/heads/branch-1.5 a0d52eb30 -> 4aa9238b9 [SPARK-9661] [MLLIB] minor clean-up of SPARK-9661 Some minor clean-ups after SPARK-9661. See my inline comments. MechCoder jkbradley Author: Xiangrui Meng Closes #8190 from mengxr/SPARK-9661-fix. (cherry picked from commit a0e1abbd010b9e73d472ce12ff1d987678005d32) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4aa9238b Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4aa9238b Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4aa9238b Branch: refs/heads/branch-1.5 Commit: 4aa9238b9b9c85e882c867ab4322ce084743e66f Parents: a0d52eb Author: Xiangrui Meng Authored: Fri Aug 14 10:25:11 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 14 10:25:19 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 5 +-- .../apache/spark/mllib/stat/Statistics.scala| 6 +-- .../spark/mllib/clustering/JavaLDASuite.java| 40 +++- .../spark/mllib/clustering/LDASuite.scala | 2 +- 4 files changed, 28 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4aa9238b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index f31949f..82f05e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -674,10 +674,9 @@ class DistributedLDAModel private[clustering] ( } /** Java-friendly version of [[topTopicsPerDocument]] */ - def javaTopTopicsPerDocument( - k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] = { + def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[Double])] = { val topics = topTopicsPerDocument(k) -topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[java.lang.Double])]].toJavaRDD() +topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Double])]].toJavaRDD() } // TODO: http://git-wip-us.apache.org/repos/asf/spark/blob/4aa9238b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala index 24fe48c..ef8d786 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala @@ -221,9 +221,7 @@ object Statistics { def kolmogorovSmirnovTest( data: JavaDoubleRDD, distName: String, - params: java.lang.Double*): KolmogorovSmirnovTestResult = { -val javaParams = params.asInstanceOf[Seq[Double]] -KolmogorovSmirnovTest.testOneSample(data.rdd.asInstanceOf[RDD[Double]], - distName, javaParams: _*) + params: Double*): KolmogorovSmirnovTestResult = { +kolmogorovSmirnovTest(data.rdd.asInstanceOf[RDD[Double]], distName, params: _*) } } http://git-wip-us.apache.org/repos/asf/spark/blob/4aa9238b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java index 427be94..6e91cde 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java @@ -22,12 +22,14 @@ import java.util.ArrayList; import java.util.Arrays; import scala.Tuple2; +import scala.Tuple3; import org.junit.After; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertArrayEquals; import org.junit.Before; import org.junit.Test; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.JavaPairRDD; @@ -44,9 +46,9 @@ public class JavaLDASuite implements Serializable { public void setUp() { sc = new JavaSparkContext("local", "JavaLDA"); ArrayList> tinyCorpus = new ArrayList>(); -for (int i = 0; i < LDASuite$.MODULE$.tinyCorpus().length; i++) { - tinyCorpus.add(new Tuple2((Long)LDASuite$.MODULE$.tinyCorpus()[i]._1(), - LDASuite$.MODULE$.tinyCorpus()[i
spark git commit: [SPARK-9661] [MLLIB] minor clean-up of SPARK-9661
Repository: spark Updated Branches: refs/heads/master c8677d736 -> a0e1abbd0 [SPARK-9661] [MLLIB] minor clean-up of SPARK-9661 Some minor clean-ups after SPARK-9661. See my inline comments. MechCoder jkbradley Author: Xiangrui Meng Closes #8190 from mengxr/SPARK-9661-fix. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a0e1abbd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a0e1abbd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a0e1abbd Branch: refs/heads/master Commit: a0e1abbd010b9e73d472ce12ff1d987678005d32 Parents: c8677d7 Author: Xiangrui Meng Authored: Fri Aug 14 10:25:11 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 14 10:25:11 2015 -0700 -- .../spark/mllib/clustering/LDAModel.scala | 5 +-- .../apache/spark/mllib/stat/Statistics.scala| 6 +-- .../spark/mllib/clustering/JavaLDASuite.java| 40 +++- .../spark/mllib/clustering/LDASuite.scala | 2 +- 4 files changed, 28 insertions(+), 25 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a0e1abbd/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala index f31949f..82f05e4 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala @@ -674,10 +674,9 @@ class DistributedLDAModel private[clustering] ( } /** Java-friendly version of [[topTopicsPerDocument]] */ - def javaTopTopicsPerDocument( - k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[java.lang.Double])] = { + def javaTopTopicsPerDocument(k: Int): JavaRDD[(java.lang.Long, Array[Int], Array[Double])] = { val topics = topTopicsPerDocument(k) -topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[java.lang.Double])]].toJavaRDD() +topics.asInstanceOf[RDD[(java.lang.Long, Array[Int], Array[Double])]].toJavaRDD() } // TODO: http://git-wip-us.apache.org/repos/asf/spark/blob/a0e1abbd/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala index 24fe48c..ef8d786 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/Statistics.scala @@ -221,9 +221,7 @@ object Statistics { def kolmogorovSmirnovTest( data: JavaDoubleRDD, distName: String, - params: java.lang.Double*): KolmogorovSmirnovTestResult = { -val javaParams = params.asInstanceOf[Seq[Double]] -KolmogorovSmirnovTest.testOneSample(data.rdd.asInstanceOf[RDD[Double]], - distName, javaParams: _*) + params: Double*): KolmogorovSmirnovTestResult = { +kolmogorovSmirnovTest(data.rdd.asInstanceOf[RDD[Double]], distName, params: _*) } } http://git-wip-us.apache.org/repos/asf/spark/blob/a0e1abbd/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java -- diff --git a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java index 427be94..6e91cde 100644 --- a/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java +++ b/mllib/src/test/java/org/apache/spark/mllib/clustering/JavaLDASuite.java @@ -22,12 +22,14 @@ import java.util.ArrayList; import java.util.Arrays; import scala.Tuple2; +import scala.Tuple3; import org.junit.After; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertArrayEquals; import org.junit.Before; import org.junit.Test; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.JavaPairRDD; @@ -44,9 +46,9 @@ public class JavaLDASuite implements Serializable { public void setUp() { sc = new JavaSparkContext("local", "JavaLDA"); ArrayList> tinyCorpus = new ArrayList>(); -for (int i = 0; i < LDASuite$.MODULE$.tinyCorpus().length; i++) { - tinyCorpus.add(new Tuple2((Long)LDASuite$.MODULE$.tinyCorpus()[i]._1(), - LDASuite$.MODULE$.tinyCorpus()[i]._2())); +for (int i = 0; i < LDASuite.tinyCorpus().length; i++) { + tinyCorpus.add(new Tuple
spark git commit: [SPARK-9956] [ML] Make trees work with one-category features
Repository: spark Updated Branches: refs/heads/master a0e1abbd0 -> 7ecf0c469 [SPARK-9956] [ML] Make trees work with one-category features This modifies DecisionTreeMetadata construction to treat 1-category features as continuous, so that trees do not fail with such features. It is important for the pipelines API, where VectorIndexer can automatically categorize certain features as categorical. As stated in the JIRA, this is a temp fix which we can improve upon later by automatically filtering out those features. That will take longer, though, since it will require careful indexing. Targeted for 1.5 and master CC: manishamde mengxr yanboliang Author: Joseph K. Bradley Closes #8187 from jkbradley/tree-1cat. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7ecf0c46 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7ecf0c46 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7ecf0c46 Branch: refs/heads/master Commit: 7ecf0c46990c39df8aeddbd64ca33d01824bcc0a Parents: a0e1abb Author: Joseph K. Bradley Authored: Fri Aug 14 10:48:02 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 14 10:48:02 2015 -0700 -- .../mllib/tree/impl/DecisionTreeMetadata.scala | 27 .../DecisionTreeClassifierSuite.scala | 13 ++ 2 files changed, 30 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/7ecf0c46/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala index 9fe2646..21ee49c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala @@ -144,21 +144,28 @@ private[spark] object DecisionTreeMetadata extends Logging { val maxCategoriesForUnorderedFeature = ((math.log(maxPossibleBins / 2 + 1) / math.log(2.0)) + 1).floor.toInt strategy.categoricalFeaturesInfo.foreach { case (featureIndex, numCategories) => -// Decide if some categorical features should be treated as unordered features, -// which require 2 * ((1 << numCategories - 1) - 1) bins. -// We do this check with log values to prevent overflows in case numCategories is large. -// The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) <= maxBins -if (numCategories <= maxCategoriesForUnorderedFeature) { - unorderedFeatures.add(featureIndex) - numBins(featureIndex) = numUnorderedBins(numCategories) -} else { - numBins(featureIndex) = numCategories +// Hack: If a categorical feature has only 1 category, we treat it as continuous. +// TODO(SPARK-9957): Handle this properly by filtering out those features. +if (numCategories > 1) { + // Decide if some categorical features should be treated as unordered features, + // which require 2 * ((1 << numCategories - 1) - 1) bins. + // We do this check with log values to prevent overflows in case numCategories is large. + // The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) <= maxBins + if (numCategories <= maxCategoriesForUnorderedFeature) { +unorderedFeatures.add(featureIndex) +numBins(featureIndex) = numUnorderedBins(numCategories) + } else { +numBins(featureIndex) = numCategories + } } } } else { // Binary classification or regression strategy.categoricalFeaturesInfo.foreach { case (featureIndex, numCategories) => -numBins(featureIndex) = numCategories +// If a categorical feature has only 1 category, we treat it as continuous: SPARK-9957 +if (numCategories > 1) { + numBins(featureIndex) = numCategories +} } } http://git-wip-us.apache.org/repos/asf/spark/blob/7ecf0c46/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index 4b7c5d3..f680d8d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionT
spark git commit: [SPARK-9956] [ML] Make trees work with one-category features
Repository: spark Updated Branches: refs/heads/branch-1.5 4aa9238b9 -> f5298da16 [SPARK-9956] [ML] Make trees work with one-category features This modifies DecisionTreeMetadata construction to treat 1-category features as continuous, so that trees do not fail with such features. It is important for the pipelines API, where VectorIndexer can automatically categorize certain features as categorical. As stated in the JIRA, this is a temp fix which we can improve upon later by automatically filtering out those features. That will take longer, though, since it will require careful indexing. Targeted for 1.5 and master CC: manishamde mengxr yanboliang Author: Joseph K. Bradley Closes #8187 from jkbradley/tree-1cat. (cherry picked from commit 7ecf0c46990c39df8aeddbd64ca33d01824bcc0a) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f5298da1 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f5298da1 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f5298da1 Branch: refs/heads/branch-1.5 Commit: f5298da16671496946a9f9ef614e5f4b9284b1d2 Parents: 4aa9238 Author: Joseph K. Bradley Authored: Fri Aug 14 10:48:02 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 14 10:48:19 2015 -0700 -- .../mllib/tree/impl/DecisionTreeMetadata.scala | 27 .../DecisionTreeClassifierSuite.scala | 13 ++ 2 files changed, 30 insertions(+), 10 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f5298da1/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala index 9fe2646..21ee49c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/tree/impl/DecisionTreeMetadata.scala @@ -144,21 +144,28 @@ private[spark] object DecisionTreeMetadata extends Logging { val maxCategoriesForUnorderedFeature = ((math.log(maxPossibleBins / 2 + 1) / math.log(2.0)) + 1).floor.toInt strategy.categoricalFeaturesInfo.foreach { case (featureIndex, numCategories) => -// Decide if some categorical features should be treated as unordered features, -// which require 2 * ((1 << numCategories - 1) - 1) bins. -// We do this check with log values to prevent overflows in case numCategories is large. -// The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) <= maxBins -if (numCategories <= maxCategoriesForUnorderedFeature) { - unorderedFeatures.add(featureIndex) - numBins(featureIndex) = numUnorderedBins(numCategories) -} else { - numBins(featureIndex) = numCategories +// Hack: If a categorical feature has only 1 category, we treat it as continuous. +// TODO(SPARK-9957): Handle this properly by filtering out those features. +if (numCategories > 1) { + // Decide if some categorical features should be treated as unordered features, + // which require 2 * ((1 << numCategories - 1) - 1) bins. + // We do this check with log values to prevent overflows in case numCategories is large. + // The next check is equivalent to: 2 * ((1 << numCategories - 1) - 1) <= maxBins + if (numCategories <= maxCategoriesForUnorderedFeature) { +unorderedFeatures.add(featureIndex) +numBins(featureIndex) = numUnorderedBins(numCategories) + } else { +numBins(featureIndex) = numCategories + } } } } else { // Binary classification or regression strategy.categoricalFeaturesInfo.foreach { case (featureIndex, numCategories) => -numBins(featureIndex) = numCategories +// If a categorical feature has only 1 category, we treat it as continuous: SPARK-9957 +if (numCategories > 1) { + numBins(featureIndex) = numCategories +} } } http://git-wip-us.apache.org/repos/asf/spark/blob/f5298da1/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/DecisionTreeClassifierSuite.scala index 4b7c5d3..f680d8d 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/classification/Deci
spark git commit: [SPARK-8744] [ML] Add a public constructor to StringIndexer
Repository: spark Updated Branches: refs/heads/master 7ecf0c469 -> a7317ccdc [SPARK-8744] [ML] Add a public constructor to StringIndexer It would be helpful to allow users to pass a pre-computed index to create an indexer, rather than always going through StringIndexer to create the model. Author: Holden Karau Closes #7267 from holdenk/SPARK-8744-StringIndexerModel-should-have-public-constructor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a7317ccd Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a7317ccd Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a7317ccd Branch: refs/heads/master Commit: a7317ccdc20d001e5b7f5277b0535923468bfbc6 Parents: 7ecf0c4 Author: Holden Karau Authored: Fri Aug 14 11:22:10 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 14 11:22:10 2015 -0700 -- .../main/scala/org/apache/spark/ml/feature/StringIndexer.scala | 4 +++- .../scala/org/apache/spark/ml/feature/StringIndexerSuite.scala | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a7317ccd/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index 9f6e7b6..6347578 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -102,10 +102,12 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod * This is a temporary fix for the case when target labels do not exist during prediction. */ @Experimental -class StringIndexerModel private[ml] ( +class StringIndexerModel ( override val uid: String, labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase { + def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) + private val labelToIndex: OpenHashMap[String, Double] = { val n = labels.length val map = new OpenHashMap[String, Double](n) http://git-wip-us.apache.org/repos/asf/spark/blob/a7317ccd/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala index fa918ce..0b4c8ba 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala @@ -30,7 +30,9 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new StringIndexer) val model = new StringIndexerModel("indexer", Array("a", "b")) +val modelWithoutUid = new StringIndexerModel(Array("a", "b")) ParamsSuite.checkParams(model) +ParamsSuite.checkParams(modelWithoutUid) } test("StringIndexer") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8744] [ML] Add a public constructor to StringIndexer
Repository: spark Updated Branches: refs/heads/branch-1.5 f5298da16 -> e4ea2390a [SPARK-8744] [ML] Add a public constructor to StringIndexer It would be helpful to allow users to pass a pre-computed index to create an indexer, rather than always going through StringIndexer to create the model. Author: Holden Karau Closes #7267 from holdenk/SPARK-8744-StringIndexerModel-should-have-public-constructor. (cherry picked from commit a7317ccdc20d001e5b7f5277b0535923468bfbc6) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e4ea2390 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e4ea2390 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e4ea2390 Branch: refs/heads/branch-1.5 Commit: e4ea2390a5f64747dbc60febc4f3c29e1970e46d Parents: f5298da Author: Holden Karau Authored: Fri Aug 14 11:22:10 2015 -0700 Committer: Joseph K. Bradley Committed: Fri Aug 14 11:22:19 2015 -0700 -- .../main/scala/org/apache/spark/ml/feature/StringIndexer.scala | 4 +++- .../scala/org/apache/spark/ml/feature/StringIndexerSuite.scala | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e4ea2390/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala index b87e154..f5dfba1 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StringIndexer.scala @@ -98,10 +98,12 @@ class StringIndexer(override val uid: String) extends Estimator[StringIndexerMod * This is a temporary fix for the case when target labels do not exist during prediction. */ @Experimental -class StringIndexerModel private[ml] ( +class StringIndexerModel ( override val uid: String, labels: Array[String]) extends Model[StringIndexerModel] with StringIndexerBase { + def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) + private val labelToIndex: OpenHashMap[String, Double] = { val n = labels.length val map = new OpenHashMap[String, Double](n) http://git-wip-us.apache.org/repos/asf/spark/blob/e4ea2390/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala index 4a12e0b..d960861 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StringIndexerSuite.scala @@ -30,7 +30,9 @@ class StringIndexerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new StringIndexer) val model = new StringIndexerModel("indexer", Array("a", "b")) +val modelWithoutUid = new StringIndexerModel(Array("a", "b")) ParamsSuite.checkParams(model) +ParamsSuite.checkParams(modelWithoutUid) } test("StringIndexer") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming pyspark tests
Repository: spark Updated Branches: refs/heads/branch-1.5 2fda1d842 -> 881baf100 [SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming pyspark tests Recently, PySpark ML streaming tests have been flaky, most likely because of the batches not being processed in time. Proposal: Replace the use of _ssc_wait (which waits for a fixed amount of time) with a method which waits for a fixed amount of time but can terminate early based on a termination condition method. With this, we can extend the waiting period (to make tests less flaky) but also stop early when possible (making tests faster on average, which I verified locally). CC: mengxr tdas freeman-lab Author: Joseph K. Bradley Closes #8087 from jkbradley/streaming-ml-tests. (cherry picked from commit 1db7179fae672fcec7b8de12c374dd384ce51c67) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/881baf10 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/881baf10 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/881baf10 Branch: refs/heads/branch-1.5 Commit: 881baf100fa9d8135b16cd390c344e3a5995805e Parents: 2fda1d8 Author: Joseph K. Bradley Authored: Sat Aug 15 18:48:20 2015 -0700 Committer: Joseph K. Bradley Committed: Sat Aug 15 18:48:29 2015 -0700 -- python/pyspark/mllib/tests.py | 177 +++-- 1 file changed, 129 insertions(+), 48 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/881baf10/python/pyspark/mllib/tests.py -- diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 3f5a02a..5097c5e 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -32,6 +32,9 @@ from numpy import sum as array_sum from py4j.protocol import Py4JJavaError +if sys.version > '3': +basestring = str + if sys.version_info[:2] <= (2, 6): try: import unittest2 as unittest @@ -86,9 +89,42 @@ class MLLibStreamingTestCase(unittest.TestCase): self.ssc.stop(False) @staticmethod -def _ssc_wait(start_time, end_time, sleep_time): -while time() - start_time < end_time: +def _eventually(condition, timeout=30.0, catch_assertions=False): +""" +Wait a given amount of time for a condition to pass, else fail with an error. +This is a helper utility for streaming ML tests. +:param condition: Function that checks for termination conditions. + condition() can return: + - True: Conditions met. Return without error. + - other value: Conditions not met yet. Continue. Upon timeout, + include last such value in error message. + Note that this method may be called at any time during + streaming execution (e.g., even before any results + have been created). +:param timeout: Number of seconds to wait. Default 30 seconds. +:param catch_assertions: If False (default), do not catch AssertionErrors. + If True, catch AssertionErrors; continue, but save + error to throw upon timeout. +""" +start_time = time() +lastValue = None +while time() - start_time < timeout: +if catch_assertions: +try: +lastValue = condition() +except AssertionError as e: +lastValue = e +else: +lastValue = condition() +if lastValue is True: +return sleep(0.01) +if isinstance(lastValue, AssertionError): +raise lastValue +else: +raise AssertionError( +"Test failed due to timeout after %g sec, with last condition returning: %s" +% (timeout, lastValue)) def _squared_distance(a, b): @@ -999,10 +1035,13 @@ class StreamingKMeansTest(MLLibStreamingTestCase): [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) -t = time() self.ssc.start() -self._ssc_wait(t, 10.0, 0.01) -self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) + +def condition(): +self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) +return True +self._eventually(condition, catch_assertions=True) + realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCente
spark git commit: [SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming pyspark tests
Repository: spark Updated Branches: refs/heads/master 570567258 -> 1db7179fa [SPARK-9805] [MLLIB] [PYTHON] [STREAMING] Added _eventually for ml streaming pyspark tests Recently, PySpark ML streaming tests have been flaky, most likely because of the batches not being processed in time. Proposal: Replace the use of _ssc_wait (which waits for a fixed amount of time) with a method which waits for a fixed amount of time but can terminate early based on a termination condition method. With this, we can extend the waiting period (to make tests less flaky) but also stop early when possible (making tests faster on average, which I verified locally). CC: mengxr tdas freeman-lab Author: Joseph K. Bradley Closes #8087 from jkbradley/streaming-ml-tests. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1db7179f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1db7179f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1db7179f Branch: refs/heads/master Commit: 1db7179fae672fcec7b8de12c374dd384ce51c67 Parents: 5705672 Author: Joseph K. Bradley Authored: Sat Aug 15 18:48:20 2015 -0700 Committer: Joseph K. Bradley Committed: Sat Aug 15 18:48:20 2015 -0700 -- python/pyspark/mllib/tests.py | 177 +++-- 1 file changed, 129 insertions(+), 48 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/1db7179f/python/pyspark/mllib/tests.py -- diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index 3f5a02a..5097c5e 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -32,6 +32,9 @@ from numpy import sum as array_sum from py4j.protocol import Py4JJavaError +if sys.version > '3': +basestring = str + if sys.version_info[:2] <= (2, 6): try: import unittest2 as unittest @@ -86,9 +89,42 @@ class MLLibStreamingTestCase(unittest.TestCase): self.ssc.stop(False) @staticmethod -def _ssc_wait(start_time, end_time, sleep_time): -while time() - start_time < end_time: +def _eventually(condition, timeout=30.0, catch_assertions=False): +""" +Wait a given amount of time for a condition to pass, else fail with an error. +This is a helper utility for streaming ML tests. +:param condition: Function that checks for termination conditions. + condition() can return: + - True: Conditions met. Return without error. + - other value: Conditions not met yet. Continue. Upon timeout, + include last such value in error message. + Note that this method may be called at any time during + streaming execution (e.g., even before any results + have been created). +:param timeout: Number of seconds to wait. Default 30 seconds. +:param catch_assertions: If False (default), do not catch AssertionErrors. + If True, catch AssertionErrors; continue, but save + error to throw upon timeout. +""" +start_time = time() +lastValue = None +while time() - start_time < timeout: +if catch_assertions: +try: +lastValue = condition() +except AssertionError as e: +lastValue = e +else: +lastValue = condition() +if lastValue is True: +return sleep(0.01) +if isinstance(lastValue, AssertionError): +raise lastValue +else: +raise AssertionError( +"Test failed due to timeout after %g sec, with last condition returning: %s" +% (timeout, lastValue)) def _squared_distance(a, b): @@ -999,10 +1035,13 @@ class StreamingKMeansTest(MLLibStreamingTestCase): [self.sc.parallelize(batch, 1) for batch in batches]) stkm.trainOn(input_stream) -t = time() self.ssc.start() -self._ssc_wait(t, 10.0, 0.01) -self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) + +def condition(): +self.assertEquals(stkm.latestModel().clusterWeights, [25.0]) +return True +self._eventually(condition, catch_assertions=True) + realCenters = array_sum(array(centers), axis=0) for i in range(5): modelCenters = stkm.latestModel().centers[0][i] @@ -1027,7 +1066,7 @@ class StreamingKMeansTest(MLLibStreamingTestCase):
spark git commit: [SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct
Repository: spark Updated Branches: refs/heads/master 52ae95257 -> 0076e8212 [SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct Add Python API, user guide and example for ml.feature.ElementwiseProduct. Author: Yanbo Liang Closes #8061 from yanboliang/SPARK-9768. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/0076e821 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/0076e821 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/0076e821 Branch: refs/heads/master Commit: 0076e8212334c613599dcbc2ac23f49e9e50cc44 Parents: 52ae952 Author: Yanbo Liang Authored: Mon Aug 17 17:25:41 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 17 17:25:41 2015 -0700 -- docs/ml-features.md | 23 +++--- python/pyspark/ml/feature.py | 67 --- 2 files changed, 81 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/0076e821/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index cec2cbe..6b2e36b 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1212,7 +1212,7 @@ v_N This example below demonstrates how to transform vectors using a transforming vector value. - + {% highlight scala %} import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors @@ -1229,12 +1229,12 @@ val transformer = new ElementwiseProduct() .setOutputCol("transformedVector") // Batch transform the vectors to create new column: -val transformedData = transformer.transform(dataFrame) +transformer.transform(dataFrame).show() {% endhighlight %} - + {% highlight java %} import com.google.common.collect.Lists; @@ -1267,10 +1267,25 @@ ElementwiseProduct transformer = new ElementwiseProduct() .setInputCol("vector") .setOutputCol("transformedVector"); // Batch transform the vectors to create new column: -DataFrame transformedData = transformer.transform(dataFrame); +transformer.transform(dataFrame).show(); {% endhighlight %} + + +{% highlight python %} +from pyspark.ml.feature import ElementwiseProduct +from pyspark.mllib.linalg import Vectors + +data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)] +df = sqlContext.createDataFrame(data, ["vector"]) +transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), + inputCol="vector", outputCol="transformedVector") +transformer.transform(df).show() + +{% endhighlight %} + + ## VectorAssembler http://git-wip-us.apache.org/repos/asf/spark/blob/0076e821/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 535d553..04b2b2c 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -26,11 +26,11 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer from pyspark.mllib.common import inherit_doc from pyspark.mllib.linalg import _convert_to_vector -__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', - 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', - 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', - 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA', - 'PCAModel', 'RFormula', 'RFormulaModel'] +__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', + 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', + 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', + 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', + 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] @inherit_doc @@ -167,6 +167,63 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): @inherit_doc +class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol): +""" +Outputs the Hadamard product (i.e., the element-wise product) of each input vector +with a provided "weight" vector. In other words, it scales each column of the dataset +by a scalar multiplier. + +>>> from pyspark.mllib.linalg import Vectors +>>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"]) +>>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), +... inputCol="values", outputCol="eprod") +>>> ep.transform(df).head().eprod +DenseVector([2.0, 2.0, 9.0]) +>>> ep.setParams(scalingVec
spark git commit: [SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct
Repository: spark Updated Branches: refs/heads/branch-1.5 407175e82 -> eaeebb92f [SPARK-9768] [PYSPARK] [ML] Add Python API and user guide for ml.feature.ElementwiseProduct Add Python API, user guide and example for ml.feature.ElementwiseProduct. Author: Yanbo Liang Closes #8061 from yanboliang/SPARK-9768. (cherry picked from commit 0076e8212334c613599dcbc2ac23f49e9e50cc44) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/eaeebb92 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/eaeebb92 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/eaeebb92 Branch: refs/heads/branch-1.5 Commit: eaeebb92f336d3862169c61e7dcc6afa2732084b Parents: 407175e Author: Yanbo Liang Authored: Mon Aug 17 17:25:41 2015 -0700 Committer: Joseph K. Bradley Committed: Mon Aug 17 17:25:50 2015 -0700 -- docs/ml-features.md | 23 +++--- python/pyspark/ml/feature.py | 67 --- 2 files changed, 81 insertions(+), 9 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/eaeebb92/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index cec2cbe..6b2e36b 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -1212,7 +1212,7 @@ v_N This example below demonstrates how to transform vectors using a transforming vector value. - + {% highlight scala %} import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.mllib.linalg.Vectors @@ -1229,12 +1229,12 @@ val transformer = new ElementwiseProduct() .setOutputCol("transformedVector") // Batch transform the vectors to create new column: -val transformedData = transformer.transform(dataFrame) +transformer.transform(dataFrame).show() {% endhighlight %} - + {% highlight java %} import com.google.common.collect.Lists; @@ -1267,10 +1267,25 @@ ElementwiseProduct transformer = new ElementwiseProduct() .setInputCol("vector") .setOutputCol("transformedVector"); // Batch transform the vectors to create new column: -DataFrame transformedData = transformer.transform(dataFrame); +transformer.transform(dataFrame).show(); {% endhighlight %} + + +{% highlight python %} +from pyspark.ml.feature import ElementwiseProduct +from pyspark.mllib.linalg import Vectors + +data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)] +df = sqlContext.createDataFrame(data, ["vector"]) +transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), + inputCol="vector", outputCol="transformedVector") +transformer.transform(df).show() + +{% endhighlight %} + + ## VectorAssembler http://git-wip-us.apache.org/repos/asf/spark/blob/eaeebb92/python/pyspark/ml/feature.py -- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 535d553..04b2b2c 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -26,11 +26,11 @@ from pyspark.ml.wrapper import JavaEstimator, JavaModel, JavaTransformer from pyspark.mllib.common import inherit_doc from pyspark.mllib.linalg import _convert_to_vector -__all__ = ['Binarizer', 'Bucketizer', 'HashingTF', 'IDF', 'IDFModel', 'NGram', 'Normalizer', - 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', 'StandardScaler', - 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', - 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', 'PCA', - 'PCAModel', 'RFormula', 'RFormulaModel'] +__all__ = ['Binarizer', 'Bucketizer', 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', + 'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 'RegexTokenizer', + 'StandardScaler', 'StandardScalerModel', 'StringIndexer', 'StringIndexerModel', + 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 'Word2VecModel', + 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel'] @inherit_doc @@ -167,6 +167,63 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): @inherit_doc +class ElementwiseProduct(JavaTransformer, HasInputCol, HasOutputCol): +""" +Outputs the Hadamard product (i.e., the element-wise product) of each input vector +with a provided "weight" vector. In other words, it scales each column of the dataset +by a scalar multiplier. + +>>> from pyspark.mllib.linalg import Vectors +>>> df = sqlContext.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]),)], ["values"]) +>>> ep = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), +... inputCol="values", outputCol="e
spark git commit: [SPARK-9028] [ML] Add CountVectorizer as an estimator to generate CountVectorizerModel
Repository: spark Updated Branches: refs/heads/master 1968276af -> 354f4582b [SPARK-9028] [ML] Add CountVectorizer as an estimator to generate CountVectorizerModel jira: https://issues.apache.org/jira/browse/SPARK-9028 Add an estimator for CountVectorizerModel. The estimator will extract a vocabulary from document collections according to the term frequency. I changed the meaning of minCount as a filter across the corpus. This aligns with Word2Vec and the similar parameter in SKlearn. Author: Yuhao Yang Author: Joseph K. Bradley Closes #7388 from hhbyyh/cvEstimator. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/354f4582 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/354f4582 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/354f4582 Branch: refs/heads/master Commit: 354f4582b637fa25d3892ec2b12869db50ed83c9 Parents: 1968276 Author: Yuhao Yang Authored: Tue Aug 18 11:00:09 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 18 11:00:09 2015 -0700 -- .../spark/ml/feature/CountVectorizer.scala | 235 +++ .../spark/ml/feature/CountVectorizerModel.scala | 82 --- .../spark/ml/feature/CountVectorizerSuite.scala | 167 + .../spark/ml/feature/CountVectorizorSuite.scala | 73 -- 4 files changed, 402 insertions(+), 155 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/354f4582/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala new file mode 100644 index 000..49028e4 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.feature + +import org.apache.spark.annotation.Experimental +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.util.collection.OpenHashMap + +/** + * Params for [[CountVectorizer]] and [[CountVectorizerModel]]. + */ +private[feature] trait CountVectorizerParams extends Params with HasInputCol with HasOutputCol { + + /** + * Max size of the vocabulary. + * CountVectorizer will build a vocabulary that only considers the top + * vocabSize terms ordered by term frequency across the corpus. + * + * Default: 2^18^ + * @group param + */ + val vocabSize: IntParam = +new IntParam(this, "vocabSize", "max size of the vocabulary", ParamValidators.gt(0)) + + /** @group getParam */ + def getVocabSize: Int = $(vocabSize) + + /** + * Specifies the minimum number of different documents a term must appear in to be included + * in the vocabulary. + * If this is an integer >= 1, this specifies the number of documents the term must appear in; + * if this is a double in [0,1), then this specifies the fraction of documents. + * + * Default: 1 + * @group param + */ + val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" + +" different documents a term must appear in to be included in the vocabulary." + +" If this is an integer >= 1, this specifies the number of documents the term must" + +" appear in; if this is a double in [0,1), then this specifies the fraction of documents.", +ParamValidators.gtEq(0.0)) + + /** @group getParam */ + def getMinDF: Double = $(minDF) + + /** Validates and transforms the input schema. */ + protected def validateAndTransf
spark git commit: [SPARK-9028] [ML] Add CountVectorizer as an estimator to generate CountVectorizerModel
Repository: spark Updated Branches: refs/heads/branch-1.5 20a760a00 -> b86378cf2 [SPARK-9028] [ML] Add CountVectorizer as an estimator to generate CountVectorizerModel jira: https://issues.apache.org/jira/browse/SPARK-9028 Add an estimator for CountVectorizerModel. The estimator will extract a vocabulary from document collections according to the term frequency. I changed the meaning of minCount as a filter across the corpus. This aligns with Word2Vec and the similar parameter in SKlearn. Author: Yuhao Yang Author: Joseph K. Bradley Closes #7388 from hhbyyh/cvEstimator. (cherry picked from commit 354f4582b637fa25d3892ec2b12869db50ed83c9) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b86378cf Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b86378cf Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b86378cf Branch: refs/heads/branch-1.5 Commit: b86378cf29f8fdb70e41b2f04d831b8a15c1c859 Parents: 20a760a Author: Yuhao Yang Authored: Tue Aug 18 11:00:09 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 18 11:00:22 2015 -0700 -- .../spark/ml/feature/CountVectorizer.scala | 235 +++ .../spark/ml/feature/CountVectorizerModel.scala | 82 --- .../spark/ml/feature/CountVectorizerSuite.scala | 167 + .../spark/ml/feature/CountVectorizorSuite.scala | 73 -- 4 files changed, 402 insertions(+), 155 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b86378cf/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala new file mode 100644 index 000..49028e4 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala @@ -0,0 +1,235 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.ml.feature + +import org.apache.spark.annotation.Experimental +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.mllib.linalg.{VectorUDT, Vectors} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.DataFrame +import org.apache.spark.util.collection.OpenHashMap + +/** + * Params for [[CountVectorizer]] and [[CountVectorizerModel]]. + */ +private[feature] trait CountVectorizerParams extends Params with HasInputCol with HasOutputCol { + + /** + * Max size of the vocabulary. + * CountVectorizer will build a vocabulary that only considers the top + * vocabSize terms ordered by term frequency across the corpus. + * + * Default: 2^18^ + * @group param + */ + val vocabSize: IntParam = +new IntParam(this, "vocabSize", "max size of the vocabulary", ParamValidators.gt(0)) + + /** @group getParam */ + def getVocabSize: Int = $(vocabSize) + + /** + * Specifies the minimum number of different documents a term must appear in to be included + * in the vocabulary. + * If this is an integer >= 1, this specifies the number of documents the term must appear in; + * if this is a double in [0,1), then this specifies the fraction of documents. + * + * Default: 1 + * @group param + */ + val minDF: DoubleParam = new DoubleParam(this, "minDF", "Specifies the minimum number of" + +" different documents a term must appear in to be included in the vocabulary." + +" If this is an integer >= 1, this specifies the number of documents the term must" + +" appear in; if this is a double in [0,1), then this specifies the fraction of documents.", +ParamValidators.gtEq(0.0)) + + /** @group getParam */ + def getMi
spark git commit: [SPARK-10012] [ML] Missing test case for Params#arrayLengthGt
Repository: spark Updated Branches: refs/heads/branch-1.5 56f4da263 -> fb207b245 [SPARK-10012] [ML] Missing test case for Params#arrayLengthGt Currently there is no test case for `Params#arrayLengthGt`. Author: lewuathe Closes #8223 from Lewuathe/SPARK-10012. (cherry picked from commit c635a16f64c939182196b46725ef2d00ed107cca) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fb207b24 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fb207b24 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fb207b24 Branch: refs/heads/branch-1.5 Commit: fb207b245305b30b4fe47e08f98f2571a2d05249 Parents: 56f4da2 Author: lewuathe Authored: Tue Aug 18 15:30:23 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 18 15:30:34 2015 -0700 -- mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fb207b24/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala index be95638..2c878f8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala @@ -199,6 +199,9 @@ class ParamsSuite extends SparkFunSuite { val inArray = ParamValidators.inArray[Int](Array(1, 2)) assert(inArray(1) && inArray(2) && !inArray(0)) + +val arrayLengthGt = ParamValidators.arrayLengthGt[Int](2.0) +assert(arrayLengthGt(Array(0, 1, 2)) && !arrayLengthGt(Array(0, 1))) } test("Params.copyValues") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10012] [ML] Missing test case for Params#arrayLengthGt
Repository: spark Updated Branches: refs/heads/master 1dbffba37 -> c635a16f6 [SPARK-10012] [ML] Missing test case for Params#arrayLengthGt Currently there is no test case for `Params#arrayLengthGt`. Author: lewuathe Closes #8223 from Lewuathe/SPARK-10012. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c635a16f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c635a16f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c635a16f Branch: refs/heads/master Commit: c635a16f64c939182196b46725ef2d00ed107cca Parents: 1dbffba Author: lewuathe Authored: Tue Aug 18 15:30:23 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 18 15:30:23 2015 -0700 -- mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala | 3 +++ 1 file changed, 3 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c635a16f/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala -- diff --git a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala index be95638..2c878f8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/param/ParamsSuite.scala @@ -199,6 +199,9 @@ class ParamsSuite extends SparkFunSuite { val inArray = ParamValidators.inArray[Int](Array(1, 2)) assert(inArray(1) && inArray(2) && !inArray(0)) + +val arrayLengthGt = ParamValidators.arrayLengthGt[Int](2.0) +assert(arrayLengthGt(Array(0, 1, 2)) && !arrayLengthGt(Array(0, 1))) } test("Params.copyValues") { - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT
Repository: spark Updated Branches: refs/heads/branch-1.5 e1b50c7d2 -> 4ee225af8 [SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT mengxr jkbradley Author: Feynman Liang Closes #8184 from feynmanliang/SPARK-9889-DCT-docs. (cherry picked from commit badf7fa650f9801c70515907fcc26b58d7ec3143) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4ee225af Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4ee225af Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4ee225af Branch: refs/heads/branch-1.5 Commit: 4ee225af8ecb38fbcf8e43ac1c498a76f3590b98 Parents: e1b50c7 Author: Feynman Liang Authored: Tue Aug 18 17:54:49 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 18 17:54:58 2015 -0700 -- docs/ml-features.md | 71 1 file changed, 71 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/4ee225af/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 6b2e36b..28a6193 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -649,6 +649,77 @@ for expanded in polyDF.select("polyFeatures").take(3): +## Discrete Cosine Transform (DCT) + +The [Discrete Cosine +Transform](https://en.wikipedia.org/wiki/Discrete_cosine_transform) +transforms a length $N$ real-valued sequence in the time domain into +another length $N$ real-valued sequence in the frequency domain. A +[DCT](api/scala/index.html#org.apache.spark.ml.feature.DCT) class +provides this functionality, implementing the +[DCT-II](https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II) +and scaling the result by $1/\sqrt{2}$ such that the representing matrix +for the transform is unitary. No shift is applied to the transformed +sequence (e.g. the $0$th element of the transformed sequence is the +$0$th DCT coefficient and _not_ the $N/2$th). + + + +{% highlight scala %} +import org.apache.spark.ml.feature.DCT +import org.apache.spark.mllib.linalg.Vectors + +val data = Seq( + Vectors.dense(0.0, 1.0, -2.0, 3.0), + Vectors.dense(-1.0, 2.0, 4.0, -7.0), + Vectors.dense(14.0, -2.0, -5.0, 1.0)) +val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") +val dct = new DCT() + .setInputCol("features") + .setOutputCol("featuresDCT") + .setInverse(false) +val dctDf = dct.transform(df) +dctDf.select("featuresDCT").show(3) +{% endhighlight %} + + + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.feature.DCT; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), + RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), + RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)) +)); +StructType schema = new StructType(new StructField[] { + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); +DataFrame df = jsql.createDataFrame(data, schema); +DCT dct = new DCT() + .setInputCol("features") + .setOutputCol("featuresDCT") + .setInverse(false); +DataFrame dctDf = dct.transform(df); +dctDf.select("featuresDCT").show(3); +{% endhighlight %} + + + ## StringIndexer `StringIndexer` encodes a string column of labels to a column of label indices. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT
Repository: spark Updated Branches: refs/heads/master 9108eff74 -> badf7fa65 [SPARK-8473] [SPARK-9889] [ML] User guide and example code for DCT mengxr jkbradley Author: Feynman Liang Closes #8184 from feynmanliang/SPARK-9889-DCT-docs. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/badf7fa6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/badf7fa6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/badf7fa6 Branch: refs/heads/master Commit: badf7fa650f9801c70515907fcc26b58d7ec3143 Parents: 9108eff Author: Feynman Liang Authored: Tue Aug 18 17:54:49 2015 -0700 Committer: Joseph K. Bradley Committed: Tue Aug 18 17:54:49 2015 -0700 -- docs/ml-features.md | 71 1 file changed, 71 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/badf7fa6/docs/ml-features.md -- diff --git a/docs/ml-features.md b/docs/ml-features.md index 6b2e36b..28a6193 100644 --- a/docs/ml-features.md +++ b/docs/ml-features.md @@ -649,6 +649,77 @@ for expanded in polyDF.select("polyFeatures").take(3): +## Discrete Cosine Transform (DCT) + +The [Discrete Cosine +Transform](https://en.wikipedia.org/wiki/Discrete_cosine_transform) +transforms a length $N$ real-valued sequence in the time domain into +another length $N$ real-valued sequence in the frequency domain. A +[DCT](api/scala/index.html#org.apache.spark.ml.feature.DCT) class +provides this functionality, implementing the +[DCT-II](https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II) +and scaling the result by $1/\sqrt{2}$ such that the representing matrix +for the transform is unitary. No shift is applied to the transformed +sequence (e.g. the $0$th element of the transformed sequence is the +$0$th DCT coefficient and _not_ the $N/2$th). + + + +{% highlight scala %} +import org.apache.spark.ml.feature.DCT +import org.apache.spark.mllib.linalg.Vectors + +val data = Seq( + Vectors.dense(0.0, 1.0, -2.0, 3.0), + Vectors.dense(-1.0, 2.0, 4.0, -7.0), + Vectors.dense(14.0, -2.0, -5.0, 1.0)) +val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features") +val dct = new DCT() + .setInputCol("features") + .setOutputCol("featuresDCT") + .setInverse(false) +val dctDf = dct.transform(df) +dctDf.select("featuresDCT").show(3) +{% endhighlight %} + + + +{% highlight java %} +import java.util.Arrays; + +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.ml.feature.DCT; +import org.apache.spark.mllib.linalg.Vector; +import org.apache.spark.mllib.linalg.VectorUDT; +import org.apache.spark.mllib.linalg.Vectors; +import org.apache.spark.sql.DataFrame; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.RowFactory; +import org.apache.spark.sql.SQLContext; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.sql.types.StructType; + +JavaRDD data = jsc.parallelize(Arrays.asList( + RowFactory.create(Vectors.dense(0.0, 1.0, -2.0, 3.0)), + RowFactory.create(Vectors.dense(-1.0, 2.0, 4.0, -7.0)), + RowFactory.create(Vectors.dense(14.0, -2.0, -5.0, 1.0)) +)); +StructType schema = new StructType(new StructField[] { + new StructField("features", new VectorUDT(), false, Metadata.empty()), +}); +DataFrame df = jsql.createDataFrame(data, schema); +DCT dct = new DCT() + .setInputCol("features") + .setOutputCol("featuresDCT") + .setInverse(false); +DataFrame dctDf = dct.transform(df); +dctDf.select("featuresDCT").show(3); +{% endhighlight %} + + + ## StringIndexer `StringIndexer` encodes a string column of labels to a column of label indices. - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator`
Repository: spark Updated Branches: refs/heads/master 5fd53c64b -> 28a98464e [SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator` Previously, users of evaluator (`CrossValidator` and `TrainValidationSplit`) would only maximize the metric in evaluator, leading to a hacky solution which negated metrics to be minimized and caused erroneous negative values to be reported to the user. This PR adds a `isLargerBetter` attribute to the `Evaluator` base class, instructing users of `Evaluator` on whether the chosen metric should be maximized or minimized. CC jkbradley Author: Feynman Liang Author: Joseph K. Bradley Closes #8290 from feynmanliang/SPARK-10097. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/28a98464 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/28a98464 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/28a98464 Branch: refs/heads/master Commit: 28a98464ea65aa7b35e24fca5ddaa60c2e5d53ee Parents: 5fd53c6 Author: Feynman Liang Authored: Wed Aug 19 11:35:05 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Aug 19 11:35:05 2015 -0700 -- .../BinaryClassificationEvaluator.scala | 20 .../apache/spark/ml/evaluation/Evaluator.scala | 7 +++ .../MulticlassClassificationEvaluator.scala | 8 .../ml/evaluation/RegressionEvaluator.scala | 19 +++ .../apache/spark/ml/tuning/CrossValidator.scala | 4 +++- .../spark/ml/tuning/TrainValidationSplit.scala | 4 +++- .../evaluation/RegressionEvaluatorSuite.scala | 4 ++-- .../spark/ml/tuning/CrossValidatorSuite.scala | 2 ++ .../ml/tuning/TrainValidationSplitSuite.scala | 2 ++ python/pyspark/ml/evaluation.py | 4 ++-- 10 files changed, 52 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/28a98464/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index 5d5cb7e..56419a0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -40,8 +40,11 @@ class BinaryClassificationEvaluator(override val uid: String) * param for metric name in evaluation * @group param */ - val metricName: Param[String] = new Param(this, "metricName", -"metric name in evaluation (areaUnderROC|areaUnderPR)") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("areaUnderROC", "areaUnderPR")) +new Param( + this, "metricName", "metric name in evaluation (areaUnderROC|areaUnderPR)", allowedParams) + } /** @group getParam */ def getMetricName: String = $(metricName) @@ -76,16 +79,17 @@ class BinaryClassificationEvaluator(override val uid: String) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { - case "areaUnderROC" => -metrics.areaUnderROC() - case "areaUnderPR" => -metrics.areaUnderPR() - case other => -throw new IllegalArgumentException(s"Does not support metric $other.") + case "areaUnderROC" => metrics.areaUnderROC() + case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } + override def isLargerBetter: Boolean = $(metricName) match { +case "areaUnderROC" => true +case "areaUnderPR" => true + } + override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } http://git-wip-us.apache.org/repos/asf/spark/blob/28a98464/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala index e56c946..13bd330 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala @@ -46,5 +46,12 @@ abstract class Evaluator extends Params { */ def evaluate(dataset: DataFrame): Double + /** + * Indicates whether the metric returned by [[evaluate()]] should be maximized (true, default) + * or minimized (false). + * A given evaluator may supp
spark git commit: [SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator`
Repository: spark Updated Branches: refs/heads/branch-1.5 a8e880818 -> f25c32475 [SPARK-10097] Adds `shouldMaximize` flag to `ml.evaluation.Evaluator` Previously, users of evaluator (`CrossValidator` and `TrainValidationSplit`) would only maximize the metric in evaluator, leading to a hacky solution which negated metrics to be minimized and caused erroneous negative values to be reported to the user. This PR adds a `isLargerBetter` attribute to the `Evaluator` base class, instructing users of `Evaluator` on whether the chosen metric should be maximized or minimized. CC jkbradley Author: Feynman Liang Author: Joseph K. Bradley Closes #8290 from feynmanliang/SPARK-10097. (cherry picked from commit 28a98464ea65aa7b35e24fca5ddaa60c2e5d53ee) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f25c3247 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f25c3247 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f25c3247 Branch: refs/heads/branch-1.5 Commit: f25c324758095ddf572157e64c8f1a93843f79c7 Parents: a8e8808 Author: Feynman Liang Authored: Wed Aug 19 11:35:05 2015 -0700 Committer: Joseph K. Bradley Committed: Wed Aug 19 11:35:17 2015 -0700 -- .../BinaryClassificationEvaluator.scala | 20 .../apache/spark/ml/evaluation/Evaluator.scala | 7 +++ .../MulticlassClassificationEvaluator.scala | 8 .../ml/evaluation/RegressionEvaluator.scala | 19 +++ .../apache/spark/ml/tuning/CrossValidator.scala | 4 +++- .../spark/ml/tuning/TrainValidationSplit.scala | 4 +++- .../evaluation/RegressionEvaluatorSuite.scala | 4 ++-- .../spark/ml/tuning/CrossValidatorSuite.scala | 2 ++ .../ml/tuning/TrainValidationSplitSuite.scala | 2 ++ python/pyspark/ml/evaluation.py | 4 ++-- 10 files changed, 52 insertions(+), 22 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/f25c3247/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala index 5d5cb7e..56419a0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.scala @@ -40,8 +40,11 @@ class BinaryClassificationEvaluator(override val uid: String) * param for metric name in evaluation * @group param */ - val metricName: Param[String] = new Param(this, "metricName", -"metric name in evaluation (areaUnderROC|areaUnderPR)") + val metricName: Param[String] = { +val allowedParams = ParamValidators.inArray(Array("areaUnderROC", "areaUnderPR")) +new Param( + this, "metricName", "metric name in evaluation (areaUnderROC|areaUnderPR)", allowedParams) + } /** @group getParam */ def getMetricName: String = $(metricName) @@ -76,16 +79,17 @@ class BinaryClassificationEvaluator(override val uid: String) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { - case "areaUnderROC" => -metrics.areaUnderROC() - case "areaUnderPR" => -metrics.areaUnderPR() - case other => -throw new IllegalArgumentException(s"Does not support metric $other.") + case "areaUnderROC" => metrics.areaUnderROC() + case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } + override def isLargerBetter: Boolean = $(metricName) match { +case "areaUnderROC" => true +case "areaUnderPR" => true + } + override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } http://git-wip-us.apache.org/repos/asf/spark/blob/f25c3247/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala -- diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala index e56c946..13bd330 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/Evaluator.scala @@ -46,5 +46,12 @@ abstract class Evaluator extends Params { */ def evaluate(dataset: DataFrame): Double + /** + * Indicates whether the metric returned by [
spark git commit: [SPARK-15502][DOC][ML][PYSPARK] add guide note that ALS only supports integer ids
Repository: spark Updated Branches: refs/heads/branch-2.0 2574abea0 -> 31fb5fa40 [SPARK-15502][DOC][ML][PYSPARK] add guide note that ALS only supports integer ids This PR adds a note to clarify that the ML API for ALS only supports integers for user/item ids, and that other types for these columns can be used but the ids must fall within integer range. (Refer [SPARK-14891](https://issues.apache.org/jira/browse/SPARK-14891)). Also cleaned up a reference to `mllib` in the ML doc. ## How was this patch tested? Built and viewed User Guide doc locally. Author: Nick Pentreath Closes #13278 from MLnick/SPARK-15502-als-int-id-doc-note. (cherry picked from commit 20900e5feced76e87f0a12823d0e3f07e082105f) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/31fb5fa4 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/31fb5fa4 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/31fb5fa4 Branch: refs/heads/branch-2.0 Commit: 31fb5fa4042eb46c541e5726a3f14da6f9f6bc2d Parents: 2574abe Author: Nick Pentreath Authored: Tue May 24 11:34:06 2016 -0700 Committer: Joseph K. Bradley Committed: Tue May 24 11:34:15 2016 -0700 -- docs/ml-collaborative-filtering.md | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/31fb5fa4/docs/ml-collaborative-filtering.md -- diff --git a/docs/ml-collaborative-filtering.md b/docs/ml-collaborative-filtering.md index bd3d527..8bd75f3 100644 --- a/docs/ml-collaborative-filtering.md +++ b/docs/ml-collaborative-filtering.md @@ -29,6 +29,10 @@ following parameters: *baseline* confidence in preference observations (defaults to 1.0). * *nonnegative* specifies whether or not to use nonnegative constraints for least squares (defaults to `false`). +**Note:** The DataFrame-based API for ALS currently only supports integers for user and item ids. +Other numeric types are supported for the user and item id columns, +but the ids must be within the integer value range. + ### Explicit vs. implicit feedback The standard approach to matrix factorization based collaborative filtering treats @@ -36,7 +40,7 @@ the entries in the user-item matrix as *explicit* preferences given by the user for example, users giving ratings to movies. It is common in many real-world use cases to only have access to *implicit feedback* (e.g. views, -clicks, purchases, likes, shares etc.). The approach used in `spark.mllib` to deal with such data is taken +clicks, purchases, likes, shares etc.). The approach used in `spark.ml` to deal with such data is taken from [Collaborative Filtering for Implicit Feedback Datasets](http://dx.doi.org/10.1109/ICDM.2008.22). Essentially, instead of trying to model the matrix of ratings directly, this approach treats the data as numbers representing the *strength* in observations of user actions (such as the number of clicks, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15502][DOC][ML][PYSPARK] add guide note that ALS only supports integer ids
Repository: spark Updated Branches: refs/heads/master be99a99fe -> 20900e5fe [SPARK-15502][DOC][ML][PYSPARK] add guide note that ALS only supports integer ids This PR adds a note to clarify that the ML API for ALS only supports integers for user/item ids, and that other types for these columns can be used but the ids must fall within integer range. (Refer [SPARK-14891](https://issues.apache.org/jira/browse/SPARK-14891)). Also cleaned up a reference to `mllib` in the ML doc. ## How was this patch tested? Built and viewed User Guide doc locally. Author: Nick Pentreath Closes #13278 from MLnick/SPARK-15502-als-int-id-doc-note. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/20900e5f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/20900e5f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/20900e5f Branch: refs/heads/master Commit: 20900e5feced76e87f0a12823d0e3f07e082105f Parents: be99a99 Author: Nick Pentreath Authored: Tue May 24 11:34:06 2016 -0700 Committer: Joseph K. Bradley Committed: Tue May 24 11:34:06 2016 -0700 -- docs/ml-collaborative-filtering.md | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/20900e5f/docs/ml-collaborative-filtering.md -- diff --git a/docs/ml-collaborative-filtering.md b/docs/ml-collaborative-filtering.md index bd3d527..8bd75f3 100644 --- a/docs/ml-collaborative-filtering.md +++ b/docs/ml-collaborative-filtering.md @@ -29,6 +29,10 @@ following parameters: *baseline* confidence in preference observations (defaults to 1.0). * *nonnegative* specifies whether or not to use nonnegative constraints for least squares (defaults to `false`). +**Note:** The DataFrame-based API for ALS currently only supports integers for user and item ids. +Other numeric types are supported for the user and item id columns, +but the ids must be within the integer value range. + ### Explicit vs. implicit feedback The standard approach to matrix factorization based collaborative filtering treats @@ -36,7 +40,7 @@ the entries in the user-item matrix as *explicit* preferences given by the user for example, users giving ratings to movies. It is common in many real-world use cases to only have access to *implicit feedback* (e.g. views, -clicks, purchases, likes, shares etc.). The approach used in `spark.mllib` to deal with such data is taken +clicks, purchases, likes, shares etc.). The approach used in `spark.ml` to deal with such data is taken from [Collaborative Filtering for Implicit Feedback Datasets](http://dx.doi.org/10.1109/ICDM.2008.22). Essentially, instead of trying to model the matrix of ratings directly, this approach treats the data as numbers representing the *strength* in observations of user actions (such as the number of clicks, - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-15457][MLLIB][ML] Eliminate some warnings from MLlib about deprecations
Repository: spark Updated Branches: refs/heads/master 0f61d6efb -> b0a03feef [SPARK-15457][MLLIB][ML] Eliminate some warnings from MLlib about deprecations ## What changes were proposed in this pull request? Several classes and methods have been deprecated and are creating lots of build warnings in branch-2.0. This issue is to identify and fix those items: * WithSGD classes: Change to make class not deprecated, object deprecated, and public class constructor deprecated. Any public use will require a deprecated API. We need to keep a non-deprecated private API since we cannot eliminate certain uses: Python API, streaming algs, and examples. * Use in PythonMLlibAPI: Change to using private constructors * Streaming algs: No warnings after we un-deprecate the classes * Examples: Deprecate or change ones which use deprecated APIs * MulticlassMetrics fields (precision, etc.) * LinearRegressionSummary.model field ## How was this patch tested? Existing tests. Checked for warnings manually. Author: Sean Owen Author: Joseph K. Bradley Closes #13314 from jkbradley/warning-cleanups. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b0a03fee Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b0a03fee Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b0a03fee Branch: refs/heads/master Commit: b0a03feef2cf4daa7642ec7f4dc479dbd473b581 Parents: 0f61d6e Author: Sean Owen Authored: Thu May 26 14:25:28 2016 -0700 Committer: Joseph K. Bradley Committed: Thu May 26 14:25:28 2016 -0700 -- .../JavaLogisticRegressionWithLBFGSExample.java | 4 +- ...aMulticlassClassificationMetricsExample.java | 4 +- .../spark/examples/ml/DecisionTreeExample.scala | 2 +- .../examples/mllib/DecisionTreeRunner.scala | 10 ++-- .../mllib/GradientBoostedTreesRunner.scala | 5 +- .../spark/examples/mllib/LinearRegression.scala | 1 + .../mllib/LinearRegressionWithSGDExample.scala | 1 + .../LogisticRegressionWithLBFGSExample.scala| 4 +- .../mllib/MulticlassMetricsExample.scala| 8 +-- .../spark/examples/mllib/PCAExample.scala | 1 + .../mllib/RegressionMetricsExample.scala| 2 + .../MulticlassClassificationEvaluator.scala | 4 +- .../spark/ml/regression/LinearRegression.scala | 53 +++- .../spark/mllib/api/python/PythonMLLibAPI.scala | 8 +-- .../classification/LogisticRegression.scala | 4 +- .../apache/spark/mllib/regression/Lasso.scala | 6 +-- .../mllib/regression/LinearRegression.scala | 2 +- .../mllib/regression/RidgeRegression.scala | 6 +-- 18 files changed, 63 insertions(+), 62 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/b0a03fee/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java index 9d8e4a9..7fc371e 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java @@ -65,8 +65,8 @@ public class JavaLogisticRegressionWithLBFGSExample { // Get evaluation metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); -double precision = metrics.precision(); -System.out.println("Precision = " + precision); +double accuracy = metrics.accuracy(); +System.out.println("Accuracy = " + accuracy); // Save and load model model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); http://git-wip-us.apache.org/repos/asf/spark/blob/b0a03fee/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java index 5247c9c..e84a3a7 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java @@ -68,9 +68,7 @@ public class JavaMulticlassClassificationMetricsExample { System.out.println("Confusion matrix: \n" + confusion); // Overall statistics -System.out.println("Precision = " +
spark git commit: [SPARK-15457][MLLIB][ML] Eliminate some warnings from MLlib about deprecations
Repository: spark Updated Branches: refs/heads/branch-2.0 6eea33ec3 -> 216e39505 [SPARK-15457][MLLIB][ML] Eliminate some warnings from MLlib about deprecations ## What changes were proposed in this pull request? Several classes and methods have been deprecated and are creating lots of build warnings in branch-2.0. This issue is to identify and fix those items: * WithSGD classes: Change to make class not deprecated, object deprecated, and public class constructor deprecated. Any public use will require a deprecated API. We need to keep a non-deprecated private API since we cannot eliminate certain uses: Python API, streaming algs, and examples. * Use in PythonMLlibAPI: Change to using private constructors * Streaming algs: No warnings after we un-deprecate the classes * Examples: Deprecate or change ones which use deprecated APIs * MulticlassMetrics fields (precision, etc.) * LinearRegressionSummary.model field ## How was this patch tested? Existing tests. Checked for warnings manually. Author: Sean Owen Author: Joseph K. Bradley Closes #13314 from jkbradley/warning-cleanups. (cherry picked from commit b0a03feef2cf4daa7642ec7f4dc479dbd473b581) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/216e3950 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/216e3950 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/216e3950 Branch: refs/heads/branch-2.0 Commit: 216e39505ef8861d12e31d5117fad90e57bed885 Parents: 6eea33e Author: Sean Owen Authored: Thu May 26 14:25:28 2016 -0700 Committer: Joseph K. Bradley Committed: Thu May 26 14:25:39 2016 -0700 -- .../JavaLogisticRegressionWithLBFGSExample.java | 4 +- ...aMulticlassClassificationMetricsExample.java | 4 +- .../spark/examples/ml/DecisionTreeExample.scala | 2 +- .../examples/mllib/DecisionTreeRunner.scala | 10 ++-- .../mllib/GradientBoostedTreesRunner.scala | 5 +- .../spark/examples/mllib/LinearRegression.scala | 1 + .../mllib/LinearRegressionWithSGDExample.scala | 1 + .../LogisticRegressionWithLBFGSExample.scala| 4 +- .../mllib/MulticlassMetricsExample.scala| 8 +-- .../spark/examples/mllib/PCAExample.scala | 1 + .../mllib/RegressionMetricsExample.scala| 2 + .../MulticlassClassificationEvaluator.scala | 4 +- .../spark/ml/regression/LinearRegression.scala | 53 +++- .../spark/mllib/api/python/PythonMLLibAPI.scala | 8 +-- .../classification/LogisticRegression.scala | 4 +- .../apache/spark/mllib/regression/Lasso.scala | 6 +-- .../mllib/regression/LinearRegression.scala | 2 +- .../mllib/regression/RidgeRegression.scala | 6 +-- 18 files changed, 63 insertions(+), 62 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/216e3950/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java index 9d8e4a9..7fc371e 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java @@ -65,8 +65,8 @@ public class JavaLogisticRegressionWithLBFGSExample { // Get evaluation metrics. MulticlassMetrics metrics = new MulticlassMetrics(predictionAndLabels.rdd()); -double precision = metrics.precision(); -System.out.println("Precision = " + precision); +double accuracy = metrics.accuracy(); +System.out.println("Accuracy = " + accuracy); // Save and load model model.save(sc, "target/tmp/javaLogisticRegressionWithLBFGSModel"); http://git-wip-us.apache.org/repos/asf/spark/blob/216e3950/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java -- diff --git a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java index 5247c9c..e84a3a7 100644 --- a/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java +++ b/examples/src/main/java/org/apache/spark/examples/mllib/JavaMulticlassClassificationMetricsExample.java @@ -68,9 +68,7 @@ public class JavaMulticlassClassificationMetricsExample { System.out.println("Confusion matrix: \n"
spark git commit: [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression
Repository: spark Updated Branches: refs/heads/master a96e4151a -> c96244f5a [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression ## What changes were proposed in this pull request? This patch adds a user guide section for generalized linear regression and includes the examples from [#12754](https://github.com/apache/spark/pull/12754). ## How was this patch tested? Documentation only, no tests required. ## Approach In general, it is a bit unclear what level of detail ought to be included in the user guide since there is a lot of variability within the current user guide. I tried to give a fairly brief mathematical introduction to GLMs, and cover what types of problems they could be used for. Additionally, I included a brief blurb on the IRLS solver. The input/output columns are given in a table as is found elsewhere in the docs (though, again, these appear rather intermittently in the current docs), as well as a table providing the supported families and their link functions. Author: sethah Closes #13139 from sethah/SPARK-15186. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c96244f5 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c96244f5 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c96244f5 Branch: refs/heads/master Commit: c96244f5acd8b335e34694c171bab32d92e6e0fb Parents: a96e415 Author: sethah Authored: Fri May 27 12:55:48 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 12:55:48 2016 -0700 -- docs/ml-classification-regression.md | 132 ++ 1 file changed, 132 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/c96244f5/docs/ml-classification-regression.md -- diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index f1a21f4..ff8dec6 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -374,6 +374,138 @@ regression model and extracting model summary statistics. +## Generalized linear regression + +Contrasted with linear regression where the output is assumed to follow a Gaussian +distribution, [generalized linear models](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLMs) are specifications of linear models where the response variable $Y_i$ follows some +distribution from the [exponential family of distributions](https://en.wikipedia.org/wiki/Exponential_family). +Spark's `GeneralizedLinearRegression` interface +allows for flexible specification of GLMs which can be used for various types of +prediction problems including linear regression, Poisson regression, logistic regression, and others. +Currently in `spark.ml`, only a subset of the exponential family distributions are supported and they are listed +[below](#available-families). + +**NOTE**: Spark currently only supports up to 4096 features through its `GeneralizedLinearRegression` +interface, and will throw an exception if this constraint is exceeded. See the [advanced section](ml-advanced) for more details. + Still, for linear and logistic regression, models with an increased number of features can be trained + using the `LinearRegression` and `LogisticRegression` estimators. + +GLMs require exponential family distributions that can be written in their "canonical" or "natural" form, aka +[natural exponential family distributions](https://en.wikipedia.org/wiki/Natural_exponential_family). The form of a natural exponential family distribution is given as: + +$$ +f_Y(y|\theta, \tau) = h(y, \tau)\exp{\left( \frac{\theta \cdot y - A(\theta)}{d(\tau)} \right)} +$$ + +where $\theta$ is the parameter of interest and $\tau$ is a dispersion parameter. In a GLM the response variable $Y_i$ is assumed to be drawn from a natural exponential family distribution: + +$$ +Y_i \sim f\left(\cdot|\theta_i, \tau \right) +$$ + +where the parameter of interest $\theta_i$ is related to the expected value of the response variable $\mu_i$ by + +$$ +\mu_i = A'(\theta_i) +$$ + +Here, $A'(\theta_i)$ is defined by the form of the distribution selected. GLMs also allow specification +of a link function, which defines the relationship between the expected value of the response variable $\mu_i$ +and the so called _linear predictor_ $\eta_i$: + +$$ +g(\mu_i) = \eta_i = \vec{x_i}^T \cdot \vec{\beta} +$$ + +Often, the link function is chosen such that $A' = g^{-1}$, which yields a simplified relationship +between the parameter of interest $\theta$ and the linear predictor $\eta$. In this case, the link +function $g(\mu)$ is said to be the "canonical" link function. + +$$ +\theta_i = A'^{-1}(\mu_i) = g(g^{-1}(\eta_i)) = \eta_i +$$ + +A GLM finds the regression coefficients
spark git commit: [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression
Repository: spark Updated Branches: refs/heads/branch-2.0 d76e066d3 -> 5dd1423f4 [SPARK-15186][ML][DOCS] Add user guide for generalized linear regression ## What changes were proposed in this pull request? This patch adds a user guide section for generalized linear regression and includes the examples from [#12754](https://github.com/apache/spark/pull/12754). ## How was this patch tested? Documentation only, no tests required. ## Approach In general, it is a bit unclear what level of detail ought to be included in the user guide since there is a lot of variability within the current user guide. I tried to give a fairly brief mathematical introduction to GLMs, and cover what types of problems they could be used for. Additionally, I included a brief blurb on the IRLS solver. The input/output columns are given in a table as is found elsewhere in the docs (though, again, these appear rather intermittently in the current docs), as well as a table providing the supported families and their link functions. Author: sethah Closes #13139 from sethah/SPARK-15186. (cherry picked from commit c96244f5acd8b335e34694c171bab32d92e6e0fb) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5dd1423f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5dd1423f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5dd1423f Branch: refs/heads/branch-2.0 Commit: 5dd1423f462f03b7ae625a93cdaf9d882969afb6 Parents: d76e066 Author: sethah Authored: Fri May 27 12:55:48 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 12:56:00 2016 -0700 -- docs/ml-classification-regression.md | 132 ++ 1 file changed, 132 insertions(+) -- http://git-wip-us.apache.org/repos/asf/spark/blob/5dd1423f/docs/ml-classification-regression.md -- diff --git a/docs/ml-classification-regression.md b/docs/ml-classification-regression.md index f1a21f4..ff8dec6 100644 --- a/docs/ml-classification-regression.md +++ b/docs/ml-classification-regression.md @@ -374,6 +374,138 @@ regression model and extracting model summary statistics. +## Generalized linear regression + +Contrasted with linear regression where the output is assumed to follow a Gaussian +distribution, [generalized linear models](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLMs) are specifications of linear models where the response variable $Y_i$ follows some +distribution from the [exponential family of distributions](https://en.wikipedia.org/wiki/Exponential_family). +Spark's `GeneralizedLinearRegression` interface +allows for flexible specification of GLMs which can be used for various types of +prediction problems including linear regression, Poisson regression, logistic regression, and others. +Currently in `spark.ml`, only a subset of the exponential family distributions are supported and they are listed +[below](#available-families). + +**NOTE**: Spark currently only supports up to 4096 features through its `GeneralizedLinearRegression` +interface, and will throw an exception if this constraint is exceeded. See the [advanced section](ml-advanced) for more details. + Still, for linear and logistic regression, models with an increased number of features can be trained + using the `LinearRegression` and `LogisticRegression` estimators. + +GLMs require exponential family distributions that can be written in their "canonical" or "natural" form, aka +[natural exponential family distributions](https://en.wikipedia.org/wiki/Natural_exponential_family). The form of a natural exponential family distribution is given as: + +$$ +f_Y(y|\theta, \tau) = h(y, \tau)\exp{\left( \frac{\theta \cdot y - A(\theta)}{d(\tau)} \right)} +$$ + +where $\theta$ is the parameter of interest and $\tau$ is a dispersion parameter. In a GLM the response variable $Y_i$ is assumed to be drawn from a natural exponential family distribution: + +$$ +Y_i \sim f\left(\cdot|\theta_i, \tau \right) +$$ + +where the parameter of interest $\theta_i$ is related to the expected value of the response variable $\mu_i$ by + +$$ +\mu_i = A'(\theta_i) +$$ + +Here, $A'(\theta_i)$ is defined by the form of the distribution selected. GLMs also allow specification +of a link function, which defines the relationship between the expected value of the response variable $\mu_i$ +and the so called _linear predictor_ $\eta_i$: + +$$ +g(\mu_i) = \eta_i = \vec{x_i}^T \cdot \vec{\beta} +$$ + +Often, the link function is chosen such that $A' = g^{-1}$, which yields a simplified relationship +between the parameter of interest $\theta$ and the linear predictor $\eta$. In this case, the link +function $g(\mu)$ is said to be the "canonical" link functi
spark git commit: [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS
Repository: spark Updated Branches: refs/heads/master c96244f5a -> a3550e374 [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS ## What changes were proposed in this pull request? * Document ```WeightedLeastSquares```(normal equation) and ```IterativelyReweightedLeastSquares```. * Copy ```L-BFGS``` documents from ```spark.mllib``` to ```spark.ml```. Due to the session ```Optimization of linear methods``` is used for developers, I think we should provide the brief introduction of the optimization method, necessary references and how it implements in Spark. It's not necessary to paste all mathematical formula and derivation here. If developers/users want to learn more, they can track reference. ## How was this patch tested? Document update, no tests. Author: Yanbo Liang Closes #13262 from yanboliang/spark-15484. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a3550e37 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a3550e37 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a3550e37 Branch: refs/heads/master Commit: a3550e3747e21c79a5110132dc127ee83879062a Parents: c96244f Author: Yanbo Liang Authored: Fri May 27 13:16:22 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 13:16:22 2016 -0700 -- docs/ml-advanced.md | 85 ++-- .../IterativelyReweightedLeastSquares.scala | 2 +- 2 files changed, 81 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a3550e37/docs/ml-advanced.md -- diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md index 91731d7..1c5f844 100644 --- a/docs/ml-advanced.md +++ b/docs/ml-advanced.md @@ -4,10 +4,85 @@ title: Advanced topics - spark.ml displayTitle: Advanced topics - spark.ml --- -# Optimization of linear methods +* Table of contents +{:toc} + +`\[ +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\x}{\mathbf{x}} +\newcommand{\y}{\mathbf{y}} +\newcommand{\wv}{\mathbf{w}} +\newcommand{\av}{\mathbf{\alpha}} +\newcommand{\bv}{\mathbf{b}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} +\newcommand{\zero}{\mathbf{0}} +\]` + +# Optimization of linear methods (developer) + +## Limited-memory BFGS (L-BFGS) +[L-BFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is an optimization +algorithm in the family of quasi-Newton methods to solve the optimization problems of the form +`$\min_{\wv \in\R^d} \; f(\wv)$`. The L-BFGS method approximates the objective function locally as a +quadratic without evaluating the second partial derivatives of the objective function to construct the +Hessian matrix. The Hessian matrix is approximated by previous gradient evaluations, so there is no +vertical scalability issue (the number of training features) unlike computing the Hessian matrix +explicitly in Newton's method. As a result, L-BFGS often achieves faster convergence compared with +other first-order optimizations. -The optimization algorithm underlying the implementation is called [Orthant-Wise Limited-memory -QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) -(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 -regularization and elastic net. +Quasi-Newton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) +(OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic net regularization. + +L-BFGS is used as a solver for [LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression), +[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression), +[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression) +and [MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier). + +MLlib L-BFGS solver calls the corresponding implementation in [breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala). + +## Normal equation solver for weighted least squares + +MLlib implements normal equation solver for [weighted least squares](https://en.wikipedia.org/wiki/Least_squares#Weighted_least_squares) by [WeightedLeastSquares](https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala). + +Given $n$ weighted observations $(w_i, a_i, b_i)$: + +* $w_i$ the weight of i-th observation +* $a_i$ the features vector of i-th observation +* $b_i$ the label of i-th
spark git commit: [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS
Repository: spark Updated Branches: refs/heads/branch-2.0 5dd1423f4 -> e6e2f293d [SPARK-11959][SPARK-15484][DOC][ML] Document WLS and IRLS ## What changes were proposed in this pull request? * Document ```WeightedLeastSquares```(normal equation) and ```IterativelyReweightedLeastSquares```. * Copy ```L-BFGS``` documents from ```spark.mllib``` to ```spark.ml```. Due to the session ```Optimization of linear methods``` is used for developers, I think we should provide the brief introduction of the optimization method, necessary references and how it implements in Spark. It's not necessary to paste all mathematical formula and derivation here. If developers/users want to learn more, they can track reference. ## How was this patch tested? Document update, no tests. Author: Yanbo Liang Closes #13262 from yanboliang/spark-15484. (cherry picked from commit a3550e3747e21c79a5110132dc127ee83879062a) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e6e2f293 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e6e2f293 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e6e2f293 Branch: refs/heads/branch-2.0 Commit: e6e2f293d6830ce118050e789773a09b3888fd30 Parents: 5dd1423 Author: Yanbo Liang Authored: Fri May 27 13:16:22 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 13:16:37 2016 -0700 -- docs/ml-advanced.md | 85 ++-- .../IterativelyReweightedLeastSquares.scala | 2 +- 2 files changed, 81 insertions(+), 6 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/e6e2f293/docs/ml-advanced.md -- diff --git a/docs/ml-advanced.md b/docs/ml-advanced.md index 91731d7..1c5f844 100644 --- a/docs/ml-advanced.md +++ b/docs/ml-advanced.md @@ -4,10 +4,85 @@ title: Advanced topics - spark.ml displayTitle: Advanced topics - spark.ml --- -# Optimization of linear methods +* Table of contents +{:toc} + +`\[ +\newcommand{\R}{\mathbb{R}} +\newcommand{\E}{\mathbb{E}} +\newcommand{\x}{\mathbf{x}} +\newcommand{\y}{\mathbf{y}} +\newcommand{\wv}{\mathbf{w}} +\newcommand{\av}{\mathbf{\alpha}} +\newcommand{\bv}{\mathbf{b}} +\newcommand{\N}{\mathbb{N}} +\newcommand{\id}{\mathbf{I}} +\newcommand{\ind}{\mathbf{1}} +\newcommand{\0}{\mathbf{0}} +\newcommand{\unit}{\mathbf{e}} +\newcommand{\one}{\mathbf{1}} +\newcommand{\zero}{\mathbf{0}} +\]` + +# Optimization of linear methods (developer) + +## Limited-memory BFGS (L-BFGS) +[L-BFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is an optimization +algorithm in the family of quasi-Newton methods to solve the optimization problems of the form +`$\min_{\wv \in\R^d} \; f(\wv)$`. The L-BFGS method approximates the objective function locally as a +quadratic without evaluating the second partial derivatives of the objective function to construct the +Hessian matrix. The Hessian matrix is approximated by previous gradient evaluations, so there is no +vertical scalability issue (the number of training features) unlike computing the Hessian matrix +explicitly in Newton's method. As a result, L-BFGS often achieves faster convergence compared with +other first-order optimizations. -The optimization algorithm underlying the implementation is called [Orthant-Wise Limited-memory -QuasiNewton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) -(OWL-QN). It is an extension of L-BFGS that can effectively handle L1 -regularization and elastic net. +Quasi-Newton](http://research-srv.microsoft.com/en-us/um/people/jfgao/paper/icml07scalable.pdf) +(OWL-QN) is an extension of L-BFGS that can effectively handle L1 and elastic net regularization. + +L-BFGS is used as a solver for [LinearRegression](api/scala/index.html#org.apache.spark.ml.regression.LinearRegression), +[LogisticRegression](api/scala/index.html#org.apache.spark.ml.classification.LogisticRegression), +[AFTSurvivalRegression](api/scala/index.html#org.apache.spark.ml.regression.AFTSurvivalRegression) +and [MultilayerPerceptronClassifier](api/scala/index.html#org.apache.spark.ml.classification.MultilayerPerceptronClassifier). + +MLlib L-BFGS solver calls the corresponding implementation in [breeze](https://github.com/scalanlp/breeze/blob/master/math/src/main/scala/breeze/optimize/LBFGS.scala). + +## Normal equation solver for weighted least squares + +MLlib implements normal equation solver for [weighted least squares](https://en.wikipedia.org/wiki/Least_squares#Weighted_least_squares) by [WeightedLeastSquares](https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/optim/WeightedLeastSquares.scala). + +Given $n$ weighted observations $(w_i, a_i, b_i)$: + +*
spark git commit: [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest
Repository: spark Updated Branches: refs/heads/master a3550e374 -> 130b8d07b [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest ## What changes were proposed in this pull request? 1. Add `_transfer_param_map_to/from_java` for OneVsRest; 2. Add `_compare_params` in ml/tests.py to help compare params. 3. Add `test_onevsrest` as the integration test for OneVsRest. ## How was this patch tested? Python unit test. Author: yinxusen Closes #12875 from yinxusen/SPARK-15008. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/130b8d07 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/130b8d07 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/130b8d07 Branch: refs/heads/master Commit: 130b8d07b8eb08f2ad522081a95032b90247094d Parents: a3550e3 Author: yinxusen Authored: Fri May 27 13:18:29 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 13:18:29 2016 -0700 -- python/pyspark/ml/tests.py | 69 +++-- 1 file changed, 46 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/130b8d07/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a7c93ac..4358175 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -747,12 +747,32 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def _compare_params(self, m1, m2, param): +""" +Compare 2 ML Params instances for the given param, and assert both have the same param value +and parent. The param must be a parameter of m1. +""" +# Prevent key not found error in case of some param in neither paramMap nor defaultParamMap. +if m1.isDefined(param): +paramValue1 = m1.getOrDefault(param) +paramValue2 = m2.getOrDefault(m2.getParam(param.name)) +if isinstance(paramValue1, Params): +self._compare_pipelines(paramValue1, paramValue2) +else: +self.assertEqual(paramValue1, paramValue2) # for general types param +# Assert parents are equal +self.assertEqual(param.parent, m2.getParam(param.name).parent) +else: +# If m1 is not defined param, then m2 should not, too. See SPARK-14931. +self.assertFalse(m2.isDefined(m2.getParam(param.name))) + def _compare_pipelines(self, m1, m2): """ Compare 2 ML types, asserting that they are equivalent. This currently supports: - basic types - Pipeline, PipelineModel + - OneVsRest, OneVsRestModel This checks: - uid - type @@ -763,8 +783,7 @@ class PersistenceTest(SparkSessionTestCase): if isinstance(m1, JavaParams): self.assertEqual(len(m1.params), len(m2.params)) for p in m1.params: -self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p)) -self.assertEqual(p.parent, m2.getParam(p.name).parent) +self._compare_params(m1, m2, p) elif isinstance(m1, Pipeline): self.assertEqual(len(m1.getStages()), len(m2.getStages())) for s1, s2 in zip(m1.getStages(), m2.getStages()): @@ -773,6 +792,13 @@ class PersistenceTest(SparkSessionTestCase): self.assertEqual(len(m1.stages), len(m2.stages)) for s1, s2 in zip(m1.stages, m2.stages): self._compare_pipelines(s1, s2) +elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel): +for p in m1.params: +self._compare_params(m1, m2, p) +if isinstance(m1, OneVsRestModel): +self.assertEqual(len(m1.models), len(m2.models)) +for x, y in zip(m1.models, m2.models): +self._compare_pipelines(x, y) else: raise RuntimeError("_compare_pipelines does not yet support type: %s" % type(m1)) @@ -833,6 +859,24 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def test_onevsrest(self): +temp_path = tempfile.mkdtemp() +df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), + (1.0, Vectors.sparse(2, [], [])), + (2.0, Vectors.dense(0.5, 0.5))] * 10, +["label", "features"]) +lr = LogisticRegression(maxIter=5, regParam=0.01) +ovr = OneVsRest(classifier=lr) +model = ovr.fit(df) +ovrPath = temp_path + "/ovr" +ovr.save(ovrPath) +loadedOvr = OneVsRest.load(ovr
spark git commit: [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest
Repository: spark Updated Branches: refs/heads/branch-2.0 e6e2f293d -> a778d3c90 [SPARK-15008][ML][PYSPARK] Add integration test for OneVsRest ## What changes were proposed in this pull request? 1. Add `_transfer_param_map_to/from_java` for OneVsRest; 2. Add `_compare_params` in ml/tests.py to help compare params. 3. Add `test_onevsrest` as the integration test for OneVsRest. ## How was this patch tested? Python unit test. Author: yinxusen Closes #12875 from yinxusen/SPARK-15008. (cherry picked from commit 130b8d07b8eb08f2ad522081a95032b90247094d) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a778d3c9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a778d3c9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a778d3c9 Branch: refs/heads/branch-2.0 Commit: a778d3c90599eb76e6bca87b7aa3c0f9910f24c5 Parents: e6e2f29 Author: yinxusen Authored: Fri May 27 13:18:29 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 13:18:36 2016 -0700 -- python/pyspark/ml/tests.py | 69 +++-- 1 file changed, 46 insertions(+), 23 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/a778d3c9/python/pyspark/ml/tests.py -- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index a7c93ac..4358175 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -747,12 +747,32 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def _compare_params(self, m1, m2, param): +""" +Compare 2 ML Params instances for the given param, and assert both have the same param value +and parent. The param must be a parameter of m1. +""" +# Prevent key not found error in case of some param in neither paramMap nor defaultParamMap. +if m1.isDefined(param): +paramValue1 = m1.getOrDefault(param) +paramValue2 = m2.getOrDefault(m2.getParam(param.name)) +if isinstance(paramValue1, Params): +self._compare_pipelines(paramValue1, paramValue2) +else: +self.assertEqual(paramValue1, paramValue2) # for general types param +# Assert parents are equal +self.assertEqual(param.parent, m2.getParam(param.name).parent) +else: +# If m1 is not defined param, then m2 should not, too. See SPARK-14931. +self.assertFalse(m2.isDefined(m2.getParam(param.name))) + def _compare_pipelines(self, m1, m2): """ Compare 2 ML types, asserting that they are equivalent. This currently supports: - basic types - Pipeline, PipelineModel + - OneVsRest, OneVsRestModel This checks: - uid - type @@ -763,8 +783,7 @@ class PersistenceTest(SparkSessionTestCase): if isinstance(m1, JavaParams): self.assertEqual(len(m1.params), len(m2.params)) for p in m1.params: -self.assertEqual(m1.getOrDefault(p), m2.getOrDefault(p)) -self.assertEqual(p.parent, m2.getParam(p.name).parent) +self._compare_params(m1, m2, p) elif isinstance(m1, Pipeline): self.assertEqual(len(m1.getStages()), len(m2.getStages())) for s1, s2 in zip(m1.getStages(), m2.getStages()): @@ -773,6 +792,13 @@ class PersistenceTest(SparkSessionTestCase): self.assertEqual(len(m1.stages), len(m2.stages)) for s1, s2 in zip(m1.stages, m2.stages): self._compare_pipelines(s1, s2) +elif isinstance(m1, OneVsRest) or isinstance(m1, OneVsRestModel): +for p in m1.params: +self._compare_params(m1, m2, p) +if isinstance(m1, OneVsRestModel): +self.assertEqual(len(m1.models), len(m2.models)) +for x, y in zip(m1.models, m2.models): +self._compare_pipelines(x, y) else: raise RuntimeError("_compare_pipelines does not yet support type: %s" % type(m1)) @@ -833,6 +859,24 @@ class PersistenceTest(SparkSessionTestCase): except OSError: pass +def test_onevsrest(self): +temp_path = tempfile.mkdtemp() +df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), + (1.0, Vectors.sparse(2, [], [])), + (2.0, Vectors.dense(0.5, 0.5))] * 10, +["label", "features"]) +lr = LogisticRegression(maxIter=5, regParam=0.01) +ovr = OneVsRest(classifier=lr) +model = ovr.
spark git commit: [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix
Repository: spark Updated Branches: refs/heads/master 130b8d07b -> 21b2605dc [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix ## What changes were proposed in this pull request? We're using `asML` to convert the mllib vector/matrix to ml vector/matrix now. Using `as` is more correct given that this conversion actually shares the same underline data structure. As a result, in this PR, `toBreeze` will be changed to `asBreeze`. This is a private API, as a result, it will not affect any user's application. ## How was this patch tested? unit tests Author: DB Tsai Closes #13198 from dbtsai/minor. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/21b2605d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/21b2605d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/21b2605d Branch: refs/heads/master Commit: 21b2605dc4900894ea7a911e039781ecc2a18c14 Parents: 130b8d0 Author: DB Tsai Authored: Fri May 27 14:02:39 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 14:02:39 2016 -0700 -- .../org/apache/spark/ml/linalg/Matrices.scala | 16 ++-- .../org/apache/spark/ml/linalg/Vectors.scala| 8 +++--- .../distribution/MultivariateGaussian.scala | 8 +++--- .../ml/linalg/BreezeMatrixConversionSuite.scala | 4 +-- .../ml/linalg/BreezeVectorConversionSuite.scala | 4 +-- .../apache/spark/ml/linalg/MatricesSuite.scala | 14 +-- .../apache/spark/ml/linalg/VectorsSuite.scala | 2 +- .../scala/org/apache/spark/ml/ann/Layer.scala | 8 +++--- .../ml/classification/LogisticRegression.scala | 2 +- .../spark/ml/clustering/GaussianMixture.scala | 2 +- .../apache/spark/ml/feature/MaxAbsScaler.scala | 2 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 2 +- .../ml/regression/AFTSurvivalRegression.scala | 2 +- .../spark/ml/regression/LinearRegression.scala | 2 +- .../apache/spark/mllib/classification/SVM.scala | 2 +- .../mllib/clustering/GaussianMixture.scala | 2 +- .../mllib/clustering/GaussianMixtureModel.scala | 4 +-- .../spark/mllib/clustering/LDAModel.scala | 26 ++-- .../spark/mllib/clustering/LDAOptimizer.scala | 6 ++--- .../mllib/clustering/StreamingKMeans.scala | 4 +-- .../apache/spark/mllib/linalg/Matrices.scala| 16 ++-- .../org/apache/spark/mllib/linalg/Vectors.scala | 8 +++--- .../mllib/linalg/distributed/BlockMatrix.scala | 8 +++--- .../mllib/linalg/distributed/RowMatrix.scala| 16 ++-- .../mllib/optimization/GradientDescent.scala| 4 +-- .../apache/spark/mllib/optimization/LBFGS.scala | 4 +-- .../spark/mllib/optimization/Updater.scala | 14 +-- .../apache/spark/mllib/regression/Lasso.scala | 2 +- .../mllib/regression/LinearRegression.scala | 2 +- .../mllib/regression/RidgeRegression.scala | 2 +- .../stat/correlation/PearsonCorrelation.scala | 2 +- .../distribution/MultivariateGaussian.scala | 8 +++--- .../spark/mllib/stat/test/ChiSqTest.scala | 2 +- .../ml/classification/NaiveBayesSuite.scala | 6 ++--- .../LogisticRegressionSuite.scala | 4 +-- .../mllib/classification/NaiveBayesSuite.scala | 4 +-- .../spark/mllib/clustering/LDASuite.scala | 4 +-- .../mllib/clustering/StreamingKMeansSuite.scala | 2 +- .../spark/mllib/feature/NormalizerSuite.scala | 16 ++-- .../linalg/BreezeMatrixConversionSuite.scala| 4 +-- .../linalg/BreezeVectorConversionSuite.scala| 4 +-- .../spark/mllib/linalg/MatricesSuite.scala | 14 +-- .../spark/mllib/linalg/VectorsSuite.scala | 2 +- .../linalg/distributed/BlockMatrixSuite.scala | 2 +- .../distributed/IndexedRowMatrixSuite.scala | 10 .../linalg/distributed/RowMatrixSuite.scala | 14 +-- .../spark/mllib/stat/CorrelationSuite.scala | 6 ++--- .../apache/spark/mllib/util/MLUtilsSuite.scala | 6 ++--- project/MimaExcludes.scala | 3 +++ 49 files changed, 156 insertions(+), 153 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/21b2605d/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala -- diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index a47526d..0ea687b 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -69,7 +69,7 @@ sealed trait Matrix extends Serializable { def rowIter: Iterator[Vector] = this.transpose.colIter /** Converts to a breeze matrix. */ - private[ml] def toBreeze: BM[Double]
spark git commit: [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix
Repository: spark Updated Branches: refs/heads/branch-2.0 a778d3c90 -> dcf498e8a [SPARK-15413][ML][MLLIB] Change `toBreeze` to `asBreeze` in Vector and Matrix ## What changes were proposed in this pull request? We're using `asML` to convert the mllib vector/matrix to ml vector/matrix now. Using `as` is more correct given that this conversion actually shares the same underline data structure. As a result, in this PR, `toBreeze` will be changed to `asBreeze`. This is a private API, as a result, it will not affect any user's application. ## How was this patch tested? unit tests Author: DB Tsai Closes #13198 from dbtsai/minor. (cherry picked from commit 21b2605dc4900894ea7a911e039781ecc2a18c14) Signed-off-by: Joseph K. Bradley Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dcf498e8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dcf498e8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dcf498e8 Branch: refs/heads/branch-2.0 Commit: dcf498e8aafd2b53c5680cf7f3ada31829686b62 Parents: a778d3c Author: DB Tsai Authored: Fri May 27 14:02:39 2016 -0700 Committer: Joseph K. Bradley Committed: Fri May 27 14:02:51 2016 -0700 -- .../org/apache/spark/ml/linalg/Matrices.scala | 16 ++-- .../org/apache/spark/ml/linalg/Vectors.scala| 8 +++--- .../distribution/MultivariateGaussian.scala | 8 +++--- .../ml/linalg/BreezeMatrixConversionSuite.scala | 4 +-- .../ml/linalg/BreezeVectorConversionSuite.scala | 4 +-- .../apache/spark/ml/linalg/MatricesSuite.scala | 14 +-- .../apache/spark/ml/linalg/VectorsSuite.scala | 2 +- .../scala/org/apache/spark/ml/ann/Layer.scala | 8 +++--- .../ml/classification/LogisticRegression.scala | 2 +- .../spark/ml/clustering/GaussianMixture.scala | 2 +- .../apache/spark/ml/feature/MaxAbsScaler.scala | 2 +- .../apache/spark/ml/feature/MinMaxScaler.scala | 2 +- .../ml/regression/AFTSurvivalRegression.scala | 2 +- .../spark/ml/regression/LinearRegression.scala | 2 +- .../apache/spark/mllib/classification/SVM.scala | 2 +- .../mllib/clustering/GaussianMixture.scala | 2 +- .../mllib/clustering/GaussianMixtureModel.scala | 4 +-- .../spark/mllib/clustering/LDAModel.scala | 26 ++-- .../spark/mllib/clustering/LDAOptimizer.scala | 6 ++--- .../mllib/clustering/StreamingKMeans.scala | 4 +-- .../apache/spark/mllib/linalg/Matrices.scala| 16 ++-- .../org/apache/spark/mllib/linalg/Vectors.scala | 8 +++--- .../mllib/linalg/distributed/BlockMatrix.scala | 8 +++--- .../mllib/linalg/distributed/RowMatrix.scala| 16 ++-- .../mllib/optimization/GradientDescent.scala| 4 +-- .../apache/spark/mllib/optimization/LBFGS.scala | 4 +-- .../spark/mllib/optimization/Updater.scala | 14 +-- .../apache/spark/mllib/regression/Lasso.scala | 2 +- .../mllib/regression/LinearRegression.scala | 2 +- .../mllib/regression/RidgeRegression.scala | 2 +- .../stat/correlation/PearsonCorrelation.scala | 2 +- .../distribution/MultivariateGaussian.scala | 8 +++--- .../spark/mllib/stat/test/ChiSqTest.scala | 2 +- .../ml/classification/NaiveBayesSuite.scala | 6 ++--- .../LogisticRegressionSuite.scala | 4 +-- .../mllib/classification/NaiveBayesSuite.scala | 4 +-- .../spark/mllib/clustering/LDASuite.scala | 4 +-- .../mllib/clustering/StreamingKMeansSuite.scala | 2 +- .../spark/mllib/feature/NormalizerSuite.scala | 16 ++-- .../linalg/BreezeMatrixConversionSuite.scala| 4 +-- .../linalg/BreezeVectorConversionSuite.scala| 4 +-- .../spark/mllib/linalg/MatricesSuite.scala | 14 +-- .../spark/mllib/linalg/VectorsSuite.scala | 2 +- .../linalg/distributed/BlockMatrixSuite.scala | 2 +- .../distributed/IndexedRowMatrixSuite.scala | 10 .../linalg/distributed/RowMatrixSuite.scala | 14 +-- .../spark/mllib/stat/CorrelationSuite.scala | 6 ++--- .../apache/spark/mllib/util/MLUtilsSuite.scala | 6 ++--- project/MimaExcludes.scala | 3 +++ 49 files changed, 156 insertions(+), 153 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/dcf498e8/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala -- diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala index a47526d..0ea687b 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Matrices.scala @@ -69,7 +69,7 @@ sealed trait Matrix extends Serializable { def rowIter: Iterator[V