spark git commit: [SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue

srowen Fri, 27 May 2016 19:00:07 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 80a40e8e2 -> 8467e2102



[SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue

## What changes were proposed in this pull request?

(Please fill in changes proposed in this fix)
In the MLLib naivebayes example, scala and python example doesn't use libsvm 
data, but Java does.

I make changes in scala and python example to use the libsvm data as the same 
as Java example.

## How was this patch tested?

Manual tests

Author: [email protected] <[email protected]>

Closes #13301 from wangmiao1981/example.

(cherry picked from commit 5d4dafe8fdea49dcbd6b0e4c23e3791fa30c8911)
Signed-off-by: Sean Owen <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8467e210
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8467e210
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8467e210

Branch: refs/heads/branch-2.0
Commit: 8467e2102886da1cefb43f2aaa69864375fe91bc
Parents: 80a40e8
Author: [email protected] <[email protected]>
Authored: Fri May 27 20:59:24 2016 -0500
Committer: Sean Owen <[email protected]>
Committed: Fri May 27 20:59:34 2016 -0500

----------------------------------------------------------------------
 data/mllib/sample_naive_bayes_data.txt                | 12 ------------
 .../spark/examples/mllib/JavaNaiveBayesExample.java   |  4 ++--
 examples/src/main/python/mllib/naive_bayes_example.py | 13 ++++---------
 .../spark/examples/mllib/NaiveBayesExample.scala      | 14 ++++----------
 4 files changed, 10 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/data/mllib/sample_naive_bayes_data.txt
----------------------------------------------------------------------
diff --git a/data/mllib/sample_naive_bayes_data.txt 
b/data/mllib/sample_naive_bayes_data.txt
deleted file mode 100644
index bd22bea..0000000
--- a/data/mllib/sample_naive_bayes_data.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-0,1 0 0
-0,2 0 0
-0,3 0 0
-0,4 0 0
-1,0 1 0
-1,0 2 0
-1,0 3 0
-1,0 4 0
-2,0 0 1
-2,0 0 2
-2,0 0 3
-2,0 0 4
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
----------------------------------------------------------------------
diff --git 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
index 2b17dbb..f4ec04b 100644
--- 
a/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
+++ 
b/examples/src/main/java/org/apache/spark/examples/mllib/JavaNaiveBayesExample.java
@@ -36,9 +36,9 @@ public class JavaNaiveBayesExample {
     SparkConf sparkConf = new SparkConf().setAppName("JavaNaiveBayesExample");
     JavaSparkContext jsc = new JavaSparkContext(sparkConf);
     // $example on$
-    String path = "data/mllib/sample_naive_bayes_data.txt";
+    String path = "data/mllib/sample_libsvm_data.txt";
     JavaRDD<LabeledPoint> inputData = MLUtils.loadLibSVMFile(jsc.sc(), 
path).toJavaRDD();
-    JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 
0.4}, 12345);
+    JavaRDD<LabeledPoint>[] tmp = inputData.randomSplit(new double[]{0.6, 
0.4});
     JavaRDD<LabeledPoint> training = tmp[0]; // training set
     JavaRDD<LabeledPoint> test = tmp[1]; // test set
     final NaiveBayesModel model = NaiveBayes.train(training.rdd(), 1.0);

http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/examples/src/main/python/mllib/naive_bayes_example.py
----------------------------------------------------------------------
diff --git a/examples/src/main/python/mllib/naive_bayes_example.py 
b/examples/src/main/python/mllib/naive_bayes_example.py
index 35724f7..749353b 100644
--- a/examples/src/main/python/mllib/naive_bayes_example.py
+++ b/examples/src/main/python/mllib/naive_bayes_example.py
@@ -29,15 +29,9 @@ import shutil
 from pyspark import SparkContext
 # $example on$
 from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
-from pyspark.mllib.linalg import Vectors
-from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.util import MLUtils
 
 
-def parseLine(line):
-    parts = line.split(',')
-    label = float(parts[0])
-    features = Vectors.dense([float(x) for x in parts[1].split(' ')])
-    return LabeledPoint(label, features)
 # $example off$
 
 if __name__ == "__main__":
@@ -45,10 +39,11 @@ if __name__ == "__main__":
     sc = SparkContext(appName="PythonNaiveBayesExample")
 
     # $example on$
-    data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)
+    # Load and parse the data file.
+    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 
     # Split data approximately into training (60%) and test (40%)
-    training, test = data.randomSplit([0.6, 0.4], seed=0)
+    training, test = data.randomSplit([0.6, 0.4])
 
     # Train a naive Bayes model.
     model = NaiveBayes.train(training, 1.0)

http://git-wip-us.apache.org/repos/asf/spark/blob/8467e210/examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala
----------------------------------------------------------------------
diff --git 
a/examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala
 
b/examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala
index 0187ad6..b321d8e 100644
--- 
a/examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala
+++ 
b/examples/src/main/scala/org/apache/spark/examples/mllib/NaiveBayesExample.scala
@@ -21,8 +21,7 @@ package org.apache.spark.examples.mllib
 import org.apache.spark.{SparkConf, SparkContext}
 // $example on$
 import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
+import org.apache.spark.mllib.util.MLUtils
 // $example off$
 
 object NaiveBayesExample {
@@ -31,16 +30,11 @@ object NaiveBayesExample {
     val conf = new SparkConf().setAppName("NaiveBayesExample")
     val sc = new SparkContext(conf)
     // $example on$
-    val data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")
-    val parsedData = data.map { line =>
-      val parts = line.split(',')
-      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' 
').map(_.toDouble)))
-    }
+    // Load and parse the data file.
+    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
 
     // Split data into training (60%) and test (40%).
-    val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
-    val training = splits(0)
-    val test = splits(1)
+    val Array(training, test) = data.randomSplit(Array(0.6, 0.4))
 
     val model = NaiveBayes.train(training, lambda = 1.0, modelType = 
"multinomial")
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-15449][MLLIB][EXAMPLE] Wrong Data Format - Documentation Issue

Reply via email to