Repository: spark
Updated Branches:
  refs/heads/master ae6c677c8 -> 3f06eb72c


[SPARK-13613][ML] Provide ignored tests to export test dataset into CSV format

## What changes were proposed in this pull request?
Provide ignored test cases to export the test dataset into CSV format in 
```LinearRegressionSuite```, ```LogisticRegressionSuite```, 
```AFTSurvivalRegressionSuite``` and ```GeneralizedLinearRegressionSuite```, so 
users can validate the training accuracy compared with R's glm, glmnet and 
survival package.
cc mengxr
## How was this patch tested?
The test suite is ignored, but I have enabled all these cases offline and it 
works as expected.

Author: Yanbo Liang <[email protected]>

Closes #11463 from yanboliang/spark-13613.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3f06eb72
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3f06eb72
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3f06eb72

Branch: refs/heads/master
Commit: 3f06eb72ca0c3e5779a702c7c677229e0c480751
Parents: ae6c677
Author: Yanbo Liang <[email protected]>
Authored: Wed Mar 16 14:14:15 2016 -0700
Committer: Xiangrui Meng <[email protected]>
Committed: Wed Mar 16 14:14:15 2016 -0700

----------------------------------------------------------------------
 .../LogisticRegressionSuite.scala               | 29 ++++++------
 .../regression/AFTSurvivalRegressionSuite.scala | 13 ++++++
 .../GeneralizedLinearRegressionSuite.scala      | 49 +++++++++++++++++++-
 .../ml/regression/LinearRegressionSuite.scala   | 39 +++++++++-------
 4 files changed, 97 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/3f06eb72/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
index cfb9bbf..afeeaf7 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/classification/LogisticRegressionSuite.scala
@@ -23,7 +23,7 @@ import scala.util.Random
 import org.apache.spark.SparkFunSuite
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.param.ParamsSuite
-import org.apache.spark.ml.util.{DefaultReadWriteTest, Identifiable, 
MLTestingUtils}
+import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
@@ -44,20 +44,6 @@ class LogisticRegressionSuite
 
     dataset = sqlContext.createDataFrame(generateLogisticInput(1.0, 1.0, 
nPoints = 100, seed = 42))
 
-    /*
-       Here is the instruction describing how to export the test data into CSV 
format
-       so we can validate the training accuracy compared with R's glmnet 
package.
-
-       import org.apache.spark.mllib.classification.LogisticRegressionSuite
-       val nPoints = 10000
-       val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 
2.688191)
-       val xMean = Array(5.843, 3.057, 3.758, 1.199)
-       val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
-       val data = 
sc.parallelize(LogisticRegressionSuite.generateMultinomialLogisticInput(
-         coefficients, xMean, xVariance, true, nPoints, 42), 1)
-       data.map(x=> x.label + ", " + x.features(0) + ", " + x.features(1) + ", 
"
-         + x.features(2) + ", " + x.features(3)).saveAsTextFile("path")
-     */
     binaryDataset = {
       val nPoints = 10000
       val coefficients = Array(-0.57997, 0.912083, -0.371077, -0.819866, 
2.688191)
@@ -65,12 +51,23 @@ class LogisticRegressionSuite
       val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)
 
       val testData =
-        generateMultinomialLogisticInput(coefficients, xMean, xVariance, true, 
nPoints, 42)
+        generateMultinomialLogisticInput(coefficients, xMean, xVariance,
+          addIntercept = true, nPoints, 42)
 
       sqlContext.createDataFrame(sc.parallelize(testData, 4))
     }
   }
 
+  /**
+   * Enable the ignored test to export the dataset into CSV format,
+   * so we can validate the training accuracy compared with R's glmnet package.
+   */
+  ignore("export test data into CSV format") {
+    binaryDataset.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    
}.repartition(1).saveAsTextFile("target/tmp/LogisticRegressionSuite/binaryDataset")
+  }
+
   test("params") {
     ParamsSuite.checkParams(new LogisticRegression)
     val model = new LogisticRegressionModel("logReg", Vectors.dense(0.0), 0.0)

http://git-wip-us.apache.org/repos/asf/spark/blob/3f06eb72/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
index d718ef6..dbd752d 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/regression/AFTSurvivalRegressionSuite.scala
@@ -44,6 +44,19 @@ class AFTSurvivalRegressionSuite
         2, Array(0.9, -1.3), Array(0.7, 1.2), 1000, 42, 1.5, 2.5, 2.0)))
   }
 
+  /**
+   * Enable the ignored test to export the dataset into CSV format,
+   * so we can validate the training accuracy compared with R's survival 
package.
+   */
+  ignore("export test data into CSV format") {
+    datasetUnivariate.rdd.map { case Row(features: Vector, label: Double, 
censor: Double) =>
+      features.toArray.mkString(",") + "," + censor + "," + label
+    
}.repartition(1).saveAsTextFile("target/tmp/AFTSurvivalRegressionSuite/datasetUnivariate")
+    datasetMultivariate.rdd.map { case Row(features: Vector, label: Double, 
censor: Double) =>
+      features.toArray.mkString(",") + "," + censor + "," + label
+    
}.repartition(1).saveAsTextFile("target/tmp/AFTSurvivalRegressionSuite/datasetMultivariate")
+  }
+
   test("params") {
     ParamsSuite.checkParams(new AFTSurvivalRegression)
     val model = new AFTSurvivalRegressionModel("aftSurvReg", 
Vectors.dense(0.0), 0.0, 0.0)

http://git-wip-us.apache.org/repos/asf/spark/blob/3f06eb72/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
index 6d570f7..4ebdbf2 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/regression/GeneralizedLinearRegressionSuite.scala
@@ -24,7 +24,7 @@ import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.param.ParamsSuite
 import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
 import org.apache.spark.mllib.classification.LogisticRegressionSuite._
-import org.apache.spark.mllib.linalg.{BLAS, DenseVector, Vectors}
+import org.apache.spark.mllib.linalg.{BLAS, DenseVector, Vector, Vectors}
 import org.apache.spark.mllib.random._
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.mllib.util.MLlibTestSparkContext
@@ -120,6 +120,53 @@ class GeneralizedLinearRegressionSuite
         family = "gamma", link = "log"), 2))
   }
 
+  /**
+   * Enable the ignored test to export the dataset into CSV format,
+   * so we can validate the training accuracy compared with R's glm and glmnet 
package.
+   */
+  ignore("export test data into CSV format") {
+    datasetGaussianIdentity.rdd.map { case Row(label: Double, features: 
Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianIdentity")
+    datasetGaussianLog.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianLog")
+    datasetGaussianInverse.rdd.map { case Row(label: Double, features: Vector) 
=>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGaussianInverse")
+    datasetBinomial.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetBinomial")
+    datasetPoissonLog.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonLog")
+    datasetPoissonIdentity.rdd.map { case Row(label: Double, features: Vector) 
=>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonIdentity")
+    datasetPoissonSqrt.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetPoissonSqrt")
+    datasetGammaInverse.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaInverse")
+    datasetGammaIdentity.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaIdentity")
+    datasetGammaLog.rdd.map { case Row(label: Double, features: Vector) =>
+      label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      "target/tmp/GeneralizedLinearRegressionSuite/datasetGammaLog")
+  }
+
   test("params") {
     ParamsSuite.checkParams(new GeneralizedLinearRegression)
     val model = new GeneralizedLinearRegressionModel("genLinReg", 
Vectors.dense(0.0), 0.0)

http://git-wip-us.apache.org/repos/asf/spark/blob/3f06eb72/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
 
b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
index 9dee04c..bd45d21 100644
--- 
a/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
+++ 
b/mllib/src/test/scala/org/apache/spark/ml/regression/LinearRegressionSuite.scala
@@ -40,19 +40,6 @@ class LinearRegressionSuite
   @transient var datasetWithWeightConstantLabel: DataFrame = _
   @transient var datasetWithWeightZeroLabel: DataFrame = _
 
-  /*
-     In `LinearRegressionSuite`, we will make sure that the model trained by 
SparkML
-     is the same as the one trained by R's glmnet package. The following 
instruction
-     describes how to reproduce the data in R.
-     In a spark-shell, use the following code:
-
-     import org.apache.spark.mllib.util.LinearDataGenerator
-     val data =
-       sc.parallelize(LinearDataGenerator.generateLinearInput(6.3, Array(4.7, 
7.2),
-         Array(0.9, -1.3), Array(0.7, 1.2), 10000, 42, 0.1), 2)
-     data.map(x=> x.label + ", " + x.features(0) + ", " + 
x.features(1)).coalesce(1)
-       .saveAsTextFile("path")
-   */
   override def beforeAll(): Unit = {
     super.beforeAll()
     datasetWithDenseFeature = sqlContext.createDataFrame(
@@ -60,8 +47,8 @@ class LinearRegressionSuite
         intercept = 6.3, weights = Array(4.7, 7.2), xMean = Array(0.9, -1.3),
         xVariance = Array(0.7, 1.2), nPoints = 10000, seed, eps = 0.1), 2))
     /*
-       datasetWithoutIntercept is not needed for correctness testing but is 
useful for illustrating
-       training model without intercept
+       datasetWithDenseFeatureWithoutIntercept is not needed for correctness 
testing
+       but is useful for illustrating training model without intercept
      */
     datasetWithDenseFeatureWithoutIntercept = sqlContext.createDataFrame(
       sc.parallelize(LinearDataGenerator.generateLinearInput(
@@ -119,6 +106,26 @@ class LinearRegressionSuite
       ), 2))
   }
 
+  /**
+   * Enable the ignored test to export the dataset into CSV format,
+   * so we can validate the training accuracy compared with R's glmnet package.
+   */
+  ignore("export test data into CSV format") {
+    datasetWithDenseFeature.rdd.map { case Row(label: Double, features: 
Vector) =>
+      label + "," + features.toArray.mkString(",")
+    
}.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithDenseFeature")
+
+    datasetWithDenseFeatureWithoutIntercept.rdd.map {
+      case Row(label: Double, features: Vector) =>
+        label + "," + features.toArray.mkString(",")
+    }.repartition(1).saveAsTextFile(
+      
"target/tmp/LinearRegressionSuite/datasetWithDenseFeatureWithoutIntercept")
+
+    datasetWithSparseFeature.rdd.map { case Row(label: Double, features: 
Vector) =>
+      label + "," + features.toArray.mkString(",")
+    
}.repartition(1).saveAsTextFile("target/tmp/LinearRegressionSuite/datasetWithSparseFeature")
+  }
+
   test("params") {
     ParamsSuite.checkParams(new LinearRegression)
     val model = new LinearRegressionModel("linearReg", Vectors.dense(0.0), 0.0)
@@ -222,7 +229,7 @@ class LinearRegressionSuite
 
       /*
          Then again with the data with no intercept:
-         > coefficientsWithourIntercept
+         > coefficientsWithoutIntercept
           3 x 1 sparse Matrix of class "dgCMatrix"
                                    s0
          (Intercept)           .


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to