spark git commit: [SPARK-6841] [SPARKR] add support for mean, median, stdev etc.

rxin Tue, 05 May 2015 20:40:20 -0700

Repository: spark
Updated Branches:
  refs/heads/master 51b3d41e1 -> a46694439



[SPARK-6841] [SPARKR] add support for mean, median, stdev etc.

Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241
sum() has been implemented. 
(https://github.com/amplab-extras/SparkR-pkg/pull/242)

Now Phase 1: mean, sd, var have been implemented, but some things still need to 
be improved with the suggestions in 
https://issues.apache.org/jira/browse/SPARK-6841

Author: qhuang <[email protected]>

Closes #5446 from hqzizania/R and squashes the following commits:

f283572 [qhuang] add test unit for describe()
2e74d5a [qhuang] add describe() DataFrame API


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a4669443
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a4669443
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a4669443

Branch: refs/heads/master
Commit: a4669443999dc13a1bb34509c827d8b9096ea84f
Parents: 51b3d41
Author: qhuang <[email protected]>
Authored: Tue May 5 20:39:56 2015 -0700
Committer: Reynold Xin <[email protected]>
Committed: Tue May 5 20:39:56 2015 -0700

----------------------------------------------------------------------
 R/pkg/NAMESPACE                  |  1 +
 R/pkg/R/DataFrame.R              | 37 +++++++++++++++++++++++++++++++++++
 R/pkg/R/generics.R               |  4 ++++
 R/pkg/inst/tests/test_sparkSQL.R | 11 +++++++++++
 4 files changed, 53 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a4669443/R/pkg/NAMESPACE
----------------------------------------------------------------------
diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE
index 1fb3311..528e660 100644
--- a/R/pkg/NAMESPACE
+++ b/R/pkg/NAMESPACE
@@ -13,6 +13,7 @@ exportMethods("cache",
               "collect",
               "columns",
               "count",
+              "describe",
               "distinct",
               "dtypes",
               "except",

http://git-wip-us.apache.org/repos/asf/spark/blob/a4669443/R/pkg/R/DataFrame.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R
index 841e77e..56c305d 100644
--- a/R/pkg/R/DataFrame.R
+++ b/R/pkg/R/DataFrame.R
@@ -1276,3 +1276,40 @@ setMethod("saveAsTable",
             callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, 
options)
           })
 
+#' describe
+#'
+#' Computes statistics for numeric columns.
+#' If no columns are given, this function computes statistics for all 
numerical columns.
+#'
+#' @param x A DataFrame to be computed.
+#' @param col A string of name
+#' @param ... Additional expressions
+#' @return A DataFrame
+#' @rdname describe 
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' sqlCtx <- sparkRSQL.init(sc)
+#' path <- "path/to/file.json"
+#' df <- jsonFile(sqlCtx, path)
+#' describe(df)
+#' describe(df, "col1")
+#' describe(df, "col1", "col2")
+#' }
+setMethod("describe",
+          signature(x = "DataFrame", col = "character"),
+          function(x, col, ...) {
+            colList <- list(col, ...)
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })
+
+#' @rdname describe
+setMethod("describe",
+          signature(x = "DataFrame"),
+          function(x) {
+            colList <- as.list(c(columns(x)))
+            sdf <- callJMethod(x@sdf, "describe", listToSeq(colList))
+            dataFrame(sdf)
+          })

http://git-wip-us.apache.org/repos/asf/spark/blob/a4669443/R/pkg/R/generics.R
----------------------------------------------------------------------
diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R
index e887293..5838955 100644
--- a/R/pkg/R/generics.R
+++ b/R/pkg/R/generics.R
@@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { 
standardGeneric("value") })
 #' @export
 setGeneric("columns", function(x) {standardGeneric("columns") })
 
+#' @rdname describe
+#' @export
+setGeneric("describe", function(x, col, ...) { standardGeneric("describe") })
+
 #' @rdname schema
 #' @export
 setGeneric("dtypes", function(x) { standardGeneric("dtypes") })

http://git-wip-us.apache.org/repos/asf/spark/blob/a4669443/R/pkg/inst/tests/test_sparkSQL.R
----------------------------------------------------------------------
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
index f82e56f..7a42e28 100644
--- a/R/pkg/inst/tests/test_sparkSQL.R
+++ b/R/pkg/inst/tests/test_sparkSQL.R
@@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", {
   expect_true(count(parquetDF) == count(df)*2)
 })
 
+test_that("describe() on a DataFrame", {
+  df <- jsonFile(sqlCtx, jsonPath)
+  stats <- describe(df, "age")
+  expect_true(collect(stats)[1, "summary"] == "count")
+  expect_true(collect(stats)[2, "age"] == 24.5)
+  expect_true(collect(stats)[3, "age"] == 5.5)
+  stats <- describe(df)
+  expect_true(collect(stats)[4, "name"] == "Andy")
+  expect_true(collect(stats)[5, "age"] == 30.0)
+})
+
 unlink(parquetPath)
 unlink(jsonPath)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-6841] [SPARKR] add support for mean, median, stdev etc.

Reply via email to