Repository: spark Updated Branches: refs/heads/master 51b3d41e1 -> a46694439
[SPARK-6841] [SPARKR] add support for mean, median, stdev etc. Moving here from https://github.com/amplab-extras/SparkR-pkg/pull/241 sum() has been implemented. (https://github.com/amplab-extras/SparkR-pkg/pull/242) Now Phase 1: mean, sd, var have been implemented, but some things still need to be improved with the suggestions in https://issues.apache.org/jira/browse/SPARK-6841 Author: qhuang <[email protected]> Closes #5446 from hqzizania/R and squashes the following commits: f283572 [qhuang] add test unit for describe() 2e74d5a [qhuang] add describe() DataFrame API Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a4669443 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a4669443 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a4669443 Branch: refs/heads/master Commit: a4669443999dc13a1bb34509c827d8b9096ea84f Parents: 51b3d41 Author: qhuang <[email protected]> Authored: Tue May 5 20:39:56 2015 -0700 Committer: Reynold Xin <[email protected]> Committed: Tue May 5 20:39:56 2015 -0700 ---------------------------------------------------------------------- R/pkg/NAMESPACE | 1 + R/pkg/R/DataFrame.R | 37 +++++++++++++++++++++++++++++++++++ R/pkg/R/generics.R | 4 ++++ R/pkg/inst/tests/test_sparkSQL.R | 11 +++++++++++ 4 files changed, 53 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/a4669443/R/pkg/NAMESPACE ---------------------------------------------------------------------- diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 1fb3311..528e660 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -13,6 +13,7 @@ exportMethods("cache", "collect", "columns", "count", + "describe", "distinct", "dtypes", "except", http://git-wip-us.apache.org/repos/asf/spark/blob/a4669443/R/pkg/R/DataFrame.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 841e77e..56c305d 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -1276,3 +1276,40 @@ setMethod("saveAsTable", callJMethod(df@sdf, "saveAsTable", tableName, source, jmode, options) }) +#' describe +#' +#' Computes statistics for numeric columns. +#' If no columns are given, this function computes statistics for all numerical columns. +#' +#' @param x A DataFrame to be computed. +#' @param col A string of name +#' @param ... Additional expressions +#' @return A DataFrame +#' @rdname describe +#' @export +#' @examples +#'\dontrun{ +#' sc <- sparkR.init() +#' sqlCtx <- sparkRSQL.init(sc) +#' path <- "path/to/file.json" +#' df <- jsonFile(sqlCtx, path) +#' describe(df) +#' describe(df, "col1") +#' describe(df, "col1", "col2") +#' } +setMethod("describe", + signature(x = "DataFrame", col = "character"), + function(x, col, ...) { + colList <- list(col, ...) + sdf <- callJMethod(x@sdf, "describe", listToSeq(colList)) + dataFrame(sdf) + }) + +#' @rdname describe +setMethod("describe", + signature(x = "DataFrame"), + function(x) { + colList <- as.list(c(columns(x))) + sdf <- callJMethod(x@sdf, "describe", listToSeq(colList)) + dataFrame(sdf) + }) http://git-wip-us.apache.org/repos/asf/spark/blob/a4669443/R/pkg/R/generics.R ---------------------------------------------------------------------- diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index e887293..5838955 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -384,6 +384,10 @@ setGeneric("value", function(bcast) { standardGeneric("value") }) #' @export setGeneric("columns", function(x) {standardGeneric("columns") }) +#' @rdname describe +#' @export +setGeneric("describe", function(x, col, ...) { standardGeneric("describe") }) + #' @rdname schema #' @export setGeneric("dtypes", function(x) { standardGeneric("dtypes") }) http://git-wip-us.apache.org/repos/asf/spark/blob/a4669443/R/pkg/inst/tests/test_sparkSQL.R ---------------------------------------------------------------------- diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index f82e56f..7a42e28 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -705,5 +705,16 @@ test_that("parquetFile works with multiple input paths", { expect_true(count(parquetDF) == count(df)*2) }) +test_that("describe() on a DataFrame", { + df <- jsonFile(sqlCtx, jsonPath) + stats <- describe(df, "age") + expect_true(collect(stats)[1, "summary"] == "count") + expect_true(collect(stats)[2, "age"] == 24.5) + expect_true(collect(stats)[3, "age"] == 5.5) + stats <- describe(df) + expect_true(collect(stats)[4, "name"] == "Andy") + expect_true(collect(stats)[5, "age"] == 30.0) +}) + unlink(parquetPath) unlink(jsonPath) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
