Repository: spark Updated Branches: refs/heads/master 0c1e72e7f -> 6f6bb0e89
[SPARK-12011][SQL] Stddev/Variance etc should support columnName as arguments Spark SQL aggregate function: ```Java stddev stddev_pop stddev_samp variance var_pop var_samp skewness kurtosis collect_list collect_set ``` should support ```columnName``` as arguments like other aggregate function(max/min/count/sum). Author: Yanbo Liang <[email protected]> Closes #9994 from yanboliang/SPARK-12011. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/6f6bb0e8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/6f6bb0e8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/6f6bb0e8 Branch: refs/heads/master Commit: 6f6bb0e893c8370cbab4d63a56d74e00cb7f3cf6 Parents: 0c1e72e Author: Yanbo Liang <[email protected]> Authored: Thu Nov 26 19:00:36 2015 -0800 Committer: Reynold Xin <[email protected]> Committed: Thu Nov 26 19:00:36 2015 -0800 ---------------------------------------------------------------------- .../scala/org/apache/spark/sql/functions.scala | 86 ++++++++++++++++++++ .../spark/sql/DataFrameAggregateSuite.scala | 3 + 2 files changed, 89 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/6f6bb0e8/sql/core/src/main/scala/org/apache/spark/sql/functions.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 276c5df..e79defb 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -215,6 +215,16 @@ object functions extends LegacyFunctions { def collect_list(e: Column): Column = callUDF("collect_list", e) /** + * Aggregate function: returns a list of objects with duplicates. + * + * For now this is an alias for the collect_list Hive UDAF. + * + * @group agg_funcs + * @since 1.6.0 + */ + def collect_list(columnName: String): Column = collect_list(Column(columnName)) + + /** * Aggregate function: returns a set of objects with duplicate elements eliminated. * * For now this is an alias for the collect_set Hive UDAF. @@ -225,6 +235,16 @@ object functions extends LegacyFunctions { def collect_set(e: Column): Column = callUDF("collect_set", e) /** + * Aggregate function: returns a set of objects with duplicate elements eliminated. + * + * For now this is an alias for the collect_set Hive UDAF. + * + * @group agg_funcs + * @since 1.6.0 + */ + def collect_set(columnName: String): Column = collect_set(Column(columnName)) + + /** * Aggregate function: returns the Pearson Correlation Coefficient for two columns. * * @group agg_funcs @@ -313,6 +333,14 @@ object functions extends LegacyFunctions { def kurtosis(e: Column): Column = withAggregateFunction { Kurtosis(e.expr) } /** + * Aggregate function: returns the kurtosis of the values in a group. + * + * @group agg_funcs + * @since 1.6.0 + */ + def kurtosis(columnName: String): Column = kurtosis(Column(columnName)) + + /** * Aggregate function: returns the last value in a group. * * @group agg_funcs @@ -387,6 +415,14 @@ object functions extends LegacyFunctions { def skewness(e: Column): Column = withAggregateFunction { Skewness(e.expr) } /** + * Aggregate function: returns the skewness of the values in a group. + * + * @group agg_funcs + * @since 1.6.0 + */ + def skewness(columnName: String): Column = skewness(Column(columnName)) + + /** * Aggregate function: alias for [[stddev_samp]]. * * @group agg_funcs @@ -395,6 +431,14 @@ object functions extends LegacyFunctions { def stddev(e: Column): Column = withAggregateFunction { StddevSamp(e.expr) } /** + * Aggregate function: alias for [[stddev_samp]]. + * + * @group agg_funcs + * @since 1.6.0 + */ + def stddev(columnName: String): Column = stddev(Column(columnName)) + + /** * Aggregate function: returns the sample standard deviation of * the expression in a group. * @@ -404,6 +448,15 @@ object functions extends LegacyFunctions { def stddev_samp(e: Column): Column = withAggregateFunction { StddevSamp(e.expr) } /** + * Aggregate function: returns the sample standard deviation of + * the expression in a group. + * + * @group agg_funcs + * @since 1.6.0 + */ + def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName)) + + /** * Aggregate function: returns the population standard deviation of * the expression in a group. * @@ -413,6 +466,15 @@ object functions extends LegacyFunctions { def stddev_pop(e: Column): Column = withAggregateFunction { StddevPop(e.expr) } /** + * Aggregate function: returns the population standard deviation of + * the expression in a group. + * + * @group agg_funcs + * @since 1.6.0 + */ + def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName)) + + /** * Aggregate function: returns the sum of all values in the expression. * * @group agg_funcs @@ -453,6 +515,14 @@ object functions extends LegacyFunctions { def variance(e: Column): Column = withAggregateFunction { VarianceSamp(e.expr) } /** + * Aggregate function: alias for [[var_samp]]. + * + * @group agg_funcs + * @since 1.6.0 + */ + def variance(columnName: String): Column = variance(Column(columnName)) + + /** * Aggregate function: returns the unbiased variance of the values in a group. * * @group agg_funcs @@ -461,6 +531,14 @@ object functions extends LegacyFunctions { def var_samp(e: Column): Column = withAggregateFunction { VarianceSamp(e.expr) } /** + * Aggregate function: returns the unbiased variance of the values in a group. + * + * @group agg_funcs + * @since 1.6.0 + */ + def var_samp(columnName: String): Column = var_samp(Column(columnName)) + + /** * Aggregate function: returns the population variance of the values in a group. * * @group agg_funcs @@ -468,6 +546,14 @@ object functions extends LegacyFunctions { */ def var_pop(e: Column): Column = withAggregateFunction { VariancePop(e.expr) } + /** + * Aggregate function: returns the population variance of the values in a group. + * + * @group agg_funcs + * @since 1.6.0 + */ + def var_pop(columnName: String): Column = var_pop(Column(columnName)) + ////////////////////////////////////////////////////////////////////////////////////////////// // Window functions ////////////////////////////////////////////////////////////////////////////////////////////// http://git-wip-us.apache.org/repos/asf/spark/blob/6f6bb0e8/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala index 9c42f65..b5c636d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala @@ -261,6 +261,9 @@ class DataFrameAggregateSuite extends QueryTest with SharedSQLContext { checkAnswer( testData2.agg(stddev('a), stddev_pop('a), stddev_samp('a)), Row(testData2ADev, math.sqrt(4 / 6.0), testData2ADev)) + checkAnswer( + testData2.agg(stddev("a"), stddev_pop("a"), stddev_samp("a")), + Row(testData2ADev, math.sqrt(4 / 6.0), testData2ADev)) } test("zero stddev") { --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
