This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git
The following commit(s) were added to refs/heads/main by this push: new 5684325 [SPARK-51996] Support `describe` and `summary` in `DataFrame` 5684325 is described below commit 568432552a8f5ff71dfc9c66911992023d3ed0ca Author: Dongjoon Hyun <dongj...@apache.org> AuthorDate: Sun May 4 06:10:00 2025 -0700 [SPARK-51996] Support `describe` and `summary` in `DataFrame` ### What changes were proposed in this pull request? This PR aims to support `describe` and `summary` API of `DataFrame`. ### Why are the changes needed? For feature parity. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #112 from dongjoon-hyun/SPARK-51996. Authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- Sources/SparkConnect/DataFrame.swift | 23 +++++++++++++++++++++++ Sources/SparkConnect/SparkConnectClient.swift | 22 ++++++++++++++++++++++ Tests/SparkConnectTests/DataFrameTests.swift | 22 ++++++++++++++++++++++ 3 files changed, 67 insertions(+) diff --git a/Sources/SparkConnect/DataFrame.swift b/Sources/SparkConnect/DataFrame.swift index 481b215..266963f 100644 --- a/Sources/SparkConnect/DataFrame.swift +++ b/Sources/SparkConnect/DataFrame.swift @@ -164,6 +164,10 @@ import Synchronization /// - ``sample(_:_:)`` /// - ``sample(_:)`` /// +/// ### Statistics +/// - ``describe(_:)`` +/// - ``summary(_:)`` +/// /// ### Utility Methods /// - ``isEmpty()`` /// - ``isLocal()`` @@ -495,6 +499,25 @@ public actor DataFrame: Sendable { return DataFrame(spark: self.spark, plan: plan) } + /// Computes basic statistics for numeric and string columns, including count, mean, stddev, min, + /// and max. If no columns are given, this function computes statistics for all numerical or + /// string columns. + /// - Parameter cols: Column names. + /// - Returns: A ``DataFrame`` containing basic statistics. + public func describe(_ cols: String...) -> DataFrame { + return DataFrame(spark: self.spark, plan: SparkConnectClient.getDescribe(self.plan.root, cols)) + } + + /// Computes specified statistics for numeric and string columns. Available statistics are: + /// count, mean, stddev, min, max, arbitrary approximate percentiles specified as a percentage (e.g. 75%) + /// count_distinct, approx_count_distinct . If no statistics are given, this function computes count, mean, + /// stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), and max. + /// - Parameter statistics: Statistics names. + /// - Returns: A ``DataFrame`` containing specified statistics. + public func summary(_ statistics: String...) -> DataFrame { + return DataFrame(spark: self.spark, plan: SparkConnectClient.getSummary(self.plan.root, statistics)) + } + /// Returns a new Dataset with a column renamed. This is a no-op if schema doesn't contain existingName. /// - Parameters: /// - existingName: A existing column name to be renamed. diff --git a/Sources/SparkConnect/SparkConnectClient.swift b/Sources/SparkConnect/SparkConnectClient.swift index 4b55584..82f350d 100644 --- a/Sources/SparkConnect/SparkConnectClient.swift +++ b/Sources/SparkConnect/SparkConnectClient.swift @@ -474,6 +474,28 @@ public actor SparkConnectClient { return plan } + static func getDescribe(_ child: Relation, _ cols: [String]) -> Plan { + var describe = Spark_Connect_StatDescribe() + describe.input = child + describe.cols = cols + var relation = Relation() + relation.describe = describe + var plan = Plan() + plan.opType = .root(relation) + return plan + } + + static func getSummary(_ child: Relation, _ statistics: [String]) -> Plan { + var summary = Spark_Connect_StatSummary() + summary.input = child + summary.statistics = statistics + var relation = Relation() + relation.summary = summary + var plan = Plan() + plan.opType = .root(relation) + return plan + } + static func getSort(_ child: Relation, _ cols: [String]) -> Plan { var sort = Sort() sort.input = child diff --git a/Tests/SparkConnectTests/DataFrameTests.swift b/Tests/SparkConnectTests/DataFrameTests.swift index da53fe4..1d5a785 100644 --- a/Tests/SparkConnectTests/DataFrameTests.swift +++ b/Tests/SparkConnectTests/DataFrameTests.swift @@ -682,6 +682,28 @@ struct DataFrameTests { await spark.stop() } + @Test + func describe() async throws { + let spark = try await SparkSession.builder.getOrCreate() + let df = try await spark.range(10) + let expected = [Row("10"), Row("4.5"), Row("3.0276503540974917"), Row("0"), Row("9")] + #expect(try await df.describe().select("id").collect() == expected) + #expect(try await df.describe("id").select("id").collect() == expected) + await spark.stop() + } + + @Test + func summary() async throws { + let spark = try await SparkSession.builder.getOrCreate() + let expected = [ + Row("10"), Row("4.5"), Row("3.0276503540974917"), + Row("0"), Row("2"), Row("4"), Row("7"), Row("9") + ] + #expect(try await spark.range(10).summary().select("id").collect() == expected) + #expect(try await spark.range(10).summary("min", "max").select("id").collect() == [Row("0"), Row("9")]) + await spark.stop() + } + @Test func groupBy() async throws { let spark = try await SparkSession.builder.getOrCreate() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org