This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git


The following commit(s) were added to refs/heads/main by this push:
     new 5684325  [SPARK-51996] Support `describe` and `summary` in `DataFrame`
5684325 is described below

commit 568432552a8f5ff71dfc9c66911992023d3ed0ca
Author: Dongjoon Hyun <dongj...@apache.org>
AuthorDate: Sun May 4 06:10:00 2025 -0700

    [SPARK-51996] Support `describe` and `summary` in `DataFrame`
    
    ### What changes were proposed in this pull request?
    
    This PR aims to support `describe` and `summary` API of `DataFrame`.
    
    ### Why are the changes needed?
    
    For feature parity.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #112 from dongjoon-hyun/SPARK-51996.
    
    Authored-by: Dongjoon Hyun <dongj...@apache.org>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 Sources/SparkConnect/DataFrame.swift          | 23 +++++++++++++++++++++++
 Sources/SparkConnect/SparkConnectClient.swift | 22 ++++++++++++++++++++++
 Tests/SparkConnectTests/DataFrameTests.swift  | 22 ++++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/Sources/SparkConnect/DataFrame.swift 
b/Sources/SparkConnect/DataFrame.swift
index 481b215..266963f 100644
--- a/Sources/SparkConnect/DataFrame.swift
+++ b/Sources/SparkConnect/DataFrame.swift
@@ -164,6 +164,10 @@ import Synchronization
 /// - ``sample(_:_:)``
 /// - ``sample(_:)``
 ///
+/// ### Statistics
+/// - ``describe(_:)``
+/// - ``summary(_:)``
+///
 /// ### Utility Methods
 /// - ``isEmpty()``
 /// - ``isLocal()``
@@ -495,6 +499,25 @@ public actor DataFrame: Sendable {
     return DataFrame(spark: self.spark, plan: plan)
   }
 
+  /// Computes basic statistics for numeric and string columns, including 
count, mean, stddev, min,
+  /// and max. If no columns are given, this function computes statistics for 
all numerical or
+  /// string columns.
+  /// - Parameter cols: Column names.
+  /// - Returns: A ``DataFrame`` containing basic statistics.
+  public func describe(_ cols: String...) -> DataFrame {
+    return DataFrame(spark: self.spark, plan: 
SparkConnectClient.getDescribe(self.plan.root, cols))
+  }
+
+  /// Computes specified statistics for numeric and string columns. Available 
statistics are:
+  /// count, mean, stddev, min, max, arbitrary approximate percentiles 
specified as a percentage (e.g. 75%)
+  /// count_distinct, approx_count_distinct . If no statistics are given, this 
function computes count, mean,
+  /// stddev, min, approximate quartiles (percentiles at 25%, 50%, and 75%), 
and max.
+  /// - Parameter statistics: Statistics names.
+  /// - Returns: A ``DataFrame`` containing specified statistics.
+  public func summary(_ statistics: String...) -> DataFrame {
+    return DataFrame(spark: self.spark, plan: 
SparkConnectClient.getSummary(self.plan.root, statistics))
+  }
+
   /// Returns a new Dataset with a column renamed. This is a no-op if schema 
doesn't contain existingName.
   /// - Parameters:
   ///   - existingName: A existing column name to be renamed.
diff --git a/Sources/SparkConnect/SparkConnectClient.swift 
b/Sources/SparkConnect/SparkConnectClient.swift
index 4b55584..82f350d 100644
--- a/Sources/SparkConnect/SparkConnectClient.swift
+++ b/Sources/SparkConnect/SparkConnectClient.swift
@@ -474,6 +474,28 @@ public actor SparkConnectClient {
     return plan
   }
 
+  static func getDescribe(_ child: Relation, _ cols: [String]) -> Plan {
+    var describe = Spark_Connect_StatDescribe()
+    describe.input = child
+    describe.cols = cols
+    var relation = Relation()
+    relation.describe = describe
+    var plan = Plan()
+    plan.opType = .root(relation)
+    return plan
+  }
+
+  static func getSummary(_ child: Relation, _ statistics: [String]) -> Plan {
+    var summary = Spark_Connect_StatSummary()
+    summary.input = child
+    summary.statistics = statistics
+    var relation = Relation()
+    relation.summary = summary
+    var plan = Plan()
+    plan.opType = .root(relation)
+    return plan
+  }
+
   static func getSort(_ child: Relation, _ cols: [String]) -> Plan {
     var sort = Sort()
     sort.input = child
diff --git a/Tests/SparkConnectTests/DataFrameTests.swift 
b/Tests/SparkConnectTests/DataFrameTests.swift
index da53fe4..1d5a785 100644
--- a/Tests/SparkConnectTests/DataFrameTests.swift
+++ b/Tests/SparkConnectTests/DataFrameTests.swift
@@ -682,6 +682,28 @@ struct DataFrameTests {
     await spark.stop()
   }
 
+  @Test
+  func describe() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    let df = try await spark.range(10)
+    let expected = [Row("10"), Row("4.5"), Row("3.0276503540974917"), 
Row("0"), Row("9")]
+    #expect(try await df.describe().select("id").collect() == expected)
+    #expect(try await df.describe("id").select("id").collect() == expected)
+    await spark.stop()
+  }
+
+  @Test
+  func summary() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    let expected = [
+      Row("10"), Row("4.5"), Row("3.0276503540974917"),
+      Row("0"), Row("2"), Row("4"), Row("7"), Row("9")
+    ]
+    #expect(try await spark.range(10).summary().select("id").collect() == 
expected)
+    #expect(try await spark.range(10).summary("min", 
"max").select("id").collect() == [Row("0"), Row("9")])
+    await spark.stop()
+  }
+
   @Test
   func groupBy() async throws {
     let spark = try await SparkSession.builder.getOrCreate()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to