(spark-connect-swift) branch main updated: [SPARK-51912] Support `semanticHash` and `sameSemantics` in `DataFrame`

dongjoon Fri, 25 Apr 2025 00:03:06 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git



The following commit(s) were added to refs/heads/main by this push:
     new d069034  [SPARK-51912] Support `semanticHash` and `sameSemantics` in 
`DataFrame`
d069034 is described below

commit d069034e1dd5ba8761b2c22bcd6588ed49856b23
Author: Dongjoon Hyun <dongj...@apache.org>
AuthorDate: Fri Apr 25 15:55:17 2025 +0900

    [SPARK-51912] Support `semanticHash` and `sameSemantics` in `DataFrame`
    
    ### What changes were proposed in this pull request?
    
    This PR aims to support `semanticHash` and `sameSemantics` in `DataFrame`.
    
    ### Why are the changes needed?
    
    For feature parity.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #89 from dongjoon-hyun/SPARK-51912.
    
    Authored-by: Dongjoon Hyun <dongj...@apache.org>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 Sources/SparkConnect/DataFrame.swift          | 14 ++++++++++++++
 Sources/SparkConnect/SparkConnectClient.swift | 27 +++++++++++++++++++++++++++
 Sources/SparkConnect/SparkSession.swift       |  8 ++++++++
 Tests/SparkConnectTests/DataFrameTests.swift  | 13 +++++++++++++
 4 files changed, 62 insertions(+)

diff --git a/Sources/SparkConnect/DataFrame.swift 
b/Sources/SparkConnect/DataFrame.swift
index 8679362..3e3b484 100644
--- a/Sources/SparkConnect/DataFrame.swift
+++ b/Sources/SparkConnect/DataFrame.swift
@@ -468,6 +468,20 @@ public actor DataFrame: Sendable {
     }
   }
 
+  /// Returns a `hashCode` of the logical query plan against this 
``DataFrame``.
+  /// - Returns: A hashcode value.
+  public func semanticHash() async throws -> Int32 {
+    return try await self.spark.semanticHash(self.plan)
+  }
+
+  /// Returns `true` when the logical query plans inside both ``Dataset``s are 
equal and therefore
+  /// return same results.
+  /// - Parameter other: A ``DataFrame`` to compare.
+  /// - Returns: Whether the both logical plans are equal.
+  public func sameSemantics(other: DataFrame) async throws -> Bool {
+    return try await self.spark.sameSemantics(self.plan, other.getPlan() as! 
Plan)
+  }
+
   /// Prints the physical plan to the console for debugging purposes.
   public func explain() async throws {
     try await explain("simple")
diff --git a/Sources/SparkConnect/SparkConnectClient.swift 
b/Sources/SparkConnect/SparkConnectClient.swift
index 38d7df2..2d4e2a9 100644
--- a/Sources/SparkConnect/SparkConnectClient.swift
+++ b/Sources/SparkConnect/SparkConnectClient.swift
@@ -568,6 +568,33 @@ public actor SparkConnectClient {
     }
   }
 
+  func sameSemantics(_ plan: Plan, _ otherPlan: Plan) async throws -> Bool {
+    try await withGPRC { client in
+      let service = SparkConnectService.Client(wrapping: client)
+      let request = analyze(self.sessionID!, {
+        var sameSemantics = AnalyzePlanRequest.SameSemantics()
+        sameSemantics.targetPlan = plan
+        sameSemantics.otherPlan = otherPlan
+        return OneOf_Analyze.sameSemantics(sameSemantics)
+      })
+      let response = try await service.analyzePlan(request)
+      return response.sameSemantics.result
+    }
+  }
+
+  func semanticHash(_ plan: Plan) async throws -> Int32 {
+    try await withGPRC { client in
+      let service = SparkConnectService.Client(wrapping: client)
+      let request = analyze(self.sessionID!, {
+        var semanticHash = AnalyzePlanRequest.SemanticHash()
+        semanticHash.plan = plan
+        return OneOf_Analyze.semanticHash(semanticHash)
+      })
+      let response = try await service.analyzePlan(request)
+      return response.semanticHash.result
+    }
+  }
+
   static func getJoin(
     _ left: Relation, _ right: Relation, _ joinType: JoinType,
     joinCondition: String? = nil, usingColumns: [String]? = nil
diff --git a/Sources/SparkConnect/SparkSession.swift 
b/Sources/SparkConnect/SparkSession.swift
index b57ec76..478df5c 100644
--- a/Sources/SparkConnect/SparkSession.swift
+++ b/Sources/SparkConnect/SparkSession.swift
@@ -171,6 +171,14 @@ public actor SparkSession {
     await client.clearTags()
   }
 
+  func sameSemantics(_ plan: Plan, _ otherPlan: Plan) async throws -> Bool {
+    return try await client.sameSemantics(plan, otherPlan)
+  }
+
+  func semanticHash(_ plan: Plan) async throws -> Int32 {
+    return try await client.semanticHash(plan)
+  }
+
   /// This is defined as the return type of `SparkSession.sparkContext` method.
   /// This is an empty `Struct` type because `sparkContext` method is designed 
to throw
   /// `UNSUPPORTED_CONNECT_FEATURE.SESSION_SPARK_CONTEXT`.
diff --git a/Tests/SparkConnectTests/DataFrameTests.swift 
b/Tests/SparkConnectTests/DataFrameTests.swift
index 7fd1403..7f06925 100644
--- a/Tests/SparkConnectTests/DataFrameTests.swift
+++ b/Tests/SparkConnectTests/DataFrameTests.swift
@@ -124,6 +124,19 @@ struct DataFrameTests {
     await spark.stop()
   }
 
+  @Test
+  func sameSemantics() async throws {
+    let spark = try await SparkSession.builder.getOrCreate()
+    let df1 = try await spark.range(1)
+    let df2 = try await spark.range(1)
+    let df3 = try await spark.range(2)
+    #expect(try await df1.sameSemantics(other: df2))
+    #expect(try await df1.semanticHash() == df2.semanticHash())
+    #expect(try await df1.sameSemantics(other: df3) == false)
+    #expect(try await df1.semanticHash() != df3.semanticHash())
+    await spark.stop()
+  }
+
   @Test
   func explain() async throws {
     let spark = try await SparkSession.builder.getOrCreate()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark-connect-swift) branch main updated: [SPARK-51912] Support `semanticHash` and `sameSemantics` in `DataFrame`

Reply via email to