This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/spark-connect-swift.git
The following commit(s) were added to refs/heads/main by this push: new d069034 [SPARK-51912] Support `semanticHash` and `sameSemantics` in `DataFrame` d069034 is described below commit d069034e1dd5ba8761b2c22bcd6588ed49856b23 Author: Dongjoon Hyun <dongj...@apache.org> AuthorDate: Fri Apr 25 15:55:17 2025 +0900 [SPARK-51912] Support `semanticHash` and `sameSemantics` in `DataFrame` ### What changes were proposed in this pull request? This PR aims to support `semanticHash` and `sameSemantics` in `DataFrame`. ### Why are the changes needed? For feature parity. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #89 from dongjoon-hyun/SPARK-51912. Authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- Sources/SparkConnect/DataFrame.swift | 14 ++++++++++++++ Sources/SparkConnect/SparkConnectClient.swift | 27 +++++++++++++++++++++++++++ Sources/SparkConnect/SparkSession.swift | 8 ++++++++ Tests/SparkConnectTests/DataFrameTests.swift | 13 +++++++++++++ 4 files changed, 62 insertions(+) diff --git a/Sources/SparkConnect/DataFrame.swift b/Sources/SparkConnect/DataFrame.swift index 8679362..3e3b484 100644 --- a/Sources/SparkConnect/DataFrame.swift +++ b/Sources/SparkConnect/DataFrame.swift @@ -468,6 +468,20 @@ public actor DataFrame: Sendable { } } + /// Returns a `hashCode` of the logical query plan against this ``DataFrame``. + /// - Returns: A hashcode value. + public func semanticHash() async throws -> Int32 { + return try await self.spark.semanticHash(self.plan) + } + + /// Returns `true` when the logical query plans inside both ``Dataset``s are equal and therefore + /// return same results. + /// - Parameter other: A ``DataFrame`` to compare. + /// - Returns: Whether the both logical plans are equal. + public func sameSemantics(other: DataFrame) async throws -> Bool { + return try await self.spark.sameSemantics(self.plan, other.getPlan() as! Plan) + } + /// Prints the physical plan to the console for debugging purposes. public func explain() async throws { try await explain("simple") diff --git a/Sources/SparkConnect/SparkConnectClient.swift b/Sources/SparkConnect/SparkConnectClient.swift index 38d7df2..2d4e2a9 100644 --- a/Sources/SparkConnect/SparkConnectClient.swift +++ b/Sources/SparkConnect/SparkConnectClient.swift @@ -568,6 +568,33 @@ public actor SparkConnectClient { } } + func sameSemantics(_ plan: Plan, _ otherPlan: Plan) async throws -> Bool { + try await withGPRC { client in + let service = SparkConnectService.Client(wrapping: client) + let request = analyze(self.sessionID!, { + var sameSemantics = AnalyzePlanRequest.SameSemantics() + sameSemantics.targetPlan = plan + sameSemantics.otherPlan = otherPlan + return OneOf_Analyze.sameSemantics(sameSemantics) + }) + let response = try await service.analyzePlan(request) + return response.sameSemantics.result + } + } + + func semanticHash(_ plan: Plan) async throws -> Int32 { + try await withGPRC { client in + let service = SparkConnectService.Client(wrapping: client) + let request = analyze(self.sessionID!, { + var semanticHash = AnalyzePlanRequest.SemanticHash() + semanticHash.plan = plan + return OneOf_Analyze.semanticHash(semanticHash) + }) + let response = try await service.analyzePlan(request) + return response.semanticHash.result + } + } + static func getJoin( _ left: Relation, _ right: Relation, _ joinType: JoinType, joinCondition: String? = nil, usingColumns: [String]? = nil diff --git a/Sources/SparkConnect/SparkSession.swift b/Sources/SparkConnect/SparkSession.swift index b57ec76..478df5c 100644 --- a/Sources/SparkConnect/SparkSession.swift +++ b/Sources/SparkConnect/SparkSession.swift @@ -171,6 +171,14 @@ public actor SparkSession { await client.clearTags() } + func sameSemantics(_ plan: Plan, _ otherPlan: Plan) async throws -> Bool { + return try await client.sameSemantics(plan, otherPlan) + } + + func semanticHash(_ plan: Plan) async throws -> Int32 { + return try await client.semanticHash(plan) + } + /// This is defined as the return type of `SparkSession.sparkContext` method. /// This is an empty `Struct` type because `sparkContext` method is designed to throw /// `UNSUPPORTED_CONNECT_FEATURE.SESSION_SPARK_CONTEXT`. diff --git a/Tests/SparkConnectTests/DataFrameTests.swift b/Tests/SparkConnectTests/DataFrameTests.swift index 7fd1403..7f06925 100644 --- a/Tests/SparkConnectTests/DataFrameTests.swift +++ b/Tests/SparkConnectTests/DataFrameTests.swift @@ -124,6 +124,19 @@ struct DataFrameTests { await spark.stop() } + @Test + func sameSemantics() async throws { + let spark = try await SparkSession.builder.getOrCreate() + let df1 = try await spark.range(1) + let df2 = try await spark.range(1) + let df3 = try await spark.range(2) + #expect(try await df1.sameSemantics(other: df2)) + #expect(try await df1.semanticHash() == df2.semanticHash()) + #expect(try await df1.sameSemantics(other: df3) == false) + #expect(try await df1.semanticHash() != df3.semanticHash()) + await spark.stop() + } + @Test func explain() async throws { let spark = try await SparkSession.builder.getOrCreate() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org