This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 3ed7de4cb885 [SPARK-53081][CORE][SQL][CONNECT] Support `contentEquals` 
in `SparkFileUtils`
3ed7de4cb885 is described below

commit 3ed7de4cb88543def8e78e6b2543bea0b912d67f
Author: Dongjoon Hyun <dongj...@apache.org>
AuthorDate: Sun Aug 3 07:18:44 2025 -0700

    [SPARK-53081][CORE][SQL][CONNECT] Support `contentEquals` in 
`SparkFileUtils`
    
    ### What changes were proposed in this pull request?
    
    This PR aims to support `contentEquals` in `SparkFileUtils`.
    
    ### Why are the changes needed?
    
    To improve Spark's file utility features.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No behavior change.
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #51793 from dongjoon-hyun/SPARK-53081.
    
    Authored-by: Dongjoon Hyun <dongj...@apache.org>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../scala/org/apache/spark/util/SparkFileUtils.scala  | 19 +++++++++++++++++++
 scalastyle-config.xml                                 |  5 +++++
 .../apache/spark/sql/connect/ClientE2ETestSuite.scala |  5 ++---
 .../apache/spark/sql/artifact/ArtifactManager.scala   |  4 ++--
 4 files changed, 28 insertions(+), 5 deletions(-)

diff --git 
a/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala 
b/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala
index 37caa85342c6..964211a49c71 100644
--- a/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala
+++ b/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala
@@ -168,6 +168,25 @@ private[spark] trait SparkFileUtils extends Logging {
     }
     Files.copy(src.toPath(), dst.toPath(), StandardCopyOption.REPLACE_EXISTING)
   }
+
+  /** Return true if the content of the files are equal or they both don't 
exist */
+  def contentEquals(file1: File, file2: File): Boolean = {
+    if (file1 == null && file2 != null || file1 != null && file2 == null) {
+      false
+    } else if (file1 == null && file2 == null || !file1.exists() && 
!file2.exists()) {
+      true
+    } else if (!file1.exists() || !file2.exists()) {
+      false
+    } else if (file1.isDirectory() || file2.isDirectory()) {
+      throw new IllegalArgumentException(s"Input is not a file: $file1 or 
$file2")
+    } else if (file1.length != file2.length) {
+      false
+    } else {
+      val path1 = file1.toPath
+      val path2 = file2.toPath
+      Files.isSameFile(path1, path2) || Files.mismatch(path1, path2) == -1L
+    }
+  }
 }
 
 private[spark] object SparkFileUtils extends SparkFileUtils
diff --git a/scalastyle-config.xml b/scalastyle-config.xml
index dbddba673285..3605daa081c4 100644
--- a/scalastyle-config.xml
+++ b/scalastyle-config.xml
@@ -337,6 +337,11 @@ This file is divided into 3 sections:
     <customMessage>Use copyDirectory of JavaUtils/SparkFileUtils/Utils 
instead.</customMessage>
   </check>
 
+  <check customId="contentEquals" level="error" 
class="org.scalastyle.file.RegexChecker" enabled="true">
+    <parameters><parameter 
name="regex">[^k]FileUtils\.contentEquals</parameter></parameters>
+    <customMessage>Use contentEquals of SparkFileUtils or Utils 
instead.</customMessage>
+  </check>
+
   <check customId="commonslang2" level="error" 
class="org.scalastyle.file.RegexChecker" enabled="true">
     <parameters><parameter 
name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
     <customMessage>Use Commons Lang 3 classes (package 
org.apache.commons.lang3.*) instead
diff --git 
a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala
 
b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala
index cc297f4c4987..338d532ce2df 100644
--- 
a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala
+++ 
b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala
@@ -26,7 +26,6 @@ import scala.concurrent.{ExecutionContext, Future}
 import scala.concurrent.duration.{DurationInt, FiniteDuration}
 import scala.jdk.CollectionConverters._
 
-import org.apache.commons.io.FileUtils
 import org.apache.commons.io.output.TeeOutputStream
 import org.scalactic.TolerantNumerics
 import org.scalatest.PrivateMethodTester
@@ -46,7 +45,7 @@ import 
org.apache.spark.sql.connect.test.SparkConnectServerUtils.port
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SqlApiConf
 import org.apache.spark.sql.types._
-import org.apache.spark.util.SparkThreadUtils
+import org.apache.spark.util.{SparkFileUtils, SparkThreadUtils}
 
 class ClientE2ETestSuite
     extends QueryTest
@@ -346,7 +345,7 @@ class ClientE2ETestSuite
       .listFiles()
       .filter(file => file.getPath.endsWith(".csv"))(0)
 
-    assert(FileUtils.contentEquals(testDataPath.toFile, outputFile))
+    assert(SparkFileUtils.contentEquals(testDataPath.toFile, outputFile))
   }
 
   test("read path collision") {
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala
index c5c998eed8f1..98e045bcb295 100644
--- 
a/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala
+++ 
b/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala
@@ -28,7 +28,7 @@ import java.util.concurrent.atomic.AtomicBoolean
 import scala.jdk.CollectionConverters._
 import scala.reflect.ClassTag
 
-import org.apache.commons.io.{FilenameUtils, FileUtils}
+import org.apache.commons.io.FilenameUtils
 import org.apache.hadoop.fs.{LocalFileSystem, Path => FSPath}
 
 import org.apache.spark.{JobArtifactSet, JobArtifactState, SparkContext, 
SparkEnv, SparkException, SparkRuntimeException, 
SparkUnsupportedOperationException}
@@ -213,7 +213,7 @@ class ArtifactManager(session: SparkSession) extends 
AutoCloseable with Logging
       // Disallow overwriting with modified version
       if (Files.exists(target)) {
         // makes the query idempotent
-        if (FileUtils.contentEquals(target.toFile, 
serverLocalStagingPath.toFile)) {
+        if (Utils.contentEquals(target.toFile, serverLocalStagingPath.toFile)) 
{
           return
         }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to