This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 3ed7de4cb885 [SPARK-53081][CORE][SQL][CONNECT] Support `contentEquals` in `SparkFileUtils` 3ed7de4cb885 is described below commit 3ed7de4cb88543def8e78e6b2543bea0b912d67f Author: Dongjoon Hyun <dongj...@apache.org> AuthorDate: Sun Aug 3 07:18:44 2025 -0700 [SPARK-53081][CORE][SQL][CONNECT] Support `contentEquals` in `SparkFileUtils` ### What changes were proposed in this pull request? This PR aims to support `contentEquals` in `SparkFileUtils`. ### Why are the changes needed? To improve Spark's file utility features. ### Does this PR introduce _any_ user-facing change? No behavior change. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51793 from dongjoon-hyun/SPARK-53081. Authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../scala/org/apache/spark/util/SparkFileUtils.scala | 19 +++++++++++++++++++ scalastyle-config.xml | 5 +++++ .../apache/spark/sql/connect/ClientE2ETestSuite.scala | 5 ++--- .../apache/spark/sql/artifact/ArtifactManager.scala | 4 ++-- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala b/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala index 37caa85342c6..964211a49c71 100644 --- a/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala +++ b/common/utils/src/main/scala/org/apache/spark/util/SparkFileUtils.scala @@ -168,6 +168,25 @@ private[spark] trait SparkFileUtils extends Logging { } Files.copy(src.toPath(), dst.toPath(), StandardCopyOption.REPLACE_EXISTING) } + + /** Return true if the content of the files are equal or they both don't exist */ + def contentEquals(file1: File, file2: File): Boolean = { + if (file1 == null && file2 != null || file1 != null && file2 == null) { + false + } else if (file1 == null && file2 == null || !file1.exists() && !file2.exists()) { + true + } else if (!file1.exists() || !file2.exists()) { + false + } else if (file1.isDirectory() || file2.isDirectory()) { + throw new IllegalArgumentException(s"Input is not a file: $file1 or $file2") + } else if (file1.length != file2.length) { + false + } else { + val path1 = file1.toPath + val path2 = file2.toPath + Files.isSameFile(path1, path2) || Files.mismatch(path1, path2) == -1L + } + } } private[spark] object SparkFileUtils extends SparkFileUtils diff --git a/scalastyle-config.xml b/scalastyle-config.xml index dbddba673285..3605daa081c4 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -337,6 +337,11 @@ This file is divided into 3 sections: <customMessage>Use copyDirectory of JavaUtils/SparkFileUtils/Utils instead.</customMessage> </check> + <check customId="contentEquals" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">[^k]FileUtils\.contentEquals</parameter></parameters> + <customMessage>Use contentEquals of SparkFileUtils or Utils instead.</customMessage> + </check> + <check customId="commonslang2" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> <parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters> <customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead diff --git a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala index cc297f4c4987..338d532ce2df 100644 --- a/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala +++ b/sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/ClientE2ETestSuite.scala @@ -26,7 +26,6 @@ import scala.concurrent.{ExecutionContext, Future} import scala.concurrent.duration.{DurationInt, FiniteDuration} import scala.jdk.CollectionConverters._ -import org.apache.commons.io.FileUtils import org.apache.commons.io.output.TeeOutputStream import org.scalactic.TolerantNumerics import org.scalatest.PrivateMethodTester @@ -46,7 +45,7 @@ import org.apache.spark.sql.connect.test.SparkConnectServerUtils.port import org.apache.spark.sql.functions._ import org.apache.spark.sql.internal.SqlApiConf import org.apache.spark.sql.types._ -import org.apache.spark.util.SparkThreadUtils +import org.apache.spark.util.{SparkFileUtils, SparkThreadUtils} class ClientE2ETestSuite extends QueryTest @@ -346,7 +345,7 @@ class ClientE2ETestSuite .listFiles() .filter(file => file.getPath.endsWith(".csv"))(0) - assert(FileUtils.contentEquals(testDataPath.toFile, outputFile)) + assert(SparkFileUtils.contentEquals(testDataPath.toFile, outputFile)) } test("read path collision") { diff --git a/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala b/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala index c5c998eed8f1..98e045bcb295 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/artifact/ArtifactManager.scala @@ -28,7 +28,7 @@ import java.util.concurrent.atomic.AtomicBoolean import scala.jdk.CollectionConverters._ import scala.reflect.ClassTag -import org.apache.commons.io.{FilenameUtils, FileUtils} +import org.apache.commons.io.FilenameUtils import org.apache.hadoop.fs.{LocalFileSystem, Path => FSPath} import org.apache.spark.{JobArtifactSet, JobArtifactState, SparkContext, SparkEnv, SparkException, SparkRuntimeException, SparkUnsupportedOperationException} @@ -213,7 +213,7 @@ class ArtifactManager(session: SparkSession) extends AutoCloseable with Logging // Disallow overwriting with modified version if (Files.exists(target)) { // makes the query idempotent - if (FileUtils.contentEquals(target.toFile, serverLocalStagingPath.toFile)) { + if (Utils.contentEquals(target.toFile, serverLocalStagingPath.toFile)) { return } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org