This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 423bca63dec4 [SPARK-53214][CORE][SQL][K8S] Use Java `HexFormat` instead of `Hex.encodeHexString` 423bca63dec4 is described below commit 423bca63dec4772d96a2dbf293318cd5c428a598 Author: Dongjoon Hyun <dongj...@apache.org> AuthorDate: Sat Aug 9 17:42:16 2025 -0700 [SPARK-53214][CORE][SQL][K8S] Use Java `HexFormat` instead of `Hex.encodeHexString` ### What changes were proposed in this pull request? This PR aims to use Java 17+ `HexFormat` instead of Apache Commons Codec `Hex.encodeHexString`. ### Why are the changes needed? Java native method is ***roughly 2x*** faster than Apache Commons Codec API. ```scala scala> val a = new Array[Byte](1_000_000_000) scala> spark.time(org.apache.commons.codec.binary.Hex.encodeHexString(a, false).length) Time taken: 1559 ms val res0: Int = 2000000000 scala> spark.time(java.util.HexFormat.of().withUpperCase().formatHex(a).length) Time taken: 672 ms val res1: Int = 2000000000 ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #51939 from dongjoon-hyun/SPARK-53214. Authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- core/src/main/scala/org/apache/spark/util/Utils.scala | 5 ++--- .../main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala | 5 ++--- scalastyle-config.xml | 5 +++++ .../scala/org/apache/spark/sql/catalyst/expressions/literals.scala | 7 +++---- .../org/apache/spark/sql/connector/expressions/expressions.scala | 4 ++-- .../src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala | 5 ++--- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala b/core/src/main/scala/org/apache/spark/util/Utils.scala index a9ceadc208dd..3b525cd69430 100644 --- a/core/src/main/scala/org/apache/spark/util/Utils.scala +++ b/core/src/main/scala/org/apache/spark/util/Utils.scala @@ -28,7 +28,7 @@ import java.nio.channels.Channels import java.nio.charset.StandardCharsets import java.nio.file.Files import java.security.SecureRandom -import java.util.{Locale, Properties, Random, UUID} +import java.util.{HexFormat, Locale, Properties, Random, UUID} import java.util.concurrent._ import java.util.concurrent.TimeUnit.NANOSECONDS import java.util.zip.{GZIPInputStream, ZipInputStream} @@ -48,7 +48,6 @@ import com.google.common.cache.{CacheBuilder, CacheLoader, LoadingCache} import com.google.common.collect.Interners import com.google.common.net.InetAddresses import jakarta.ws.rs.core.UriBuilder -import org.apache.commons.codec.binary.Hex import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} import org.apache.hadoop.fs.audit.CommonAuditContext.currentAuditContext @@ -2919,7 +2918,7 @@ private[spark] object Utils val rnd = new SecureRandom() val secretBytes = new Array[Byte](bits / JByte.SIZE) rnd.nextBytes(secretBytes) - Hex.encodeHexString(secretBytes) + HexFormat.of().formatHex(secretBytes) } /** diff --git a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala index 5f79c894c232..a6eb0fcc98b9 100644 --- a/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala +++ b/resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/k8s/KubernetesUtils.scala @@ -19,13 +19,12 @@ package org.apache.spark.deploy.k8s import java.io.{File, IOException} import java.net.URI import java.security.SecureRandom -import java.util.{Collections, UUID} +import java.util.{Collections, HexFormat, UUID} import scala.jdk.CollectionConverters._ import io.fabric8.kubernetes.api.model.{Container, ContainerBuilder, ContainerStateRunning, ContainerStateTerminated, ContainerStateWaiting, ContainerStatus, EnvVar, EnvVarBuilder, EnvVarSourceBuilder, HasMetadata, OwnerReferenceBuilder, Pod, PodBuilder, Quantity} import io.fabric8.kubernetes.client.KubernetesClient -import org.apache.commons.codec.binary.Hex import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkException} @@ -241,7 +240,7 @@ object KubernetesUtils extends Logging { } val time = java.lang.Long.toHexString(clock.getTimeMillis() & 0xFFFFFFFFFFL) - Hex.encodeHexString(random) + time + HexFormat.of().formatHex(random) + time } /** diff --git a/scalastyle-config.xml b/scalastyle-config.xml index bc28bfe295a8..48362ca6f9a2 100644 --- a/scalastyle-config.xml +++ b/scalastyle-config.xml @@ -499,6 +499,11 @@ This file is divided into 3 sections: <customMessage>Use Utils.strip method instead</customMessage> </check> + <check customId="encodeHexString" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> + <parameters><parameter name="regex">\bHex\.encodeHexString\b</parameter></parameters> + <customMessage>Use java.util.HexFormat instead</customMessage> + </check> + <check customId="commonsiofileutils" level="error" class="org.scalastyle.file.RegexChecker" enabled="true"> <parameters><parameter name="regex">org\.apache\.commons\.io\.FileUtils\b</parameter></parameters> <customMessage>Use Java API or Spark's JavaUtils/SparkSystemUtils/Utils instead</customMessage> diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala index b1d8c636118d..c799415dfc70 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/literals.scala @@ -30,14 +30,13 @@ import java.nio.charset.StandardCharsets import java.sql.{Date, Timestamp} import java.time.{Duration, Instant, LocalDate, LocalDateTime, LocalTime, Period, ZoneOffset} import java.util -import java.util.Objects +import java.util.{HexFormat, Objects} import scala.collection.{immutable, mutable} import scala.math.{BigDecimal, BigInt} import scala.reflect.runtime.universe.TypeTag import scala.util.Try -import org.apache.commons.codec.binary.{Hex => ApacheHex} import org.json4s.JsonAST._ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, FunctionIdentifier, InternalRow, ScalaReflection} @@ -432,7 +431,7 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { override def toString: String = value match { case null => "null" - case binary: Array[Byte] => "0x" + ApacheHex.encodeHexString(binary, false) + case binary: Array[Byte] => "0x" + HexFormat.of().withUpperCase().formatHex(binary) case d: ArrayBasedMapData => s"map(${d.toString})" case other => dataType match { @@ -578,7 +577,7 @@ case class Literal (value: Any, dataType: DataType) extends LeafExpression { s"TIMESTAMP_NTZ '$toString'" case (i: CalendarInterval, CalendarIntervalType) => s"INTERVAL '${i.toString}'" - case (v: Array[Byte], BinaryType) => s"X'${ApacheHex.encodeHexString(v, false)}'" + case (v: Array[Byte], BinaryType) => s"X'${HexFormat.of().withUpperCase().formatHex(v)}'" case (i: Long, DayTimeIntervalType(startField, endField)) => toDayTimeIntervalString(i, ANSI_STYLE, startField, endField) case (i: Int, YearMonthIntervalType(startField, endField)) => diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala index d9d91e187e8a..18d94969aa27 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/expressions/expressions.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.connector.expressions -import org.apache.commons.codec.binary.Hex +import java.util.HexFormat import org.apache.spark.SparkException import org.apache.spark.sql.catalyst @@ -393,7 +393,7 @@ private[sql] final case class LiteralValue[T](value: T, dataType: DataType) exte case BinaryType => assert(value.isInstanceOf[Array[Byte]]) val bytes = value.asInstanceOf[Array[Byte]] - "0x" + Hex.encodeHexString(bytes, false) + "0x" + HexFormat.of().withUpperCase().formatHex(bytes) case _ => s"$value" } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala index d19585a3b8f2..c6a48d98fa98 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala @@ -18,11 +18,10 @@ package org.apache.spark.sql.jdbc import java.sql.{Connection, DriverManager} -import java.util.Properties +import java.util.{HexFormat, Properties} import scala.util.control.NonFatal -import org.apache.commons.codec.binary.Hex import test.org.apache.spark.sql.connector.catalog.functions.JavaStrLen.JavaStrLenStaticMagic import org.apache.spark.{SparkConf, SparkException, SparkIllegalArgumentException} @@ -3103,7 +3102,7 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel } test("SPARK-50792: Format binary data as a binary literal in JDBC.") { - val hexBinary = Hex.encodeHexString(testBytes, false) + val hexBinary = HexFormat.of().withUpperCase().formatHex(testBytes) val binary = "X'" + hexBinary + "'" val df = sql(s"SELECT * FROM h2.test.binary_tab WHERE b = $binary") checkFiltersRemoved(df) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org