Skip to content

Commit 22d63c7

Browse files
committed
[SPARK-56492][CORE][SQL] Support digestToHexString/md5Hex/sha256Hex in JavaUtils
### What changes were proposed in this pull request? This PR replaces `org.apache.commons.codec.digest.DigestUtils` with JDK's `java.security.MessageDigest` and `java.util.HexFormat` by adding `digestToHexString/md5Hex/sha256Hex` utility methods to `JavaUtils`. It also adds a Scalastyle rule to ban `org.apache.commons.codec.digest` imports going forward. ### Why are the changes needed? Apache Spark can use JDK built-in APIs (`java.security.MessageDigest` + `java.util.HexFormat`) instead of the third-party `commons-codec` library for SHA digest computation. This reduces external dependency usage where standard library alternatives exist. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Pass the CIs. ### Was this patch authored or co-authored using generative AI tooling? Generated-by: Claude Code (claude-opus-4-6) Closes #55353 from dongjoon-hyun/SPARK-56492. Authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
1 parent a4ebe27 commit 22d63c7

7 files changed

Lines changed: 84 additions & 30 deletions

File tree

common/utils-java/src/main/java/org/apache/spark/network/util/JavaUtils.java

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
import java.nio.file.SimpleFileVisitor;
3232
import java.nio.file.StandardCopyOption;
3333
import java.nio.file.attribute.BasicFileAttributes;
34+
import java.security.MessageDigest;
35+
import java.security.NoSuchAlgorithmException;
3436
import java.util.*;
3537
import java.util.concurrent.TimeUnit;
3638
import java.util.concurrent.atomic.AtomicLong;
@@ -767,4 +769,58 @@ public static void checkState(boolean check, String msg, Object... args) {
767769
throw new IllegalStateException(String.format(msg, args));
768770
}
769771
}
772+
773+
private static final HexFormat LOWERCASE_HEX = HexFormat.of();
774+
775+
/**
776+
* Computes the digest of the input bytes using the given algorithm
777+
* and returns the result as a lowercase hex string.
778+
*/
779+
public static String digestToHexString(String algorithm, byte[] input) {
780+
try {
781+
return LOWERCASE_HEX.formatHex(MessageDigest.getInstance(algorithm).digest(input));
782+
} catch (NoSuchAlgorithmException e) {
783+
throw new RuntimeException(e);
784+
}
785+
}
786+
787+
/**
788+
* Computes the digest of the input string using the given algorithm
789+
* and returns the result as a lowercase hex string.
790+
*/
791+
public static String digestToHexString(String algorithm, String input) {
792+
return digestToHexString(algorithm, input.getBytes(StandardCharsets.UTF_8));
793+
}
794+
795+
/**
796+
* Computes the MD5 digest of the input bytes
797+
* and returns the result as a lowercase hex string.
798+
*/
799+
public static String md5Hex(byte[] input) {
800+
return digestToHexString("MD5", input);
801+
}
802+
803+
/**
804+
* Computes the MD5 digest of the input string
805+
* and returns the result as a lowercase hex string.
806+
*/
807+
public static String md5Hex(String input) {
808+
return digestToHexString("MD5", input);
809+
}
810+
811+
/**
812+
* Computes the SHA-256 digest of the input bytes
813+
* and returns the result as a lowercase hex string.
814+
*/
815+
public static String sha256Hex(byte[] input) {
816+
return digestToHexString("SHA-256", input);
817+
}
818+
819+
/**
820+
* Computes the SHA-256 digest of the input string
821+
* and returns the result as a lowercase hex string.
822+
*/
823+
public static String sha256Hex(String input) {
824+
return digestToHexString("SHA-256", input);
825+
}
770826
}

scalastyle-config.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,11 @@ This file is divided into 3 sections:
629629
<customMessage>Please use Apache Log4j 2 instead.</customMessage>
630630
</check>
631631

632+
<check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="true">
633+
<parameters><parameter name="illegalImports"><![CDATA[org.apache.commons.codec.digest]]></parameter></parameters>
634+
<customMessage>Please use org.apache.spark.network.util.JavaUtils.digestToHexString instead.</customMessage>
635+
</check>
636+
632637

633638
<!-- ================================================================================ -->
634639
<!-- rules we'd like to enforce, but haven't cleaned up the codebase yet -->

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,8 @@ import java.util.zip.CRC32
2323

2424
import scala.annotation.tailrec
2525

26-
import org.apache.commons.codec.digest.DigestUtils
27-
import org.apache.commons.codec.digest.MessageDigestAlgorithms
28-
26+
import org.apache.spark.network.util.JavaUtils
27+
import org.apache.spark.network.util.JavaUtils.{digestToHexString, md5Hex, sha256Hex}
2928
import org.apache.spark.sql.catalyst.InternalRow
3029
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
3130
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
@@ -72,11 +71,11 @@ case class Md5(child: Expression)
7271
override def contextIndependentFoldable: Boolean = child.contextIndependentFoldable
7372

7473
protected override def nullSafeEval(input: Any): Any =
75-
UTF8String.fromString(DigestUtils.md5Hex(input.asInstanceOf[Array[Byte]]))
74+
UTF8String.fromString(md5Hex(input.asInstanceOf[Array[Byte]]))
7675

7776
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
7877
defineCodeGen(ctx, ev, c =>
79-
s"UTF8String.fromString(${classOf[DigestUtils].getName}.md5Hex($c))")
78+
s"UTF8String.fromString(${classOf[JavaUtils].getName}.md5Hex($c))")
8079
}
8180

8281
override protected def withNewChildInternal(newChild: Expression): Md5 = copy(child = newChild)
@@ -122,35 +121,29 @@ case class Sha2(left: Expression, right: Expression)
122121
val input = input1.asInstanceOf[Array[Byte]]
123122
bitLength match {
124123
case 224 =>
125-
UTF8String.fromString(
126-
new DigestUtils(MessageDigestAlgorithms.SHA_224).digestAsHex(input))
124+
UTF8String.fromString(digestToHexString("SHA-224", input))
127125
case 256 | 0 =>
128-
UTF8String.fromString(DigestUtils.sha256Hex(input))
126+
UTF8String.fromString(sha256Hex(input))
129127
case 384 =>
130-
UTF8String.fromString(DigestUtils.sha384Hex(input))
128+
UTF8String.fromString(digestToHexString("SHA-384", input))
131129
case 512 =>
132-
UTF8String.fromString(DigestUtils.sha512Hex(input))
130+
UTF8String.fromString(digestToHexString("SHA-512", input))
133131
case _ => null
134132
}
135133
}
136134

137135
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
138-
val digestUtils = classOf[DigestUtils].getName
139-
val messageDigestAlgorithms = classOf[MessageDigestAlgorithms].getName
136+
val javaUtils = classOf[JavaUtils].getName
140137
nullSafeCodeGen(ctx, ev, (eval1, eval2) => {
141138
s"""
142139
if ($eval2 == 224) {
143-
${ev.value} = UTF8String.fromString(
144-
new $digestUtils($messageDigestAlgorithms.SHA_224).digestAsHex($eval1));
140+
${ev.value} = UTF8String.fromString($javaUtils.digestToHexString("SHA-224", $eval1));
145141
} else if ($eval2 == 256 || $eval2 == 0) {
146-
${ev.value} =
147-
UTF8String.fromString($digestUtils.sha256Hex($eval1));
142+
${ev.value} = UTF8String.fromString($javaUtils.sha256Hex($eval1));
148143
} else if ($eval2 == 384) {
149-
${ev.value} =
150-
UTF8String.fromString($digestUtils.sha384Hex($eval1));
144+
${ev.value} = UTF8String.fromString($javaUtils.digestToHexString("SHA-384", $eval1));
151145
} else if ($eval2 == 512) {
152-
${ev.value} =
153-
UTF8String.fromString($digestUtils.sha512Hex($eval1));
146+
${ev.value} = UTF8String.fromString($javaUtils.digestToHexString("SHA-512", $eval1));
154147
} else {
155148
${ev.isNull} = true;
156149
}
@@ -186,11 +179,11 @@ case class Sha1(child: Expression)
186179
override def contextIndependentFoldable: Boolean = child.contextIndependentFoldable
187180

188181
protected override def nullSafeEval(input: Any): Any =
189-
UTF8String.fromString(DigestUtils.sha1Hex(input.asInstanceOf[Array[Byte]]))
182+
UTF8String.fromString(digestToHexString("SHA-1", input.asInstanceOf[Array[Byte]]))
190183

191184
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
192185
defineCodeGen(ctx, ev, c =>
193-
s"UTF8String.fromString(${classOf[DigestUtils].getName}.sha1Hex($c))"
186+
s"""UTF8String.fromString(${classOf[JavaUtils].getName}.digestToHexString("SHA-1", $c))"""
194187
)
195188
}
196189

sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/HashExpressionsSuite.scala

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ import java.time.{Duration, LocalTime, Period, ZoneId, ZoneOffset}
2323
import scala.collection.mutable.ArrayBuffer
2424
import scala.language.implicitConversions
2525

26-
import org.apache.commons.codec.digest.DigestUtils
2726
import org.scalatest.exceptions.TestFailedException
2827

2928
import org.apache.spark.SparkFunSuite
29+
import org.apache.spark.network.util.JavaUtils.{digestToHexString, sha256Hex}
3030
import org.apache.spark.sql.{RandomDataGenerator, Row}
3131
import org.apache.spark.sql.catalyst.InternalRow
3232
import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, ExpressionEncoder}
@@ -65,13 +65,13 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
6565
checkEvaluation(Sha2(Literal("ABC".getBytes(StandardCharsets.UTF_8)), Literal(224)),
6666
"107c5072b799c4771f328304cfe1ebb375eb6ea7f35a3aa753836fad")
6767
checkEvaluation(Sha2(Literal("ABC".getBytes(StandardCharsets.UTF_8)), Literal(0)),
68-
DigestUtils.sha256Hex("ABC"))
68+
sha256Hex("ABC"))
6969
checkEvaluation(Sha2(Literal("ABC".getBytes(StandardCharsets.UTF_8)), Literal(256)),
70-
DigestUtils.sha256Hex("ABC"))
70+
sha256Hex("ABC"))
7171
checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)),
72-
DigestUtils.sha384Hex(Array[Byte](1, 2, 3, 4, 5, 6)))
72+
digestToHexString("SHA-384", Array[Byte](1, 2, 3, 4, 5, 6)))
7373
checkEvaluation(Sha2(Literal("ABC".getBytes(StandardCharsets.UTF_8)), Literal(512)),
74-
DigestUtils.sha512Hex("ABC"))
74+
digestToHexString("SHA-512", "ABC"))
7575
// unsupported bit length
7676
checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(1024)), null)
7777
// null input and valid bit length

sql/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/ArtifactSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ import java.util.zip.CRC32
2626
import com.google.protobuf.ByteString
2727
import io.grpc.{ManagedChannel, Server}
2828
import io.grpc.inprocess.{InProcessChannelBuilder, InProcessServerBuilder}
29-
import org.apache.commons.codec.digest.DigestUtils.sha256Hex
3029

3130
import org.apache.spark.connect.proto.AddArtifactsRequest
31+
import org.apache.spark.network.util.JavaUtils.sha256Hex
3232
import org.apache.spark.sql.Artifact
3333
import org.apache.spark.sql.connect.client.SparkConnectClient.Configuration
3434
import org.apache.spark.sql.connect.test.ConnectFunSuite

sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ import scala.util.control.NonFatal
3232
import com.google.protobuf.ByteString
3333
import io.grpc.StatusRuntimeException
3434
import io.grpc.stub.StreamObserver
35-
import org.apache.commons.codec.digest.DigestUtils.sha256Hex
3635

3736
import org.apache.spark.SparkException
3837
import org.apache.spark.connect.proto
3938
import org.apache.spark.connect.proto.AddArtifactsResponse
4039
import org.apache.spark.connect.proto.AddArtifactsResponse.ArtifactSummary
40+
import org.apache.spark.network.util.JavaUtils.sha256Hex
4141
import org.apache.spark.sql.Artifact
4242
import org.apache.spark.sql.Artifact.{newCacheArtifact, newIvyArtifacts}
4343
import org.apache.spark.util.{SparkFileUtils, SparkStringUtils, SparkThreadUtils}

sql/connect/server/src/test/scala/org/apache/spark/sql/connect/service/ArtifactStatusesHandlerSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ import scala.concurrent.duration._
2323
import scala.jdk.CollectionConverters._
2424

2525
import io.grpc.stub.StreamObserver
26-
import org.apache.commons.codec.digest.DigestUtils.sha256Hex
2726

2827
import org.apache.spark.connect.proto
2928
import org.apache.spark.connect.proto.ArtifactStatusesResponse
29+
import org.apache.spark.network.util.JavaUtils.sha256Hex
3030
import org.apache.spark.sql.connect.ResourceHelper
3131
import org.apache.spark.sql.test.SharedSparkSession
3232
import org.apache.spark.util.ThreadUtils

0 commit comments

Comments
 (0)