This is an automated email from the ASF dual-hosted git repository.
dmollitor pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/avro.git
The following commit(s) were added to refs/heads/main by this push:
new b61427fdb AVRO-4060: Use JDK to Hash Byte Array in UTF8 (#3175)
b61427fdb is described below
commit b61427fdb9bae00cf4d14225b40c3399c7c823aa
Author: belugabehr <[email protected]>
AuthorDate: Sat Feb 1 22:43:47 2025 -0500
AVRO-4060: Use JDK to Hash Byte Array in UTF8 (#3175)
---
.../src/main/java/org/apache/avro/util/Utf8.java | 17 ++++++++++++++---
.../src/test/java/org/apache/avro/util/TestUtf8.java | 20 ++++++++++++++++++++
2 files changed, 34 insertions(+), 3 deletions(-)
diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
index 22c21c76b..b609e166c 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
@@ -68,6 +68,11 @@ public class Utf8 implements Comparable<Utf8>, CharSequence,
Externalizable {
this.length = length;
}
+ Utf8(String string, int length) {
+ this(string);
+ this.length = length;
+ }
+
/**
* Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()}
* assuming the bytes have been fully copied into the underlying buffer from
the
@@ -173,9 +178,15 @@ public class Utf8 implements Comparable<Utf8>,
CharSequence, Externalizable {
if (h == 0) {
byte[] bytes = this.bytes;
int length = this.length;
- h = 1;
- for (int i = 0; i < length; i++) {
- h = h * 31 + bytes[i];
+ // If the array is filled, use the underlying JDK hash functionality.
+ // Starting with JDK 21, the underlying implementation is vectorized.
+ if (length > 7 && bytes.length == length) {
+ h = Arrays.hashCode(bytes);
+ } else {
+ h = 1;
+ for (int i = 0; i < length; i++) {
+ h = h * 31 + bytes[i];
+ }
}
this.hash = h;
}
diff --git a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
index 91618ca5e..3e36d9a02 100644
--- a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
+++ b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
@@ -99,6 +99,26 @@ public class TestUtf8 {
assertEquals(4122302, u.hashCode());
}
+ /**
+ * There are two different code paths that hashcode() can call depending on
the
+ * state of the internal buffer. If the buffer is full (string length is
equal
+ * to buffer length) then the JDK hashcode function can be used. However, if
the
+ * buffer is not full (string length is less than the internal buffer
length),
+ * then the JDK does not support this prior to JDK 23 and a scalar
+ * implementation is the only option today. This difference can be resolved
with
+ * JDK 23 as it supports both cases.
+ */
+ @Test
+ void hashCodeBasedOnCapacity() {
+ // string = 8; buffer = 8
+ Utf8 fullCapacity = new Utf8("abcdefgh", 8);
+
+ // string = 8; buffer = 9
+ Utf8 partialCapacity = new Utf8("abcdefghX", 8);
+
+ assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode());
+ }
+
@Test
void oversizeUtf8() {
Utf8 u = new Utf8();