(avro) branch main updated: AVRO-4060: Use JDK to Hash Byte Array in UTF8 (#3175)

dmollitor Sat, 01 Feb 2025 19:44:23 -0800

This is an automated email from the ASF dual-hosted git repository.

dmollitor pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/avro.git



The following commit(s) were added to refs/heads/main by this push:
     new b61427fdb AVRO-4060: Use JDK to Hash Byte Array in UTF8 (#3175)
b61427fdb is described below

commit b61427fdb9bae00cf4d14225b40c3399c7c823aa
Author: belugabehr <[email protected]>
AuthorDate: Sat Feb 1 22:43:47 2025 -0500

    AVRO-4060: Use JDK to Hash Byte Array in UTF8 (#3175)
---
 .../src/main/java/org/apache/avro/util/Utf8.java     | 17 ++++++++++++++---
 .../src/test/java/org/apache/avro/util/TestUtf8.java | 20 ++++++++++++++++++++
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java 
b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
index 22c21c76b..b609e166c 100644
--- a/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
+++ b/lang/java/avro/src/main/java/org/apache/avro/util/Utf8.java
@@ -68,6 +68,11 @@ public class Utf8 implements Comparable<Utf8>, CharSequence, 
Externalizable {
     this.length = length;
   }
 
+  Utf8(String string, int length) {
+    this(string);
+    this.length = length;
+  }
+
   /**
    * Return UTF-8 encoded bytes. Only valid through {@link #getByteLength()}
    * assuming the bytes have been fully copied into the underlying buffer from 
the
@@ -173,9 +178,15 @@ public class Utf8 implements Comparable<Utf8>, 
CharSequence, Externalizable {
     if (h == 0) {
       byte[] bytes = this.bytes;
       int length = this.length;
-      h = 1;
-      for (int i = 0; i < length; i++) {
-        h = h * 31 + bytes[i];
+      // If the array is filled, use the underlying JDK hash functionality.
+      // Starting with JDK 21, the underlying implementation is vectorized.
+      if (length > 7 && bytes.length == length) {
+        h = Arrays.hashCode(bytes);
+      } else {
+        h = 1;
+        for (int i = 0; i < length; i++) {
+          h = h * 31 + bytes[i];
+        }
       }
       this.hash = h;
     }
diff --git a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java 
b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
index 91618ca5e..3e36d9a02 100644
--- a/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
+++ b/lang/java/avro/src/test/java/org/apache/avro/util/TestUtf8.java
@@ -99,6 +99,26 @@ public class TestUtf8 {
     assertEquals(4122302, u.hashCode());
   }
 
+  /**
+   * There are two different code paths that hashcode() can call depending on 
the
+   * state of the internal buffer. If the buffer is full (string length is 
equal
+   * to buffer length) then the JDK hashcode function can be used. However, if 
the
+   * buffer is not full (string length is less than the internal buffer 
length),
+   * then the JDK does not support this prior to JDK 23 and a scalar
+   * implementation is the only option today. This difference can be resolved 
with
+   * JDK 23 as it supports both cases.
+   */
+  @Test
+  void hashCodeBasedOnCapacity() {
+    // string = 8; buffer = 8
+    Utf8 fullCapacity = new Utf8("abcdefgh", 8);
+
+    // string = 8; buffer = 9
+    Utf8 partialCapacity = new Utf8("abcdefghX", 8);
+
+    assertEquals(fullCapacity.hashCode(), partialCapacity.hashCode());
+  }
+
   @Test
   void oversizeUtf8() {
     Utf8 u = new Utf8();

(avro) branch main updated: AVRO-4060: Use JDK to Hash Byte Array in UTF8 (#3175)

Reply via email to