This is an automated email from the ASF dual-hosted git repository.

aherbert pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-codec.git

commit 48b23313e7c7428fc421942352c8dba4d1e35200
Author: aherbert <aherb...@apache.org>
AuthorDate: Thu Nov 21 16:54:34 2019 +0000

    Fix MurmurHash2 code in-line with MurmurHash3.
    
    This updates the documentation and code so the two can be compared.
    
    No binary functionality is changed.
---
 .../apache/commons/codec/digest/MurmurHash2.java   | 174 ++++++++++++++++-----
 .../apache/commons/codec/digest/MurmurHash3.java   |   3 +-
 2 files changed, 137 insertions(+), 40 deletions(-)

diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java 
b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
index 0e07bfe..e349ff9 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java
@@ -18,30 +18,42 @@
 package org.apache.commons.codec.digest;
 
 /**
- * MurmurHash2 yields a 32-bit or 64-bit value.
+ * Implementation of the MurmurHash2 32-bit and 64-bit hash functions.
  *
- * MurmurHash is a non-cryptographic hash function suitable for general
+ * <p>MurmurHash is a non-cryptographic hash function suitable for general
  * hash-based lookup. The name comes from two basic operations, multiply (MU)
  * and rotate (R), used in its inner loop. Unlike cryptographic hash functions,
  * it is not specifically designed to be difficult to reverse by an adversary,
- * making it unsuitable for cryptographic purposes.
+ * making it unsuitable for cryptographic purposes.</p>
  *
- * This is a re-implementation of the original C code plus some additional
- * features.
+ * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2}
+ * and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's
+ * original {@code c++} code in SMHasher.</p>
  *
- * Public domain.
+ * <p>This is a re-implementation of the original C code plus some additional
+ * features.</p>
+ *
+ * <p>This is public domain code with no copyrights. From homepage of
+ * <a href="https://github.com/aappleby/smhasher";>SMHasher</a>:</p>
+ *
+ * <blockquote>
+ * "All MurmurHash versions are public domain software, and the author
+ * disclaims all copyright to their code."
+ * </blockquote>
  *
  * @see <a href="https://en.wikipedia.org/wiki/MurmurHash";>MurmurHash</a>
+ * @see <a 
href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp";>
+ *   Original MurmurHash2 c++ code</a>
  * @since 1.13
  */
 public final class MurmurHash2 {
 
-    // all methods static; private constructor.
+    /** No instance methods. */
     private MurmurHash2() {
     }
 
     /**
-     * Generates 32 bit hash from byte array of the given length and seed.
+     * Generates 32 bit hash from byte array with the given length and seed.
      *
      * @param data   byte array to hash
      * @param length length of the array to hash
@@ -56,12 +68,14 @@ public final class MurmurHash2 {
 
         // Initialize the hash to a random value
         int h = seed ^ length;
-        final int length4 = length / 4;
 
-        for (int i = 0; i < length4; i++) {
-            final int i4 = i * 4;
-            int k = (data[i4 + 0] & 0xff) + ((data[i4 + 1] & 0xff) << 8) + 
((data[i4 + 2] & 0xff) << 16)
-                    + ((data[i4 + 3] & 0xff) << 24);
+        // Mix 4 bytes at a time into the hash
+        final int nblocks = length >> 2;
+
+        // body
+        for (int i = 0; i < nblocks; i++) {
+            final int index = (i << 2);
+            int k = getLittleEndianInt(data, index);
             k *= m;
             k ^= k >>> r;
             k *= m;
@@ -70,16 +84,19 @@ public final class MurmurHash2 {
         }
 
         // Handle the last few bytes of the input array
-        switch (length % 4) {
+        final int index = (nblocks << 2);
+        switch (length - index) {
         case 3:
-            h ^= (data[(length & ~3) + 2] & 0xff) << 16;
+            h ^= (data[index + 2] & 0xff) << 16;
         case 2:
-            h ^= (data[(length & ~3) + 1] & 0xff) << 8;
+            h ^= (data[index + 1] & 0xff) << 8;
         case 1:
-            h ^= (data[length & ~3] & 0xff);
+            h ^= (data[index] & 0xff);
             h *= m;
         }
 
+        // Do a few final mixes of the hash to ensure the last few
+        // bytes are well-incorporated.
         h ^= h >>> 13;
         h *= m;
         h ^= h >>> 15;
@@ -88,21 +105,37 @@ public final class MurmurHash2 {
     }
 
     /**
-     * Generates 32 bit hash from byte array with default seed value.
+     * Generates 32 bit hash from byte array with the given length and a 
default seed value.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0x9747b28c;
+     * int hash = MurmurHash2.hash32(data, offset, length, seed);
+     * </pre>
      *
      * @param data   byte array to hash
      * @param length length of the array to hash
      * @return 32 bit hash of the given array
+     * @see #hash32(byte[], int, int)
      */
     public static int hash32(final byte[] data, final int length) {
         return hash32(data, length, 0x9747b28c);
     }
 
     /**
-     * Generates 32 bit hash from a string.
+     * Generates 32 bit hash from a string with a default seed.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0x9747b28c;
+     * byte[] bytes = data.getBytes();
+     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
+     * </pre>
      *
      * @param text string to hash
      * @return 32 bit hash of the given string
+     * @see #hash32(byte[], int, int)
      */
     public static int hash32(final String text) {
         final byte[] bytes = text.getBytes();
@@ -110,12 +143,21 @@ public final class MurmurHash2 {
     }
 
     /**
-     * Generates 32 bit hash from a substring.
+     * Generates 32 bit hash from a substring with a default seed value.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0x9747b28c;
+     * byte[] bytes = text.substring(from, from + length).getBytes();
+     * int hash = MurmurHash2.hash32(bytes, bytes.length, seed);
+     * </pre>
      *
      * @param text   string to hash
      * @param from   starting index
      * @param length length of the substring to hash
      * @return 32 bit hash of the given string
+     * @see #hash32(byte[], int, int)
      */
     public static int hash32(final String text, final int from, final int 
length) {
         return hash32(text.substring(from, from + length));
@@ -135,14 +177,12 @@ public final class MurmurHash2 {
 
         long h = (seed & 0xffffffffl) ^ (length * m);
 
-        final int length8 = length / 8;
+        final int nblocks = length >> 3;
 
-        for (int i = 0; i < length8; i++) {
-            final int i8 = i * 8;
-            long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 
0xff) << 8)
-                    + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 
+ 3] & 0xff) << 24)
-                    + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 
+ 5] & 0xff) << 40)
-                    + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 
+ 7] & 0xff) << 56);
+        // body
+        for (int i = 0; i < nblocks; i++) {
+            final int index = (i << 3);
+            long k = getLittleEndianLong(data, index);
 
             k *= m;
             k ^= k >>> r;
@@ -152,21 +192,22 @@ public final class MurmurHash2 {
             h *= m;
         }
 
-        switch (length % 8) {
+        final int index = (nblocks << 3);
+        switch (length - index) {
         case 7:
-            h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48;
+            h ^= ((long) data[index + 6] & 0xff) << 48;
         case 6:
-            h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40;
+            h ^= ((long) data[index + 5] & 0xff) << 40;
         case 5:
-            h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32;
+            h ^= ((long) data[index + 4] & 0xff) << 32;
         case 4:
-            h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24;
+            h ^= ((long) data[index + 3] & 0xff) << 24;
         case 3:
-            h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16;
+            h ^= ((long) data[index + 2] & 0xff) << 16;
         case 2:
-            h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8;
+            h ^= ((long) data[index + 1] & 0xff) << 8;
         case 1:
-            h ^= data[length & ~7] & 0xff;
+            h ^= ((long) data[index] & 0xff);
             h *= m;
         }
 
@@ -178,21 +219,37 @@ public final class MurmurHash2 {
     }
 
     /**
-     * Generates 64 bit hash from byte array with default seed value.
+     * Generates 64 bit hash from byte array with given length and a default 
seed value.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0xe17a1465;
+     * int hash = MurmurHash2.hash64(data, offset, length, seed);
+     * </pre>
      *
      * @param data   byte array to hash
      * @param length length of the array to hash
      * @return 64 bit hash of the given string
+     * @see #hash64(byte[], int, int)
      */
     public static long hash64(final byte[] data, final int length) {
         return hash64(data, length, 0xe17a1465);
     }
 
     /**
-     * Generates 64 bit hash from a string.
+     * Generates 64 bit hash from a string with a default seed.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0xe17a1465;
+     * byte[] bytes = data.getBytes();
+     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
+     * </pre>
      *
      * @param text string to hash
      * @return 64 bit hash of the given string
+     * @see #hash64(byte[], int, int)
      */
     public static long hash64(final String text) {
         final byte[] bytes = text.getBytes();
@@ -200,14 +257,55 @@ public final class MurmurHash2 {
     }
 
     /**
-     * Generates 64 bit hash from a substring.
+     * Generates 64 bit hash from a substring with a default seed value.
+     * The string is converted to bytes using the default encoding.
+     * This is a helper method that will produce the same result as:
+     *
+     * <pre>
+     * int seed = 0xe17a1465;
+     * byte[] bytes = text.substring(from, from + length).getBytes();
+     * int hash = MurmurHash2.hash64(bytes, bytes.length, seed);
+     * </pre>
      *
      * @param text   string to hash
      * @param from   starting index
      * @param length length of the substring to hash
      * @return 64 bit hash of the given array
+     * @see #hash64(byte[], int, int)
      */
     public static long hash64(final String text, final int from, final int 
length) {
         return hash64(text.substring(from, from + length));
     }
+
+    /**
+     * Gets the little-endian int from 4 bytes starting at the specified index.
+     *
+     * @param data The data
+     * @param index The index
+     * @return The little-endian int
+     */
+    private static int getLittleEndianInt(final byte[] data, final int index) {
+        return ((data[index    ] & 0xff)      ) |
+               ((data[index + 1] & 0xff) <<  8) |
+               ((data[index + 2] & 0xff) << 16) |
+               ((data[index + 3] & 0xff) << 24);
+    }
+
+    /**
+     * Gets the little-endian long from 8 bytes starting at the specified 
index.
+     *
+     * @param data The data
+     * @param index The index
+     * @return The little-endian long
+     */
+    private static long getLittleEndianLong(final byte[] data, final int 
index) {
+        return (((long) data[index    ] & 0xff)      ) |
+               (((long) data[index + 1] & 0xff) <<  8) |
+               (((long) data[index + 2] & 0xff) << 16) |
+               (((long) data[index + 3] & 0xff) << 24) |
+               (((long) data[index + 4] & 0xff) << 32) |
+               (((long) data[index + 5] & 0xff) << 40) |
+               (((long) data[index + 6] & 0xff) << 48) |
+               (((long) data[index + 7] & 0xff) << 56);
+    }
 }
diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java 
b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
index 1fda9ac..513ace8 100644
--- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
+++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
@@ -18,7 +18,7 @@
 package org.apache.commons.codec.digest;
 
 /**
- * MurmurHash3 yields a 32-bit or 128-bit value.
+ * Implementation of the MurmurHash3 32-bit and 128-bit hash functions.
  *
  * <p>MurmurHash is a non-cryptographic hash function suitable for general
  * hash-based lookup. The name comes from two basic operations, multiply (MU)
@@ -860,7 +860,6 @@ public final class MurmurHash3 {
         return new long[] { h1, h2 };
     }
 
-
     /**
      * Gets the little-endian long from 8 bytes starting at the specified 
index.
      *

Reply via email to