This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-codec.git
commit 48b23313e7c7428fc421942352c8dba4d1e35200 Author: aherbert <aherb...@apache.org> AuthorDate: Thu Nov 21 16:54:34 2019 +0000 Fix MurmurHash2 code in-line with MurmurHash3. This updates the documentation and code so the two can be compared. No binary functionality is changed. --- .../apache/commons/codec/digest/MurmurHash2.java | 174 ++++++++++++++++----- .../apache/commons/codec/digest/MurmurHash3.java | 3 +- 2 files changed, 137 insertions(+), 40 deletions(-) diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java index 0e07bfe..e349ff9 100644 --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash2.java @@ -18,30 +18,42 @@ package org.apache.commons.codec.digest; /** - * MurmurHash2 yields a 32-bit or 64-bit value. + * Implementation of the MurmurHash2 32-bit and 64-bit hash functions. * - * MurmurHash is a non-cryptographic hash function suitable for general + * <p>MurmurHash is a non-cryptographic hash function suitable for general * hash-based lookup. The name comes from two basic operations, multiply (MU) * and rotate (R), used in its inner loop. Unlike cryptographic hash functions, * it is not specifically designed to be difficult to reverse by an adversary, - * making it unsuitable for cryptographic purposes. + * making it unsuitable for cryptographic purposes.</p> * - * This is a re-implementation of the original C code plus some additional - * features. + * <p>This contains a Java port of the 32-bit hash function {@code MurmurHash2} + * and the 64-bit hash function {@code MurmurHash64A} from Austin Applyby's + * original {@code c++} code in SMHasher.</p> * - * Public domain. + * <p>This is a re-implementation of the original C code plus some additional + * features.</p> + * + * <p>This is public domain code with no copyrights. From homepage of + * <a href="https://github.com/aappleby/smhasher">SMHasher</a>:</p> + * + * <blockquote> + * "All MurmurHash versions are public domain software, and the author + * disclaims all copyright to their code." + * </blockquote> * * @see <a href="https://en.wikipedia.org/wiki/MurmurHash">MurmurHash</a> + * @see <a href="https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp"> + * Original MurmurHash2 c++ code</a> * @since 1.13 */ public final class MurmurHash2 { - // all methods static; private constructor. + /** No instance methods. */ private MurmurHash2() { } /** - * Generates 32 bit hash from byte array of the given length and seed. + * Generates 32 bit hash from byte array with the given length and seed. * * @param data byte array to hash * @param length length of the array to hash @@ -56,12 +68,14 @@ public final class MurmurHash2 { // Initialize the hash to a random value int h = seed ^ length; - final int length4 = length / 4; - for (int i = 0; i < length4; i++) { - final int i4 = i * 4; - int k = (data[i4 + 0] & 0xff) + ((data[i4 + 1] & 0xff) << 8) + ((data[i4 + 2] & 0xff) << 16) - + ((data[i4 + 3] & 0xff) << 24); + // Mix 4 bytes at a time into the hash + final int nblocks = length >> 2; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = (i << 2); + int k = getLittleEndianInt(data, index); k *= m; k ^= k >>> r; k *= m; @@ -70,16 +84,19 @@ public final class MurmurHash2 { } // Handle the last few bytes of the input array - switch (length % 4) { + final int index = (nblocks << 2); + switch (length - index) { case 3: - h ^= (data[(length & ~3) + 2] & 0xff) << 16; + h ^= (data[index + 2] & 0xff) << 16; case 2: - h ^= (data[(length & ~3) + 1] & 0xff) << 8; + h ^= (data[index + 1] & 0xff) << 8; case 1: - h ^= (data[length & ~3] & 0xff); + h ^= (data[index] & 0xff); h *= m; } + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. h ^= h >>> 13; h *= m; h ^= h >>> 15; @@ -88,21 +105,37 @@ public final class MurmurHash2 { } /** - * Generates 32 bit hash from byte array with default seed value. + * Generates 32 bit hash from byte array with the given length and a default seed value. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 0x9747b28c; + * int hash = MurmurHash2.hash32(data, offset, length, seed); + * </pre> * * @param data byte array to hash * @param length length of the array to hash * @return 32 bit hash of the given array + * @see #hash32(byte[], int, int) */ public static int hash32(final byte[] data, final int length) { return hash32(data, length, 0x9747b28c); } /** - * Generates 32 bit hash from a string. + * Generates 32 bit hash from a string with a default seed. + * The string is converted to bytes using the default encoding. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 0x9747b28c; + * byte[] bytes = data.getBytes(); + * int hash = MurmurHash2.hash32(bytes, bytes.length, seed); + * </pre> * * @param text string to hash * @return 32 bit hash of the given string + * @see #hash32(byte[], int, int) */ public static int hash32(final String text) { final byte[] bytes = text.getBytes(); @@ -110,12 +143,21 @@ public final class MurmurHash2 { } /** - * Generates 32 bit hash from a substring. + * Generates 32 bit hash from a substring with a default seed value. + * The string is converted to bytes using the default encoding. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 0x9747b28c; + * byte[] bytes = text.substring(from, from + length).getBytes(); + * int hash = MurmurHash2.hash32(bytes, bytes.length, seed); + * </pre> * * @param text string to hash * @param from starting index * @param length length of the substring to hash * @return 32 bit hash of the given string + * @see #hash32(byte[], int, int) */ public static int hash32(final String text, final int from, final int length) { return hash32(text.substring(from, from + length)); @@ -135,14 +177,12 @@ public final class MurmurHash2 { long h = (seed & 0xffffffffl) ^ (length * m); - final int length8 = length / 8; + final int nblocks = length >> 3; - for (int i = 0; i < length8; i++) { - final int i8 = i * 8; - long k = ((long) data[i8 + 0] & 0xff) + (((long) data[i8 + 1] & 0xff) << 8) - + (((long) data[i8 + 2] & 0xff) << 16) + (((long) data[i8 + 3] & 0xff) << 24) - + (((long) data[i8 + 4] & 0xff) << 32) + (((long) data[i8 + 5] & 0xff) << 40) - + (((long) data[i8 + 6] & 0xff) << 48) + (((long) data[i8 + 7] & 0xff) << 56); + // body + for (int i = 0; i < nblocks; i++) { + final int index = (i << 3); + long k = getLittleEndianLong(data, index); k *= m; k ^= k >>> r; @@ -152,21 +192,22 @@ public final class MurmurHash2 { h *= m; } - switch (length % 8) { + final int index = (nblocks << 3); + switch (length - index) { case 7: - h ^= (long) (data[(length & ~7) + 6] & 0xff) << 48; + h ^= ((long) data[index + 6] & 0xff) << 48; case 6: - h ^= (long) (data[(length & ~7) + 5] & 0xff) << 40; + h ^= ((long) data[index + 5] & 0xff) << 40; case 5: - h ^= (long) (data[(length & ~7) + 4] & 0xff) << 32; + h ^= ((long) data[index + 4] & 0xff) << 32; case 4: - h ^= (long) (data[(length & ~7) + 3] & 0xff) << 24; + h ^= ((long) data[index + 3] & 0xff) << 24; case 3: - h ^= (long) (data[(length & ~7) + 2] & 0xff) << 16; + h ^= ((long) data[index + 2] & 0xff) << 16; case 2: - h ^= (long) (data[(length & ~7) + 1] & 0xff) << 8; + h ^= ((long) data[index + 1] & 0xff) << 8; case 1: - h ^= data[length & ~7] & 0xff; + h ^= ((long) data[index] & 0xff); h *= m; } @@ -178,21 +219,37 @@ public final class MurmurHash2 { } /** - * Generates 64 bit hash from byte array with default seed value. + * Generates 64 bit hash from byte array with given length and a default seed value. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 0xe17a1465; + * int hash = MurmurHash2.hash64(data, offset, length, seed); + * </pre> * * @param data byte array to hash * @param length length of the array to hash * @return 64 bit hash of the given string + * @see #hash64(byte[], int, int) */ public static long hash64(final byte[] data, final int length) { return hash64(data, length, 0xe17a1465); } /** - * Generates 64 bit hash from a string. + * Generates 64 bit hash from a string with a default seed. + * The string is converted to bytes using the default encoding. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 0xe17a1465; + * byte[] bytes = data.getBytes(); + * int hash = MurmurHash2.hash64(bytes, bytes.length, seed); + * </pre> * * @param text string to hash * @return 64 bit hash of the given string + * @see #hash64(byte[], int, int) */ public static long hash64(final String text) { final byte[] bytes = text.getBytes(); @@ -200,14 +257,55 @@ public final class MurmurHash2 { } /** - * Generates 64 bit hash from a substring. + * Generates 64 bit hash from a substring with a default seed value. + * The string is converted to bytes using the default encoding. + * This is a helper method that will produce the same result as: + * + * <pre> + * int seed = 0xe17a1465; + * byte[] bytes = text.substring(from, from + length).getBytes(); + * int hash = MurmurHash2.hash64(bytes, bytes.length, seed); + * </pre> * * @param text string to hash * @param from starting index * @param length length of the substring to hash * @return 64 bit hash of the given array + * @see #hash64(byte[], int, int) */ public static long hash64(final String text, final int from, final int length) { return hash64(text.substring(from, from + length)); } + + /** + * Gets the little-endian int from 4 bytes starting at the specified index. + * + * @param data The data + * @param index The index + * @return The little-endian int + */ + private static int getLittleEndianInt(final byte[] data, final int index) { + return ((data[index ] & 0xff) ) | + ((data[index + 1] & 0xff) << 8) | + ((data[index + 2] & 0xff) << 16) | + ((data[index + 3] & 0xff) << 24); + } + + /** + * Gets the little-endian long from 8 bytes starting at the specified index. + * + * @param data The data + * @param index The index + * @return The little-endian long + */ + private static long getLittleEndianLong(final byte[] data, final int index) { + return (((long) data[index ] & 0xff) ) | + (((long) data[index + 1] & 0xff) << 8) | + (((long) data[index + 2] & 0xff) << 16) | + (((long) data[index + 3] & 0xff) << 24) | + (((long) data[index + 4] & 0xff) << 32) | + (((long) data[index + 5] & 0xff) << 40) | + (((long) data[index + 6] & 0xff) << 48) | + (((long) data[index + 7] & 0xff) << 56); + } } diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java index 1fda9ac..513ace8 100644 --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java @@ -18,7 +18,7 @@ package org.apache.commons.codec.digest; /** - * MurmurHash3 yields a 32-bit or 128-bit value. + * Implementation of the MurmurHash3 32-bit and 128-bit hash functions. * * <p>MurmurHash is a non-cryptographic hash function suitable for general * hash-based lookup. The name comes from two basic operations, multiply (MU) @@ -860,7 +860,6 @@ public final class MurmurHash3 { return new long[] { h1, h2 }; } - /** * Gets the little-endian long from 8 bytes starting at the specified index. *