This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-codec.git
commit 1645ab953ce98bffaff2424cd0b3d2501e21b04f Author: aherbert <aherb...@apache.org> AuthorDate: Thu Nov 21 15:13:42 2019 +0000 [CODEC-267] Add hash32x86 methods to fix sign extension error. Adds a new API to preserve behavioural compatibility with the released version: hash32x86(byte[]) hash32x86(byte[], int offset, int length, int seed) IncrementalHash32x64 The methods with the sign extension bug have been deprecated. The new API methods use zero as the default seed. mvn clirr:check is OK with adding a new class IncrementalHash32x64 as the parent of IncrementalHash32. --- src/changes/changes.xml | 3 +- .../apache/commons/codec/digest/MurmurHash3.java | 159 +++++++++++++++++++-- .../commons/codec/digest/MurmurHash3Test.java | 122 +++++++++++++++- 3 files changed, 267 insertions(+), 17 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 1f3a2d1..74fed85 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -43,7 +43,8 @@ The <action> type attribute can be add,update,fix,remove. <body> <release version="1.14" date="TBD" description="Feature and fix release."> - <action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to IncrementalHash32.end() to generate the same value.</action> + <action issue="CODEC-267" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash32x86 methods and IncrementalHash32x86 to fix sign extension error in hash32 methods.</action> + <action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to MurmurHash3.IncrementalHash32.end() to generate the same value.</action> </release> <release version="1.13" date="2019-07-20" description="Feature and fix release."> diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java index c7a149e..8b28b64 100644 --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java @@ -217,7 +217,9 @@ public final class MurmurHash3 { * @param data The input byte array * @return The 32-bit hash * @see #hash32(byte[], int, int, int) + * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. */ + @Deprecated public static int hash32(final byte[] data) { return hash32(data, 0, data.length, DEFAULT_SEED); } @@ -240,7 +242,9 @@ public final class MurmurHash3 { * @param data The input string * @return The 32-bit hash * @see #hash32(byte[], int, int, int) + * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. */ + @Deprecated public static int hash32(final String data) { final byte[] bytes = data.getBytes(); return hash32(bytes, 0, bytes.length, DEFAULT_SEED); @@ -263,7 +267,9 @@ public final class MurmurHash3 { * @param length The length of array * @return The 32-bit hash * @see #hash32(byte[], int, int, int) + * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. */ + @Deprecated public static int hash32(final byte[] data, final int length) { return hash32(data, length, DEFAULT_SEED); } @@ -285,7 +291,9 @@ public final class MurmurHash3 { * @param seed The initial seed value * @return The 32-bit hash * @see #hash32(byte[], int, int, int) + * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. */ + @Deprecated public static int hash32(final byte[] data, final int length, final int seed) { return hash32(data, 0, length, seed); } @@ -305,7 +313,9 @@ public final class MurmurHash3 { * @param length The length of array * @param seed The initial seed value * @return The 32-bit hash + * @deprecated Use {@link #hash32x86(byte[], int, int, int)}. This corrects the processing of trailing bytes. */ + @Deprecated public static int hash32(final byte[] data, final int offset, final int length, final int seed) { int hash = seed; final int nblocks = length >> 2; @@ -343,6 +353,69 @@ public final class MurmurHash3 { } /** + * Generates 32-bit hash from the byte array with a seed of zero. + * This is a helper method that will produce the same result as: + * + * <pre> + * int offset = 0; + * int seed = 0; + * int hash = hash32x86(data, offset, data.length, seed); + * </pre> + * + * @param data The input byte array + * @return The 32-bit hash + * @see #hash32x86(byte[], int, int, int) + */ + public static int hash32x86(final byte[] data) { + return hash32x86(data, 0, data.length, 0); + } + + /** + * Generates 32-bit hash from the byte array with the given offset, length and seed. + * + * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} + * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> + * + * @param data The input byte array + * @param offset The offset of data + * @param length The length of array + * @param seed The initial seed value + * @return The 32-bit hash + */ + public static int hash32x86(final byte[] data, final int offset, final int length, final int seed) { + int hash = seed; + final int nblocks = length >> 2; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 2); + final int k = getLittleEndianInt(data, index); + hash = mix32(k, hash); + } + + // tail + final int index = offset + (nblocks << 2); + int k1 = 0; + switch (offset + length - index) { + case 3: + k1 ^= (data[index + 2] & 0xff) << 16; + case 2: + k1 ^= (data[index + 1] & 0xff) << 8; + case 1: + k1 ^= (data[index] & 0xff); + + // mix functions + k1 *= C1_32; + k1 = Integer.rotateLeft(k1, R1_32); + k1 *= C2_32; + hash ^= k1; + } + + hash ^= length; + return fmix32(hash); + } + + /** * Generates 64-bit hash from a long with a default seed. * * <p>This is a Murmur3-like 64-bit variant. @@ -804,12 +877,8 @@ public final class MurmurHash3 { * * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> - * - * <p>This implementation contains a sign-extension bug in the finalisation step of - * any bytes left over from dividing the length by 4. This manifests if any of these - * bytes are negative.<p> */ - public static class IncrementalHash32 { + public static class IncrementalHash32x86 { /** The size of byte blocks that are processed together. */ private static final int BLOCK_SIZE = 4; @@ -916,26 +985,33 @@ public final class MurmurHash3 { * Generate the 32-bit hash value. Repeat calls to this method with no additional data * will generate the same hash value. * - * <p>This implementation contains a sign-extension bug in the finalisation step of - * any bytes left over from dividing the length by 4. This manifests if any of these - * bytes are negative.<p> - * * @return The 32-bit hash */ public final int end() { // Allow calling end() again after adding no data to return the same result. + return finalise(hash, unprocessedLength, unprocessed, totalLen); + } + + /** + * Finalise the running hash to the output 32-bit hash by processing remaining bytes + * and performing final mixing. + * + * @param hash The running hash + * @param unprocessedLength The number of unprocessed bytes in the tail data. + * @param unprocessed Up to 3 unprocessed bytes from input data. + * @param totalLen The total number of input bytes added since the start. + * @return The 32-bit hash + */ + int finalise(int hash, int unprocessedLength, byte[] unprocessed, int totalLen) { int result = hash; - // ************ - // Note: This fails to apply masking using 0xff to the 3 remaining bytes. - // ************ int k1 = 0; switch (unprocessedLength) { case 3: - k1 ^= unprocessed[2] << 16; + k1 ^= (unprocessed[2] & 0xff) << 16; case 2: - k1 ^= unprocessed[1] << 8; + k1 ^= (unprocessed[1] & 0xff) << 8; case 1: - k1 ^= unprocessed[0]; + k1 ^= (unprocessed[0] & 0xff); // mix functions k1 *= C1_32; @@ -964,4 +1040,57 @@ public final class MurmurHash3 { return (b1 & 0xff) | ((b2 & 0xff) << 8) | ((b3 & 0xff) << 16) | ((b4 & 0xff) << 24); } } + + /** + * Generates 32-bit hash from input bytes. Bytes can be added incrementally and the new + * hash computed. + * + * <p>This is an implementation of the 32-bit hash function {@code MurmurHash3_x86_32} + * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> + * + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + * + * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. + */ + @Deprecated + public static class IncrementalHash32 extends IncrementalHash32x86 { + /** + * {@inheritDoc} + * + * <p>This implementation contains a sign-extension bug in the finalisation step of + * any bytes left over from dividing the length by 4. This manifests if any of these + * bytes are negative.<p> + * + * @deprecated Use IncrementalHash32x86. This corrects the processing of trailing bytes. + */ + @Override + @Deprecated + int finalise(int hash, int unprocessedLength, byte[] unprocessed, int totalLen) { + int result = hash; + // ************ + // Note: This fails to apply masking using 0xff to the 3 remaining bytes. + // ************ + int k1 = 0; + switch (unprocessedLength) { + case 3: + k1 ^= unprocessed[2] << 16; + case 2: + k1 ^= unprocessed[1] << 8; + case 1: + k1 ^= unprocessed[0]; + + // mix functions + k1 *= C1_32; + k1 = Integer.rotateLeft(k1, R1_32); + k1 *= C2_32; + result ^= k1; + } + + // finalization + result ^= totalLen; + return fmix32(result); + } + } } diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java index d7c21aa..f975ca8 100644 --- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java +++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java @@ -24,6 +24,7 @@ import java.util.Arrays; import java.util.concurrent.ThreadLocalRandom; import org.apache.commons.codec.digest.MurmurHash3.IncrementalHash32; +import org.apache.commons.codec.digest.MurmurHash3.IncrementalHash32x86; import org.junit.Test; /** @@ -349,7 +350,7 @@ public class MurmurHash3Test { * if the final 1, 2, or 3 bytes are negative. */ @Test - public void testHash32With1TrailingSignedByteIsInvalid() { + public void testHash32WithTrailingNegativeSignedBytesIsInvalid() { // import mmh3 // import numpy as np // mmh3.hash(np.uint8([-1])) @@ -367,6 +368,71 @@ public class MurmurHash3Test { } /** + * Test the {@link MurmurHash3#hash32x86(byte[])} algorithm. + * + * <p>Reference data is taken from the Python library {@code mmh3}.</p> + * + * @see <a href="https://pypi.org/project/mmh3/">mmh3</a> + */ + @Test + public void testhash32x86() { + // Note: Default seed is zero. + + // mmh3.hash(bytes, 0) + Assert.assertEquals(1546271276, MurmurHash3.hash32x86(RANDOM_BYTES)); + + // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to + // 15 bytes remaining. + // for x in range(0, 32): + // print(mmh3.hash(bytes[:x], 0), ',') + final int[] answers = {0, -1353253853, 915381745, -734983419, 1271125654, -1042265893, -1204521619, 735845843, + 138310876, -1918938664, 1399647898, -1126342309, 2067593280, 1220975287, 1941281084, -1289513180, 942412060, + -618173583, -269546647, -1645631262, 1162379906, -1960125577, -1856773195, 1980513522, 1174612855, + 905810751, 1044578220, -1758486689, -491393913, 839836946, -435014415, 2044851178,}; + for (int i = 0; i < answers.length; i++) { + final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i); + Assert.assertEquals(answers[i], MurmurHash3.hash32x86(bytes)); + } + } + + /** + * Test the {@link MurmurHash3#hash32x86(byte[], int, int, int)} algorithm. + * + * <p>Reference data is taken from the Python library {@code mmh3}.</p> + * + * @see <a href="https://pypi.org/project/mmh3/">mmh3</a> + */ + @Test + public void testHash32x86WithOffsetLengthAndSeed() { + // Data as above for testing MurmurHash3.hash32(byte[], int, int, int). + final int seed = -42; + final int offset = 13; + final int[] answers = {192929823, -27171978, -1282326280, -816314453, -1176217753, -1904531247, 1962794233, + -1302316624, -1151850323, -1464386748, -369299427, 972232488, 1747314487, 2137398916, 690986564, + -1985866226, -678669121, -2123325690, -253319081, 46181235, 656058278, 1401175653, 1750113912, -1567219725, + 2032742772, -2024269989, -305340794, 1161737942, -661265418, 172838872, -650122718, -1934812417,}; + for (int i = 0; i < answers.length; i++) { + Assert.assertEquals(answers[i], MurmurHash3.hash32x86(RANDOM_BYTES, offset, i, seed)); + } + } + + /** + * Test to demonstrate {@link MurmurHash3#hash32x86(byte[], int, int, int)} is OK + * if the final 1, 2, or 3 bytes are negative. + */ + @Test + public void testHash32x86WithTrailingNegativeSignedBytes() { + // Data as above for testing MurmurHash3.hash32(byte[], int, int, int). + // This test uses assertEquals(). + Assert.assertEquals(-43192051, MurmurHash3.hash32x86(new byte[] {-1}, 0, 1, 0)); + Assert.assertEquals(-582037868, MurmurHash3.hash32x86(new byte[] {0, -1}, 0, 2, 0)); + Assert.assertEquals(922088087, MurmurHash3.hash32x86(new byte[] {0, 0, -1}, 0, 3, 0)); + Assert.assertEquals(-1309567588, MurmurHash3.hash32x86(new byte[] {-1, 0}, 0, 2, 0)); + Assert.assertEquals(-363779670, MurmurHash3.hash32x86(new byte[] {-1, 0, 0}, 0, 3, 0)); + Assert.assertEquals(-225068062, MurmurHash3.hash32x86(new byte[] {0, -1, 0}, 0, 3, 0)); + } + + /** * Test the {@link MurmurHash3#hash64(byte[])} algorithm. * Unknown origin of test data. It may be from the Apache Hive project. */ @@ -600,4 +666,58 @@ public class MurmurHash3Test { Assert.assertEquals("Hashes differ after no additional data", h1, inc.end()); } } + + /** + * Test {@link IncrementalHash32x86} returns the same values as + * {@link MurmurHash3#hash32x86(byte[], int, int, int)}. + */ + @Test + public void testIncrementalHash32x86() { + final byte[] bytes = new byte[1023]; + ThreadLocalRandom.current().nextBytes(bytes); + // The seed does not matter + for (final int seed : new int[] {-567, 0, 6787990}) { + // Cases are constructed to hit all edge cases of processing: + // Nothing added + assertIncrementalHash32x86(bytes, seed, 0, 0); + // Add single bytes + assertIncrementalHash32x86(bytes, seed, 1, 1, 1, 1, 1, 1, 1, 1); + // Leading unprocessed 1, 2, 3 + assertIncrementalHash32x86(bytes, seed, 1, 4); + assertIncrementalHash32x86(bytes, seed, 2, 4); + assertIncrementalHash32x86(bytes, seed, 3, 4); + // Trailing unprocessed 1, 2, 3 + assertIncrementalHash32x86(bytes, seed, 4, 1); + assertIncrementalHash32x86(bytes, seed, 4, 2); + assertIncrementalHash32x86(bytes, seed, 4, 3); + // Complete blocks + assertIncrementalHash32x86(bytes, seed, 4, 16, 64); + } + } + + /** + * Assert {@link IncrementalHash32x86} returns the same values as + * {@link MurmurHash3#hash32x86(byte[], int, int, int)}. + * + * <p>The bytes are added to the incremental hash in the given blocks.</p> + * + * @param bytes the bytes + * @param seed the seed + * @param blocks the blocks + */ + private static void assertIncrementalHash32x86(byte[] bytes, int seed, int... blocks) { + int offset = 0; + int total = 0; + final IncrementalHash32x86 inc = new IncrementalHash32x86(); + inc.start(seed); + for (final int block : blocks) { + total += block; + final int h1 = MurmurHash3.hash32x86(bytes, 0, total, seed); + inc.add(bytes, offset, block); + offset += block; + final int h2 = inc.end(); + Assert.assertEquals("Hashes differ", h1, h2); + Assert.assertEquals("Hashes differ after no additional data", h1, inc.end()); + } + } }