This is an automated email from the ASF dual-hosted git repository. aherbert pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/commons-codec.git
commit 82b108f3df4b8db2a2ce56afce19a2b0a97c5539 Author: aherbert <[email protected]> AuthorDate: Thu Nov 21 15:32:11 2019 +0000 [CODEC-264] Add hash128x64 methods to fix sign extension error. Adds a new API to preserve behavioural compatibility with the released version: hash128x64(byte[]) hash128x64(byte[], int offset, int length, int seed) The methods with the sign extension bug have been deprecated. The new API methods use zero as the default seed. --- src/changes/changes.xml | 1 + .../apache/commons/codec/digest/MurmurHash3.java | 60 +++++++++++- .../commons/codec/digest/MurmurHash3Test.java | 109 +++++++++++++++++++++ 3 files changed, 167 insertions(+), 3 deletions(-) diff --git a/src/changes/changes.xml b/src/changes/changes.xml index 74fed85..4949fcb 100644 --- a/src/changes/changes.xml +++ b/src/changes/changes.xml @@ -43,6 +43,7 @@ The <action> type attribute can be add,update,fix,remove. <body> <release version="1.14" date="TBD" description="Feature and fix release."> + <action issue="CODEC-264" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash128x64 methods to fix sign extension error during seeding in hash128 methods.</action> <action issue="CODEC-267" dev="aherbert" due-to="Claude Warren" type="add">Add MurmurHash3.hash32x86 methods and IncrementalHash32x86 to fix sign extension error in hash32 methods.</action> <action issue="CODEC-269" dev="aherbert" type="fix">Allow repeat calls to MurmurHash3.IncrementalHash32.end() to generate the same value.</action> </release> diff --git a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java index 8b28b64..a8ef5f3 100644 --- a/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java +++ b/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java @@ -554,7 +554,7 @@ public final class MurmurHash3 { * * <pre> * int seed = 104729; - * long hash = hash64(data, offset, length, seed); + * long hash = MurmurHash3.hash64(data, offset, length, seed); * <pre> * * @param data The input byte array @@ -645,8 +645,9 @@ public final class MurmurHash3 { * This is a helper method that will produce the same result as: * * <pre> + * int offset = 0; * int seed = 104729; - * int hash = hash128(data, 0, data.length, seed); + * int hash = MurmurHash3.hash128(data, offset, data.length, seed); * </pre> * * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect @@ -661,6 +662,24 @@ public final class MurmurHash3 { } /** + * Generates 128-bit hash from the byte array with a seed of zero. + * This is a helper method that will produce the same result as: + * + * <pre> + * int offset = 0; + * int seed = 0; + * int hash = MurmurHash3.hash128x64(data, offset, data.length, 0); + * </pre> + * + * @param data The input byte array + * @return The 128-bit hash (2 longs) + * @see #hash128x64(byte[], int, int, int) + */ + public static long[] hash128x64(final byte[] data) { + return hash128x64(data, 0, data.length, 0); + } + + /** * Generates 128-bit hash from a string with a default seed. * The string is converted to bytes using the default encoding. * This is a helper method that will produce the same result as: @@ -668,7 +687,7 @@ public final class MurmurHash3 { * <pre> * int seed = 104729; * byte[] bytes = data.getBytes(); - * int hash = hash128(bytes, 0, bytes.length, seed); + * int hash = MurmurHash3.hash128(bytes, 0, bytes.length, seed); * </pre> * * <p>Note: The sign extension bug in {@link #hash128(byte[], int, int, int)} does not effect @@ -697,11 +716,46 @@ public final class MurmurHash3 { * @param length The length of array * @param seed The initial seed value * @return The 128-bit hash (2 longs) + * @deprecated Use {@link #hash128x64(byte[], int, int, int)}. This corrects the seed initialisation. */ + @Deprecated public static long[] hash128(final byte[] data, final int offset, final int length, final int seed) { // ************ // Note: This fails to apply masking using 0xffffffffL to the seed. // ************ + return hash128x64(data, offset, length, seed); + } + + /** + * Generates 128-bit hash from the byte array with the given offset, length and seed. + * + * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} + * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> + * + * @param data The input byte array + * @param offset The first element of array + * @param length The length of array + * @param seed The initial seed value + * @return The 128-bit hash (2 longs) + */ + public static long[] hash128x64(final byte[] data, final int offset, final int length, final int seed) { + // Use an unsigned 32-bit integer as the seed + return hash128x64(data, offset, length, seed & 0xffffffffL); + } + + /** + * Generates 128-bit hash from the byte array with the given offset, length and seed. + * + * <p>This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} + * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.</p> + * + * @param data The input byte array + * @param offset The first element of array + * @param length The length of array + * @param seed The initial seed value + * @return The 128-bit hash (2 longs) + */ + private static long[] hash128x64(final byte[] data, final int offset, final int length, final long seed) { long h1 = seed; long h2 = seed; final int nblocks = length >> 4; diff --git a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java index f975ca8..79787f3 100644 --- a/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java +++ b/src/test/java/org/apache/commons/codec/digest/MurmurHash3Test.java @@ -30,6 +30,7 @@ import org.junit.Test; /** * Test for {@link MurmurHash3}. */ +@SuppressWarnings("deprecation") public class MurmurHash3Test { /** Test data for the hash64 method. */ @@ -614,6 +615,114 @@ public class MurmurHash3Test { } /** + * Test the {@link MurmurHash3#hash128x64(byte[])} algorithm. + * + * <p>Reference data is taken from the Python library {@code mmh3}.</p> + * + * @see <a href="https://pypi.org/project/mmh3/">mmh3</a> + */ + @Test + public void testHash128x64() { + // Note: Default seed is zero. + + // mmh3.hash64(bytes, 0) + Assert.assertArrayEquals(new long[] {1972113670104592209L, 5171809317673151911L}, + MurmurHash3.hash128x64(RANDOM_BYTES)); + + // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to + // 15 bytes remaining. + // for x in range(0, 32): + // print(mmh3.hash64(bytes[:x], 0), ',') + final long[][] answers = {{0L, 0L}, {-2808653841080383123L, -2531784594030660343L}, + {-1284575471001240306L, -8226941173794461820L}, {1645529003294647142L, 4109127559758330427L}, + {-4117979116203940765L, -8362902660322042742L}, {2559943399590596158L, 4738005461125350075L}, + {-1651760031591552651L, -5386079254924224461L}, {-6208043960690815609L, 7862371518025305074L}, + {-5150023478423646337L, 8346305334874564507L}, {7658274117911906792L, -4962914659382404165L}, + {1309458104226302269L, 570003296096149119L}, {7440169453173347487L, -3489345781066813740L}, + {-5698784298612201352L, 3595618450161835420L}, {-3822574792738072442L, 6878153771369862041L}, + {3705084673301918328L, 3202155281274291907L}, {-6797166743928506931L, -4447271093653551597L}, + {5240533565589385084L, -5575481185288758327L}, {-8467620131382649428L, -6450630367251114468L}, + {3632866961828686471L, -5957695976089313500L}, {-6450283648077271139L, -7908632714374518059L}, + {226350826556351719L, 8225586794606475685L}, {-2382996224496980401L, 2188369078123678011L}, + {-1337544762358780825L, 7004253486151757299L}, {2889033453638709716L, -4099509333153901374L}, + {-8644950936809596954L, -5144522919639618331L}, {-5628571865255520773L, -839021001655132087L}, + {-5226774667293212446L, -505255961194269502L}, {1337107025517938142L, 3260952073019398505L}, + {9149852874328582511L, 1880188360994521535L}, {-4035957988359881846L, -7709057850766490780L}, + {-3842593823306330815L, 3805147088291453755L}, {4030161393619149616L, -2813603781312455238L},}; + for (int i = 0; i < answers.length; i++) { + final byte[] bytes = Arrays.copyOf(RANDOM_BYTES, i); + Assert.assertArrayEquals(answers[i], MurmurHash3.hash128x64(bytes)); + } + } + + /** + * Test the {@link MurmurHash3#hash128x64(byte[], int, int, int)} algorithm. + * + * <p>Reference data is taken from the Python library {@code mmh3}.</p> + * + * @see <a href="https://pypi.org/project/mmh3/">mmh3</a> + */ + @Test + public void testHash128x64WithOffsetLengthAndSeed() { + // Seed can be positive + final int seed = 42; + final int offset = 13; + + // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to + // 15 bytes remaining. + // for x in range(0, 32): + // print(mmh3.hash64(bytes[13:x+13], 42), ',') + final long[][] answers = {{-1140915396076141277L, -3386313222241793095L}, + {2745805417334040752L, -3045882272665292331L}, {6807939080212835946L, -1975749467247671127L}, + {-7924884987449335214L, -4468571497642087939L}, {3005389733967167773L, -5809440073240597398L}, + {8032745196600164727L, 4545709434702374224L}, {2095398623732573832L, 1778447136435513908L}, + {4492723708121417255L, -7411125500882394867L}, {8467397417110552178L, -1503802302645548949L}, + {4189760269121918355L, -8004336343217265057L}, {4939298084211301953L, -8419135013628844658L}, + {5497136916151148085L, -394028342910298191L}, {3405983294878231737L, -3216533807498089078L}, + {5833223403351466775L, -1792451370239813325L}, {7730583391236194819L, 5356157313842354092L}, + {3111977482488580945L, -3119414725698132191L}, {3314524606404365027L, -1923219843083192742L}, + {7299569240140613949L, 4176392429810027494L}, {6398084683727166117L, 7703960505857395788L}, + {-8594572031068184774L, 4394224719145783692L}, {-7589785442804461713L, 4110439243215224554L}, + {-5343610105946840628L, -4423992782020122809L}, {-522490326525787270L, 289136460641968781L}, + {-5320637070354802556L, -7845553044730489027L}, {1344456408744313334L, 3803048032054968586L}, + {1131205296221907191L, -6256656049039287019L}, {8583339267101027117L, 8934225022848628726L}, + {-6379552869905441749L, 8973517768420051734L}, {5076646564516328801L, 8561479196844000567L}, + {-4610341636137642517L, -6694266039505142069L}, {-758896383254029789L, 4050360662271552727L}, + {-6123628195475753507L, 4283875822581966645L},}; + for (int i = 0; i < answers.length; i++) { + Assert.assertArrayEquals("Length: " + i, answers[i], MurmurHash3.hash128x64(RANDOM_BYTES, offset, i, seed)); + } + + // Seed can be negative + final int seed2 = -42; + + // Test with all sizes up to 31 bytes. This ensures a full round of 16-bytes plus up to + // 15 bytes remaining. + // for x in range(0, 32): + // print(mmh3.hash64(bytes[13:x+13], -42), ',') + final long[][] answers2 = {{7182599573337898253L, -6490979146667806054L}, + {-461284136738605467L, 7073284964362976233L}, {-3090354666589400212L, 2978755180788824810L}, + {5052807367580803906L, -4497188744879598335L}, {5003711854877353474L, -6616808651483337088L}, + {2043501804923817748L, -760668448196918637L}, {6813003268375229932L, -1818545210475363684L}, + {4488070015393027916L, 8520186429078977003L}, {4709278711722456062L, -2262080641289046033L}, + {-5944514262756048380L, 5968714500873552518L}, {-2304397529301122510L, 6451500469518446488L}, + {-1054078041081348909L, -915114408909600132L}, {1300471646869277217L, -399493387666437046L}, + {-2821780479886030222L, -9061571187511294733L}, {8005764841242557505L, 4135287855434326053L}, + {318307346637037498L, -5355856739901286522L}, {3380719536119187025L, 1890890833937151467L}, + {2691044185935730001L, 7963546423617895734L}, {-5277462388534000227L, 3613853764390780573L}, + {8504421722476165699L, 2058020162708103700L}, {-6578421288092422241L, 3311200163790829579L}, + {-5915037218487974215L, -7385137075895184179L}, {659642911937398022L, 854071824595671049L}, + {-7007237968866727198L, 1372258010932080058L}, {622891376282772539L, -4140783491297489868L}, + {8357110718969014985L, -4737117827581590306L}, {2208857857926305405L, -8360240839768465042L}, + {858120048221036376L, -5822288789703639119L}, {-1988334009458340679L, 1262479472434068698L}, + {-8580307083590783934L, 3634449965473715778L}, {6705664584730187559L, 5192304951463791556L}, + {-6426410954037604142L, -1579122709247558101L},}; + for (int i = 0; i < answers.length; i++) { + Assert.assertArrayEquals("Length: " + i, answers2[i], MurmurHash3.hash128x64(RANDOM_BYTES, offset, i, seed2)); + } + } + + /** * Test {@link IncrementalHash32} returns the same values as * {@link MurmurHash3#hash32(byte[], int, int, int)}. */
