jpountz commented on code in PR #12868: URL: https://github.com/apache/lucene/pull/12868#discussion_r1755498311
########## lucene/core/src/java/org/apache/lucene/util/StringHelper.java: ########## @@ -209,6 +209,156 @@ public static int murmurhash3_x86_32(BytesRef bytes, int seed) { return murmurhash3_x86_32(bytes.bytes, bytes.offset, bytes.length, seed); } + /** + * Generates 128-bit hash from the byte array with the given offset, length and seed. + * + * <p>The code is adopted from Apache Commons (<a + * href="https://commons.apache.org/proper/commons-codec/jacoco/org.apache.commons.codec.digest/MurmurHash3.java.html">link</a>) + * + * @param data The input byte array + * @param offset The first element of array + * @param length The length of array + * @param seed The initial seed value + * @return The 128-bit hash (2 longs) + */ + public static long[] murmurhash3_x64_128( + final byte[] data, final int offset, final int length, final int seed) { + // Use an unsigned 32-bit integer as the seed + return murmurhash3_x64_128(data, offset, length, seed & 0xFFFFFFFFL); + } + + @SuppressWarnings("fallthrough") + private static long[] murmurhash3_x64_128( + final byte[] data, final int offset, final int length, final long seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // Constants for 128-bit variant + final long C1 = 0x87c37b91114253d5L; + final long C2 = 0x4cf5ad432745937fL; + final int R1 = 31; + final int R2 = 27; + final int R3 = 33; + final int M = 5; + final int N1 = 0x52dce729; + final int N2 = 0x38495ab5; + + // body + for (int i = 0; i < nblocks; i++) { + final int index = offset + (i << 4); + long k1 = (long) BitUtil.VH_LE_LONG.get(data, index); + long k2 = (long) BitUtil.VH_LE_LONG.get(data, index + 8); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + final int index = offset + (nblocks << 4); + switch (offset + length - index) { Review Comment: `offset + length - index` is the same as `length & 0x0F`, saving a few instructions? ########## lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java: ########## @@ -150,9 +150,10 @@ private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) { * @return NO or MAYBE */ public ContainsResult contains(BytesRef value) { - long hash = hashFunction.hash(value); - int msb = (int) (hash >>> Integer.SIZE); - int lsb = (int) hash; + long[] hash = hashFunction.hash128(value); + + int msb = ((int) hash[0] >>> Integer.SIZE) >>> 1 + ((int) hash[1] >>> Integer.SIZE) >>> 1; + int lsb = ((int) hash[0]) >>> 1 + ((int) hash[1]) >>> 1; Review Comment: I'd rather switch to murmur3 accepting the slowdown given that murmur3 is expected to yield less collisions than murmur2, than introduce this weird way of combining bits that we don't understand why it's needed. ########## lucene/codecs/src/java/org/apache/lucene/codecs/bloom/FuzzySet.java: ########## @@ -150,9 +150,10 @@ private FuzzySet(FixedBitSet filter, int bloomSize, int hashCount) { * @return NO or MAYBE */ public ContainsResult contains(BytesRef value) { - long hash = hashFunction.hash(value); - int msb = (int) (hash >>> Integer.SIZE); - int lsb = (int) hash; + long[] hash = hashFunction.hash128(value); + + int msb = ((int) hash[0] >>> Integer.SIZE) >>> 1 + ((int) hash[1] >>> Integer.SIZE) >>> 1; + int lsb = ((int) hash[0]) >>> 1 + ((int) hash[1]) >>> 1; Review Comment: If we want to postpone the time when we drop some bits, we could change a few lines below to `int bloomPos = (int) (hash[0] + i * hash[1]);` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org