rmuir commented on code in PR #13076: URL: https://github.com/apache/lucene/pull/13076#discussion_r1478382295
########## lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java: ########## @@ -576,4 +578,114 @@ private int squareDistanceBody128(byte[] a, byte[] b, int limit) { // reduce return acc1.add(acc2).reduceLanes(ADD); } + + private static final ByteVector lookup128 = + ByteVector.fromArray( + ByteVector.SPECIES_128, + new byte[] { + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + }, + 0); + + private static final ByteVector lookup256 = + (ByteVector) lookup128.castShape(ByteVector.SPECIES_256, 0); + private static final ByteVector lookup512 = + (ByteVector) lookup128.castShape(ByteVector.SPECIES_512, 0); + + @Override + public int binaryHammingDistance(byte[] a, byte[] b) { + int res = 0; + int i = 0; + if (a.length >= 16) { + if (VECTOR_BITSIZE >= 512) { + i += ByteVector.SPECIES_512.loopBound(a.length); + res += binaryHammingDistanceBody512(a, b, i); + } else if (VECTOR_BITSIZE == 256) { + i += ByteVector.SPECIES_256.loopBound(a.length); + res += binaryHammingDistanceBody256(a, b, i); + } else { + i += ByteVector.SPECIES_128.loopBound(a.length); + res += binaryHammingDistanceBody128(a, b, i); + } + } + + // scalar tail + for (; i < a.length; i++) { + res += HAMMING_DISTANCE_LOOKUP_TABLE[(a[i] ^ b[i]) & 0xFF]; + } + return res; + } + + private int binaryHammingDistanceBody512(byte[] a, byte[] b, int limit) { + int res = 0; + for (int i = 0; i < limit; i += ByteVector.SPECIES_512.length()) { + ByteVector bva64 = ByteVector.fromArray(ByteVector.SPECIES_512, a, i); + ByteVector bvb64 = ByteVector.fromArray(ByteVector.SPECIES_512, b, i); + ByteVector xor64 = bva64.lanewise(XOR, bvb64); + + ByteVector low_mask = ByteVector.broadcast(ByteVector.SPECIES_512, 0x0f); + ByteVector low_bits = xor64.and(low_mask); + ByteVector high_bits = xor64.lanewise(LSHR, 4).and(low_mask); + + var popcnt1 = lookup512.rearrange(low_bits.toShuffle()); + var popcnt2 = lookup512.rearrange(high_bits.toShuffle()); + + var total = popcnt1.add(popcnt2); + + // Need to break up the total ByteVector as the result might not + // fit in a byte + var acc1 = total.castShape(ShortVector.SPECIES_512, 0); + var acc2 = total.castShape(ShortVector.SPECIES_512, 1); Review Comment: Vector castShape() with part number > 0 really needs to be avoided. It is incredibly slow. Have you benchmarked non-mac machines with 256 or 512-bit vectors? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org For additional commands, e-mail: issues-h...@lucene.apache.org