rmuir commented on code in PR #13076:
URL: https://github.com/apache/lucene/pull/13076#discussion_r1478382295


##########
lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java:
##########
@@ -576,4 +578,114 @@ private int squareDistanceBody128(byte[] a, byte[] b, int 
limit) {
     // reduce
     return acc1.add(acc2).reduceLanes(ADD);
   }
+
+  private static final ByteVector lookup128 =
+      ByteVector.fromArray(
+          ByteVector.SPECIES_128,
+          new byte[] {
+            0, 1, 1, 2, 1, 2, 2, 3,
+            1, 2, 2, 3, 2, 3, 3, 4,
+          },
+          0);
+
+  private static final ByteVector lookup256 =
+      (ByteVector) lookup128.castShape(ByteVector.SPECIES_256, 0);
+  private static final ByteVector lookup512 =
+      (ByteVector) lookup128.castShape(ByteVector.SPECIES_512, 0);
+
+  @Override
+  public int binaryHammingDistance(byte[] a, byte[] b) {
+    int res = 0;
+    int i = 0;
+    if (a.length >= 16) {
+      if (VECTOR_BITSIZE >= 512) {
+        i += ByteVector.SPECIES_512.loopBound(a.length);
+        res += binaryHammingDistanceBody512(a, b, i);
+      } else if (VECTOR_BITSIZE == 256) {
+        i += ByteVector.SPECIES_256.loopBound(a.length);
+        res += binaryHammingDistanceBody256(a, b, i);
+      } else {
+        i += ByteVector.SPECIES_128.loopBound(a.length);
+        res += binaryHammingDistanceBody128(a, b, i);
+      }
+    }
+
+    // scalar tail
+    for (; i < a.length; i++) {
+      res += HAMMING_DISTANCE_LOOKUP_TABLE[(a[i] ^ b[i]) & 0xFF];
+    }
+    return res;
+  }
+
+  private int binaryHammingDistanceBody512(byte[] a, byte[] b, int limit) {
+    int res = 0;
+    for (int i = 0; i < limit; i += ByteVector.SPECIES_512.length()) {
+      ByteVector bva64 = ByteVector.fromArray(ByteVector.SPECIES_512, a, i);
+      ByteVector bvb64 = ByteVector.fromArray(ByteVector.SPECIES_512, b, i);
+      ByteVector xor64 = bva64.lanewise(XOR, bvb64);
+
+      ByteVector low_mask = ByteVector.broadcast(ByteVector.SPECIES_512, 0x0f);
+      ByteVector low_bits = xor64.and(low_mask);
+      ByteVector high_bits = xor64.lanewise(LSHR, 4).and(low_mask);
+
+      var popcnt1 = lookup512.rearrange(low_bits.toShuffle());
+      var popcnt2 = lookup512.rearrange(high_bits.toShuffle());
+
+      var total = popcnt1.add(popcnt2);
+
+      // Need to break up the total ByteVector as the result might not
+      // fit in a byte
+      var acc1 = total.castShape(ShortVector.SPECIES_512, 0);
+      var acc2 = total.castShape(ShortVector.SPECIES_512, 1);

Review Comment:
   Vector castShape() with part number > 0 really needs to be avoided. It is 
incredibly slow. Have you benchmarked non-mac machines with 256 or 512-bit 
vectors?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@lucene.apache.org
For additional commands, e-mail: issues-h...@lucene.apache.org

Reply via email to