https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101927

            Bug ID: 101927
           Summary: There is no vector mode popcount for aarch64
           Product: gcc
           Version: 12.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: enhancement
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: pinskia at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

Take:

#include <stdlib.h>
#include <stdint.h>

size_t hd (const uint8_t *restrict a, const uint8_t *restrict b, size_t l) {
  size_t r = 0, x;
  for (x = 0; x < l; x++)
    r += __builtin_popcount (a[x] ^ b[x]);

  return r;
}

at -O3 we don't vectorize this.
Clang/LLVM does:
.LBB0_5:                                // =>This Inner Loop Header: Depth=1
        ld1     { v3.b }[0], [x8]
        sub     x12, x8, #2
        ld1     { v5.b }[0], [x10]
        ld1     { v4.b }[0], [x12]
        sub     x12, x10, #2
        ld1     { v6.b }[0], [x12]
        add     x12, x8, #1
        ld1     { v3.b }[4], [x12]
        add     x12, x10, #1
        ld1     { v5.b }[4], [x12]
        sub     x12, x8, #1
        ld1     { v4.b }[4], [x12]
        sub     x12, x10, #1
        ld1     { v6.b }[4], [x12]
        eor     v3.8b, v5.8b, v3.8b
        ushll   v3.2d, v3.2s, #0
        and     v3.16b, v3.16b, v1.16b
        eor     v4.8b, v6.8b, v4.8b
        ushll   v4.2d, v4.2s, #0
        and     v4.16b, v4.16b, v1.16b
        cnt     v3.16b, v3.16b
        cnt     v4.16b, v4.16b
        uaddlp  v3.8h, v3.16b
        uaddlp  v4.8h, v4.16b
        uaddlp  v3.4s, v3.8h
        uaddlp  v4.4s, v4.8h
        add     x8, x8, #4
        subs    x11, x11, #4
        uadalp  v2.2d, v3.4s
        uadalp  v0.2d, v4.4s
        add     x10, x10, #4
        b.ne    .LBB0_5

------ CUT ----
Note I think we could be better.

Reply via email to