https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116840

            Bug ID: 116840
           Summary: Optimise __builtin_parity for aarch64
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: enhancement
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

int
foob (unsigned char x)
{
  return __builtin_parityg (x);
}

int
fooh (unsigned short x)
{
  return __builtin_parityg (x);
}

int
foo (unsigned int x)
{
  return __builtin_parity (x);
}

int
fooll (unsigned long x)
{
  return __builtin_parityll (x);
}

For -O2 aarch64 we emit:
foob:
        and     x0, x0, 255
        fmov    d31, x0
        cnt     v31.8b, v31.8b
        addv    b31, v31.8b
        fmov    w0, s31
        and     w0, w0, 1
        ret
fooh:
        and     x0, x0, 65535
        fmov    d31, x0
        cnt     v31.8b, v31.8b
        addv    b31, v31.8b
        fmov    w0, s31
        and     w0, w0, 1
        ret
foo:
        fmov    s31, w0
        cnt     v31.8b, v31.8b
        addv    b31, v31.8b
        fmov    x0, d31
        and     w0, w0, 1
        ret
fooll:
        fmov    d31, x0
        cnt     v31.8b, v31.8b
        addv    b31, v31.8b
        fmov    x0, d31
        and     w0, w0, 1
        ret

For all functions but fooll this looks suboptimal. For the 32-bit foo LLVM
emits:
foo:
        eor     w8, w0, w0, lsr #16
        eor     w8, w8, w8, lsr #8
        eor     w8, w8, w8, lsr #4
        eor     w8, w8, w8, lsr #2
        eor     w8, w8, w8, lsr #1
        and     w0, w8, #0x1
        ret

This is one instruction longer than the GCC sequence but should be preferable
as  they are all 1-cycle instructions on many modern AArch64 cores and does not
involve any cross-bank moves or vector reductions. I think we should use this
sequence for (!TARGET_CSSC || !TARGET_SIMD) expansions of the paritym2 optab,
maybe gated on optimize_function_for_speed_p (cfun) as well.

Reply via email to