https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109885

--- Comment #1 from Andrew Pinski <pinskia at gcc dot gnu.org> ---
Just FYI, GCC does better on aarch64 with sum.
GCC:
        ldp     q29, q30, [x0]
        movi    v31.4s, 0x1
        fcmeq   v29.4s, v29.4s, 0
        fcmeq   v30.4s, v30.4s, 0
        and     v31.16b, v31.16b, v29.16b
        sub     v31.4s, v31.4s, v30.4s
        addv    s31, v31.4s
        fmov    w0, s31
        ret

vs this mess:
        sub     sp, sp, #16
        ldp     q1, q0, [x0]
        adrp    x8, .LCPI0_0
        fcmeq   v1.4s, v1.4s, #0.0
        fcmeq   v0.4s, v0.4s, #0.0
        uzp1    v0.8h, v1.8h, v0.8h
        ldr     q1, [x8, :lo12:.LCPI0_0]
        and     v0.16b, v0.16b, v1.16b
        addv    h0, v0.8h
        fmov    w8, s0
        and     w8, w8, #0xff
        fmov    s0, w8
        cnt     v0.8b, v0.8b
        uaddlv  h0, v0.8b
        fmov    w0, s0
        add     sp, sp, #16
        ret

The reason is it looks like clang/LLVM is tuned to try to use movmskps/testps
while GCC is tuned to do just a sum reduction in general.
Though I think GCC could be slightly better here too.
        ldp     q29, q30, [x0]
        fcmeq   v29.4s, v29.4s, 0
        fcmeq   v30.4s, v30.4s, 0
        add     v31.16b, v29.16b, v30.16b
        addv    s31, v31.4s
        fmov    w0, s31
        neg     w0, w0
        ret

I think might be the best code for aarch64 reduction of bools

Reply via email to