https://gcc.gnu.org/bugzilla/show_bug.cgi?id=103781

            Bug ID: 103781
           Summary: [AArch64, 11 regr.] Failed partial vectorization of
                    mulv2di3
           Product: gcc
           Version: 11.2.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: husseydevin at gmail dot com
  Target Milestone: ---

As of GCC 11, the AArch64 backend is very greedy in trying to vectorize
mulv2di3. However, there is no mulv2di3 routine so it extracts from the vector.

The bad codegen should be obvious. 

#include <stdint.h>

void fma_u64(uint64_t *restrict acc, const uint64_t *restrict x, const uint64_t
*restrict y)
{
    for (int i = 0; i < 16384; i++){
        acc[0] += *x++ * *y++;
        acc[1] += *x++ * *y++;
    }
}

gcc-11 -O3

fma_u64:
.LFB0:
        .cfi_startproc
        ldr     q1, [x0]
        add     x6, x1, 262144
        .p2align 3,,7
.L2:
        ldr     x4, [x1], 16
        ldr     x5, [x2], 16
        ldr     x3, [x1, -8]
        mul     x4, x4, x5
        ldr     x5, [x2, -8]
        fmov    d0, x4
        ins     v0.d[1], x5
        mul     x3, x3, x5
        ins     v0.d[1], x3
        add     v1.2d, v1.2d, v0.2d
        cmp     x1, x6
        bne     .L2
        str     q1, [x0]
        ret
        .cfi_endproc

GCC 10.2.1 emits better code.

fma_u64:
.LFB0:
        .cfi_startproc
        ldp     x4, x3, [x0]
        add     x9, x1, 262144
        .p2align 3,,7
.L2:
        ldr     x8, [x1], 16
        ldr     x7, [x2], 16
        ldr     x6, [x1, -8]
        ldr     x5, [x2, -8]
        madd    x4, x8, x7, x4
        madd    x3, x6, x5, x3
        cmp     x9, x1
        bne     .L2
        stp     x4, x3, [x0]
        ret
        .cfi_endproc

However, the ideal code would be a 2 iteration unroll.

Side note: why not ldp in the loop?

Reply via email to