https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118188

            Bug ID: 118188
           Summary: [15 regression] aarch64: 30% regression in TSVC s4115
                    since r15-5565-gdbc38dd9e96a99
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: dhruvc at nvidia dot com
  Target Milestone: ---

Testcase:

===
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
#define ARRAY_ALIGNMENT 64
#include <sys/time.h>

struct args_t {
    struct timeval t1;
    struct timeval t2;
    void * __restrict__ arg_info;
};

typedef float real_t;

__attribute__((aligned(ARRAY_ALIGNMENT))) real_t
a[LEN_1D],b[LEN_1D],c[LEN_1D],d[LEN_1D],e[LEN_1D];

real_t s4115(struct args_t * func_args)
{
    int * __restrict__ ip = func_args->arg_info;
    real_t sum;
    for (int nl = 0; nl < iterations; nl++) {
        sum = 0.;
        for (int i = 0; i < LEN_1D; i++) {
            sum += a[i] * b[ip[i]];
        }
    }
    return sum;
}
===

Before:

===
.L2:
        mov     x0, 0
        mov     p7.b, p15.b
        movi    d0, #0
        .p2align 5,,15
.L3:
        ld1w    z29.s, p7/z, [x5, x0, lsl 2]
        ld1w    z31.s, p7/z, [x3, x0, lsl 2]
        ld1w    z30.s, p7/z, [x2, z29.s, sxtw 2]
        add     x0, x0, x4
        fmla    z0.s, p7/m, z31.s, z30.s
        whilelo p7.s, w0, w1
        b.any   .L3
        subs    w6, w6, #1
        bne     .L2
        ptrue   p7.b, all
        faddv   s0, p7, z0.s
        ret
===

After:

===
.L2:
        movi    v0.4s, 0
        mov     x0, 0
        .p2align 5,,15
.L3:
        add     x2, x3, x0
        ldrsw   x4, [x3, x0]
        ldrsw   x6, [x2, 4]
        ldpsw   x2, x5, [x2, 8]
        ldr     s1, [x1, x4, lsl 2]
        ldr     s30, [x1, x6, lsl 2]
        ldr     s31, [x1, x5, lsl 2]
        ldr     s29, [x1, x2, lsl 2]
        uzp1    v30.2s, v30.2s, v31.2s
        ldr     q31, [x7, x0]
        add     x0, x0, 16
        uzp1    v1.2s, v1.2s, v29.2s
        zip1    v30.4s, v1.4s, v30.4s
        fmla    v0.4s, v31.4s, v30.4s
        cmp     x0, x8
        bne     .L3
        subs    w9, w9, #1
        bne     .L2
        faddp   v0.4s, v0.4s, v0.4s
        faddp   v0.4s, v0.4s, v0.4s
        ret
===

I think the scalar loads are causing the slowdown. This appears to have been an
optimization in GCC 15 that has regressed again.

Command line:
gcc -std=c99 -march=native -Ofast -fstrict-aliasing -fivopts -ftree-vectorize
-S -mcpu=grace -c src/tsvc.c

Reply via email to