https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121290

            Bug ID: 121290
           Summary: Regrsssions in TSVC s119, s3113, s312, s313, s314,
                    s315, s316 since commit 3bf2aa834e1
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: soumyaa at gcc dot gnu.org
                CC: rguenth at gcc dot gnu.org
  Target Milestone: ---

Hi,

Commit 3bf2aa834e1
[https://gcc.gnu.org/cgit/gcc/commit/?id=3bf2aa834e1270e3167c9559bef9a8ef1f668604]
has led to the following regressions in TSVC kernels:

Summary:

s3113 [230% Regression] -Ofast -mcpu=neoverse-v2
https://godbolt.org/z/En8rYf7sE

s312 [300%] -Ofast -mcpu=neoverse-v2
https://godbolt.org/z/WbT6rfW3h

s313 [92%] -Ofast -mcpu=neoverse-v2
https://godbolt.org/z/Kz1vv54hh

s314 [288%] -Ofast -mcpu=neoverse-v2
https://godbolt.org/z/9eo6dc9Pv

s316 [186%] -Ofast -mcpu=neoverse-v2
https://godbolt.org/z/9Mqzxr9aE

s315 [48%] -O3 -mcpu=neoverse-v2 -msve-vector-bits=128
https://godbolt.org/z/8465KKMc4

------

s3113, s312, s313, s314, s316, are affected by missing loop unrolling. For
example, in s3113:

#define iterations 100000
#define LEN_1D 32000

float a[LEN_1D];

int main()
{
    for (int i = 0; i < LEN_1D; i++) {
        a[i] = i;
    }
    float max;
    for (int nl = 0; nl < iterations*4; nl++) {
        max = fabsf(a[0]);
        for (int i = 0; i < LEN_1D; i++) {
            if (fabsf(a[i]) > max) {
                max = fabsf(a[i]);
            }
        }
    }

    return max;
} 


Now:
        ldr     s24, [x0], 4
        fabs    s24, s24
        dup     v24.4s, v24.s[0]
        fmaxnm  v25.4s, v25.4s, v24.4s
        cmp     x1, x0
        bne     .L3
        add     w2, w2, 1
        cmp     w2, w4
        bne     .L4
        mov     w0, 1148846080
        dup     s25, v25.s[3]
        fmov    s31, w0
        fcmpe   s25, s31
        cset    w0, gt
        ret

Before:
        ldp     q2, q1, [x0]
        ldp     q0, q21, [x0, 32]
        add     x0, x0, 64
        fabs    v2.4s, v2.4s
        fabs    v1.4s, v1.4s
        fabs    v0.4s, v0.4s
        fabs    v21.4s, v21.4s
        fmaxnm  v25.4s, v25.4s, v2.4s
        fmaxnm  v22.4s, v22.4s, v1.4s
        fmaxnm  v23.4s, v23.4s, v0.4s
        fmaxnm  v24.4s, v24.4s, v21.4s
        cmp     x1, x0
        bne     .L3
        subs    w2, w2, #1
        bne     .L4
        fmaxnm  v22.4s, v25.4s, v22.4s
        mov     w0, 1148846080
        fmaxnm  v23.4s, v23.4s, v24.4s
        fmov    s31, w0
        fmaxnm  v23.4s, v22.4s, v23.4s
        fmaxnmv s23, v23.4s
        fcmpe   s23, s31
        cset    w0, gt
        ret

s315 is odd, it removes branching by using bit/bsl, but still executes slower:

#define iterations 100000
#define LEN_1D 32000

float a[LEN_1D];

int main()
{
    for (int i = 0; i < LEN_1D; i++) {
        a[i] = (i * 7) % LEN_1D;
    }
    float x, chksum;
    int index;
    for (int nl = 0; nl < iterations; nl++) {
        x = a[0];
        index = 0;
        for (int i = 0; i < LEN_1D; ++i) {
            if (a[i] > x) {
                x = a[i];
                index = i;
            }
        }
        chksum = x + (float) index;
    }

    return index + x > 1;
} 

Now:

.L4:
        movi    v23.4s, 0
        mov     v24.16b, v26.16b
        mov     x0, x3
        mov     v22.16b, v23.16b
.L3:
        ld1r    {v1.4s}, [x0], 4
        fcmgt   v20.4s, v1.4s, v24.4s
        bit     v23.16b, v22.16b, v20.16b
        bsl     v20.16b, v1.16b, v24.16b
        add     v22.4s, v22.4s, v25.4s
        mov     v24.16b, v20.16b
        cmp     x1, x0
        bne     .L3
        add     w2, w2, 1
        cmp     w2, w4
        bne     .L4
        dup     s23, v23.s[3]
        dup     s20, v20.s[3]
        fmov    s21, 1.0e+0
        scvtf   s0, s23
        fadd    s20, s0, s20
        fcmpe   s20, s21
        cset    w0, gt
        ret

Before:
.L6:
        fmov    s25, s1
        movi    v26.2d, #0
        mov     x0, 0
.L5:
        ldr     s0, [x1, x0, lsl 2]
        fcmpe   s25, s0
        bmi     .L7
.L3:
        add     x0, x0, 1
        cmp     x0, x2
        bne     .L5
        subs    w3, w3, #1
        bne     .L6
        scvtf   s26, s26
        fmov    s24, 1.0e+0
        fadd    s26, s26, s25
        fcmpe   s26, s24
        cset    w0, gt
        ret
.L7:
        fmov    s26, w0
        fmov    s25, s0
        b       .L3


Thanks,
Soumya

Reply via email to