https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118505

            Bug ID: 118505
           Summary: [15 regression] aarch64: 25% regression in TSVC s258
                    since r15-3436-gb2b20b277988ab
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: dhruvc at nvidia dot com
  Target Milestone: ---

Test case:

===
#define iterations 100000
#define LEN_1D 32000
#define LEN_2D 256
#define ARRAY_ALIGNMENT 64
#include <sys/time.h>

struct args_t
{
  struct timeval t1;
  struct timeval t2;
  void *__restrict__ arg_info;
};

typedef float real_t;

__attribute__ ((aligned (ARRAY_ALIGNMENT))) real_t a[LEN_1D], b[LEN_1D],
  c[LEN_1D], d[LEN_1D], e[LEN_1D], aa[LEN_2D][LEN_2D];

void
s258 (struct args_t *func_args)
{
  real_t s;
  for (int nl = 0; nl < iterations; nl++)
    {
      s = 0.;
      for (int i = 0; i < LEN_2D; ++i)
        {
          if (a[i] > 0.)
            {
              s = d[i] * d[i];
            }
          b[i] = s * c[i] + d[i];
          e[i] = (s + (real_t) 1.) * aa[0][i];
        }
    }
}
===

Before the patch:

===
s258:
.LFB0:
        .cfi_startproc
        fmov    s31, 1.0e+0
        adrp    x2, d
        adrp    x1, a
        adrp    x6, c
        adrp    x5, b
        adrp    x4, e
        adrp    x3, aa
        mov     w7, 34464
        add     x2, x2, :lo12:d
        add     x1, x1, :lo12:a
        add     x6, x6, :lo12:c
        add     x5, x5, :lo12:b
        add     x4, x4, :lo12:e
        add     x3, x3, :lo12:aa
        movk    w7, 0x1, lsl 16
.L7:
        movi    v30.2s, #0
        mov     x0, 0
        .p2align 5,,15
.L6:
        ldr     s4, [x1, x0]
        fadd    s27, s30, s31
        ldr     s29, [x2, x0]
        fcmpe   s4, #0.0
        bls     .L13
        fmul    s30, s29, s29
        ldr     s3, [x6, x0]
        ldr     s1, [x3, x0]
        fadd    s2, s30, s31
        fmadd   s3, s3, s30, s29
        fmul    s1, s1, s2
        str     s3, [x5, x0]
        str     s1, [x4, x0]
        add     x0, x0, 4
        cmp     x0, 1024
        bne     .L6
        subs    w7, w7, #1
        bne     .L7
.L15:
        ret
        .p2align 2,,3
.L13:
        ldr     s0, [x6, x0]
        ldr     s28, [x3, x0]
        fmadd   s0, s30, s0, s29
        fmul    s28, s27, s28
        str     s28, [x4, x0]
        str     s0, [x5, x0]
        add     x0, x0, 4
        cmp     x0, 1024
        bne     .L6
        subs    w7, w7, #1
        bne     .L7
        b       .L15
        .cfi_endproc
===

After the patch:

===
s258:
.LFB0:
        .cfi_startproc
        fmov    s31, 1.0e+0
        adrp    x6, d
        adrp    x5, a
        adrp    x4, c
        adrp    x3, b
        adrp    x2, e
        adrp    x1, aa
        mov     w7, 34464
        add     x6, x6, :lo12:d
        add     x5, x5, :lo12:a
        add     x4, x4, :lo12:c
        add     x3, x3, :lo12:b
        add     x2, x2, :lo12:e
        add     x1, x1, :lo12:aa
        movk    w7, 0x1, lsl 16
.L5:
        movi    v30.2s, #0
        mov     x0, 0
        .p2align 5,,15
.L4:
        ldr     s2, [x5, x0]
        ldr     s29, [x6, x0]
        ldr     s0, [x4, x0]
        fcmpe   s2, #0.0
        ldr     s28, [x1, x0]
        fmul    s1, s29, s29
        fcsel   s30, s1, s30, gt
        fadd    s27, s30, s31
        fmadd   s0, s30, s0, s29
        fmul    s28, s27, s28
        str     s0, [x3, x0]
        str     s28, [x2, x0]
        add     x0, x0, 4
        cmp     x0, 1024
        bne     .L4
        subs    w7, w7, #1
        bne     .L5
        ret
        .cfi_endproc
===

Compiled on NVIDIA Grace with:

gcc -std=c99 -march=native -Ofast -fstrict-aliasing -fivopts -ftree-vectorize

Before: 0.02s
After:  0.025s

The part I find strange is that the faster code has an extra section (label
.L13) containing an extra fmadd. Could the fcsel be causing the performance
hit?

Reply via email to