https://gcc.gnu.org/bugzilla/show_bug.cgi?id=118505
Bug ID: 118505 Summary: [15 regression] aarch64: 25% regression in TSVC s258 since r15-3436-gb2b20b277988ab Product: gcc Version: 15.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: dhruvc at nvidia dot com Target Milestone: --- Test case: === #define iterations 100000 #define LEN_1D 32000 #define LEN_2D 256 #define ARRAY_ALIGNMENT 64 #include <sys/time.h> struct args_t { struct timeval t1; struct timeval t2; void *__restrict__ arg_info; }; typedef float real_t; __attribute__ ((aligned (ARRAY_ALIGNMENT))) real_t a[LEN_1D], b[LEN_1D], c[LEN_1D], d[LEN_1D], e[LEN_1D], aa[LEN_2D][LEN_2D]; void s258 (struct args_t *func_args) { real_t s; for (int nl = 0; nl < iterations; nl++) { s = 0.; for (int i = 0; i < LEN_2D; ++i) { if (a[i] > 0.) { s = d[i] * d[i]; } b[i] = s * c[i] + d[i]; e[i] = (s + (real_t) 1.) * aa[0][i]; } } } === Before the patch: === s258: .LFB0: .cfi_startproc fmov s31, 1.0e+0 adrp x2, d adrp x1, a adrp x6, c adrp x5, b adrp x4, e adrp x3, aa mov w7, 34464 add x2, x2, :lo12:d add x1, x1, :lo12:a add x6, x6, :lo12:c add x5, x5, :lo12:b add x4, x4, :lo12:e add x3, x3, :lo12:aa movk w7, 0x1, lsl 16 .L7: movi v30.2s, #0 mov x0, 0 .p2align 5,,15 .L6: ldr s4, [x1, x0] fadd s27, s30, s31 ldr s29, [x2, x0] fcmpe s4, #0.0 bls .L13 fmul s30, s29, s29 ldr s3, [x6, x0] ldr s1, [x3, x0] fadd s2, s30, s31 fmadd s3, s3, s30, s29 fmul s1, s1, s2 str s3, [x5, x0] str s1, [x4, x0] add x0, x0, 4 cmp x0, 1024 bne .L6 subs w7, w7, #1 bne .L7 .L15: ret .p2align 2,,3 .L13: ldr s0, [x6, x0] ldr s28, [x3, x0] fmadd s0, s30, s0, s29 fmul s28, s27, s28 str s28, [x4, x0] str s0, [x5, x0] add x0, x0, 4 cmp x0, 1024 bne .L6 subs w7, w7, #1 bne .L7 b .L15 .cfi_endproc === After the patch: === s258: .LFB0: .cfi_startproc fmov s31, 1.0e+0 adrp x6, d adrp x5, a adrp x4, c adrp x3, b adrp x2, e adrp x1, aa mov w7, 34464 add x6, x6, :lo12:d add x5, x5, :lo12:a add x4, x4, :lo12:c add x3, x3, :lo12:b add x2, x2, :lo12:e add x1, x1, :lo12:aa movk w7, 0x1, lsl 16 .L5: movi v30.2s, #0 mov x0, 0 .p2align 5,,15 .L4: ldr s2, [x5, x0] ldr s29, [x6, x0] ldr s0, [x4, x0] fcmpe s2, #0.0 ldr s28, [x1, x0] fmul s1, s29, s29 fcsel s30, s1, s30, gt fadd s27, s30, s31 fmadd s0, s30, s0, s29 fmul s28, s27, s28 str s0, [x3, x0] str s28, [x2, x0] add x0, x0, 4 cmp x0, 1024 bne .L4 subs w7, w7, #1 bne .L5 ret .cfi_endproc === Compiled on NVIDIA Grace with: gcc -std=c99 -march=native -Ofast -fstrict-aliasing -fivopts -ftree-vectorize Before: 0.02s After: 0.025s The part I find strange is that the faster code has an extra section (label .L13) containing an extra fmadd. Could the fcsel be causing the performance hit?