https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110647
--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> --- This is a testcase based on our testuiste version so it can be copied to compiler explorer #define iterations 10000 #define LEN_1D 32000 #define LEN_2D 256 #define ARRAY_ALIGNMENT 64 typedef float real_t; #define ABS fabsf __attribute__((aligned(ARRAY_ALIGNMENT))) real_t flat_2d_array[LEN_2D * LEN_2D]; __attribute__((aligned(ARRAY_ALIGNMENT))) real_t x[LEN_1D]; __attribute__((aligned(ARRAY_ALIGNMENT))) real_t a[LEN_1D], b[LEN_1D], c[LEN_1D], d[LEN_1D], e[LEN_1D], aa[LEN_2D][LEN_2D], bb[LEN_2D][LEN_2D], cc[LEN_2D][LEN_2D], tt[LEN_2D][LEN_2D]; __attribute__((aligned(ARRAY_ALIGNMENT))) int indx[LEN_1D]; int dummy(real_t[LEN_1D], real_t[LEN_1D], real_t[LEN_1D], real_t[LEN_1D], real_t[LEN_1D], real_t[LEN_2D][LEN_2D], real_t[LEN_2D][LEN_2D], real_t[LEN_2D][LEN_2D], real_t); real_t s2712(struct args_t * func_args) { // control flow // if to elemental min for (int nl = 0; nl < 4*iterations; nl++) { for (int i = 0; i < LEN_1D; i++) { if (a[i] >= b[i]) { a[i] += b[i] * c[i]; } } dummy(a, b, c, d, e, aa, bb, cc, 0.); } return 0; } So with GCC 13 I get: s2712(args_t*): stp x29, x30, [sp, -96]! mov x29, sp stp x19, x20, [sp, 16] adrp x19, a adrp x20, b add x19, x19, :lo12:a add x20, x20, :lo12:b stp x21, x22, [sp, 32] adrp x22, c mov x21, 62464 add x22, x22, :lo12:c stp x23, x24, [sp, 48] adrp x24, e adrp x23, d add x24, x24, :lo12:e add x23, x23, :lo12:d stp x25, x26, [sp, 64] adrp x26, bb adrp x25, aa add x26, x26, :lo12:bb add x25, x25, :lo12:aa stp x27, x28, [sp, 80] adrp x27, cc add x27, x27, :lo12:cc mov w28, 40000 movk x21, 0x1, lsl 16 .L2: mov x0, 0 .L5: ldr s0, [x19, x0] ldr s1, [x20, x0] fcmpe s0, s1 bge .L7 .L3: add x0, x0, 4 cmp x0, x21 bne .L5 movi v0.2s, #0 mov x7, x27 mov x6, x26 mov x5, x25 mov x4, x24 mov x3, x23 mov x2, x22 mov x1, x20 mov x0, x19 bl dummy(float*, float*, float*, float*, float*, float (*) [256], float (*) [256], float (*) [256], float) subs w28, w28, #1 bne .L2 ldp x19, x20, [sp, 16] movi v0.2s, #0 ldp x21, x22, [sp, 32] ldp x23, x24, [sp, 48] ldp x25, x26, [sp, 64] ldp x27, x28, [sp, 80] ldp x29, x30, [sp], 96 ret .L7: ldr s2, [x22, x0] fmadd s0, s1, s2, s0 str s0, [x19, x0] b .L3 and trunk: s2712(args_t*): stp x29, x30, [sp, -96]! mov x29, sp stp x19, x20, [sp, 16] adrp x19, a adrp x20, b add x19, x19, :lo12:a add x20, x20, :lo12:b stp x21, x22, [sp, 32] adrp x22, c mov x21, 62464 add x22, x22, :lo12:c stp x23, x24, [sp, 48] adrp x24, e adrp x23, d add x24, x24, :lo12:e add x23, x23, :lo12:d stp x25, x26, [sp, 64] adrp x26, bb adrp x25, aa add x26, x26, :lo12:bb add x25, x25, :lo12:aa stp x27, x28, [sp, 80] adrp x27, cc add x27, x27, :lo12:cc mov w28, 40000 movk x21, 0x1, lsl 16 .L2: mov x0, 0 .L5: ldr s31, [x19, x0] ldr s30, [x20, x0] fcmpe s31, s30 bge .L7 .L3: add x0, x0, 4 cmp x0, x21 bne .L5 movi v0.2s, #0 mov x7, x27 mov x6, x26 mov x5, x25 mov x4, x24 mov x3, x23 mov x2, x22 mov x1, x20 mov x0, x19 bl dummy(float*, float*, float*, float*, float*, float (*) [256], float (*) [256], float (*) [256], float) subs w28, w28, #1 bne .L2 ldp x19, x20, [sp, 16] movi v0.2s, #0 ldp x21, x22, [sp, 32] ldp x23, x24, [sp, 48] ldp x25, x26, [sp, 64] ldp x27, x28, [sp, 80] ldp x29, x30, [sp], 96 ret .L7: ldr s29, [x22, x0] fmadd s31, s30, s29, s31 str s31, [x19, x0] b .L3 The only difference seems to be: .L2: mov x0, 0 .L5: - ldr s31, [x19, x0] - ldr s30, [x20, x0] - fcmpe s31, s30 + ldr s0, [x19, x0] + ldr s1, [x20, x0] + fcmpe s0, s1 bge .L7 .L3: add x0, x0, 4 @@ -57,7 +57,7 @@ ldp x29, x30, [sp], 96 ret .L7: - ldr s29, [x22, x0] - fmadd s31, s30, s29, s31 - str s31, [x19, x0] + ldr s2, [x22, x0] + fmadd s0, s1, s2, s0 + str s0, [x19, x0] b .L3 which seems that it is a noise (caused by code layout change in the whole bechmark) after all?