https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110647
--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
This is a testcase based on our testuiste version so it can be copied to
compiler explorer
#define iterations 10000
#define LEN_1D 32000
#define LEN_2D 256
#define ARRAY_ALIGNMENT 64
typedef float real_t;
#define ABS fabsf
__attribute__((aligned(ARRAY_ALIGNMENT)))
real_t flat_2d_array[LEN_2D * LEN_2D];
__attribute__((aligned(ARRAY_ALIGNMENT))) real_t x[LEN_1D];
__attribute__((aligned(ARRAY_ALIGNMENT))) real_t a[LEN_1D], b[LEN_1D],
c[LEN_1D], d[LEN_1D], e[LEN_1D], aa[LEN_2D][LEN_2D], bb[LEN_2D][LEN_2D],
cc[LEN_2D][LEN_2D], tt[LEN_2D][LEN_2D];
__attribute__((aligned(ARRAY_ALIGNMENT))) int indx[LEN_1D];
int dummy(real_t[LEN_1D], real_t[LEN_1D], real_t[LEN_1D], real_t[LEN_1D],
real_t[LEN_1D], real_t[LEN_2D][LEN_2D], real_t[LEN_2D][LEN_2D],
real_t[LEN_2D][LEN_2D], real_t);
real_t s2712(struct args_t * func_args)
{
// control flow
// if to elemental min
for (int nl = 0; nl < 4*iterations; nl++) {
for (int i = 0; i < LEN_1D; i++) {
if (a[i] >= b[i]) {
a[i] += b[i] * c[i];
}
}
dummy(a, b, c, d, e, aa, bb, cc, 0.);
}
return 0;
}
So with GCC 13 I get:
s2712(args_t*):
stp x29, x30, [sp, -96]!
mov x29, sp
stp x19, x20, [sp, 16]
adrp x19, a
adrp x20, b
add x19, x19, :lo12:a
add x20, x20, :lo12:b
stp x21, x22, [sp, 32]
adrp x22, c
mov x21, 62464
add x22, x22, :lo12:c
stp x23, x24, [sp, 48]
adrp x24, e
adrp x23, d
add x24, x24, :lo12:e
add x23, x23, :lo12:d
stp x25, x26, [sp, 64]
adrp x26, bb
adrp x25, aa
add x26, x26, :lo12:bb
add x25, x25, :lo12:aa
stp x27, x28, [sp, 80]
adrp x27, cc
add x27, x27, :lo12:cc
mov w28, 40000
movk x21, 0x1, lsl 16
.L2:
mov x0, 0
.L5:
ldr s0, [x19, x0]
ldr s1, [x20, x0]
fcmpe s0, s1
bge .L7
.L3:
add x0, x0, 4
cmp x0, x21
bne .L5
movi v0.2s, #0
mov x7, x27
mov x6, x26
mov x5, x25
mov x4, x24
mov x3, x23
mov x2, x22
mov x1, x20
mov x0, x19
bl dummy(float*, float*, float*, float*, float*, float (*) [256],
float (*) [256], float (*) [256], float)
subs w28, w28, #1
bne .L2
ldp x19, x20, [sp, 16]
movi v0.2s, #0
ldp x21, x22, [sp, 32]
ldp x23, x24, [sp, 48]
ldp x25, x26, [sp, 64]
ldp x27, x28, [sp, 80]
ldp x29, x30, [sp], 96
ret
.L7:
ldr s2, [x22, x0]
fmadd s0, s1, s2, s0
str s0, [x19, x0]
b .L3
and trunk:
s2712(args_t*):
stp x29, x30, [sp, -96]!
mov x29, sp
stp x19, x20, [sp, 16]
adrp x19, a
adrp x20, b
add x19, x19, :lo12:a
add x20, x20, :lo12:b
stp x21, x22, [sp, 32]
adrp x22, c
mov x21, 62464
add x22, x22, :lo12:c
stp x23, x24, [sp, 48]
adrp x24, e
adrp x23, d
add x24, x24, :lo12:e
add x23, x23, :lo12:d
stp x25, x26, [sp, 64]
adrp x26, bb
adrp x25, aa
add x26, x26, :lo12:bb
add x25, x25, :lo12:aa
stp x27, x28, [sp, 80]
adrp x27, cc
add x27, x27, :lo12:cc
mov w28, 40000
movk x21, 0x1, lsl 16
.L2:
mov x0, 0
.L5:
ldr s31, [x19, x0]
ldr s30, [x20, x0]
fcmpe s31, s30
bge .L7
.L3:
add x0, x0, 4
cmp x0, x21
bne .L5
movi v0.2s, #0
mov x7, x27
mov x6, x26
mov x5, x25
mov x4, x24
mov x3, x23
mov x2, x22
mov x1, x20
mov x0, x19
bl dummy(float*, float*, float*, float*, float*, float (*) [256],
float (*) [256], float (*) [256], float)
subs w28, w28, #1
bne .L2
ldp x19, x20, [sp, 16]
movi v0.2s, #0
ldp x21, x22, [sp, 32]
ldp x23, x24, [sp, 48]
ldp x25, x26, [sp, 64]
ldp x27, x28, [sp, 80]
ldp x29, x30, [sp], 96
ret
.L7:
ldr s29, [x22, x0]
fmadd s31, s30, s29, s31
str s31, [x19, x0]
b .L3
The only difference seems to be:
.L2:
mov x0, 0
.L5:
- ldr s31, [x19, x0]
- ldr s30, [x20, x0]
- fcmpe s31, s30
+ ldr s0, [x19, x0]
+ ldr s1, [x20, x0]
+ fcmpe s0, s1
bge .L7
.L3:
add x0, x0, 4
@@ -57,7 +57,7 @@
ldp x29, x30, [sp], 96
ret
.L7:
- ldr s29, [x22, x0]
- fmadd s31, s30, s29, s31
- str s31, [x19, x0]
+ ldr s2, [x22, x0]
+ fmadd s0, s1, s2, s0
+ str s0, [x19, x0]
b .L3
which seems that it is a noise (caused by code layout change in the whole
bechmark) after all?