https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110647

--- Comment #2 from Jan Hubicka <hubicka at gcc dot gnu.org> ---
This is a testcase based on our testuiste version so it can be copied to
compiler explorer

#define iterations 10000
#define LEN_1D 32000
#define LEN_2D 256
#define ARRAY_ALIGNMENT 64

typedef float real_t;
#define ABS fabsf
__attribute__((aligned(ARRAY_ALIGNMENT)))
real_t flat_2d_array[LEN_2D * LEN_2D];
__attribute__((aligned(ARRAY_ALIGNMENT))) real_t x[LEN_1D];
__attribute__((aligned(ARRAY_ALIGNMENT))) real_t a[LEN_1D], b[LEN_1D],
    c[LEN_1D], d[LEN_1D], e[LEN_1D], aa[LEN_2D][LEN_2D], bb[LEN_2D][LEN_2D],
    cc[LEN_2D][LEN_2D], tt[LEN_2D][LEN_2D];
__attribute__((aligned(ARRAY_ALIGNMENT))) int indx[LEN_1D];

int dummy(real_t[LEN_1D], real_t[LEN_1D], real_t[LEN_1D], real_t[LEN_1D],
          real_t[LEN_1D], real_t[LEN_2D][LEN_2D], real_t[LEN_2D][LEN_2D],
          real_t[LEN_2D][LEN_2D], real_t);
real_t s2712(struct args_t * func_args)
{
//    control flow
//    if to elemental min


    for (int nl = 0; nl < 4*iterations; nl++) {
        for (int i = 0; i < LEN_1D; i++) {
            if (a[i] >= b[i]) {
                a[i] += b[i] * c[i];
            }
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
    }
return 0;
}

So with GCC 13 I get:
s2712(args_t*):
        stp     x29, x30, [sp, -96]!
        mov     x29, sp
        stp     x19, x20, [sp, 16]
        adrp    x19, a
        adrp    x20, b
        add     x19, x19, :lo12:a
        add     x20, x20, :lo12:b
        stp     x21, x22, [sp, 32]
        adrp    x22, c
        mov     x21, 62464
        add     x22, x22, :lo12:c
        stp     x23, x24, [sp, 48]
        adrp    x24, e
        adrp    x23, d
        add     x24, x24, :lo12:e
        add     x23, x23, :lo12:d
        stp     x25, x26, [sp, 64]
        adrp    x26, bb
        adrp    x25, aa
        add     x26, x26, :lo12:bb
        add     x25, x25, :lo12:aa
        stp     x27, x28, [sp, 80]
        adrp    x27, cc
        add     x27, x27, :lo12:cc
        mov     w28, 40000
        movk    x21, 0x1, lsl 16
.L2:
        mov     x0, 0
.L5:
        ldr     s0, [x19, x0]
        ldr     s1, [x20, x0]
        fcmpe   s0, s1
        bge     .L7
.L3:
        add     x0, x0, 4
        cmp     x0, x21
        bne     .L5
        movi    v0.2s, #0
        mov     x7, x27
        mov     x6, x26
        mov     x5, x25
        mov     x4, x24
        mov     x3, x23
        mov     x2, x22
        mov     x1, x20
        mov     x0, x19
        bl      dummy(float*, float*, float*, float*, float*, float (*) [256],
float (*) [256], float (*) [256], float)
        subs    w28, w28, #1
        bne     .L2
        ldp     x19, x20, [sp, 16]
        movi    v0.2s, #0
        ldp     x21, x22, [sp, 32]
        ldp     x23, x24, [sp, 48]
        ldp     x25, x26, [sp, 64]
        ldp     x27, x28, [sp, 80]
        ldp     x29, x30, [sp], 96
        ret
.L7:
        ldr     s2, [x22, x0]
        fmadd   s0, s1, s2, s0
        str     s0, [x19, x0]
        b       .L3

and trunk:
s2712(args_t*):
        stp     x29, x30, [sp, -96]!
        mov     x29, sp
        stp     x19, x20, [sp, 16]
        adrp    x19, a
        adrp    x20, b
        add     x19, x19, :lo12:a
        add     x20, x20, :lo12:b
        stp     x21, x22, [sp, 32]
        adrp    x22, c
        mov     x21, 62464
        add     x22, x22, :lo12:c
        stp     x23, x24, [sp, 48]
        adrp    x24, e
        adrp    x23, d
        add     x24, x24, :lo12:e
        add     x23, x23, :lo12:d
        stp     x25, x26, [sp, 64]
        adrp    x26, bb
        adrp    x25, aa
        add     x26, x26, :lo12:bb
        add     x25, x25, :lo12:aa
        stp     x27, x28, [sp, 80]
        adrp    x27, cc
        add     x27, x27, :lo12:cc
        mov     w28, 40000
        movk    x21, 0x1, lsl 16
.L2:
        mov     x0, 0
.L5:
        ldr     s31, [x19, x0]
        ldr     s30, [x20, x0]
        fcmpe   s31, s30
        bge     .L7
.L3:
        add     x0, x0, 4
        cmp     x0, x21
        bne     .L5
        movi    v0.2s, #0
        mov     x7, x27
        mov     x6, x26
        mov     x5, x25
        mov     x4, x24
        mov     x3, x23
        mov     x2, x22
        mov     x1, x20
        mov     x0, x19
        bl      dummy(float*, float*, float*, float*, float*, float (*) [256],
float (*) [256], float (*) [256], float)
        subs    w28, w28, #1
        bne     .L2
        ldp     x19, x20, [sp, 16]
        movi    v0.2s, #0
        ldp     x21, x22, [sp, 32]
        ldp     x23, x24, [sp, 48]
        ldp     x25, x26, [sp, 64]
        ldp     x27, x28, [sp, 80]
        ldp     x29, x30, [sp], 96
        ret
.L7:
        ldr     s29, [x22, x0]
        fmadd   s31, s30, s29, s31
        str     s31, [x19, x0]
        b       .L3

The only difference seems to be:
 .L2:
         mov     x0, 0
 .L5:
-        ldr     s31, [x19, x0]
-        ldr     s30, [x20, x0]
-        fcmpe   s31, s30
+        ldr     s0, [x19, x0]
+        ldr     s1, [x20, x0]
+        fcmpe   s0, s1
         bge     .L7
 .L3:
         add     x0, x0, 4
@@ -57,7 +57,7 @@
         ldp     x29, x30, [sp], 96
         ret
 .L7:
-        ldr     s29, [x22, x0]
-        fmadd   s31, s30, s29, s31
-        str     s31, [x19, x0]
+        ldr     s2, [x22, x0]
+        fmadd   s0, s1, s2, s0
+        str     s0, [x19, x0]
         b       .L3

which seems that it is a noise (caused by code layout change in the whole
bechmark) after all?

Reply via email to