https://gcc.gnu.org/bugzilla/show_bug.cgi?id=119442

            Bug ID: 119442
           Summary: [14/15 Regression] Regression in creating SVE
                    predicate
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: aarch64-sve
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: ktkachov at gcc dot gnu.org
  Target Milestone: ---
            Target: aarch64

The testcase is nonsense in itself but is heavily reduced from a real
application. I'd appreciate help with crafting a better one:
float fasten_main_etot_0;
void fasten_main() {
  for (int l = 0; l < 2;) {
    int phphb_nz;
    for (; l < 32; l++) {
      float dslv_e = l && phphb_nz;
      fasten_main_etot_0 += dslv_e;
    }
  }
}

Compiled with -O3 -march=armv8.2-a+sve -msve-vector-bits=128 it tries to create
a governing predicate but ends up scalarising the operation into the
inefficient:
fasten_main:
        sub     sp, sp, #16
        pfalse  p15.b
        str     p15, [sp, #6, mul vl]
        mov     w0, 0
        adrp    x1, .LANCHOR0
        ptrue   p14.b, vl1
        ptrue   p7.b, vl16
        ptrue   p15.s, vl4
        eor     p15.b, p15/z, p14.b, p15.b
        ldr     w2, [sp, 12]
        ldr     s31, [x1, #:lo12:.LANCHOR0]
        bfi     w2, w0, 0, 4
        uxtw    x2, w2
        bfi     w2, w0, 4, 4
        uxtw    x2, w2
        bfi     w2, w0, 8, 4
        uxtw    x2, w2
        bfi     w2, w0, 12, 4
        str     w2, [sp, 12]
        ldr     p14, [sp, #6, mul vl]
        and     p15.b, p14/z, p15.b, p15.b
        mov     z30.s, p14/z, #1
        mov     z0.s, p15/z, #1
        scvtf   z30.s, p7/m, z30.s
        scvtf   z0.s, p7/m, z0.s
        fadda   s31, p7, s31, z0.s
        fadda   s31, p7, s31, z30.s
        fadda   s31, p7, s31, z30.s
        fadda   s31, p7, s31, z30.s
        fadda   s31, p7, s31, z30.s
        fadda   s31, p7, s31, z30.s
        fadda   s31, p7, s31, z30.s
        fadda   s31, p7, s31, z30.s
        str     s31, [x1, #:lo12:.LANCHOR0]
        add     sp, sp, 16
        ret

This seems to be a regression from GCC 13:
fasten_main:
        adrp    x0, .LANCHOR0
        mov     z1.b, #0
        ptrue   p0.s, vl4
        ptrue   p1.b, vl1
        eor     p1.b, p0/z, p1.b, p0.b
        cmpne   p1.s, p1/z, z1.s, z1.s
        ptrue   p0.b, vl16
        ldr     s0, [x0, #:lo12:.LANCHOR0]
        movprfx z2, z1
        mov     z2.s, p1/m, #1
        cmpne   p1.s, p0/z, z1.s, z1.s
        scvtf   z2.s, p0/m, z2.s
        mov     z1.s, p1/m, #1
        fadda   s0, p0, s0, z2.s
        scvtf   z1.s, p0/m, z1.s
        fadda   s0, p0, s0, z1.s
        fadda   s0, p0, s0, z1.s
        fadda   s0, p0, s0, z1.s
        fadda   s0, p0, s0, z1.s
        fadda   s0, p0, s0, z1.s
        fadda   s0, p0, s0, z1.s
        fadda   s0, p0, s0, z1.s
        str     s0, [x0, #:lo12:.LANCHOR0]
        ret

As said, the testcase itself is nonsense (and Clang optimises most of the
function away) but the predicate construction regression is real.
It only triggers for -msve-vector-bits VLS code.

Reply via email to