https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121536

            Bug ID: 121536
           Summary: Performance regression in RAJAPerf workload due to
                    vectorization differences
           Product: gcc
           Version: 16.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jschmitz at gcc dot gnu.org
                CC: rguenth at gcc dot gnu.org, tamar.christina at arm dot com
  Target Milestone: ---
            Target: aarch64

We are seeing a performance regression of approx. 8% in the RAJAPerf kernels
Basic_PI_ATOMIC and Basic_PI_REDUCE on Neoverse V2. Bisection points to
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=fb59c5719c17a04ecfd58b5e566eccd6d2ac583a
(Avoid passing vectype != NULL when costing scalar IL) as the causing commit.

Investigation of the kernel Basic_PI_REDUCE shows differences in codegen where
several code sections are no longer being vectorized. The following code
snippet was reduced from Basic_PI_REDUCED and illustrates the codegen
differences:

using a = long;
using b = a;
using c = double;
b d;
c e;
void f() {
  for (b g; g < d; ++g)
    e += g;
}

Compiled with: -std=c++14 -O3 -Wl,-z,muldefs -lm -Wl,--sort-section=name
-fpermissive -march=native -mcpu=neoverse-v2 -msve-vector-bits=128

was previously compiled to (commit da88f9bd):
    adrp    x0, .LANCHOR0
    ldr    x2, [x0, #:lo12:.LANCHOR0]
    add    x3, x0, :lo12:.LANCHOR0
    cmp    x2, 0
    ble    .L1
    mov    x0, 0
    ldr    d31, [x3, 8]
    sub    x4, x2, x0
    sub    x1, x4, #1
    cmp    x1, 8
    bls    .L3
    lsr    x1, x4, 3
    index    z30.d, #0, #1
    ptrue    p7.b, vl16
.L4:
    add    x0, x0, 1
    movprfx    z29, z30
    add    z29.d, z29.d, #2
    movprfx    z16, z30
    scvtf    z16.d, p7/m, z30.d
    movprfx    z28, z30
    add    z28.d, z28.d, #4
    movprfx    z27, z30
    add    z27.d, z27.d, #6
    scvtf    z29.d, p7/m, z29.d
    fadda    d31, p7, d31, z16.d
    scvtf    z28.d, p7/m, z28.d
    fadda    d31, p7, d31, z29.d
    scvtf    z27.d, p7/m, z27.d
    fadda    d31, p7, d31, z28.d
    add    z30.d, z30.d, #8
    fadda    d31, p7, d31, z27.d
    cmp    x0, x1
    bne    .L4
    lsl    x0, x0, 3
    cmp    x4, x0
    beq    .L5
.L3:
    scvtf    d7, x0
    add    x1, x0, 1
    fadd    d31, d31, d7
    cmp    x2, x1
    ble    .L5
    scvtf    d6, x1
    add    x1, x0, 2
    fadd    d31, d31, d6
    cmp    x2, x1
    ble    .L5
    scvtf    d5, x1
    add    x1, x0, 3
    fadd    d31, d31, d5
    cmp    x2, x1
    ble    .L5
    scvtf    d4, x1
    add    x1, x0, 4
    fadd    d31, d31, d4
    cmp    x2, x1
    ble    .L5
    scvtf    d3, x1
    add    x1, x0, 5
    fadd    d31, d31, d3
    cmp    x2, x1
    ble    .L5
    scvtf    d2, x1
    add    x1, x0, 6
    fadd    d31, d31, d2
    cmp    x2, x1
    ble    .L5
    scvtf    d1, x1
    add    x1, x0, 7
    fadd    d31, d31, d1
    cmp    x2, x1
    ble    .L5
    scvtf    d0, x1
    add    x0, x0, 8
    fadd    d31, d31, d0
    cmp    x2, x0
    ble    .L5
    scvtf    d26, x0
    fadd    d31, d31, d26
.L5:
    str    d31, [x3, 8]
.L1:
    ret

and is now compiled to (commit fb59c571):
    adrp    x2, .LANCHOR0
    mov    x0, 0
    ldr    x1, [x2, #:lo12:.LANCHOR0]
    add    x2, x2, :lo12:.LANCHOR0
    cmp    x0, x1
    bge    .L1
    ldr    d31, [x2, 8]
.L3:
    scvtf    d30, x0
    add    x0, x0, 1
    fadd    d31, d31, d30
    cmp    x0, x1
    bne    .L3
    str    d31, [x2, 8]
.L1:
    ret

Comparing the 185t.vect dumps shows that scalar_stmt previously cost 2, while
it now costs 1, making vectorization less profitable.

It looks that the lack of vectorization is causing the performance regression
and it could be a cost model issue blocking vectorization.

Reply via email to