https://gcc.gnu.org/bugzilla/show_bug.cgi?id=109499
Bug ID: 109499
Summary: Unnecessary zeroing in SVE loops
Product: gcc
Version: 13.0
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: rsandifo at gcc dot gnu.org
CC: rguenth at gcc dot gnu.org
Target Milestone: ---
Target: aarch64*-*-*
The following two loops contain unnecessary zeroing operations:
// -march=armv8.2-a+sve -O2
void
f (int *__restrict x, int *__restrict y, int n)
{
for (int i = 0; i < n; i++)
x[i] = x[i] ? y[i] : 0;
}
void
g (int *__restrict x, int *__restrict y, int n)
{
for (int i = 0; i < n; i++)
x[i] = x[i] ? y[i] & 15 : 0;
}
Output:
f(int*, int*, int):
cmp w2, 0
ble .L1
mov x3, 0
cntw x4
whilelo p0.s, wzr, w2
mov z1.b, #0
.L3:
ld1w z0.s, p0/z, [x0, x3, lsl 2]
cmpne p1.s, p0/z, z0.s, #0
ld1w z0.s, p1/z, [x1, x3, lsl 2] // Sets inactive lanes to zero
sel z0.s, p1, z0.s, z1.s // Not needed
st1w z0.s, p0, [x0, x3, lsl 2]
add x3, x3, x4
whilelo p0.s, w3, w2
b.any .L3
.L1:
ret
g(int*, int*, int):
cmp w2, 0
ble .L6
mov x3, 0
cntw x4
whilelo p0.s, wzr, w2
mov z1.s, #15
.L8:
ld1w z0.s, p0/z, [x0, x3, lsl 2]
cmpne p1.s, p0/z, z0.s, #0
ld1w z0.s, p1/z, [x1, x3, lsl 2] // Sets inactive lanes to zero
movprfx z0.s, p1/z, z0.s // Not needed
and z0.s, p1/m, z0.s, z1.s // Could be AND (immediate)
st1w z0.s, p0, [x0, x3, lsl 2]
add x3, x3, x4
whilelo p0.s, w3, w2
b.any .L8
.L6:
ret
It would be good to model somehow that IFN_MASK_LOAD has a zeroing effect on
AArch64, so that this is exposed at the gimple level. At the same time, we
probably don't want the behaviour of the ifn to depend on target hooks. Not
sure what the best design is here.