https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115629
Bug ID: 115629
Summary: Inefficient if-convert of masked conditionals
Product: gcc
Version: 15.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
Assignee: unassigned at gcc dot gnu.org
Reporter: tnfchris at gcc dot gnu.org
Target Milestone: ---
With cases such as:
void foo1 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}
we generate:
foo1:
cmp w5, 0
ble .L1
mov x6, 0
whilelo p7.s, wzr, w5
ptrue p3.b, all
.L3:
ld1w z31.s, p7/z, [x0, x6, lsl 2]
cmpeq p6.s, p7/z, z31.s, #0
cmpne p5.s, p7/z, z31.s, #0
ld1w z0.s, p6/z, [x2, x6, lsl 2]
ld1w z30.s, p5/z, [x1, x6, lsl 2]
cmpne p15.s, p3/z, z0.s, #0
orr z0.d, z31.d, z0.d
and p6.b, p15/z, p6.b, p6.b
cmpeq p4.s, p7/z, z0.s, #0
ld1w z28.s, p6/z, [x1, x6, lsl 2]
ld1w z29.s, p4/z, [x3, x6, lsl 2]
sel z29.s, p15, z28.s, z29.s
sel z29.s, p5, z30.s, z29.s
st1w z29.s, p7, [x4, x6, lsl 2]
incw x6
whilelo p7.s, w6, w5
b.any .L3
where b is loaded twice, once with the mask a[i] != 0, and once a[i] == 0 &&
c[i] != 0.
clearly we don't need the second compare nor the second load. This loop can be
optimally handled as:
foo1:
cmp w5, 0
ble .L1
mov x6, 0
cntw x7
whilelo p7.s, wzr, w5
.p2align 5,,15
.L3:
ld1w z1.s, p7/z, [x0, x6, lsl 2]
ld1w z0.s, p7/z, [x2, x6, lsl 2]
orr z0.d, z1.d, z0.d
cmpne p6.s, p7/z, z0.s, #0
cmpeq p5.s, p7/z, z0.s, #0
ld1w z31.s, p6/z, [x1, x6, lsl 2]
ld1w z30.s, p5/z, [x3, x6, lsl 2]
sel z30.s, p6, z31.s, z30.s
st1w z30.s, p7, [x4, x6, lsl 2]
add x6, x6, x7
whilelo p7.s, w6, w5
b.any .L3
.L1:
ret
i.e. transform a ? b : (c ? b : d) into (a || c) ? b : d.
This transform is actually also beneficial for scalar, but that's not the case
when one of the conditions have to be inverted. i.e. cases 2 to 4 are
beneficial for vector masked operations but not scalar:
/* Convert a ? b : (c ? b : d) into (a || c) ? b : d. */
void foo1 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? b[i] : d[i]);
}
/* Convert a ? (c ? b : d) : b into (-a || c) ? b : d. */
void foo2 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? (c[i] ? b[i] : d[i]) : b[i];
}
/* Convert a ? (c ? d :b) : b into (-a || -c) ? b : d. */
void foo3 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? (c[i] ? d[i] : b[i]) : b[i];
}
/* Convert a ? b : (c ? d : b) into (a || -c) ? b : d. */
void foo4 (int *restrict a, int *restrict b, int *restrict c,
int *restrict d, int *restrict res, int n)
{
for (int i = 0; i < n; i++)
res[i] = a[i] ? b[i] : (c[i] ? d[i] : b[i]);
}
for instance case 3 is currently vectorized as:
foo3:
cmp w5, 0
ble .L10
mov x6, 0
whilelo p7.s, wzr, w5
ptrue p3.b, all
.L12:
ld1w z1.s, p7/z, [x0, x6, lsl 2]
cmpeq p5.s, p7/z, z1.s, #0
cmpne p6.s, p7/z, z1.s, #0
ld1w z29.s, p5/z, [x1, x6, lsl 2]
ld1w z0.s, p6/z, [x2, x6, lsl 2]
cmpne p15.s, p3/z, z0.s, #0
cmpeq p4.s, p6/z, z0.s, #0
and p5.b, p15/z, p6.b, p6.b
ld1w z30.s, p4/z, [x1, x6, lsl 2]
ld1w z31.s, p5/z, [x3, x6, lsl 2]
sel z30.s, p15, z31.s, z30.s
sel z30.s, p6, z30.s, z29.s
st1w z30.s, p7, [x4, x6, lsl 2]
incw x6
whilelo p7.s, w6, w5
b.any .L12
but can be
foo3:
cmp w5, 0
ble .L10
mov x6, 0
cntw x7
whilelo p6.s, wzr, w5
ptrue p5.b, all
.p2align 5,,15
.L12:
ld1w z29.s, p6/z, [x0, x6, lsl 2]
ld1w z28.s, p6/z, [x2, x6, lsl 2]
cmpeq p15.s, p5/z, z29.s, #0
cmpeq p14.s, p5/z, z28.s, #0
orr p15.b, p5/z, p15.b, p14.b
and p4.b, p6/z, p15.b, p15.b
bic p7.b, p5/z, p6.b, p15.b
ld1w z31.s, p4/z, [x1, x6, lsl 2]
ld1w z30.s, p7/z, [x3, x6, lsl 2]
sel z30.s, p4, z31.s, z30.s
st1w z30.s, p6, [x4, x6, lsl 2]
add x6, x6, x7
whilelo p6.s, w6, w5
b.any .L12
with those match.pd rule, however this likely needs to be done in if-convert
since not all the transforms are beneficial for scalar because the negation
adds an extra instruction before the branch.
I think this would have to be done before conditional load lowering, because
we'd have to transform:
_32 = _4 != 0;
iftmp.0_22 = .MASK_LOAD (_5, 32B, _32);
_61 = _4 == 0;
_7 = .MASK_LOAD (_6, 32B, _61);
_26 = _7 != 0;
_13 = _26 & _61;
iftmp.0_21 = .MASK_LOAD (_5, 32B, _13);
_66 = _4 | _7;
_36 = _66 == 0;
iftmp.0_19 = .MASK_LOAD (_9, 32B, _36);
_ifc__56 = _26 ? iftmp.0_21 : iftmp.0_19;
iftmp.0_12 = _32 ? iftmp.0_22 : _ifc__56;
into
_4 = *_3;
_6 = *_5;
_7 = _4 | _6;
_35 = _7 != 0;
iftmp.0_21 = .MASK_LOAD (_8, 32B, _35);
_51 = _7 == 0;
iftmp.0_19 = .MASK_LOAD (_9, 32B, _51);
iftmp.0_12 = _35 ? iftmp.0_21 : iftmp.0_19;
which would be quite hard.