Hi All,
This RFC is trying to address the following inefficiency when vectorizing
conditional statements with SVE.
Consider the case
void f10(double * restrict z, double * restrict w, double * restrict x,
double * restrict y, int n)
{
for (int i = 0; i < n; i++) {
z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
}
}
For which we currently generate at -O3:
f10:
cmp w4, 0
ble .L1
mov x5, 0
whilelo p1.d, wzr, w4
ptrue p3.b, all
.L3:
ld1d z1.d, p1/z, [x1, x5, lsl 3]
fcmgt p2.d, p1/z, z1.d, #0.0
fcmgt p0.d, p3/z, z1.d, #0.0
ld1d z2.d, p2/z, [x2, x5, lsl 3]
bic p0.b, p3/z, p1.b, p0.b
ld1d z0.d, p0/z, [x3, x5, lsl 3]
fsub z0.d, p0/m, z0.d, z1.d
movprfx z0.d, p2/m, z1.d
fadd z0.d, p2/m, z0.d, z2.d
st1d z0.d, p1, [x0, x5, lsl 3]
incd x5
whilelo p1.d, w5, w4
b.any .L3
.L1:
ret
Notice that the condition for the else branch duplicates the same predicate as
the then branch and then uses BIC to negate the results.
The reason for this is that during instruction generation in the vectorizer we
emit
mask__41.11_66 = vect__4.10_64 > vect_cst__65;
vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
mask__43.16_73 = ~mask__41.11_66;
vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
vec_mask_and_78 = mask__43.16_73 & loop_mask_63;
which ultimately gets optimized to
mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
mask__43.16_73 = ~mask__41.11_66;
vec_mask_and_76 = loop_mask_63 & mask__43.16_73;
Notice how the negate is on the operation and not the predicate resulting from
the operation. When this is expanded this turns into RTL where the negate is on
the compare directly. This means the RTL is different from the one without the
negate and so CSE is unable to recognize that they are essentially same
operation.
To fix this my patch changes it so you negate the mask rather than the operation
mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
vec_mask_op_67 = ~vec_mask_and_58;
vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;
which means the negate end up on the masked operation. This removes the
additional comparisons
f10:
cmp w4, 0
ble .L1
mov x5, 0
whilelo p0.d, wzr, w4
ptrue p3.b, all
.p2align 5,,15
.L3:
ld1d z1.d, p0/z, [x1, x5, lsl 3]
fcmgt p1.d, p0/z, z1.d, #0.0
bic p2.b, p3/z, p0.b, p1.b
ld1d z2.d, p1/z, [x2, x5, lsl 3]
ld1d z0.d, p2/z, [x3, x5, lsl 3]
fsub z0.d, p2/m, z0.d, z1.d
movprfx z0.d, p1/m, z1.d
fadd z0.d, p1/m, z0.d, z2.d
st1d z0.d, p0, [x0, x5, lsl 3]
incd x5
whilelo p0.d, w5, w4
b.any .L3
.L1:
ret
But is still not optimal. The problem is the BIC pattern,
aarch64_pred_<nlogical><mode>_z which will replace the NOT and AND with BIC.
However in this case since p1 is the result of a predicate operation on p0 the
BIC should instead be a NEG disabling the pattern for combine (adding
&& reload_completed)
gives me the codegen I'm after:
f10:
cmp w4, 0
ble .L1
mov x5, 0
whilelo p0.d, wzr, w4
.p2align 5,,15
.L3:
ld1d z1.d, p0/z, [x1, x5, lsl 3]
fcmgt p1.d, p0/z, z1.d, #0.0
not p2.b, p0/z, p1.b
ld1d z2.d, p1/z, [x2, x5, lsl 3]
ld1d z0.d, p2/z, [x3, x5, lsl 3]
fsub z0.d, p2/m, z0.d, z1.d
movprfx z0.d, p1/m, z1.d
fadd z0.d, p1/m, z0.d, z2.d
st1d z0.d, p0, [x0, x5, lsl 3]
incd x5
whilelo p0.d, w5, w4
b.any .L3
.L1:
ret
Which used NOT pedicated on p0 instead. Which is what the code was pre-combine
and also removed the need of having a third predicate p3 with the BIC case.
I can't remove combine to remove the BIC since in the case above the fcmgt isn't
single use so combine won't try. Of course disabling the early recog for BIC
isn't ideal since you miss genuine BICs.
Any feedback on the approach and how to fix the BIC issue? I did try an
approach using combine where I matched against the full sequence and spit it
early in combine. This works for the case above but falls apart in other cases
where the cmp isn't single use from the start.
Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
But would like to solve the remaining issues.
Thanks,
Tamar
gcc/ChangeLog:
* tree-vect-stmts.c (prepare_load_store_mask): Expand unary operators
on the mask instead of the operation.
--- inline copy of patch --
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index
eeef96a2eb60853e9c18a288af9e49ae9ad65128..35e5212c77d7cb26b1a2b9645cbac22c30078fb8
100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1785,8 +1785,38 @@ prepare_load_store_mask (tree mask_type, tree loop_mask,
tree vec_mask,
gcc_assert (TREE_TYPE (loop_mask) == mask_type);
tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
+ tree final_mask = vec_mask;
+
+ /* Check if what vec_mask is pointing at is a unary operator and if so
+ expand the operand before the mask and not on the operation to allow
+ for better CSE. */
+ if (TREE_CODE (vec_mask) == SSA_NAME)
+ {
+ gimple *stmt = SSA_NAME_DEF_STMT (vec_mask);
+ if (is_gimple_assign (stmt)
+ && gimple_assign_rhs_class (stmt) == GIMPLE_UNARY_RHS)
+ {
+ tree_code code = gimple_assign_rhs_code (stmt);
+ tree pred_op = gimple_assign_rhs1 (stmt);
+
+ /* Predicate the operation first. */
+ gimple *pred_stmt;
+ tree pred_res1 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+ pred_stmt = gimple_build_assign (pred_res1, BIT_AND_EXPR,
+ pred_op, loop_mask);
+ gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+
+ /* Now move the operation to the top and predicate it. */
+ tree pred_res2 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+ pred_stmt = gimple_build_assign (pred_res2, code,
+ pred_res1);
+ gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+ final_mask = pred_res2;
+ }
+ }
+
gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
- vec_mask, loop_mask);
+ final_mask, loop_mask);
gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
return and_res;
}
--
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index eeef96a2eb60853e9c18a288af9e49ae9ad65128..35e5212c77d7cb26b1a2b9645cbac22c30078fb8 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1785,8 +1785,38 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
gcc_assert (TREE_TYPE (loop_mask) == mask_type);
tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
+ tree final_mask = vec_mask;
+
+ /* Check if what vec_mask is pointing at is a unary operator and if so
+ expand the operand before the mask and not on the operation to allow
+ for better CSE. */
+ if (TREE_CODE (vec_mask) == SSA_NAME)
+ {
+ gimple *stmt = SSA_NAME_DEF_STMT (vec_mask);
+ if (is_gimple_assign (stmt)
+ && gimple_assign_rhs_class (stmt) == GIMPLE_UNARY_RHS)
+ {
+ tree_code code = gimple_assign_rhs_code (stmt);
+ tree pred_op = gimple_assign_rhs1 (stmt);
+
+ /* Predicate the operation first. */
+ gimple *pred_stmt;
+ tree pred_res1 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+ pred_stmt = gimple_build_assign (pred_res1, BIT_AND_EXPR,
+ pred_op, loop_mask);
+ gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+
+ /* Now move the operation to the top and predicate it. */
+ tree pred_res2 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+ pred_stmt = gimple_build_assign (pred_res2, code,
+ pred_res1);
+ gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+ final_mask = pred_res2;
+ }
+ }
+
gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
- vec_mask, loop_mask);
+ final_mask, loop_mask);
gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
return and_res;
}