Hi All, This RFC is trying to address the following inefficiency when vectorizing conditional statements with SVE.
Consider the case void f10(double * restrict z, double * restrict w, double * restrict x, double * restrict y, int n) { for (int i = 0; i < n; i++) { z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i]; } } For which we currently generate at -O3: f10: cmp w4, 0 ble .L1 mov x5, 0 whilelo p1.d, wzr, w4 ptrue p3.b, all .L3: ld1d z1.d, p1/z, [x1, x5, lsl 3] fcmgt p2.d, p1/z, z1.d, #0.0 fcmgt p0.d, p3/z, z1.d, #0.0 ld1d z2.d, p2/z, [x2, x5, lsl 3] bic p0.b, p3/z, p1.b, p0.b ld1d z0.d, p0/z, [x3, x5, lsl 3] fsub z0.d, p0/m, z0.d, z1.d movprfx z0.d, p2/m, z1.d fadd z0.d, p2/m, z0.d, z2.d st1d z0.d, p1, [x0, x5, lsl 3] incd x5 whilelo p1.d, w5, w4 b.any .L3 .L1: ret Notice that the condition for the else branch duplicates the same predicate as the then branch and then uses BIC to negate the results. The reason for this is that during instruction generation in the vectorizer we emit mask__41.11_66 = vect__4.10_64 > vect_cst__65; vec_mask_and_69 = mask__41.11_66 & loop_mask_63; vec_mask_and_71 = mask__41.11_66 & loop_mask_63; mask__43.16_73 = ~mask__41.11_66; vec_mask_and_76 = mask__43.16_73 & loop_mask_63; vec_mask_and_78 = mask__43.16_73 & loop_mask_63; which ultimately gets optimized to mask__41.11_66 = vect__4.10_64 > { 0.0, ... }; vec_mask_and_69 = loop_mask_63 & mask__41.11_66; mask__43.16_73 = ~mask__41.11_66; vec_mask_and_76 = loop_mask_63 & mask__43.16_73; Notice how the negate is on the operation and not the predicate resulting from the operation. When this is expanded this turns into RTL where the negate is on the compare directly. This means the RTL is different from the one without the negate and so CSE is unable to recognize that they are essentially same operation. To fix this my patch changes it so you negate the mask rather than the operation mask__41.13_55 = vect__4.12_53 > { 0.0, ... }; vec_mask_and_58 = loop_mask_52 & mask__41.13_55; vec_mask_op_67 = ~vec_mask_and_58; vec_mask_and_65 = loop_mask_52 & vec_mask_op_67; which means the negate end up on the masked operation. This removes the additional comparisons f10: cmp w4, 0 ble .L1 mov x5, 0 whilelo p0.d, wzr, w4 ptrue p3.b, all .p2align 5,,15 .L3: ld1d z1.d, p0/z, [x1, x5, lsl 3] fcmgt p1.d, p0/z, z1.d, #0.0 bic p2.b, p3/z, p0.b, p1.b ld1d z2.d, p1/z, [x2, x5, lsl 3] ld1d z0.d, p2/z, [x3, x5, lsl 3] fsub z0.d, p2/m, z0.d, z1.d movprfx z0.d, p1/m, z1.d fadd z0.d, p1/m, z0.d, z2.d st1d z0.d, p0, [x0, x5, lsl 3] incd x5 whilelo p0.d, w5, w4 b.any .L3 .L1: ret But is still not optimal. The problem is the BIC pattern, aarch64_pred_<nlogical><mode>_z which will replace the NOT and AND with BIC. However in this case since p1 is the result of a predicate operation on p0 the BIC should instead be a NEG disabling the pattern for combine (adding && reload_completed) gives me the codegen I'm after: f10: cmp w4, 0 ble .L1 mov x5, 0 whilelo p0.d, wzr, w4 .p2align 5,,15 .L3: ld1d z1.d, p0/z, [x1, x5, lsl 3] fcmgt p1.d, p0/z, z1.d, #0.0 not p2.b, p0/z, p1.b ld1d z2.d, p1/z, [x2, x5, lsl 3] ld1d z0.d, p2/z, [x3, x5, lsl 3] fsub z0.d, p2/m, z0.d, z1.d movprfx z0.d, p1/m, z1.d fadd z0.d, p1/m, z0.d, z2.d st1d z0.d, p0, [x0, x5, lsl 3] incd x5 whilelo p0.d, w5, w4 b.any .L3 .L1: ret Which used NOT pedicated on p0 instead. Which is what the code was pre-combine and also removed the need of having a third predicate p3 with the BIC case. I can't remove combine to remove the BIC since in the case above the fcmgt isn't single use so combine won't try. Of course disabling the early recog for BIC isn't ideal since you miss genuine BICs. Any feedback on the approach and how to fix the BIC issue? I did try an approach using combine where I matched against the full sequence and spit it early in combine. This works for the case above but falls apart in other cases where the cmp isn't single use from the start. Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. But would like to solve the remaining issues. Thanks, Tamar gcc/ChangeLog: * tree-vect-stmts.c (prepare_load_store_mask): Expand unary operators on the mask instead of the operation. --- inline copy of patch -- diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index eeef96a2eb60853e9c18a288af9e49ae9ad65128..35e5212c77d7cb26b1a2b9645cbac22c30078fb8 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1785,8 +1785,38 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, gcc_assert (TREE_TYPE (loop_mask) == mask_type); tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and"); + tree final_mask = vec_mask; + + /* Check if what vec_mask is pointing at is a unary operator and if so + expand the operand before the mask and not on the operation to allow + for better CSE. */ + if (TREE_CODE (vec_mask) == SSA_NAME) + { + gimple *stmt = SSA_NAME_DEF_STMT (vec_mask); + if (is_gimple_assign (stmt) + && gimple_assign_rhs_class (stmt) == GIMPLE_UNARY_RHS) + { + tree_code code = gimple_assign_rhs_code (stmt); + tree pred_op = gimple_assign_rhs1 (stmt); + + /* Predicate the operation first. */ + gimple *pred_stmt; + tree pred_res1 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op"); + pred_stmt = gimple_build_assign (pred_res1, BIT_AND_EXPR, + pred_op, loop_mask); + gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT); + + /* Now move the operation to the top and predicate it. */ + tree pred_res2 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op"); + pred_stmt = gimple_build_assign (pred_res2, code, + pred_res1); + gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT); + final_mask = pred_res2; + } + } + gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR, - vec_mask, loop_mask); + final_mask, loop_mask); gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT); return and_res; } --
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index eeef96a2eb60853e9c18a288af9e49ae9ad65128..35e5212c77d7cb26b1a2b9645cbac22c30078fb8 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -1785,8 +1785,38 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask, gcc_assert (TREE_TYPE (loop_mask) == mask_type); tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and"); + tree final_mask = vec_mask; + + /* Check if what vec_mask is pointing at is a unary operator and if so + expand the operand before the mask and not on the operation to allow + for better CSE. */ + if (TREE_CODE (vec_mask) == SSA_NAME) + { + gimple *stmt = SSA_NAME_DEF_STMT (vec_mask); + if (is_gimple_assign (stmt) + && gimple_assign_rhs_class (stmt) == GIMPLE_UNARY_RHS) + { + tree_code code = gimple_assign_rhs_code (stmt); + tree pred_op = gimple_assign_rhs1 (stmt); + + /* Predicate the operation first. */ + gimple *pred_stmt; + tree pred_res1 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op"); + pred_stmt = gimple_build_assign (pred_res1, BIT_AND_EXPR, + pred_op, loop_mask); + gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT); + + /* Now move the operation to the top and predicate it. */ + tree pred_res2 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op"); + pred_stmt = gimple_build_assign (pred_res2, code, + pred_res1); + gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT); + final_mask = pred_res2; + } + } + gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR, - vec_mask, loop_mask); + final_mask, loop_mask); gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT); return and_res; }