[PATCH][RFC]AArch64 SVE: Fix multiple comparison masks on inverted operands

Tamar Christina via Gcc-patches Mon, 14 Jun 2021 06:43:28 -0700

Hi All,

This RFC is trying to address the following inefficiency when vectorizing
conditional statements with SVE.


Consider the case

void f10(double * restrict z, double * restrict w, double * restrict x,
         double * restrict y, int n)
{
    for (int i = 0; i < n; i++) {
        z[i] = (w[i] > 0) ? x[i] + w[i] : y[i] - w[i];
    }
}


For which we currently generate at -O3:

f10:
        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        whilelo p1.d, wzr, w4
        ptrue   p3.b, all
.L3:
        ld1d    z1.d, p1/z, [x1, x5, lsl 3]
        fcmgt   p2.d, p1/z, z1.d, #0.0
        fcmgt   p0.d, p3/z, z1.d, #0.0
        ld1d    z2.d, p2/z, [x2, x5, lsl 3]
        bic     p0.b, p3/z, p1.b, p0.b
        ld1d    z0.d, p0/z, [x3, x5, lsl 3]
        fsub    z0.d, p0/m, z0.d, z1.d
        movprfx z0.d, p2/m, z1.d
        fadd    z0.d, p2/m, z0.d, z2.d
        st1d    z0.d, p1, [x0, x5, lsl 3]
        incd    x5
        whilelo p1.d, w5, w4
        b.any   .L3
.L1:
        ret

Notice that the condition for the else branch duplicates the same predicate as
the then branch and then uses BIC to negate the results.

The reason for this is that during instruction generation in the vectorizer we
emit

  mask__41.11_66 = vect__4.10_64 > vect_cst__65;
  vec_mask_and_69 = mask__41.11_66 & loop_mask_63;
  vec_mask_and_71 = mask__41.11_66 & loop_mask_63;
  mask__43.16_73 = ~mask__41.11_66;
  vec_mask_and_76 = mask__43.16_73 & loop_mask_63;
  vec_mask_and_78 = mask__43.16_73 & loop_mask_63;

which ultimately gets optimized to

  mask__41.11_66 = vect__4.10_64 > { 0.0, ... };
  vec_mask_and_69 = loop_mask_63 & mask__41.11_66;
  mask__43.16_73 = ~mask__41.11_66;
  vec_mask_and_76 = loop_mask_63 & mask__43.16_73;

Notice how the negate is on the operation and not the predicate resulting from
the operation.  When this is expanded this turns into RTL where the negate is on
the compare directly.  This means the RTL is different from the one without the
negate and so CSE is unable to recognize that they are essentially same 
operation.

To fix this my patch changes it so you negate the mask rather than the operation

  mask__41.13_55 = vect__4.12_53 > { 0.0, ... };
  vec_mask_and_58 = loop_mask_52 & mask__41.13_55;
  vec_mask_op_67 = ~vec_mask_and_58;
  vec_mask_and_65 = loop_mask_52 & vec_mask_op_67;

which means the negate end up on the masked operation.  This removes the
additional comparisons

f10:
        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        whilelo p0.d, wzr, w4
        ptrue   p3.b, all
        .p2align 5,,15
.L3:
        ld1d    z1.d, p0/z, [x1, x5, lsl 3]
        fcmgt   p1.d, p0/z, z1.d, #0.0
        bic     p2.b, p3/z, p0.b, p1.b
        ld1d    z2.d, p1/z, [x2, x5, lsl 3]
        ld1d    z0.d, p2/z, [x3, x5, lsl 3]
        fsub    z0.d, p2/m, z0.d, z1.d
        movprfx z0.d, p1/m, z1.d
        fadd    z0.d, p1/m, z0.d, z2.d
        st1d    z0.d, p0, [x0, x5, lsl 3]
        incd    x5
        whilelo p0.d, w5, w4
        b.any   .L3
.L1:
        ret

But is still not optimal.  The problem is the BIC pattern,
aarch64_pred_<nlogical><mode>_z which will replace the NOT and AND with BIC.

However in this case since p1 is the result of a predicate operation on p0 the
BIC should instead be a NEG disabling the pattern for combine (adding
&& reload_completed)

gives me the codegen I'm after:

f10:
        cmp     w4, 0
        ble     .L1
        mov     x5, 0
        whilelo p0.d, wzr, w4
        .p2align 5,,15
.L3:
        ld1d    z1.d, p0/z, [x1, x5, lsl 3]
        fcmgt   p1.d, p0/z, z1.d, #0.0
        not     p2.b, p0/z, p1.b
        ld1d    z2.d, p1/z, [x2, x5, lsl 3]
        ld1d    z0.d, p2/z, [x3, x5, lsl 3]
        fsub    z0.d, p2/m, z0.d, z1.d
        movprfx z0.d, p1/m, z1.d
        fadd    z0.d, p1/m, z0.d, z2.d
        st1d    z0.d, p0, [x0, x5, lsl 3]
        incd    x5
        whilelo p0.d, w5, w4
        b.any   .L3
.L1:
        ret

Which used NOT pedicated on p0 instead.  Which is what the code was pre-combine
and also removed the need of having a third predicate p3 with the BIC case.

I can't remove combine to remove the BIC since in the case above the fcmgt isn't
single use so combine won't try.  Of course disabling the early recog for BIC
isn't ideal since you miss genuine BICs.

Any feedback on the approach and how to fix the BIC issue?  I did try an
approach using combine where I matched against the full sequence and spit it
early in combine.  This works for the case above but falls apart in other cases
where the cmp isn't single use from the start.

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.

But would like to solve the remaining issues.

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vect-stmts.c (prepare_load_store_mask): Expand unary operators
        on the mask instead of the operation.

--- inline copy of patch -- 
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 
eeef96a2eb60853e9c18a288af9e49ae9ad65128..35e5212c77d7cb26b1a2b9645cbac22c30078fb8
 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1785,8 +1785,38 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, 
tree vec_mask,
 
   gcc_assert (TREE_TYPE (loop_mask) == mask_type);
   tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
+  tree final_mask = vec_mask;
+
+  /* Check if what vec_mask is pointing at is a unary operator and if so
+     expand the operand before the mask and not on the operation to allow
+     for better CSE.  */
+  if (TREE_CODE (vec_mask) == SSA_NAME)
+    {
+      gimple *stmt = SSA_NAME_DEF_STMT (vec_mask);
+      if (is_gimple_assign (stmt)
+         && gimple_assign_rhs_class (stmt) == GIMPLE_UNARY_RHS)
+       {
+         tree_code code = gimple_assign_rhs_code (stmt);
+         tree pred_op = gimple_assign_rhs1 (stmt);
+
+         /* Predicate the operation first.  */
+         gimple *pred_stmt;
+         tree pred_res1 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+         pred_stmt = gimple_build_assign (pred_res1, BIT_AND_EXPR,
+                                          pred_op, loop_mask);
+         gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+
+         /* Now move the operation to the top and predicate it.  */
+         tree pred_res2 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+         pred_stmt = gimple_build_assign (pred_res2, code,
+                                          pred_res1);
+         gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+         final_mask = pred_res2;
+       }
+    }
+
   gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
-                                         vec_mask, loop_mask);
+                                         final_mask, loop_mask);
   gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
   return and_res;
 }


--

diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index eeef96a2eb60853e9c18a288af9e49ae9ad65128..35e5212c77d7cb26b1a2b9645cbac22c30078fb8 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1785,8 +1785,38 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
 
   gcc_assert (TREE_TYPE (loop_mask) == mask_type);
   tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
+  tree final_mask = vec_mask;
+
+  /* Check if what vec_mask is pointing at is a unary operator and if so
+     expand the operand before the mask and not on the operation to allow
+     for better CSE.  */
+  if (TREE_CODE (vec_mask) == SSA_NAME)
+    {
+      gimple *stmt = SSA_NAME_DEF_STMT (vec_mask);
+      if (is_gimple_assign (stmt)
+	  && gimple_assign_rhs_class (stmt) == GIMPLE_UNARY_RHS)
+	{
+	  tree_code code = gimple_assign_rhs_code (stmt);
+	  tree pred_op = gimple_assign_rhs1 (stmt);
+
+	  /* Predicate the operation first.  */
+	  gimple *pred_stmt;
+	  tree pred_res1 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+	  pred_stmt = gimple_build_assign (pred_res1, BIT_AND_EXPR,
+					   pred_op, loop_mask);
+	  gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+
+	  /* Now move the operation to the top and predicate it.  */
+	  tree pred_res2 = make_temp_ssa_name (mask_type, NULL, "vec_mask_op");
+	  pred_stmt = gimple_build_assign (pred_res2, code,
+					   pred_res1);
+	  gsi_insert_before (gsi, pred_stmt, GSI_SAME_STMT);
+	  final_mask = pred_res2;
+	}
+    }
+
   gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
-					  vec_mask, loop_mask);
+					  final_mask, loop_mask);
   gsi_insert_before (gsi, and_stmt, GSI_SAME_STMT);
   return and_res;
 }

[PATCH][RFC]AArch64 SVE: Fix multiple comparison masks on inverted operands

Reply via email to