RE: [PATCH 2/5]AArch64 sve: combine nested if predicates

Tamar Christina via Gcc-patches Tue, 21 Sep 2021 10:02:26 -0700

Hi honored reviewer,

Thanks for the feedback, I hereby submit the new patch:


> > Note: This patch series is working incrementally towards generating the
> most
> >       efficient code for this and other loops in small steps.
> 
> It looks like this could be done in the vectoriser via an extension of the
> scalar_cond_masked_set mechanism.  We have:
> 
>   mask__54.13_59 = vect_a_15.9_55 > vect_b_17.12_58;
>   vec_mask_and_60 = loop_mask_32 & mask__54.13_59;
>   …
>   mask__30.17_67 = vect_a_15.9_55 > vect_cst__66;
>   mask__29.18_68 = mask__54.13_59 & mask__30.17_67;
>   vec_mask_and_69 = loop_mask_32 & mask__29.18_68;
> 
> When vectorising mask__29.18_68, we could test whether each side of the
> "&" is already in scalar_cond_masked_set and AND in the loop mask if so, like
> we do in vectorizable_condition.  We could then separately record that the &
> result includes the loop mask.

When never a mask is being generated from an BIT_AND we mask the operands of
the and instead and then just AND the result.

This allows us to be able to CSE the masks and generate the right combination.
However because re-assoc will try to re-order the masks in the & we have to now
perform a small local CSE on the vectorized loop is vectorization is successful.

Note: This patch series is working incrementally towards generating the most
      efficient code for this and other loops in small steps.

Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-linux-gnu and no 
issues.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vectorizer.c (vectorize_loops): Do local CSE through RPVN upon
        successful vectorization.
        * tree-vect-stmts.c (prepare_load_store_mask): When combining two masks
        mask the operands instead of the combined operation.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/sve/pred-combine-and.c: New test.

--- inline copy of patch ---

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
new file mode 100644
index 
0000000000000000000000000000000000000000..d395b7f84bb15b588493611df5a47549726ac24a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-combine-and.c
@@ -0,0 +1,18 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O3 --save-temps" } */
+
+void f5(float * restrict z0, float * restrict z1, float *restrict x, float * 
restrict y, float c, int n)
+{
+    for (int i = 0; i < n; i++) {
+        float a = x[i];
+        float b = y[i];
+        if (a > b) {
+            z0[i] = a + b;
+            if (a > c) {
+                z1[i] = a - b;
+            }
+        }
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.s, p[0-9]+/z, 
z[0-9]+\.s, z[0-9]+\.s} 2 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 
4e0b2adf1dc2404bc345af30cfeb9c819084894e..717a25f46aa72534eebeb382c92b9145d7d44d04
 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1799,6 +1799,19 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, 
tree vec_mask,
     return vec_mask;
 
   gcc_assert (TREE_TYPE (loop_mask) == mask_type);
+
+  /* Check if the mask is a combination of two different masks.  */
+  gimple *def_stmt = SSA_NAME_DEF_STMT (vec_mask);
+  if (is_gimple_assign (def_stmt)
+      && gimple_assign_rhs_code (def_stmt) == BIT_AND_EXPR)
+    {
+      tree lhs1 = gimple_assign_rhs1 (def_stmt);
+      tree lhs2 = gimple_assign_rhs2 (def_stmt);
+
+      vec_mask = prepare_load_store_mask (mask_type, loop_mask, lhs1, gsi);
+      loop_mask = prepare_load_store_mask (mask_type, loop_mask, lhs2, gsi);
+    }
+
   tree and_res = make_temp_ssa_name (mask_type, NULL, "vec_mask_and");
   gimple *and_stmt = gimple_build_assign (and_res, BIT_AND_EXPR,
                                          vec_mask, loop_mask);
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 
3aa3e2a678328baccc4869fe2c6546e700b92255..84bcd146af7c4dedf6acdd7317954010ad3f281b
 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -81,7 +81,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "gimple-pretty-print.h"
 #include "opt-problem.h"
 #include "internal-fn.h"
-
+#include "tree-ssa-sccvn.h"
 
 /* Loop or bb location, with hotness information.  */
 dump_user_location_t vect_location;
@@ -1321,6 +1321,27 @@ vectorize_loops (void)
         ???  Also while we try hard to update loop-closed SSA form we fail
         to properly do this in some corner-cases (see PR56286).  */
       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals);
+
+      for (i = 1; i < number_of_loops (cfun); i++)
+       {
+         loop = get_loop (cfun, i);
+         if (!loop || !single_exit (loop))
+           continue;
+
+         bitmap exit_bbs;
+         /* Perform local CSE, this esp. helps because we emit code for
+            predicates that need to be shared for optimal predicate usage.
+            However reassoc will re-order them and prevent CSE from working
+            as it should.  CSE only the loop body, not the entry.  */
+         exit_bbs = BITMAP_ALLOC (NULL);
+         bitmap_set_bit (exit_bbs, single_exit (loop)->dest->index);
+         bitmap_set_bit (exit_bbs, loop->latch->index);
+
+         do_rpo_vn (cfun, loop_preheader_edge (loop), exit_bbs);
+
+         BITMAP_FREE (exit_bbs);
+       }
+
       return TODO_cleanup_cfg;
     }

rb14777.patch
Description: rb14777.patch

RE: [PATCH 2/5]AArch64 sve: combine nested if predicates

Reply via email to