[PATCH v2][committed][14 backport]middle-end: fix masking for partial vectors and early break [PR119351]

Tamar Christina Mon, 28 Apr 2025 04:34:35 -0700

Hi All,

The following testcase shows an incorrect masked codegen:


#define N 512
#define START 1
#define END 505
 
int x[N] __attribute__((aligned(32)));

int __attribute__((noipa))
foo (void)
{
  int z = 0;
  for (unsigned int i = START; i < END; ++i)
    {
      z++;
      if (x[i] > 0)
        continue;
    
      return z;
    }
  return -1;
}

notice how there's a continue there instead of a break.  This means we generate
a control flow where success stays within the loop iteration:

  mask_patt_9.12_46 = vect__1.11_45 > { 0, 0, 0, 0 };
  vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41;
  if (vec_mask_and_47 == { -1, -1, -1, -1 })
    goto <bb 4>; [41.48%]
  else
    goto <bb 15>; [58.52%]

However when loop_mask_41 is a partial mask this comparison can lead to an
incorrect match.  In this case the mask is:

  # loop_mask_41 = PHI <next_mask_63(6), { 0, -1, -1, -1 }(2)>

due to peeling for alignment with masking and compiling with
-msve-vector-bits=128.

At codegen time we generate:

        ptrue   p15.s, vl4
        ptrue   p7.b, vl1
        not     p7.b, p15/z, p7.b
.L5:
        ld1w    z29.s, p7/z, [x1, x0, lsl 2]
        cmpgt   p7.s, p7/z, z29.s, #0
        not     p7.b, p15/z, p7.b
        ptest   p15, p7.b
        b.none  .L2
        ...<early exit>...

Here the basic blocks are rotated and a not is generated.
But the generated not is unmasked (or predicated over an ALL true mask in this
case).  This has the unintended side-effect of flipping the results of the
inactive lanes (which were zero'd by the cmpgt) into -1.  Which then incorrectly
causes us to not take the branch to .L2.

This is happening because we're not comparing against the right value for the
forall case.  This patch gets rid of the forall case by rewriting the
if(all(mask)) into if (!all(mask)) which is the same as if (any(~mask)) by
negating the masks and flipping the branches.

        1. For unmasked loops we simply reduce the ~mask.
        2. For masked loops we reduce (~mask & loop_mask) which is the same as
           doing (mask & loop_mask) ^ loop_mask.   

For the above we now generate:

.L5:
        ld1w    z28.s, p7/z, [x1, x0, lsl 2]
        cmple   p7.s, p7/z, z28.s, #0
        ptest   p15, p7.b
        b.none  .L2

This fixes gromacs with > 1 OpenMP threads and improves performance.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Committed to GCC-14.

Thanks,
Tamar

gcc/ChangeLog:

        PR tree-optimization/119351
        * tree-vect-stmts.cc (vectorizable_early_exit): Mask both operands of
        the gcond for partial masking support.

gcc/testsuite/ChangeLog:

        PR tree-optimization/119351
        * gcc.target/aarch64/sve/pr119351.c: New test.
        * gcc.target/aarch64/sve/pr119351_run.c: New test.

    (cherry picked from commit 7cf5503e0af52f5b726da4274a148590c57a458a)

---
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
new file mode 100644
index 
0000000000000000000000000000000000000000..a5ad0bf2507c57df43027a81fd3654737c40a6a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
@@ -0,0 +1,35 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param 
aarch64-autovec-preference=2 -fdump-tree-vect-details" } */
+/* { dg-final { check-function-bodies "**" "" ""} } */
+
+#define N 512
+#define START 0
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+/*
+** foo:
+**     ...
+**     orr     p[0-9]+.b, p[0-9]+/z, p[0-9]+.b, p[0-9]+.b
+**     ...
+*/
+
+int __attribute__((noipa))
+foo (void)
+{
+  int z = 0;
+  for (unsigned short i = START; i < END; ++i)
+    {
+      z++;
+      if (x[i] > 0)
+        continue;
+    
+      return z;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
new file mode 100644
index 
0000000000000000000000000000000000000000..e1838589f92bb320cc4e8f24798448def5da256b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
@@ -0,0 +1,20 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=2" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw 
} } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw 
} } */
+
+#include "pr119351.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  x[0] = 1;
+  x[1] = 21;
+  x[2] = 39;
+  x[3] = 59;
+  int res = foo ();
+  if (res != 5)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
e1e7465080e9b8025b2525188df11a264caaddd5..307d989fbe3cb9b8430d1c06d4b4e60bc4970227
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12915,29 +12915,23 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,
      codegen so we must replace the original insn.  */
   gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
   gcond *cond_stmt = as_a <gcond *>(orig_stmt);
+
+  tree cst = build_zero_cst (vectype);
+  auto bb = gimple_bb (cond_stmt);
+  edge exit_true_edge = EDGE_SUCC (bb, 0);
+  if (exit_true_edge->flags & EDGE_FALSE_VALUE)
+    exit_true_edge = EDGE_SUCC (bb, 1);
+  gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
+
   /* When vectorizing we assume that if the branch edge is taken that we're
      exiting the loop.  This is not however always the case as the compiler 
will
      rewrite conditions to always be a comparison against 0.  To do this it
      sometimes flips the edges.  This is fine for scalar,  but for vector we
-     then have to flip the test, as we're still assuming that if you take the
-     branch edge that we found the exit condition.  i.e. we need to know 
whether
-     we are generating a `forall` or an `exist` condition.  */
-  auto new_code = NE_EXPR;
-  auto reduc_optab = ior_optab;
-  auto reduc_op = BIT_IOR_EXPR;
-  tree cst = build_zero_cst (vectype);
-  edge exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 0);
-  if (exit_true_edge->flags & EDGE_FALSE_VALUE)
-    exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 1);
-  gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
-  if (flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
-                            exit_true_edge->dest))
-    {
-      new_code = EQ_EXPR;
-      reduc_optab = and_optab;
-      reduc_op = BIT_AND_EXPR;
-      cst = build_minus_one_cst (vectype);
-    }
+     then have to negate the result of the test, as we're still assuming that 
if
+     you take the branch edge that we found the exit condition.  i.e. we need 
to
+     know whether we are generating a `forall` or an `exist` condition.  */
+  bool flipped = flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
+                                       exit_true_edge->dest);
 
   /* Analyze only.  */
   if (!vec_stmt)
@@ -12953,14 +12947,13 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,
        }
 
       if (ncopies > 1
-         && direct_optab_handler (reduc_optab, mode) == CODE_FOR_nothing)
+         && direct_optab_handler (ior_optab, mode) == CODE_FOR_nothing)
        {
          if (dump_enabled_p ())
              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                               "can't vectorize early exit because the "
-                              "target does not support boolean vector %s "
+                              "target does not support boolean vector IOR "
                               "for type %T.\n",
-                              reduc_optab == ior_optab ? "OR" : "AND",
                               vectype);
          return false;
        }
@@ -13010,6 +13003,29 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,
        stmts.quick_push (gimple_assign_lhs (stmt));
     }
 
+  /* If we're comparing against a previous forall we need to negate the 
resullts
+     before we do the final comparison or reduction.  */
+  if (flipped)
+    {
+      /* Rewrite the if(all(mask)) into if (!all(mask)) which is the same as
+        if (any(~mask)) by negating the masks and flipping the branches.
+
+       1. For unmasked loops we simply reduce the ~mask.
+       2. For masked loops we reduce (~mask & loop_mask) which is the same as
+          doing (mask & loop_mask) ^ loop_mask.  */
+      for (unsigned i = 0; i < stmts.length (); i++)
+       {
+         tree inv_lhs = make_temp_ssa_name (vectype, NULL, "vexit_inv");
+         auto inv_stmt = gimple_build_assign (inv_lhs, BIT_NOT_EXPR, stmts[i]);
+         vect_finish_stmt_generation (loop_vinfo, stmt_info, inv_stmt,
+                                      &cond_gsi);
+         stmts[i] = inv_lhs;
+       }
+
+      EDGE_SUCC (bb, 0)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
+      EDGE_SUCC (bb, 1)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
+    }
+
   /* Determine if we need to reduce the final value.  */
   if (stmts.length () > 1)
     {
@@ -13039,7 +13055,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info 
stmt_info,
          new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
          tree arg0 = workset.pop ();
          tree arg1 = workset.pop ();
-         new_stmt = gimple_build_assign (new_temp, reduc_op, arg0, arg1);
+         new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
          vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
                                       &cond_gsi);
          workset.quick_insert (0, new_temp);
@@ -13059,7 +13075,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info 
stmt_info,
 
   gcc_assert (new_temp);
 
-  gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
+  gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
   update_stmt (orig_stmt);
 
   if (slp_node)


--

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
new file mode 100644
index 0000000000000000000000000000000000000000..a5ad0bf2507c57df43027a81fd3654737c40a6a6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
@@ -0,0 +1,35 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=2 -fdump-tree-vect-details" } */
+/* { dg-final { check-function-bodies "**" "" ""} } */
+
+#define N 512
+#define START 0
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+/*
+** foo:
+**	...
+**	orr	p[0-9]+.b, p[0-9]+/z, p[0-9]+.b, p[0-9]+.b
+**	...
+*/
+
+int __attribute__((noipa))
+foo (void)
+{
+  int z = 0;
+  for (unsigned short i = START; i < END; ++i)
+    {
+      z++;
+      if (x[i] > 0)
+        continue;
+    
+      return z;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..e1838589f92bb320cc4e8f24798448def5da256b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
@@ -0,0 +1,20 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=2" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "pr119351.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  x[0] = 1;
+  x[1] = 21;
+  x[2] = 39;
+  x[3] = 59;
+  int res = foo ();
+  if (res != 5)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index e1e7465080e9b8025b2525188df11a264caaddd5..307d989fbe3cb9b8430d1c06d4b4e60bc4970227 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -12915,29 +12915,23 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
      codegen so we must replace the original insn.  */
   gimple *orig_stmt = STMT_VINFO_STMT (vect_orig_stmt (stmt_info));
   gcond *cond_stmt = as_a <gcond *>(orig_stmt);
+
+  tree cst = build_zero_cst (vectype);
+  auto bb = gimple_bb (cond_stmt);
+  edge exit_true_edge = EDGE_SUCC (bb, 0);
+  if (exit_true_edge->flags & EDGE_FALSE_VALUE)
+    exit_true_edge = EDGE_SUCC (bb, 1);
+  gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
+
   /* When vectorizing we assume that if the branch edge is taken that we're
      exiting the loop.  This is not however always the case as the compiler will
      rewrite conditions to always be a comparison against 0.  To do this it
      sometimes flips the edges.  This is fine for scalar,  but for vector we
-     then have to flip the test, as we're still assuming that if you take the
-     branch edge that we found the exit condition.  i.e. we need to know whether
-     we are generating a `forall` or an `exist` condition.  */
-  auto new_code = NE_EXPR;
-  auto reduc_optab = ior_optab;
-  auto reduc_op = BIT_IOR_EXPR;
-  tree cst = build_zero_cst (vectype);
-  edge exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 0);
-  if (exit_true_edge->flags & EDGE_FALSE_VALUE)
-    exit_true_edge = EDGE_SUCC (gimple_bb (cond_stmt), 1);
-  gcc_assert (exit_true_edge->flags & EDGE_TRUE_VALUE);
-  if (flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
-			     exit_true_edge->dest))
-    {
-      new_code = EQ_EXPR;
-      reduc_optab = and_optab;
-      reduc_op = BIT_AND_EXPR;
-      cst = build_minus_one_cst (vectype);
-    }
+     then have to negate the result of the test, as we're still assuming that if
+     you take the branch edge that we found the exit condition.  i.e. we need to
+     know whether we are generating a `forall` or an `exist` condition.  */
+  bool flipped = flow_bb_inside_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
+					exit_true_edge->dest);
 
   /* Analyze only.  */
   if (!vec_stmt)
@@ -12953,14 +12947,13 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 	}
 
       if (ncopies > 1
-	  && direct_optab_handler (reduc_optab, mode) == CODE_FOR_nothing)
+	  && direct_optab_handler (ior_optab, mode) == CODE_FOR_nothing)
 	{
 	  if (dump_enabled_p ())
 	      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 			       "can't vectorize early exit because the "
-			       "target does not support boolean vector %s "
+			       "target does not support boolean vector IOR "
 			       "for type %T.\n",
-			       reduc_optab == ior_optab ? "OR" : "AND",
 			       vectype);
 	  return false;
 	}
@@ -13010,6 +13003,29 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 	stmts.quick_push (gimple_assign_lhs (stmt));
     }
 
+  /* If we're comparing against a previous forall we need to negate the resullts
+     before we do the final comparison or reduction.  */
+  if (flipped)
+    {
+      /* Rewrite the if(all(mask)) into if (!all(mask)) which is the same as
+	 if (any(~mask)) by negating the masks and flipping the branches.
+
+	1. For unmasked loops we simply reduce the ~mask.
+	2. For masked loops we reduce (~mask & loop_mask) which is the same as
+	   doing (mask & loop_mask) ^ loop_mask.  */
+      for (unsigned i = 0; i < stmts.length (); i++)
+	{
+	  tree inv_lhs = make_temp_ssa_name (vectype, NULL, "vexit_inv");
+	  auto inv_stmt = gimple_build_assign (inv_lhs, BIT_NOT_EXPR, stmts[i]);
+	  vect_finish_stmt_generation (loop_vinfo, stmt_info, inv_stmt,
+				       &cond_gsi);
+	  stmts[i] = inv_lhs;
+	}
+
+      EDGE_SUCC (bb, 0)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
+      EDGE_SUCC (bb, 1)->flags ^= (EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
+    }
+
   /* Determine if we need to reduce the final value.  */
   if (stmts.length () > 1)
     {
@@ -13039,7 +13055,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 	  new_temp = make_temp_ssa_name (vectype, NULL, "vexit_reduc");
 	  tree arg0 = workset.pop ();
 	  tree arg1 = workset.pop ();
-	  new_stmt = gimple_build_assign (new_temp, reduc_op, arg0, arg1);
+	  new_stmt = gimple_build_assign (new_temp, BIT_IOR_EXPR, arg0, arg1);
 	  vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
 				       &cond_gsi);
 	  workset.quick_insert (0, new_temp);
@@ -13059,7 +13075,7 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 
   gcc_assert (new_temp);
 
-  gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
+  gimple_cond_set_condition (cond_stmt, NE_EXPR, new_temp, cst);
   update_stmt (orig_stmt);
 
   if (slp_node)

[PATCH v2][committed][14 backport]middle-end: fix masking for partial vectors and early break [PR119351]

Reply via email to