Hi All,

The following testcase shows an incorrect masked codegen:

#define N 512
#define START 1
#define END 505
 
int x[N] __attribute__((aligned(32)));

int __attribute__((noipa))
foo (void)
{
  int z = 0;
  for (unsigned int i = START; i < END; ++i)
    {
      z++;
      if (x[i] > 0)
        continue;
    
      return z;
    }
  return -1;
}

notice how there's a continue there instead of a break.  This means we generate
a control flow where success stays within the loop iteration:

  mask_patt_9.12_46 = vect__1.11_45 > { 0, 0, 0, 0 };
  vec_mask_and_47 = mask_patt_9.12_46 & loop_mask_41;
  if (vec_mask_and_47 == { -1, -1, -1, -1 })
    goto <bb 4>; [41.48%]
  else
    goto <bb 15>; [58.52%]

However when loop_mask_41 is a partial mask this comparison can lead to an
incorrect match.  In this case the mask is:

  # loop_mask_41 = PHI <next_mask_63(6), { 0, -1, -1, -1 }(2)>

due to peeling for alignment with masking and compiling with
-msve-vector-bits=128.

At codegen time we generate:

        ptrue   p15.s, vl4
        ptrue   p7.b, vl1
        not     p7.b, p15/z, p7.b
.L5:
        ld1w    z29.s, p7/z, [x1, x0, lsl 2]
        cmpgt   p7.s, p7/z, z29.s, #0
        not     p7.b, p15/z, p7.b
        ptest   p15, p7.b
        b.none  .L2
        ...<early exit>...

notice how at expand time the basic blocks are inverted and a not is generated.
But the generated not is unmasked (or predicated over an ALL true mask in this
case).  This has the unintended side-effect of flipping the results of the
inactive lanes (which were zero'd by the cmpgt) into -1.  Which then incorrectly
causes us to not take the branch to .L2.

This is happening because the expander has no context about the mask, and since
we can't mask a gcond, we do the next best thing which is to mask both operands.

We already mask the compare, but this patch now also masks the constant.  In the
normal case this means we drop it since {0, ..} & mask = {0, ..} but in the case
of an forall comparison we'll keep the mask, allowing the generated code to
correctly mask the results.

For the above we now generate:

.L5:
        ld1w    z28.s, p7/z, [x1, x0, lsl 2]
        cmpgt   p14.s, p7/z, z28.s, #0
        eors    p7.b, p15/z, p7.b, p14.b
        b.none  .L2

This fixes gromacs with > 1 OpenMP threads and improves performance.

Bootstrapped Regtested on aarch64-none-linux-gnu,
arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
-m32, -m64 and no issues.

Ok for master? and backport to GCC-14?

Thanks,
Tamar


gcc/ChangeLog:

        PR tree-optimization/119351
        * tree-vect-stmts.cc (vectorizable_early_exit): Mask both operands of
        the gcond for partial masking support.

gcc/testsuite/ChangeLog:

        PR tree-optimization/119351
        * gcc.target/aarch64/sve/pr119351.c: New test.
        * gcc.target/aarch64/sve/pr119351_run.c: New test.

---
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
new file mode 100644
index 
0000000000000000000000000000000000000000..198f7edb0fc01bfc74ae231db7823e9a6f0bc119
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
@@ -0,0 +1,38 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param 
aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+/* { dg-final { check-function-bodies "**" "" ""} } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+/*
+** foo:
+**     ...
+**     ld1w    z[0-9]+.s, p[0-9]+/z, \[x[0-9], x[0-9], lsl 2\]
+**     cmpgt   p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**     eors    p[0-9]+.b, p[0-9]+/z, p[0-9]+.b, p[0-9]+.b
+**     ...
+*/
+
+int __attribute__((noipa))
+foo (void)
+{
+  int z = 0;
+  for (unsigned int i = START; i < END; ++i)
+    {
+      z++;
+      if (x[i] > 0)
+        continue;
+    
+      return z;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
"vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c 
b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
new file mode 100644
index 
0000000000000000000000000000000000000000..d36ab0eb7a900504e7dc2266ec5a19d1beeb5123
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
@@ -0,0 +1,20 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw 
} } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw 
} } */
+
+#include "pr119351.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  x[0] = 1;
+  x[1] = 21;
+  x[2] = 39;
+  x[3] = 59;
+  int res = foo ();
+  if (res != 4)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 
7f874354e75e8d4d3a7196df4e9b559ac641827c..8f67483a943705d89e3f6d744cb2bdb5cf8b44db
 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13781,6 +13781,20 @@ vectorizable_early_exit (vec_info *vinfo, 
stmt_vec_info stmt_info,
 
   gcc_assert (new_temp);
 
+  /* We have to mask both operands of the gcond because if one element of the
+     mask is inactive and was zero'd we would incorrectly compare true for the
+     inactive element.  */
+  if (masked_loop_p)
+    {
+      tree mask
+       = vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0);
+      cst
+       = prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, cst, &cond_gsi);
+    }
+  else if (len_loop_p)
+    cst = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
+                                 ncopies, vectype, cst, 0, 1);
+
   gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
   update_stmt (orig_stmt);
 


-- 
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
new file mode 100644
index 0000000000000000000000000000000000000000..198f7edb0fc01bfc74ae231db7823e9a6f0bc119
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351.c
@@ -0,0 +1,38 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do compile } */
+/* { dg-options "-Ofast -msve-vector-bits=256 --param aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
+/* { dg-final { check-function-bodies "**" "" ""} } */
+
+#define N 512
+#define START 1
+#define END 505
+ 
+int x[N] __attribute__((aligned(32)));
+
+/*
+** foo:
+**	...
+**	ld1w	z[0-9]+.s, p[0-9]+/z, \[x[0-9], x[0-9], lsl 2\]
+**	cmpgt	p[0-9]+.s, p[0-9]+/z, z[0-9]+.s, #0
+**	eors	p[0-9]+.b, p[0-9]+/z, p[0-9]+.b, p[0-9]+.b
+**	...
+*/
+
+int __attribute__((noipa))
+foo (void)
+{
+  int z = 0;
+  for (unsigned int i = START; i < END; ++i)
+    {
+      z++;
+      if (x[i] > 0)
+        continue;
+    
+      return z;
+    }
+  return -1;
+}
+
+/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
+/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
+/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
new file mode 100644
index 0000000000000000000000000000000000000000..d36ab0eb7a900504e7dc2266ec5a19d1beeb5123
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pr119351_run.c
@@ -0,0 +1,20 @@
+/* Fix for PR119351 alignment peeling with vectors and VLS.  */
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
+/* { dg-additional-options "-msve-vector-bits=256" { target aarch64_sve256_hw } } */
+/* { dg-additional-options "-msve-vector-bits=128" { target aarch64_sve128_hw } } */
+
+#include "pr119351.c"
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  x[0] = 1;
+  x[1] = 21;
+  x[2] = 39;
+  x[3] = 59;
+  int res = foo ();
+  if (res != 4)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 7f874354e75e8d4d3a7196df4e9b559ac641827c..8f67483a943705d89e3f6d744cb2bdb5cf8b44db 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -13781,6 +13781,20 @@ vectorizable_early_exit (vec_info *vinfo, stmt_vec_info stmt_info,
 
   gcc_assert (new_temp);
 
+  /* We have to mask both operands of the gcond because if one element of the
+     mask is inactive and was zero'd we would incorrectly compare true for the
+     inactive element.  */
+  if (masked_loop_p)
+    {
+      tree mask
+	= vect_get_loop_mask (loop_vinfo, gsi, masks, ncopies, vectype, 0);
+      cst
+	= prepare_vec_mask (loop_vinfo, TREE_TYPE (mask), mask, cst, &cond_gsi);
+    }
+  else if (len_loop_p)
+    cst = vect_gen_loop_len_mask (loop_vinfo, gsi, &cond_gsi, lens,
+				  ncopies, vectype, cst, 0, 1);
+
   gimple_cond_set_condition (cond_stmt, new_code, new_temp, cst);
   update_stmt (orig_stmt);
 

Reply via email to