https://gcc.gnu.org/g:f4e259b4a66c81c234608056117836e13606e4c8

commit r15-6810-gf4e259b4a66c81c234608056117836e13606e4c8
Author: Alex Coplan <alex.cop...@arm.com>
Date:   Thu Jul 25 16:34:05 2024 +0000

    vect: Ensure we add vector skip guard even when versioning for aliasing 
[PR118211]
    
    This fixes a latent wrong code issue whereby vect_do_peeling determined
    the wrong condition for inserting the vector skip guard.  Specifically
    in the case where the loop niters are unknown at compile time we used to
    check:
    
      !LOOP_REQUIRES_VERSIONING (loop_vinfo)
    
    but LOOP_REQUIRES_VERSIONING is true for loops which we have versioned
    for aliasing, and that has nothing to do with prolog peeling.  I think
    this condition should instead be checking specifically if we aren't
    versioning for alignment.
    
    As it stands, when we version for alignment, we don't peel, so the
    vector skip guard is indeed redundant in that case.
    
    With the testcase added (reduced from the Fortran frontend) we would
    version for aliasing, omit the vector skip guard, and then at runtime we
    would peel sufficient iterations for alignment that there wasn't a full
    vector iteration left when we entered the vector body, thus overflowing
    the output buffer.
    
    gcc/ChangeLog:
    
            PR tree-optimization/118211
            PR tree-optimization/116126
            * tree-vect-loop-manip.cc (vect_do_peeling): Adjust skip_vector
            condition to only omit the edge if we're versioning for
            alignment.
    
    gcc/testsuite/ChangeLog:
    
            PR tree-optimization/118211
            PR tree-optimization/116126
            * gcc.dg/vect/vect-early-break_130.c: New test.

Diff:
---
 gcc/testsuite/gcc.dg/vect/vect-early-break_130.c | 91 ++++++++++++++++++++++++
 gcc/tree-vect-loop-manip.cc                      |  2 +-
 2 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_130.c 
b/gcc/testsuite/gcc.dg/vect/vect-early-break_130.c
new file mode 100644
index 000000000000..ce43fcd5681c
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_130.c
@@ -0,0 +1,91 @@
+/* { dg-require-effective-target mmap } */
+/* { dg-add-options vect_early_break } */
+
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+/* This was reduced from gcc/fortran/scanner.cc:gfc_widechar_to_char.
+   The problem was that we omitted adding the vector skip guard when
+   versioning for aliasing.  When invoked on a string that is 28 bytes
+   long, that caused us to enter the vector body after having peeled 15
+   iterations, leaving only 13 iterations to be performed as vector, but
+   the vector body performs 16 (thus overflowing the res buffer by three
+   bytes).  */
+__attribute__((noipa))
+void f (const uint32_t *s, char *res, int length)
+{
+  unsigned long i;
+
+  for (i = 0; i < length; i++)
+    {
+      if (s[i] > 255)
+        __builtin_abort ();
+      res[i] = (char)s[i];
+    }
+}
+
+int main(void)
+{
+  long pgsz = sysconf (_SC_PAGESIZE);
+  if (pgsz == -1) {
+    fprintf (stderr, "sysconf failed: %m\n");
+    return 0;
+  }
+
+  void *p = mmap (NULL,
+      pgsz * 2,
+      PROT_READ | PROT_WRITE,
+      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (p == MAP_FAILED) {
+    fprintf (stderr, "mmap failed: %m\n");
+    return 0;
+  }
+
+  if (mprotect (p + pgsz, pgsz, PROT_NONE)) {
+    fprintf (stderr, "mprotect failed: %m\n");
+    return 0;
+  }
+
+  uint32_t in[128];
+  memset (in, 0, sizeof(in));
+
+  uintptr_t x = (uintptr_t)in;
+
+  /* We want to make our input pointer maximally misaligned (so we have
+     to peel the greatest possible number of iterations for alignment).
+     We need two bits of alignment for our uint32_t pointer to be
+     aligned.  Assuming we process 16 chars per vector iteration, we
+     will need to load 16 uint32_ts, thus we need a further 4 bits of
+     alignment.  */
+  const uintptr_t align_bits = 2 + 4;
+  const uintptr_t align_p2 = (1 << align_bits);
+  const uintptr_t align_p2m1 = align_p2 - 1;
+
+  if (x & align_p2m1 <= 4)
+    x &= -align_p2; /* Round down.  */
+  else
+    x = (x + align_p2m1) & -align_p2; /* Round up.  */
+
+  /* Add one uint32_t to get maximally misaligned.  */
+  uint32_t *inp = (uint32_t *)x + 1;
+
+  const char *str = "dec-comparison-complex_1.f90";
+  long n;
+#pragma GCC novector
+  for (n = 0; str[n]; n++)
+    inp[n] = str[n];
+
+  if (n > pgsz)
+    __builtin_abort ();
+
+  char *buf = p + pgsz - n;
+  f (inp, buf, n);
+
+#pragma GCC novector
+  for (int i = 0; i < n; i++)
+    if (buf[i] != str[i])
+      __builtin_abort ();
+}
diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
index 9a55a5611ccc..06ca99eaab95 100644
--- a/gcc/tree-vect-loop-manip.cc
+++ b/gcc/tree-vect-loop-manip.cc
@@ -3271,7 +3271,7 @@ vect_do_peeling (loop_vec_info loop_vinfo, tree niters, 
tree nitersm1,
   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
                      ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
                                  bound_prolog + bound_epilog)
-                     : (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+                     : (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
                         || vect_epilogues));
 
   /* Epilog loop must be executed if the number of iterations for epilog

Reply via email to