[gcc r15-1239] tree-optimization/115385 - handle more gaps with peeling of a single iteration

Richard Biener via Gcc-cvs Wed, 12 Jun 2024 23:23:00 -0700

https://gcc.gnu.org/g:6669dc51515313dd1e60c493596dbc90429fc362


commit r15-1239-g6669dc51515313dd1e60c493596dbc90429fc362
Author: Richard Biener <rguent...@suse.de>
Date:   Fri Jun 7 14:47:12 2024 +0200

    tree-optimization/115385 - handle more gaps with peeling of a single 
iteration
    
    The following makes peeling of a single scalar iteration handle more
    gaps, including non-power-of-two cases.  This can be done by rounding
    up the remaining access to the next power-of-two which ensures that
    the next scalar iteration will pick at least the number of excess
    elements we access.
    
    I've added a correctness testcase and one x86 specific scanning for
    the optimization.
    
            PR tree-optimization/115385
            * tree-vect-stmts.cc (get_group_load_store_type): Peeling
            of a single scalar iteration is sufficient if we can narrow
            the access to the next power of two of the bits in the last
            access.
            (vectorizable_load): Ensure that the last access is narrowed.
    
            * gcc.dg/vect/pr115385.c: New testcase.
            * gcc.target/i386/vect-pr115385.c: Likewise.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr115385.c          | 88 +++++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-pr115385.c | 53 ++++++++++++++++
 gcc/tree-vect-stmts.cc                        | 44 ++++++++++++--
 3 files changed, 180 insertions(+), 5 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr115385.c 
b/gcc/testsuite/gcc.dg/vect/pr115385.c
new file mode 100644
index 000000000000..a18cd665d7d0
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr115385.c
@@ -0,0 +1,88 @@
+/* { dg-require-effective-target mmap } */
+
+#include <sys/mman.h>
+#include <stdio.h>
+
+#define COUNT 511
+#define MMAP_SIZE 0x20000
+#define ADDRESS 0x1122000000
+#define TYPE unsigned char
+
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS MAP_ANON
+#endif
+
+void __attribute__((noipa)) foo(TYPE * __restrict x,
+                                TYPE *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[3*i+0];
+      x[16*i+1] = y[3*i+1];
+      x[16*i+2] = y[3*i+2];
+      x[16*i+3] = y[3*i+0];
+      x[16*i+4] = y[3*i+1];
+      x[16*i+5] = y[3*i+2];
+      x[16*i+6] = y[3*i+0];
+      x[16*i+7] = y[3*i+1];
+      x[16*i+8] = y[3*i+2];
+      x[16*i+9] = y[3*i+0];
+      x[16*i+10] = y[3*i+1];
+      x[16*i+11] = y[3*i+2];
+      x[16*i+12] = y[3*i+0];
+      x[16*i+13] = y[3*i+1];
+      x[16*i+14] = y[3*i+2];
+      x[16*i+15] = y[3*i+0];
+    }
+}
+
+void __attribute__((noipa)) bar(TYPE * __restrict x,
+                                TYPE *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[5*i+0];
+      x[16*i+1] = y[5*i+1];
+      x[16*i+2] = y[5*i+2];
+      x[16*i+3] = y[5*i+3];
+      x[16*i+4] = y[5*i+4];
+      x[16*i+5] = y[5*i+0];
+      x[16*i+6] = y[5*i+1];
+      x[16*i+7] = y[5*i+2];
+      x[16*i+8] = y[5*i+3];
+      x[16*i+9] = y[5*i+4];
+      x[16*i+10] = y[5*i+0];
+      x[16*i+11] = y[5*i+1];
+      x[16*i+12] = y[5*i+2];
+      x[16*i+13] = y[5*i+3];
+      x[16*i+14] = y[5*i+4];
+      x[16*i+15] = y[5*i+0];
+    }
+}
+
+TYPE x[COUNT * 16];
+
+int
+main (void)
+{
+  void *y;
+  TYPE *end_y;
+
+  y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
+            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (y == MAP_FAILED)
+    {
+      perror ("mmap");
+      return 1;
+    }
+
+  end_y = (TYPE *) ((char *) y + MMAP_SIZE);
+
+  foo (x, end_y - COUNT * 3, COUNT);
+  bar (x, end_y - COUNT * 5, COUNT);
+
+  return 0;
+}
+
+/* We always require a scalar epilogue here but we don't know which
+   targets support vector composition this way.  */
diff --git a/gcc/testsuite/gcc.target/i386/vect-pr115385.c 
b/gcc/testsuite/gcc.target/i386/vect-pr115385.c
new file mode 100644
index 000000000000..a6be9ce4e543
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-pr115385.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -msse4.1 -mno-avx -fdump-tree-vect-details" } */
+
+void __attribute__((noipa)) foo(unsigned char * __restrict x,
+                                unsigned char *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[3*i+0];
+      x[16*i+1] = y[3*i+1];
+      x[16*i+2] = y[3*i+2];
+      x[16*i+3] = y[3*i+0];
+      x[16*i+4] = y[3*i+1];
+      x[16*i+5] = y[3*i+2];
+      x[16*i+6] = y[3*i+0];
+      x[16*i+7] = y[3*i+1];
+      x[16*i+8] = y[3*i+2];
+      x[16*i+9] = y[3*i+0];
+      x[16*i+10] = y[3*i+1];
+      x[16*i+11] = y[3*i+2];
+      x[16*i+12] = y[3*i+0];
+      x[16*i+13] = y[3*i+1];
+      x[16*i+14] = y[3*i+2];
+      x[16*i+15] = y[3*i+0];
+    }
+}
+
+void __attribute__((noipa)) bar(unsigned char * __restrict x,
+                                unsigned char *y, int n)
+{
+  for (int i = 0; i < n; ++i)
+    {
+      x[16*i+0] = y[5*i+0];
+      x[16*i+1] = y[5*i+1];
+      x[16*i+2] = y[5*i+2];
+      x[16*i+3] = y[5*i+3];
+      x[16*i+4] = y[5*i+4];
+      x[16*i+5] = y[5*i+0];
+      x[16*i+6] = y[5*i+1];
+      x[16*i+7] = y[5*i+2];
+      x[16*i+8] = y[5*i+3];
+      x[16*i+9] = y[5*i+4];
+      x[16*i+10] = y[5*i+0];
+      x[16*i+11] = y[5*i+1];
+      x[16*i+12] = y[5*i+2];
+      x[16*i+13] = y[5*i+3];
+      x[16*i+14] = y[5*i+4];
+      x[16*i+15] = y[5*i+0];
+    }
+}
+
+/* { dg-final { scan-tree-dump "Data access with gaps requires scalar epilogue 
loop" "vect"} } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect"} } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index b151f53c2fbd..05fe523da993 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2151,11 +2151,24 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
                                    nunits, &tem, &remain)
                  || maybe_lt (remain + group_size, nunits)))
            {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "peeling for gaps insufficient for "
-                                "access\n");
-             return false;
+             /* But peeling a single scalar iteration is enough if
+                we can use the next power-of-two sized partial
+                access.  */
+             unsigned HOST_WIDE_INT cnunits, cvf, cremain, cpart_size;
+             if (!nunits.is_constant (&cnunits)
+                 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
+                 || ((cremain = remain.to_constant (), true)
+                     && ((cpart_size = (1 << ceil_log2 (cremain))) != cnunits)
+                     && vector_vector_composition_type
+                          (vectype, cnunits / cpart_size,
+                           &half_vtype) == NULL_TREE))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "peeling for gaps insufficient for "
+                                    "access\n");
+                 return false;
+               }
            }
 
          /* If this is single-element interleaving with an element
@@ -11584,6 +11597,27 @@ vectorizable_load (vec_info *vinfo,
                              gcc_assert (new_vtype
                                          || LOOP_VINFO_PEELING_FOR_GAPS
                                               (loop_vinfo));
+                           /* But still reduce the access size to the next
+                              required power-of-two so peeling a single
+                              scalar iteration is sufficient.  */
+                           unsigned HOST_WIDE_INT cremain;
+                           if (remain.is_constant (&cremain))
+                             {
+                               unsigned HOST_WIDE_INT cpart_size
+                                 = 1 << ceil_log2 (cremain);
+                               if (known_gt (nunits, cpart_size)
+                                   && constant_multiple_p (nunits, cpart_size,
+                                                           &num))
+                                 {
+                                   tree ptype;
+                                   new_vtype
+                                     = vector_vector_composition_type (vectype,
+                                                                       num,
+                                                                       &ptype);
+                                   if (new_vtype)
+                                     ltype = ptype;
+                                 }
+                             }
                          }
                      }
                    tree offset

[gcc r15-1239] tree-optimization/115385 - handle more gaps with peeling of a single iteration

Reply via email to