[gcc r15-4383] Enhance gather fallback for PR65518 with SLP

Richard Biener via Gcc-cvs Wed, 16 Oct 2024 04:05:08 -0700

https://gcc.gnu.org/g:62cdddd4e621a8182c58161188009f1e9b256e1b


commit r15-4383-g62cdddd4e621a8182c58161188009f1e9b256e1b
Author: Richard Biener <rguent...@suse.de>
Date:   Wed Oct 16 10:09:36 2024 +0200

    Enhance gather fallback for PR65518 with SLP
    
    With SLP forced we fail to use gather for PR65518 on RISC-V as expected
    because we're failing due to not effective peeling for gaps.  The
    following appropriately moves the memory_access_type adjustment before
    doing all the overrun checking since using VMAT_ELEMENTWISE means
    there's no overrun.
    
            * tree-vect-stmts.cc (get_group_load_store_type): Move
            VMAT_ELEMENTWISE fallback for single-element interleaving
            of too large groups before overrun checking.
    
            * gcc.dg/vect/pr65518.c: Adjust.

Diff:
---
 gcc/testsuite/gcc.dg/vect/pr65518.c | 109 ++++++++++++++++++------------------
 gcc/tree-vect-stmts.cc              |  58 ++++++++++---------
 2 files changed, 85 insertions(+), 82 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/vect/pr65518.c 
b/gcc/testsuite/gcc.dg/vect/pr65518.c
index 189a65534f61..6d8515061694 100644
--- a/gcc/testsuite/gcc.dg/vect/pr65518.c
+++ b/gcc/testsuite/gcc.dg/vect/pr65518.c
@@ -1,54 +1,55 @@
-#include "tree-vect.h"
-
-#if VECTOR_BITS > 256
-#define NINTS (VECTOR_BITS / 32)
-#else
-#define NINTS 8
-#endif
-
-#define N (NINTS * 2)
-#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
-
-extern void abort (void);
-
-typedef struct giga
-{
-  unsigned int g[N];
-} giga;
-
-unsigned long __attribute__((noinline,noclone))
-addfst(giga const *gptr, int num)
-{
-  unsigned int retval = 0;
-  int i;
-  for (i = 0; i < num; i++)
-    retval += gptr[i].g[0];
-  return retval;
-}
-
-int main ()
-{
-  struct giga g[NINTS];
-  unsigned int n = 1;
-  int i, j;
-  check_vect ();
-  for (i = 0; i < NINTS; ++i)
-    for (j = 0; j < N; ++j)
-      {
-       g[i].g[j] = n++;
-       __asm__ volatile ("");
-      }
-  if (addfst (g, NINTS) != RESULT)
-    abort ();
-  return 0;
-}
-
-/* We don't want to vectorize the single-element interleaving in the way
-   we currently do that (without ignoring not needed vectors in the
-   gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
-   sub-optimal and causes memory explosion (even though the cost model
-   should reject that in the end).  */
-
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" 
{ target {! riscv*-*-* } } } } */
-/* We end up using gathers for the strided load on RISC-V which would be OK.  
*/
-/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped 
access" "vect" { target { riscv*-*-* } } } } */
+#include "tree-vect.h"
+
+#if VECTOR_BITS > 256
+#define NINTS (VECTOR_BITS / 32)
+#else
+#define NINTS 8
+#endif
+
+#define N (NINTS * 2)
+#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS)
+
+extern void abort (void);
+
+typedef struct giga
+{
+  unsigned int g[N];
+} giga;
+
+unsigned long __attribute__((noinline,noclone))
+addfst(giga const *gptr, int num)
+{
+  unsigned int retval = 0;
+  int i;
+  for (i = 0; i < num; i++)
+    retval += gptr[i].g[0];
+  return retval;
+}
+
+int main ()
+{
+  struct giga g[NINTS];
+  unsigned int n = 1;
+  int i, j;
+  check_vect ();
+  for (i = 0; i < NINTS; ++i)
+    for (j = 0; j < N; ++j)
+      {
+       g[i].g[j] = n++;
+       __asm__ volatile ("");
+      }
+  if (addfst (g, NINTS) != RESULT)
+    abort ();
+  return 0;
+}
+
+/* We don't want to vectorize the single-element interleaving in the way
+   we currently do that (without ignoring not needed vectors in the
+   gap between gptr[0].g[0] and gptr[1].g[0]), because that's very
+   sub-optimal and causes memory explosion (even though the cost model
+   should reject that in the end).  */
+
+/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" 
{ target {! riscv*-*-* } } } } */
+/* We should end up using gathers for the strided load on RISC-V.  */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" 
{ target { riscv*-*-* } } } } */
+/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped 
access" "vect" { target { riscv*-*-* } } } } */
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 9b14b96cb5a6..6967d50288e9 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2081,6 +2081,35 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
          else
            *memory_access_type = VMAT_CONTIGUOUS;
 
+         /* If this is single-element interleaving with an element
+            distance that leaves unused vector loads around punt - we
+            at least create very sub-optimal code in that case (and
+            blow up memory, see PR65518).  */
+         if (loop_vinfo
+             && *memory_access_type == VMAT_CONTIGUOUS
+             && single_element_p
+             && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
+           {
+             if (SLP_TREE_LANES (slp_node) == 1)
+               {
+                 *memory_access_type = VMAT_ELEMENTWISE;
+                 overrun_p = false;
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "single-element interleaving not supported 
"
+                                    "for not adjacent vector loads, using "
+                                    "elementwise access\n");
+               }
+             else
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "single-element interleaving not supported 
"
+                                    "for not adjacent vector loads\n");
+                 return false;
+               }
+           }
+
          overrun_p = loop_vinfo && gap != 0;
          if (overrun_p && vls_type != VLS_LOAD)
            {
@@ -2149,6 +2178,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info 
stmt_info,
                                 "Peeling for outer loop is not supported\n");
              return false;
            }
+
          /* Peeling for gaps assumes that a single scalar iteration
             is enough to make sure the last vector iteration doesn't
             access excess elements.  */
@@ -2179,34 +2209,6 @@ get_group_load_store_type (vec_info *vinfo, 
stmt_vec_info stmt_info,
                  return false;
                }
            }
-
-         /* If this is single-element interleaving with an element
-            distance that leaves unused vector loads around punt - we
-            at least create very sub-optimal code in that case (and
-            blow up memory, see PR65518).  */
-         if (loop_vinfo
-             && *memory_access_type == VMAT_CONTIGUOUS
-             && single_element_p
-             && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
-           {
-             if (SLP_TREE_LANES (slp_node) == 1)
-               {
-                 *memory_access_type = VMAT_ELEMENTWISE;
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "single-element interleaving not supported 
"
-                                    "for not adjacent vector loads, using "
-                                    "elementwise access\n");
-               }
-             else
-               {
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "single-element interleaving not supported 
"
-                                    "for not adjacent vector loads\n");
-                 return false;
-               }
-           }
        }
     }
   else

[gcc r15-4383] Enhance gather fallback for PR65518 with SLP

Reply via email to