https://gcc.gnu.org/g:62cdddd4e621a8182c58161188009f1e9b256e1b
commit r15-4383-g62cdddd4e621a8182c58161188009f1e9b256e1b Author: Richard Biener <rguent...@suse.de> Date: Wed Oct 16 10:09:36 2024 +0200 Enhance gather fallback for PR65518 with SLP With SLP forced we fail to use gather for PR65518 on RISC-V as expected because we're failing due to not effective peeling for gaps. The following appropriately moves the memory_access_type adjustment before doing all the overrun checking since using VMAT_ELEMENTWISE means there's no overrun. * tree-vect-stmts.cc (get_group_load_store_type): Move VMAT_ELEMENTWISE fallback for single-element interleaving of too large groups before overrun checking. * gcc.dg/vect/pr65518.c: Adjust. Diff: --- gcc/testsuite/gcc.dg/vect/pr65518.c | 109 ++++++++++++++++++------------------ gcc/tree-vect-stmts.cc | 58 ++++++++++--------- 2 files changed, 85 insertions(+), 82 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/pr65518.c b/gcc/testsuite/gcc.dg/vect/pr65518.c index 189a65534f61..6d8515061694 100644 --- a/gcc/testsuite/gcc.dg/vect/pr65518.c +++ b/gcc/testsuite/gcc.dg/vect/pr65518.c @@ -1,54 +1,55 @@ -#include "tree-vect.h" - -#if VECTOR_BITS > 256 -#define NINTS (VECTOR_BITS / 32) -#else -#define NINTS 8 -#endif - -#define N (NINTS * 2) -#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS) - -extern void abort (void); - -typedef struct giga -{ - unsigned int g[N]; -} giga; - -unsigned long __attribute__((noinline,noclone)) -addfst(giga const *gptr, int num) -{ - unsigned int retval = 0; - int i; - for (i = 0; i < num; i++) - retval += gptr[i].g[0]; - return retval; -} - -int main () -{ - struct giga g[NINTS]; - unsigned int n = 1; - int i, j; - check_vect (); - for (i = 0; i < NINTS; ++i) - for (j = 0; j < N; ++j) - { - g[i].g[j] = n++; - __asm__ volatile (""); - } - if (addfst (g, NINTS) != RESULT) - abort (); - return 0; -} - -/* We don't want to vectorize the single-element interleaving in the way - we currently do that (without ignoring not needed vectors in the - gap between gptr[0].g[0] and gptr[1].g[0]), because that's very - sub-optimal and causes memory explosion (even though the cost model - should reject that in the end). */ - -/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */ -/* We end up using gathers for the strided load on RISC-V which would be OK. */ -/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */ +#include "tree-vect.h" + +#if VECTOR_BITS > 256 +#define NINTS (VECTOR_BITS / 32) +#else +#define NINTS 8 +#endif + +#define N (NINTS * 2) +#define RESULT (NINTS * (NINTS - 1) / 2 * N + NINTS) + +extern void abort (void); + +typedef struct giga +{ + unsigned int g[N]; +} giga; + +unsigned long __attribute__((noinline,noclone)) +addfst(giga const *gptr, int num) +{ + unsigned int retval = 0; + int i; + for (i = 0; i < num; i++) + retval += gptr[i].g[0]; + return retval; +} + +int main () +{ + struct giga g[NINTS]; + unsigned int n = 1; + int i, j; + check_vect (); + for (i = 0; i < NINTS; ++i) + for (j = 0; j < N; ++j) + { + g[i].g[j] = n++; + __asm__ volatile (""); + } + if (addfst (g, NINTS) != RESULT) + abort (); + return 0; +} + +/* We don't want to vectorize the single-element interleaving in the way + we currently do that (without ignoring not needed vectors in the + gap between gptr[0].g[0] and gptr[1].g[0]), because that's very + sub-optimal and causes memory explosion (even though the cost model + should reject that in the end). */ + +/* { dg-final { scan-tree-dump-times "vectorized 0 loops in function" 2 "vect" { target {! riscv*-*-* } } } } */ +/* We should end up using gathers for the strided load on RISC-V. */ +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 "vect" { target { riscv*-*-* } } } } */ +/* { dg-final { scan-tree-dump "using gather/scatter for strided/grouped access" "vect" { target { riscv*-*-* } } } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 9b14b96cb5a6..6967d50288e9 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2081,6 +2081,35 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, else *memory_access_type = VMAT_CONTIGUOUS; + /* If this is single-element interleaving with an element + distance that leaves unused vector loads around punt - we + at least create very sub-optimal code in that case (and + blow up memory, see PR65518). */ + if (loop_vinfo + && *memory_access_type == VMAT_CONTIGUOUS + && single_element_p + && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype))) + { + if (SLP_TREE_LANES (slp_node) == 1) + { + *memory_access_type = VMAT_ELEMENTWISE; + overrun_p = false; + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "single-element interleaving not supported " + "for not adjacent vector loads, using " + "elementwise access\n"); + } + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "single-element interleaving not supported " + "for not adjacent vector loads\n"); + return false; + } + } + overrun_p = loop_vinfo && gap != 0; if (overrun_p && vls_type != VLS_LOAD) { @@ -2149,6 +2178,7 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, "Peeling for outer loop is not supported\n"); return false; } + /* Peeling for gaps assumes that a single scalar iteration is enough to make sure the last vector iteration doesn't access excess elements. */ @@ -2179,34 +2209,6 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, return false; } } - - /* If this is single-element interleaving with an element - distance that leaves unused vector loads around punt - we - at least create very sub-optimal code in that case (and - blow up memory, see PR65518). */ - if (loop_vinfo - && *memory_access_type == VMAT_CONTIGUOUS - && single_element_p - && maybe_gt (group_size, TYPE_VECTOR_SUBPARTS (vectype))) - { - if (SLP_TREE_LANES (slp_node) == 1) - { - *memory_access_type = VMAT_ELEMENTWISE; - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads, using " - "elementwise access\n"); - } - else - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "single-element interleaving not supported " - "for not adjacent vector loads\n"); - return false; - } - } } } else