Richard Biener <rguent...@suse.de> writes: > The testcase shows that we can end up with a contiguous access across > loop iterations but by means of permutations the elements accessed > might only cover parts of a vector. In this case we end up with > GROUP_GAP == 0 but still need to avoid accessing excess elements > in the last loop iterations. Peeling for gaps is designed to cover > this but a single scalar iteration might not cover all of the excess > elements. The following ensures peeling for gaps is done in this > situation and when that isn't sufficient because we need to peel > more than one iteration (gcc.dg/vect/pr103116-2.c), fail the SLP > vectorization. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. > > OK?
LGTM. In principle I think we could (in future) handle some of the !multiple_p cases for variable-length vectors, but I don't think it would ever trigger in practice yet, given the limited permutes we support in that case. Thanks, Richard > > Thanks, > Richard. > > 2022-05-04 Richard Biener <rguent...@suse.de> > > PR tree-optimization/103116 > * tree-vect-stmts.cc (get_group_load_store_type): Handle the > case we need peeling for gaps even though GROUP_GAP is zero. > > * gcc.dg/vect/pr103116-1.c: New testcase. > * gcc.dg/vect/pr103116-2.c: Likewise. > --- > gcc/testsuite/gcc.dg/vect/pr103116-1.c | 50 ++++++++++++++++++++++ > gcc/testsuite/gcc.dg/vect/pr103116-2.c | 59 ++++++++++++++++++++++++++ > gcc/tree-vect-stmts.cc | 31 ++++++++++++++ > 3 files changed, 140 insertions(+) > create mode 100644 gcc/testsuite/gcc.dg/vect/pr103116-1.c > create mode 100644 gcc/testsuite/gcc.dg/vect/pr103116-2.c > > diff --git a/gcc/testsuite/gcc.dg/vect/pr103116-1.c > b/gcc/testsuite/gcc.dg/vect/pr103116-1.c > new file mode 100644 > index 00000000000..d3639fc8cfd > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr103116-1.c > @@ -0,0 +1,50 @@ > +/* { dg-require-effective-target mmap } */ > + > +#include <sys/mman.h> > +#include <stdio.h> > + > +#define COUNT 128 > +#define MMAP_SIZE 0x20000 > +#define ADDRESS 0x1122000000 > +#define TYPE unsigned int > + > +#ifndef MAP_ANONYMOUS > +#define MAP_ANONYMOUS MAP_ANON > +#endif > + > +void __attribute__((noipa)) > +loop (TYPE *restrict x, TYPE *restrict y) > +{ > + for (int i = 0; i < COUNT; ++i) > + { > + x[i * 4] = y[i * 2] + 1; > + x[i * 4 + 1] = y[i * 2] + 2; > + x[i * 4 + 2] = y[i * 2 + 1] + 3; > + x[i * 4 + 3] = y[i * 2 + 1] + 4; > + } > +} > + > +TYPE x[COUNT * 4]; > + > +int > +main (void) > +{ > + void *y; > + TYPE *end_y; > + > + y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + if (y == MAP_FAILED) > + { > + perror ("mmap"); > + return 1; > + } > + > + end_y = (TYPE *) ((char *) y + MMAP_SIZE); > + > + loop (x, end_y - COUNT * 2); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump "Data access with gaps requires scalar > epilogue loop" "vect" { target { vect_perm && vect_int } } } } */ > diff --git a/gcc/testsuite/gcc.dg/vect/pr103116-2.c > b/gcc/testsuite/gcc.dg/vect/pr103116-2.c > new file mode 100644 > index 00000000000..2f4ed0f404c > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/vect/pr103116-2.c > @@ -0,0 +1,59 @@ > +/* { dg-require-effective-target mmap } */ > +/* { dg-additional-options "-mssse3" { target x86_64-*-* i?86-*-* } } */ > + > +#include <sys/mman.h> > +#include <stdio.h> > +#include "tree-vect.h" > + > +#define COUNT 128 > +#define MMAP_SIZE 0x20000 > +#define ADDRESS 0x1122000000 > +#define TYPE unsigned short > +#define GROUP_SIZE 2 > + > +#ifndef MAP_ANONYMOUS > +#define MAP_ANONYMOUS MAP_ANON > +#endif > + > +void __attribute__((noipa)) > +loop (TYPE *restrict x, TYPE *restrict y) > +{ > + for (int i = 0; i < COUNT; ++i) > + { > + x[i * 8] = y[i * GROUP_SIZE] + 1; > + x[i * 8 + 1] = y[i * GROUP_SIZE] + 2; > + x[i * 8 + 2] = y[i * GROUP_SIZE + 1] + 3; > + x[i * 8 + 3] = y[i * GROUP_SIZE + 1] + 4; > + x[i * 8 + 4] = y[i * GROUP_SIZE] + 5; > + x[i * 8 + 5] = y[i * GROUP_SIZE] + 6; > + x[i * 8 + 6] = y[i * GROUP_SIZE + 1] + 7; > + x[i * 8 + 7] = y[i * GROUP_SIZE + 1] + 8; > + } > +} > + > +TYPE x[COUNT * 4]; > + > +int > +main (void) > +{ > + void *y; > + TYPE *end_y; > + > + check_vect (); > + > + y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); > + if (y == MAP_FAILED) > + { > + perror ("mmap"); > + return 1; > + } > + > + end_y = (TYPE *) ((char *) y + MMAP_SIZE); > + > + loop (x, end_y - COUNT * GROUP_SIZE); > + > + return 0; > +} > + > +/* { dg-final { scan-tree-dump "peeling for gaps insufficient for access" > "vect" { target { vect_perm_short } } } } */ > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > index c9534ef9b1e..d8da13e312a 100644 > --- a/gcc/tree-vect-stmts.cc > +++ b/gcc/tree-vect-stmts.cc > @@ -2293,6 +2293,37 @@ get_group_load_store_type (vec_info *vinfo, > stmt_vec_info stmt_info, > gcc_assert (!loop_vinfo || cmp > 0); > *memory_access_type = VMAT_CONTIGUOUS; > } > + > + /* When we have a contiguous access across loop iterations > + but the access in the loop doesn't cover the full vector > + we can end up with no gap recorded but still excess > + elements accessed, see PR103116. Make sure we peel for > + gaps if necessary and sufficient and give up if not. */ > + if (loop_vinfo > + && *memory_access_type == VMAT_CONTIGUOUS > + && SLP_TREE_LOAD_PERMUTATION (slp_node).exists () > + && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo), > + nunits)) > + { > + unsigned HOST_WIDE_INT cnunits, cvf; > + if (!can_overrun_p > + || !nunits.is_constant (&cnunits) > + || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf) > + /* Peeling for gaps assumes that a single scalar iteration > + is enough to make sure the last vector iteration doesn't > + access excess elements. > + ??? Enhancements include peeling multiple iterations > + or using masked loads with a static mask. */ > + || (group_size * cvf) % cnunits + group_size < cnunits) > + { > + if (dump_enabled_p ()) > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > + "peeling for gaps insufficient for " > + "access\n"); > + return false; > + } > + overrun_p = true; > + } > } > } > else