Richard Biener <rguent...@suse.de> writes:
> The testcase shows that we can end up with a contiguous access across
> loop iterations but by means of permutations the elements accessed
> might only cover parts of a vector.  In this case we end up with
> GROUP_GAP == 0 but still need to avoid accessing excess elements
> in the last loop iterations.  Peeling for gaps is designed to cover
> this but a single scalar iteration might not cover all of the excess
> elements.  The following ensures peeling for gaps is done in this
> situation and when that isn't sufficient because we need to peel
> more than one iteration (gcc.dg/vect/pr103116-2.c), fail the SLP
> vectorization.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.
>
> OK?

LGTM.  In principle I think we could (in future) handle some of the
!multiple_p cases for variable-length vectors, but I don't think it
would ever trigger in practice yet, given the limited permutes we
support in that case.

Thanks,
Richard

>
> Thanks,
> Richard.
>
> 2022-05-04  Richard Biener  <rguent...@suse.de>
>
>       PR tree-optimization/103116
>       * tree-vect-stmts.cc (get_group_load_store_type): Handle the
>       case we need peeling for gaps even though GROUP_GAP is zero.
>
>       * gcc.dg/vect/pr103116-1.c: New testcase.
>       * gcc.dg/vect/pr103116-2.c: Likewise.
> ---
>  gcc/testsuite/gcc.dg/vect/pr103116-1.c | 50 ++++++++++++++++++++++
>  gcc/testsuite/gcc.dg/vect/pr103116-2.c | 59 ++++++++++++++++++++++++++
>  gcc/tree-vect-stmts.cc                 | 31 ++++++++++++++
>  3 files changed, 140 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/pr103116-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/vect/pr103116-2.c
>
> diff --git a/gcc/testsuite/gcc.dg/vect/pr103116-1.c 
> b/gcc/testsuite/gcc.dg/vect/pr103116-1.c
> new file mode 100644
> index 00000000000..d3639fc8cfd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr103116-1.c
> @@ -0,0 +1,50 @@
> +/* { dg-require-effective-target mmap } */
> +
> +#include <sys/mman.h>
> +#include <stdio.h>
> +
> +#define COUNT 128
> +#define MMAP_SIZE 0x20000
> +#define ADDRESS 0x1122000000
> +#define TYPE unsigned int
> +
> +#ifndef MAP_ANONYMOUS
> +#define MAP_ANONYMOUS MAP_ANON
> +#endif
> +
> +void __attribute__((noipa))
> +loop (TYPE *restrict x, TYPE *restrict y)
> +{
> +  for (int i = 0; i < COUNT; ++i)
> +    {
> +      x[i * 4] = y[i * 2] + 1;
> +      x[i * 4 + 1] = y[i * 2] + 2;
> +      x[i * 4 + 2] = y[i * 2 + 1] + 3;
> +      x[i * 4 + 3] = y[i * 2 + 1] + 4;
> +    }
> +}
> +
> +TYPE x[COUNT * 4];
> +
> +int
> +main (void)
> +{
> +  void *y;
> +  TYPE *end_y;
> +
> +  y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
> +            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +  if (y == MAP_FAILED)
> +    {
> +      perror ("mmap");
> +      return 1;
> +    }
> +
> +  end_y = (TYPE *) ((char *) y + MMAP_SIZE);
> +
> +  loop (x, end_y - COUNT * 2);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "Data access with gaps requires scalar 
> epilogue loop" "vect" { target { vect_perm && vect_int } } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/pr103116-2.c 
> b/gcc/testsuite/gcc.dg/vect/pr103116-2.c
> new file mode 100644
> index 00000000000..2f4ed0f404c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr103116-2.c
> @@ -0,0 +1,59 @@
> +/* { dg-require-effective-target mmap } */
> +/* { dg-additional-options "-mssse3" { target x86_64-*-* i?86-*-* } } */
> +
> +#include <sys/mman.h>
> +#include <stdio.h>
> +#include "tree-vect.h"
> +
> +#define COUNT 128
> +#define MMAP_SIZE 0x20000
> +#define ADDRESS 0x1122000000
> +#define TYPE unsigned short
> +#define GROUP_SIZE 2
> +
> +#ifndef MAP_ANONYMOUS
> +#define MAP_ANONYMOUS MAP_ANON
> +#endif
> +
> +void __attribute__((noipa))
> +loop (TYPE *restrict x, TYPE *restrict y)
> +{
> +  for (int i = 0; i < COUNT; ++i)
> +    {
> +      x[i * 8] = y[i * GROUP_SIZE] + 1;
> +      x[i * 8 + 1] = y[i * GROUP_SIZE] + 2;
> +      x[i * 8 + 2] = y[i * GROUP_SIZE + 1] + 3;
> +      x[i * 8 + 3] = y[i * GROUP_SIZE + 1] + 4;
> +      x[i * 8 + 4] = y[i * GROUP_SIZE] + 5;
> +      x[i * 8 + 5] = y[i * GROUP_SIZE] + 6;
> +      x[i * 8 + 6] = y[i * GROUP_SIZE + 1] + 7;
> +      x[i * 8 + 7] = y[i * GROUP_SIZE + 1] + 8;
> +    }
> +}
> +
> +TYPE x[COUNT * 4];
> +
> +int
> +main (void)
> +{
> +  void *y;
> +  TYPE *end_y;
> +
> +  check_vect ();
> +
> +  y = mmap ((void *) ADDRESS, MMAP_SIZE, PROT_READ | PROT_WRITE,
> +            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> +  if (y == MAP_FAILED)
> +    {
> +      perror ("mmap");
> +      return 1;
> +    }
> +
> +  end_y = (TYPE *) ((char *) y + MMAP_SIZE);
> +
> +  loop (x, end_y - COUNT * GROUP_SIZE);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "peeling for gaps insufficient for access" 
> "vect" { target { vect_perm_short } } } } */
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index c9534ef9b1e..d8da13e312a 100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2293,6 +2293,37 @@ get_group_load_store_type (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>             gcc_assert (!loop_vinfo || cmp > 0);
>             *memory_access_type = VMAT_CONTIGUOUS;
>           }
> +
> +       /* When we have a contiguous access across loop iterations
> +          but the access in the loop doesn't cover the full vector
> +          we can end up with no gap recorded but still excess
> +          elements accessed, see PR103116.  Make sure we peel for
> +          gaps if necessary and sufficient and give up if not.  */
> +       if (loop_vinfo
> +           && *memory_access_type == VMAT_CONTIGUOUS
> +           && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
> +           && !multiple_p (group_size * LOOP_VINFO_VECT_FACTOR (loop_vinfo),
> +                           nunits))
> +         {
> +           unsigned HOST_WIDE_INT cnunits, cvf;
> +           if (!can_overrun_p
> +               || !nunits.is_constant (&cnunits)
> +               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&cvf)
> +               /* Peeling for gaps assumes that a single scalar iteration
> +                  is enough to make sure the last vector iteration doesn't
> +                  access excess elements.
> +                  ???  Enhancements include peeling multiple iterations
> +                  or using masked loads with a static mask.  */
> +               || (group_size * cvf) % cnunits + group_size < cnunits)
> +             {
> +               if (dump_enabled_p ())
> +                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                                  "peeling for gaps insufficient for "
> +                                  "access\n");
> +               return false;
> +             }
> +           overrun_p = true;
> +         }
>       }
>      }
>    else

Reply via email to