On Tue, 4 Feb 2025, Tamar Christina wrote:

> Looks like a last minute change I made accidentally blocked SVE. Fixed and 
> re-sending:
> 
> Hi All,
> 
> This fixes two PRs on Early break vectorization by delaying the safety checks 
> to
> vectorizable_load when the VF, VMAT and vectype are all known.
> 
> This patch does add two new restrictions:
> 
> 1. On LOAD_LANES targets, where the buffer size is known, we reject uneven
>    group sizes, as they are unaligned every n % 2 iterations and so may cross
>    a page unwittingly.
> 
> 2. On LOAD_LANES targets when the buffer is unknown, we reject vectorization 
> if
>    we cannot peel for alignment, as the alignment requirement is quite large 
> at
>    GROUP_SIZE * vectype_size.  This is unlikely to ever be beneficial so we
>    don't support it for now.
> 
> There are other steps documented inside the code itself so that the reasoning
> is next to the code.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> 
> On arm-none-linux-gnueabihf some tests are failing to vectorize because it 
> looks
> like LOAD_LANES is often misaligned. I need to debug those a bit more to see 
> if
> it's the patch or backend.
> 
> For now I think the patch itself is fine.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/118464
>       PR tree-optimization/116855
>       * doc/invoke.texi (min-pagesize): Update docs with vectorizer use.
>       * tree-vect-data-refs.cc (vect_analyze_early_break_dependences): Delay
>       checks.
>       (vect_compute_data_ref_alignment): Remove alignment checks and move to
>       vectorizable_load.
>       (vect_enhance_data_refs_alignment): Add note to comment needing
>       investigating.
>       (vect_analyze_data_refs_alignment): Likewise.
>       (vect_supportable_dr_alignment): For group loads look at first DR.
>       * tree-vect-stmts.cc (get_load_store_type, vectorizable_load):
>       Perform safety checks for early break pfa.
>       * tree-vectorizer.h (dr_peeling_alignment): New.
> 
> gcc/testsuite/ChangeLog:
> 
>       PR tree-optimization/118464
>       PR tree-optimization/116855
>       * gcc.dg/vect/bb-slp-pr65935.c: Update, it now vectorizes because the
>       load type is relaxed later.
>       * gcc.dg/vect/vect-early-break_121-pr114081.c: Update.
>       * gcc.dg/vect/vect-early-break_22.c: Reject for load_lanes targets
>       * g++.dg/vect/vect-early-break_7-pr118464.cc: New test.
>       * gcc.dg/vect/vect-early-break_132-pr118464.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa1.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa10.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa2.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa3.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa4.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa5.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa6.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa7.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa8.c: New test.
>       * gcc.dg/vect/vect-early-break_133_pfa9.c: New test.
> 
> -- inline copy of patch --
> 
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 
> dddde54a287dbdf504f540bc499e024d077746a8..85f9c49eff437221f2cea77c114064a6a603b732
>  100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -17246,7 +17246,7 @@ Maximum number of relations the oracle will register 
> in a basic block.
>  Work bound when discovering transitive relations from existing relations.
>  
>  @item min-pagesize
> -Minimum page size for warning purposes.
> +Minimum page size for warning and early break vectorization purposes.
>  
>  @item openacc-kernels
>  Specify mode of OpenACC `kernels' constructs handling.
> diff --git a/gcc/testsuite/g++.dg/vect/vect-early-break_7-pr118464.cc 
> b/gcc/testsuite/g++.dg/vect/vect-early-break_7-pr118464.cc
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..5e50e56ad17515e278c05c92263af120c3ab2c21
> --- /dev/null
> +++ b/gcc/testsuite/g++.dg/vect/vect-early-break_7-pr118464.cc
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" } */
> +
> +#include <cstddef>
> +
> +struct ts1 {
> +  int spans[6][2];
> +};
> +struct gg {
> +  int t[6];
> +};
> +ts1 f(size_t t, struct ts1 *s1, struct gg *s2) {
> +  ts1 ret;
> +  for (size_t i = 0; i != t; i++) {
> +    if (!(i < t)) __builtin_abort();
> +    ret.spans[i][0] = s1->spans[i][0] + s2->t[i];
> +    ret.spans[i][1] = s1->spans[i][1] + s2->t[i];
> +  }
> +  return ret;
> +}

This one ICEd at some point?  Otherwise it doesn't test anything?

> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
> index 
> 9ef1330b47c817e16baaafa44c2b15108b9dd3a9..4c8255895b976653228233d93c950629f3231554
>  100644
> --- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr65935.c
> @@ -55,7 +55,9 @@ int main()
>         }
>      }
>    rephase ();
> +#pragma GCC novector
>    for (i = 0; i < 32; ++i)
> +#pragma GCC novector
>      for (j = 0; j < 3; ++j)
>  #pragma GCC novector
>        for (k = 0; k < 3; ++k)
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c
> index 
> 423ff0b566b18bf04ce4f67a45b94dc1a021a4a0..8bd85f3893f08157e640414b5b252b716a8ba93a
>  100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_121-pr114081.c
> @@ -5,7 +5,8 @@
>  /* { dg-additional-options "-O3" } */
>  /* { dg-additional-options "-mavx2" { target { x86_64-*-* i?86-*-* } } } */
>  
> -/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* Arm creates a group size of 3 here, which we can't support yet.  */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! 
> arm*-*-* } } } } */
>  
>  typedef struct filter_list_entry {
>    const char *name;
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_132-pr118464.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_132-pr118464.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..9bf0cbc8853f74de550e8ac83ab569fc9fbde126
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_132-pr118464.c
> @@ -0,0 +1,25 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" } */
> +
> +int a, b, c, d, e, f;
> +short g[1];
> +int main() {
> +  int h;
> +  while (a) {
> +    while (h)
> +      ;
> +    for (b = 2; b; b--) {
> +      while (c)
> +        ;
> +      f = g[a];
> +      if (d)
> +        break;
> +    }
> +    while (e)
> +      ;
> +  }
> +  return 0;
> +}

again, what does this test?

> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa1.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa1.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..dc771186efafe25bb65490da7a383ad7f6ceb0a7
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa1.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" } */
> +
> +char string[1020];
> +
> +char * find(int n, char c)
> +{
> +    for (int i = 1; i < n; i++) {
> +        if (string[i] == c)
> +            return &string[i];
> +    }
> +    return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa10.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa10.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..1cf58e4f6307f3d258cf093afe6c86a998cdd216
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa10.c
> @@ -0,0 +1,24 @@
> +/* { dg-add-options vect_early_break } */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-additional-options "-Ofast" } */
> +
> +/* Alignment requirement too big, load lanes targets can't safely vectorize 
> this.  */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" { target { ! 
> vect_load_lanes } } } } */
> +/* { dg-final { scan-tree-dump-not "Peeling for alignment will be applied" 
> "vect" { target { ! vect_load_lanes } } } } */
> +

Any reason to not use "Alignment of access forced using peeling" like in
the other testcases?

> +unsigned test4(char x, char *restrict vect_a, char *restrict vect_b, int n)
> +{  
> + unsigned ret = 0;
> + for (int i = 0; i < (n - 2); i+=2)
> + {
> +   if (vect_a[i] > x || vect_a[i+2] > x)
> +     return 1;
> +
> +   vect_b[i] = x;
> +   vect_b[i+1] = x+1;
> + }
> + return ret;
> +}
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa2.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa2.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..7d56772fbf380ce42ac758ca29a5f3f9d3f6e0d1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa2.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" } */
> +
> +char string[1020];
> +
> +char * find(int n, char c)
> +{
> +    for (int i = 0; i < n; i++) {
> +        if (string[i] == c)
> +            return &string[i];
> +    }
> +    return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "Alignment of access forced using 
> peeling" "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa3.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa3.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..374a051b945e97eedb9be9da423cf54b5e564d6f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa3.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" } */
> +
> +char string[1020] __attribute__((aligned(1)));
> +
> +char * find(int n, char c)
> +{
> +    for (int i = 1; i < n; i++) {
> +        if (string[i] == c)
> +            return &string[i];
> +    }
> +    return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> +/* { dg-final { scan-tree-dump "force alignment of string" "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa4.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa4.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..297fb7e9b9beffa25ab8f257ceea1c065fcc6ae9
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa4.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-add-options vect_early_break } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-additional-options "-O3" } */
> +
> +char string[1020] __attribute__((aligned(1)));
> +
> +char * find(int n, char c)
> +{
> +    for (int i = 0; i < n; i++) {
> +        if (string[i] == c)
> +            return &string[i];
> +    }
> +    return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "Alignment of access forced using 
> peeling" "vect" } } */
> +/* { dg-final { scan-tree-dump "force alignment of string" "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa5.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa5.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..32a4cee68f3418ed4fc6604ffd03ff4d8ff53d6b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa5.c
> @@ -0,0 +1,23 @@
> +/* { dg-add-options vect_early_break } */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-additional-options "-Ofast" } */
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +
> +unsigned test4(char x, char *vect, int n)
> +{  
> + unsigned ret = 0;
> + for (int i = 0; i < n; i++)
> + {
> +   if (vect[i] > x)
> +     return 1;
> +
> +   vect[i] = x;
> + }
> + return ret;
> +}
> +
> +/* { dg-final { scan-tree-dump "Peeling for alignment will be applied" 
> "vect" } } */

See above.

> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa6.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa6.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..ee123df6ed2ba97e92307c64a61c97b1b6268743
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa6.c
> @@ -0,0 +1,23 @@
> +/* { dg-add-options vect_early_break } */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-additional-options "-Ofast" } */
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +
> +unsigned test4(char x, char *vect_a, char *vect_b, int n)
> +{  
> + unsigned ret = 0;
> + for (int i = 1; i < n; i++)
> + {
> +   if (vect_a[i] > x || vect_b[i] > x)
> +     return 1;
> +
> +   vect_a[i] = x;
> + }
> + return ret;
> +}
> +
> +/* { dg-final { scan-tree-dump "Versioning for alignment will be applied" 
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa7.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa7.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..51bad4e745b67cfdaad20f50776299531824ce9c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa7.c
> @@ -0,0 +1,23 @@
> +/* { dg-add-options vect_early_break } */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-additional-options "-Ofast" } */
> +
> +/* This should be vectorizable through load_lanes and linear targets.  */
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +
> +unsigned test4(char x, char * restrict vect_a, char * restrict vect_b, int n)
> +{  
> + unsigned ret = 0;
> + for (int i = 0; i < n; i+=2)
> + {
> +   if (vect_a[i] > x || vect_a[i+1] > x)
> +     return 1;
> +
> +   vect_b[i] = x;
> +   vect_b[i+1] = x+1;
> + }
> + return ret;
> +}
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa8.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa8.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..d4d87768a60803ed943f95ccbda19e7e7812bf29
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa8.c
> @@ -0,0 +1,27 @@
> +/* { dg-add-options vect_early_break } */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-additional-options "-Ofast" } */
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "Access will not read beyond buffer to due 
> known size buffer" "vect" } } */
> +
> +
> +char vect_a[1025];
> +char vect_b[1025];
> +
> +unsigned test4(char x, int n)
> +{  
> + unsigned ret = 0;
> + for (int i = 1; i < (n - 2); i+=2)
> + {
> +   if (vect_a[i] > x || vect_a[i+1] > x)
> +     return 1;
> +
> +   vect_b[i] = x;
> +   vect_b[i+1] = x+1;
> + }
> + return ret;
> +}
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa9.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa9.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..8419d7a9201dc8c433a238f59520dea7e35c666e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_133_pfa9.c
> @@ -0,0 +1,28 @@
> +/* { dg-add-options vect_early_break } */
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_early_break } */
> +/* { dg-require-effective-target vect_int } */
> +
> +/* { dg-additional-options "-Ofast" } */
> +
> +/* Group size is uneven, load lanes targets can't safely vectorize this.  */
> +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "Peeling for alignment will be applied" 
> "vect" } } */

Likewise.

> +
> +
> +char vect_a[1025];
> +char vect_b[1025];
> +
> +unsigned test4(char x, int n)
> +{  
> + unsigned ret = 0;
> + for (int i = 1; i < (n - 2); i+=2)
> + {
> +   if (vect_a[i-1] > x || vect_a[i+2] > x)
> +     return 1;
> +
> +   vect_b[i] = x;
> +   vect_b[i+1] = x+1;
> + }
> + return ret;
> +}
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-early-break_22.c 
> b/gcc/testsuite/gcc.dg/vect/vect-early-break_22.c
> index 
> b3f5984f682f30f79331d48a264c2cc4af3e2503..a4bab5a72e369892c65569a04ec7507e32993ce8
>  100644
> --- a/gcc/testsuite/gcc.dg/vect/vect-early-break_22.c
> +++ b/gcc/testsuite/gcc.dg/vect/vect-early-break_22.c
> @@ -42,4 +42,6 @@ main ()
>    return 0;
>  }
>  
> -/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 2 
> "vect" } } */
> +/* Targets that make this LOAD_LANES fail due to the group misalignment.  */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 1 
> "vect" { target vect_load_lanes } } } */
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops in function" 2 
> "vect" { target { ! vect_load_lanes } } } } */
> diff --git a/gcc/tree-vect-data-refs.cc b/gcc/tree-vect-data-refs.cc
> index 
> 6eda40267bd1382938a77826d11f20dcc959a166..d0148d4938cafc3c59edfa6a60002933f384f65b
>  100644
> --- a/gcc/tree-vect-data-refs.cc
> +++ b/gcc/tree-vect-data-refs.cc
> @@ -731,7 +731,8 @@ vect_analyze_early_break_dependences (loop_vec_info 
> loop_vinfo)
>         if (is_gimple_debug (stmt))
>           continue;
>  
> -       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
> +       stmt_vec_info stmt_vinfo
> +         = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (stmt));
>         auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
>         if (!dr_ref)
>           continue;
> @@ -748,26 +749,15 @@ vect_analyze_early_break_dependences (loop_vec_info 
> loop_vinfo)
>            bounded by VF so accesses are within range.  We only need to check
>            the reads since writes are moved to a safe place where if we get
>            there we know they are safe to perform.  */
> -       if (DR_IS_READ (dr_ref)
> -           && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
> +       if (DR_IS_READ (dr_ref))
>           {
> -           if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo)
> -               || STMT_VINFO_STRIDED_P (stmt_vinfo))
> -             {
> -               const char *msg
> -                 = "early break not supported: cannot peel "
> -                   "for alignment, vectorization would read out of "
> -                   "bounds at %G";
> -               return opt_result::failure_at (stmt, msg, stmt);
> -             }
> -
>             dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_vinfo);
>             dr_info->need_peeling_for_alignment = true;

You're setting the flag on any DR of a DR group here ...

>             if (dump_enabled_p ())
>               dump_printf_loc (MSG_NOTE, vect_location,
> -                              "marking DR (read) as needing peeling for "
> -                              "alignment at %G", stmt);
> +                              "marking DR (read) as possibly needing peeling 
> "
> +                              "for alignment at %G", stmt);
>           }
>  
>         if (DR_IS_READ (dr_ref))
> @@ -1326,9 +1316,6 @@ vect_record_base_alignments (vec_info *vinfo)
>     Compute the misalignment of the data reference DR_INFO when vectorizing
>     with VECTYPE.
>  
> -   RESULT is non-NULL iff VINFO is a loop_vec_info.  In that case, *RESULT 
> will
> -   be set appropriately on failure (but is otherwise left unchanged).
> -
>     Output:
>     1. initialized misalignment info for DR_INFO
>  
> @@ -1337,7 +1324,7 @@ vect_record_base_alignments (vec_info *vinfo)
>  
>  static void
>  vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info *dr_info,
> -                              tree vectype, opt_result *result = nullptr)
> +                              tree vectype)
>  {
>    stmt_vec_info stmt_info = dr_info->stmt;
>    vec_base_alignments *base_alignments = &vinfo->base_alignments;
> @@ -1365,66 +1352,6 @@ vect_compute_data_ref_alignment (vec_info *vinfo, 
> dr_vec_info *dr_info,
>      = exact_div (targetm.vectorize.preferred_vector_alignment (vectype),
>                BITS_PER_UNIT);
>  
> -  /* If this DR needs peeling for alignment for correctness, we must
> -     ensure the target alignment is a constant power-of-two multiple of the
> -     amount read per vector iteration (overriding the above hook where
> -     necessary).  */
> -  if (dr_info->need_peeling_for_alignment)
> -    {
> -      /* Vector size in bytes.  */
> -      poly_uint64 safe_align = tree_to_poly_uint64 (TYPE_SIZE_UNIT 
> (vectype));
> -
> -      /* We can only peel for loops, of course.  */
> -      gcc_checking_assert (loop_vinfo);
> -
> -      /* Calculate the number of vectors read per vector iteration.  If
> -      it is a power of two, multiply through to get the required
> -      alignment in bytes.  Otherwise, fail analysis since alignment
> -      peeling wouldn't work in such a case.  */
> -      poly_uint64 num_scalars = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> -      if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> -     num_scalars *= DR_GROUP_SIZE (stmt_info);
> -
> -      auto num_vectors = vect_get_num_vectors (num_scalars, vectype);
> -      if (!pow2p_hwi (num_vectors))
> -     {
> -       *result = opt_result::failure_at (vect_location,
> -                                         "non-power-of-two num vectors %u "
> -                                         "for DR needing peeling for "
> -                                         "alignment at %G",
> -                                         num_vectors, stmt_info->stmt);
> -       return;
> -     }
> -
> -      safe_align *= num_vectors;
> -      if (maybe_gt (safe_align, 4096U))
> -     {
> -       pretty_printer pp;
> -       pp_wide_integer (&pp, safe_align);
> -       *result = opt_result::failure_at (vect_location,
> -                                         "alignment required for correctness"
> -                                         " (%s) may exceed page size",
> -                                         pp_formatted_text (&pp));
> -       return;
> -     }
> -
> -      unsigned HOST_WIDE_INT multiple;
> -      if (!constant_multiple_p (vector_alignment, safe_align, &multiple)
> -       || !pow2p_hwi (multiple))
> -     {
> -       if (dump_enabled_p ())
> -         {
> -           dump_printf_loc (MSG_NOTE, vect_location,
> -                            "forcing alignment for DR from preferred (");
> -           dump_dec (MSG_NOTE, vector_alignment);
> -           dump_printf (MSG_NOTE, ") to safe align (");
> -           dump_dec (MSG_NOTE, safe_align);
> -           dump_printf (MSG_NOTE, ") for stmt: %G", stmt_info->stmt);
> -         }
> -       vector_alignment = safe_align;
> -     }
> -    }
> -
>    SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
>  
>    /* If the main loop has peeled for alignment we have no way of knowing
> @@ -2479,7 +2406,7 @@ vect_enhance_data_refs_alignment (loop_vec_info 
> loop_vinfo)
>        || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
>                                      loop_preheader_edge (loop))
>        || loop->inner
> -      || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
> +      || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) // <<-- ??

Spurious change(?)

>      do_peeling = false;
>  
>    struct _vect_peel_extended_info peel_for_known_alignment;
> @@ -2942,12 +2869,9 @@ vect_analyze_data_refs_alignment (loop_vec_info 
> loop_vinfo)
>         if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
>             && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info->stmt)
>           continue;
> -       opt_result res = opt_result::success ();
> +
>         vect_compute_data_ref_alignment (loop_vinfo, dr_info,
> -                                        STMT_VINFO_VECTYPE (dr_info->stmt),
> -                                        &res);
> -       if (!res)
> -         return res;
> +                                        STMT_VINFO_VECTYPE (dr_info->stmt));
>       }
>      }
>  
> @@ -7211,7 +7135,7 @@ vect_supportable_dr_alignment (vec_info *vinfo, 
> dr_vec_info *dr_info,
>  
>    if (misalignment == 0)
>      return dr_aligned;
> -  else if (dr_info->need_peeling_for_alignment)
> +  else if (dr_peeling_alignment (stmt_info))
>      return dr_unaligned_unsupported;
>  
>    /* For now assume all conditional loads/stores support unaligned
> diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> index 
> 1b639ae3b174779bb838d0f7ce4886d84ecafcfe..c213ec46c4bdb456f99a43a3aff7c1af80e8769a
>  100644
> --- a/gcc/tree-vect-stmts.cc
> +++ b/gcc/tree-vect-stmts.cc
> @@ -2597,6 +2597,69 @@ get_load_store_type (vec_info  *vinfo, stmt_vec_info 
> stmt_info,
>        return false;
>      }
>  
> +  auto unit_size = GET_MODE_UNIT_SIZE (TYPE_MODE (vectype));
> +  /* Check if a misalignment with an unsupported peeling for early break is
> +     still OK.  First we need to distinguish between when we've reached here 
> do

due to

> +     to dependency analysis or when the user has requested -mstrict-align or
> +     similar.  In those cases we must not override it.  */
> +  if (dr_peeling_alignment (stmt_info)
> +      && *alignment_support_scheme == dr_unaligned_unsupported
> +      /* We can only attempt to override if the misalignment is a multiple of
> +      the element being loaded, otherwise peeling or versioning would have
> +      really been required.  */
> +      && multiple_p (*misalignment, unit_size))

Hmm, but wouldn't that mean dr_info->target_alignment is bigger
than the vector size?  Does that ever happen?  I'll note that
*alignment_support_scheme == dr_aligned means alignment according
to dr_info->target_alignment which might be actually less than
the vector size (we've noticed this recently in other PRs), so
we might want to make sure that dr_info->target_alignment is
at least vector size when ->need_peeling_for_alignment I think.

So - which of the testcases gets you here?  I think we
set *misalignment to be modulo target_alignment, never larger than that.

> +    {
> +      bool inbounds
> +     = ref_within_array_bound (STMT_VINFO_STMT (stmt_info),
> +                               DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
> +      /* If we have a known misalignment, and are doing a group load for a DR
> +      that requires aligned access, check if the misalignment is a multiple
> +      of the unit size.  In which case the group load will be issued aligned
> +      as long as the first load in the group is aligned.
> +
> +      For the non-inbound case we'd need goup_size * vectype alignment.  But
> +      this is quite huge and unlikely to ever happen so if we can't peel for
> +      it, just reject it.  */
> +      if (*memory_access_type == VMAT_LOAD_STORE_LANES
> +       && (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node))
> +     {
> +       /* ?? This needs updating whenever we support slp group > 1.  */
> +       auto group_size = DR_GROUP_SIZE (stmt_info);
> +       /* For the inbound case it's enough to check for an alignment of
> +          GROUP_SIZE * element size.  */
> +       if (inbounds
> +           && (*misalignment % (group_size * unit_size)) == 0
> +           && group_size % 2 == 0)
> +         {
> +           if (dump_enabled_p ())
> +             dump_printf_loc (MSG_NOTE, vect_location,
> +                      "Assuming grouped access is aligned due to load "
> +                      "lanes, overriding alignment scheme\n");
> +
> +           *alignment_support_scheme = dr_unaligned_supported;
> +         }
> +     }
> +      /* If we have a linear access and know the misalignment and know we 
> won't
> +      read out of bounds then it's also ok if the misalignment is a multiple
> +      of the element size.  We get this when the loop has known misalignments
> +      but the misalignments of the DRs can't be peeled to reach mutual
> +      alignment.  Because the misalignments are known however we also know
> +      that versioning won't work.  If the target does support unaligned
> +      accesses and we know we are free to read the entire buffer then we
> +      can allow the unaligned access if it's on elements for an early break
> +      condition.  */
> +      else if (*memory_access_type != VMAT_GATHER_SCATTER
> +            && inbounds)
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_NOTE, vect_location,
> +                      "Access will not read beyond buffer to due known size "
> +                      "buffer, overriding alignment scheme\n");
> +
> +       *alignment_support_scheme = dr_unaligned_supported;
> +     }
> +    }
> +
>    if (*alignment_support_scheme == dr_unaligned_unsupported)
>      {
>        if (dump_enabled_p ())
> @@ -10520,6 +10583,68 @@ vectorizable_load (vec_info *vinfo,
>    /* Transform.  */
>
>    dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info), *first_dr_info = 
> NULL;
> +
> +  /* Check if we support the operation if early breaks are needed.  */
> +  if (loop_vinfo
> +      && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +      && (memory_access_type == VMAT_GATHER_SCATTER
> +       || memory_access_type == VMAT_STRIDED_SLP))
> +    {
> +      if (dump_enabled_p ())
> +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                        "early break not supported: cannot peel for "
> +                        "alignment. With non-contiguous memory vectorization"
> +                        " could read out of bounds at %G ",
> +                        STMT_VINFO_STMT (stmt_info));

Hmm, this is now more restrictive than the original check in
vect_analyze_early_break_dependences because it covers all accesses.
The simplest fix would be to leave it there.

> +      return false;
> +    }
> +
> +  /* If this DR needs peeling for alignment for correctness, we must
> +     ensure the target alignment is a constant power-of-two multiple of the
> +     amount read per vector iteration (overriding the above hook where
> +     necessary).  We don't support group loads, which would have been filterd
> +     out in the check above.  For now it means we don't have to look at the
> +     group info and just check that the load is continguous and can just use
> +     dr_info.  For known size buffers we still need to check if the vector
> +     is misaligned and if so we need to peel.  */
> +  if (costing_p && dr_info->need_peeling_for_alignment)

dr_peeling_alignment ()

I think this belongs in get_load_store_type, specifically I think
we want to only allow VMAT_CONTIGUOUS for need_peeling_for_alignment refs
and by construction dr_aligned should ensure type size alignment
(by altering target_alignment for the respective refs).  Given that
both VF and vector type are fixed at vect_analyze_data_refs_alignment
time we should be able to compute the appropriate target alignment
there (I'm not sure we support peeling of more than VF-1 iterations
though).

> +    {
> +      /* Vector size in bytes.  */
> +      poly_uint64 safe_align = tree_to_poly_uint64 (TYPE_SIZE_UNIT 
> (vectype));
> +
> +      /* We can only peel for loops, of course.  */
> +      gcc_checking_assert (loop_vinfo);
> +
> +      auto num_vectors = ncopies;
> +      if (!pow2p_hwi (num_vectors))
> +     {
> +       if (dump_enabled_p ())
> +         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                          "non-power-of-two num vectors %u "
> +                          "for DR needing peeling for "
> +                          "alignment at %G",
> +                          num_vectors, STMT_VINFO_STMT (stmt_info));
> +       return false;
> +     }
> +
> +      safe_align *= num_vectors;
> +      if (known_gt (safe_align, (unsigned)param_min_pagesize)
> +       /* For VLA we don't support PFA when any unrolling needs to be done.
> +          We could though but too much work for GCC 15.  For now we assume a
> +          vector is not larger than a page size so allow single loads.  */
> +       && (num_vectors > 1 && !vf.is_constant ()))
> +     {
> +       if (dump_enabled_p ())
> +         {
> +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> +                            "alignment required for correctness (");
> +           dump_dec (MSG_MISSED_OPTIMIZATION, safe_align);
> +           dump_printf (MSG_NOTE, ") may exceed page size\n");
> +         }
> +       return false;
> +     }
> +    }
> +
>    ensure_base_align (dr_info);
>  
>    if (memory_access_type == VMAT_INVARIANT)
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 
> 44d3a1d46c409597f1e67a275211a1da414fc7c7..6ef97ee84336ac9a0e1922145f3d418436d709f4
>  100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -1998,6 +1998,19 @@ dr_target_alignment (dr_vec_info *dr_info)
>  }
>  #define DR_TARGET_ALIGNMENT(DR) dr_target_alignment (DR)
>  
> +/* Return if the stmt_vec_info requires peeling for alignment.  */
> +inline bool
> +dr_peeling_alignment (stmt_vec_info stmt_info)
> +{
> +  dr_vec_info *dr_info;
> +  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> +    dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info));

... but checking it on the first only.

> +  else
> +    dr_info = STMT_VINFO_DR_INFO (stmt_info);
> +
> +  return dr_info->need_peeling_for_alignment;
> +}
> +
>  inline void
>  set_dr_target_alignment (dr_vec_info *dr_info, poly_uint64 val)
>  {
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to