On Thu, 30 Oct 2025, Tamar Christina wrote:

> This fixes the two reported ICEs by accounting for that neutral_op can be NULL
> and that we should require IFN_VEC_SHL_INSERT for non reduction defs.
> 
> The conditions were taken from what is required to delay the reduction with 
> the
> initial values later on in vect_transform_cycle_phi.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/122474
>       PR tree-optimization/122475
>       * tree-vect-loop.cc (vectorizable_reduction): Tighten IFN_VEC_SHL_INSERT
>       requirement.
> 
> gcc/testsuite/ChangeLog:
> 
>       PR tree-optimization/122474
>       PR tree-optimization/122475
>       * gcc.dg/vect/pr122474.c: New test.
>       * gcc.dg/vect/pr122475.c: New test.
>       * gcc.target/aarch64/sve/vect-reduc-bool-19.c: New test.
>       * gcc.target/aarch64/sve/vect-reduc-bool-20.c: New test.
> 
> ---
> diff --git a/gcc/testsuite/gcc.dg/vect/pr122474.c 
> b/gcc/testsuite/gcc.dg/vect/pr122474.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..c5d70b518a0078a6ec887a37d1c898365ec6f01d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr122474.c
> @@ -0,0 +1,19 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-fwrapv -Waggressive-loop-optimizations -w" } */
> +/* { dg-additional-options "-march=armv8-a+sve" { target aarch64*-*-* } } */
> +/* Check that we don't ICE.  */
> +
> +_Bool a;
> +void b(unsigned c[][8][8][8][8][8]) {
> +  for (int d = 1; d; d += 3)
> +    for (int e = 0; e < 11; e += 2)
> +      for (short f = 0; f < 1; f = 30482)
> +        for (short g = 0; g < 014; g++)
> +          a = ({
> +            int h = a;
> +            int i = c[2][f][d][d][d][d] ^ c[g][g][1][2][g][g];
> +            i ? h : i;
> +          });
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" { 
> target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/pr122475.c 
> b/gcc/testsuite/gcc.dg/vect/pr122475.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..ed229c5475cc96fc0ef3beea71b101db89e08916
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr122475.c
> @@ -0,0 +1,13 @@
> +/* { dg-additional-options "-march=armv8-a+sve" { target aarch64*-*-* } } */
> +/* Check that we don't ICE.  */
> +int a;
> +int b;
> +int main() {
> +  for (char t = 0; t < 14; t += 2)
> +    for (int u = 0; u < 242; u += 4) {
> +      a = a < 0 ? a : 0;
> +      b = b < 0 ? b : 0;
> +    }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" { 
> target aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-19.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-19.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6492c44c065159d06d7e69499dfc9834721d62ef
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-19.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mautovec-preference=sve-only 
> -fdump-tree-vect-details -O3 --param vect-epilogues-nomask=0" } */
> +
> +int p[128];
> +
> +bool __attribute__((noipa))
> +fand (int n, bool r1, bool r2)
> +{
> +  bool r = true;
> +  for (int i = 0; i < (n/2); i+=2)
> +    {
> +      r &= (p[i] != 0) & r1;
> +      r &= (p[i+1] != 0) & r2;
> +    }
> +  return r;
> +}
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 1 "vect" } 
> } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-20.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-20.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..83c5c206dd2597c3776317005a3dd38ee5991f16
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/vect-reduc-bool-20.c
> @@ -0,0 +1,32 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-mautovec-preference=sve-only 
> -fdump-tree-vect-details -O3 --param vect-epilogues-nomask=0" } */
> +
> +#include <stdbool.h>
> +#include <stdint.h>
> +
> +void vec_slp_cmp (char* restrict a, char* restrict b, int n) {
> +    bool x0 = b[0] != 0;
> +    bool x1 = b[1] != 0;
> +    bool x2 = b[2] != 0;
> +    bool x3 = b[3] != 0;
> +    for (int i = 0; i < n; ++i) {
> +        x0 &= (a[i * 4] != 0);
> +        x1 &= (a[i * 4 + 1] != 0);
> +        x2 &= (a[i * 4 + 2] != 0);
> +        x3 &= (a[i * 4 + 3] != 0);
> +    }
> +    b[0] = x0;
> +    b[1] = x1;
> +    b[2] = x2;
> +    b[3] = x3;
> +}
> +
> +void vec_slp_cmp1 (char* restrict a, char* restrict b, int n) {
> +    bool x0 = b[0] != 0;
> +    for (int i = 0; i < n; ++i) {
> +        x0 &= (a[i] != 0);
> +    }
> +    b[0] = x0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "optimized: loop vectorized" 2 "vect" { 
> target aarch64*-*-* } } } */
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 
> 50cdc2a90fa29c1e0d116c0589bc246e6d8fcc84..bfcea0d52337eead7ee80572963f986bee00591d
>  100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -7577,8 +7577,14 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
>       values into the low-numbered elements.  */
>    if ((double_reduc || neutral_op)
>        && !nunits_out.is_constant ()
> -      && (SLP_TREE_LANES (slp_node) != 1 && !reduc_chain)
> -      && !operand_equal_p (neutral_op, vect_phi_initial_value 
> (reduc_def_phi))
> +      && (!neutral_op
> +       || (!reduc_chain
> +            && (SLP_TREE_LANES (slp_node) != 1
> +                || (!VECT_REDUC_INFO_REUSED_ACCUMULATOR (reduc_info)

So this will be always true given we set 
VECT_REDUC_INFO_REUSED_ACCUMULATOR only during transform.  Meaning
this will be a source of mismatch which is why I earlier suggested
that we'd need to ensure we can later drop the requirement, like
forcing of no accumulator re-use.  Thus, after the checks we can
perform here, if ...

> +                    && STMT_VINFO_DEF_TYPE (stmt_info)
> +                             != vect_reduction_def))
> +            && !operand_equal_p (neutral_op,
> +                                 vect_phi_initial_value (reduc_def_phi))))
>        && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
>                                         vectype_out, OPTIMIZE_FOR_SPEED))

... this fails, allow to continue but set some
VECT_REDUC_INFO_MAY_NOT_REUSE_ACCUMULATOR or so?

I'd suggest to push the !neutral_op change and rework the rest in
a way suggested?

That said, the accumulator re-use might be not an actual blocker
to delay the initial value adjustment, but just an implementation
detail missing.  Does the other bug run into exactly that?

Richard.

>      {
> 
> 
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to