On Tue, 15 Apr 2025, Tamar Christina wrote:

> Hi All,
> 
> The following example:
> 
> #define N 512
> #define START 2
> #define END 505
> 
> int x[N] __attribute__((aligned(32)));
> 
> int __attribute__((noipa))
> foo (void)
> {
>   for (signed int i = START; i < END; ++i)
>     {
>       if (x[i] == 0)
>         return i;
>     }
>   return -1;
> }
> 
> generates incorrect code with fixed length SVE because for early break we need
> to know which value to start the scalar loop with if we take an early exit.
> 
> Historically this means that we take the first element of every induction.
> this is because there's an assumption in place, that even with masked loops 
> the
> masks come from a whilel* instruction.
> 
> As such we reduce using a BIT_FIELD_REF <, 0>.
> 
> When PFA was added this assumption was correct for non-masked loop, however we
> assumed that PFA for VLA wouldn't work for now, and disabled it using the
> alignment requirement checks.  We also expected VLS to PFA using scalar loops.
> 
> However as this PR shows, for VLS the vectorizer can, and does in some
> circumstances choose to peel using masks by masking the first iteration of the
> loop with an additional alignment mask.
> 
> When this is done, the first elements of the predicate can be inactive. In 
> this
> example element 1 is inactive based on the calculated misalignment.  hence the
> -1 value in the first vector IV element.
> 
> When we reduce using BIT_FIELD_REF we get the wrong value.
> 
> This patch updates it by creating a new scalar PHI that keeps track of whether
> we are the first iteration of the loop (with the additional masking) or 
> whether
> we have taken a loop iteration already.
> 
> The generated sequence:
> 
> pre-header:
>   bb1:
>     i_1 = <number of leading inactive elements>
> 
> header:
>   bb2:
>     i_2 = PHI <i_1(bb1), 0(latch)>
>     …
> 
> early-exit:
>   bb3:
>     i_3 = iv_step * i_2 + PHI<vector-iv>
> 
> Which eliminates the need to do an expensive mask based reduction.
> 
> This fixes gromacs with one OpenMP thread. But with > 1 there is still an 
> issue.
> 
> Bootstrapped Regtested on aarch64-none-linux-gnu,
> arm-none-linux-gnueabihf, x86_64-pc-linux-gnu
> -m32, -m64 and no issues.
> 
> Ok for master?
> 
> Thanks,
> Tamar
> 
> gcc/ChangeLog:
> 
>       PR tree-optimization/119351
>       * tree-vect-loop-manip.cc (vect_can_advance_ivs_p): Record non-linear
>       inductions.
>       * tree-vectorizer.h (LOOP_VINFO_MASK_NITERS_PFA_OFFSET,
>       LOOP_VINFO_NON_LINEAR_IV): New.
>       (class _loop_vec_info): Add mask_skip_niters_pfa_offset and
>       nonlinear_iv.
>       * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize them.
>       (vectorizable_induction): If early break and PFA using masking create a
>       new phi which tracks where the scalar code needs to start...
>       (vectorizable_live_operation): ...and generate the adjustments here.
>       (vect_use_loop_mask_for_alignment_p): Reject non-linear inductions and
>       early break needing peeling.
> 
> gcc/testsuite/ChangeLog:
> 
>       PR tree-optimization/119351
>       * gcc.target/aarch64/sve/peel_ind_10.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_10_run.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_5.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_5_run.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_6.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_6_run.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_7.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_7_run.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_8.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_8_run.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_9.c: New test.
>       * gcc.target/aarch64/sve/peel_ind_9_run.c: New test.
> 
> ---
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..b7a7bc5cb0cfdfdb74adb120c54ba15019832cf1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param 
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 0
> +#define END 505
> + 
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (int start)
> +{
> +  for (unsigned int i = start; i < END; ++i)
> +    {
> +      if (x[i] == 0)
> +        return i;
> +    }
> +  return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..6169aebcc40cc1553f30c1af61ccec91b51cdb42
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target 
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_10.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> +  int res = foo (START);
> +  asm volatile ("");
> +  if (res != START)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..a03bb1dec21ef75aa0cbfb22c8bb02b99644239e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param 
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 2
> +#define END 505
> + 
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> +  for (signed int i = START; i < END; ++i)
> +    {
> +      if (x[i] == 0)
> +        return i;
> +    }
> +  return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..f26befeab7e53561f84b037aec857b44cf018456
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target 
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_5.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> +  int res = foo ();
> +  asm volatile ("");
> +  if (res != START)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..9bfd1a65c4feb0c140d4abf98508fc8af08042ba
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param 
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> + 
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (int start)
> +{
> +  for (unsigned int i = start; i < END; ++i)
> +    {
> +      if (x[i] == 0)
> +        return i;
> +    }
> +  return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..4fdf3e4e7cac70dc48bad487db37e1e5838b87ab
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target 
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_6.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> +  int res = foo (START);
> +  asm volatile ("");
> +  if (res != START)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..0182e131a173b7b05e88c3393ba854b2da25c6b2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param 
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> + 
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> +  for (unsigned int i = START; i < END; ++i)
> +    {
> +      if (x[i] == 0)
> +        return i;
> +    }
> +  return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..05608dd85f13912f8555ac3f39284f6894875998
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target 
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_7.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> +  int res = foo ();
> +  asm volatile ("");
> +  if (res != START)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..043348b55d0e8e5e5a5c461b4a4f22b45dfba8e8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c
> @@ -0,0 +1,24 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param 
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> + 
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> +  for (unsigned int i = START; i < END; i*=2)
> +    {
> +      if (x[i] == 0)
> +        return i;
> +    }
> +  return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump-not "Alignment of access forced using 
> peeling" "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..aa8612248bffdc9f4367b8f6699d395ab2726dec
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target 
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_8.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> +  int res = foo ();
> +  asm volatile ("");
> +  if (res != START)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..cc904e88170f072e1d3c6be86643d99a7cd5cb12
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c
> @@ -0,0 +1,25 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Ofast -msve-vector-bits=256 --param 
> aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */
> +
> +#define N 512
> +#define START 1
> +#define END 505
> + 
> +int x[N] __attribute__((aligned(32)));
> +
> +int __attribute__((noipa))
> +foo (void)
> +{
> +  for (int *p = x + START; p < x + END; p++)
> +    {
> +      if (*p == 0)
> +        return START;
> +    }
> +  return -1;
> +}
> +
> +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */
> +/* Peels using a scalar loop.  */
> +/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */
> +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" 
> "vect" } } */
> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c 
> b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
> new file mode 100644
> index 
> 0000000000000000000000000000000000000000..767f8bd284ca7c3b9f595c5428c20175ed176d96
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c
> @@ -0,0 +1,17 @@
> +/* Fix for PR119351 alignment peeling with vectors and VLS.  */
> +/* { dg-do run { target aarch64_sve_hw } } */
> +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */
> +/* { dg-additional-options "-msve-vector-bits=256" { target 
> aarch64_sve256_hw } } */
> +/* { dg-additional-options "-msve-vector-bits=128" { target 
> aarch64_sve128_hw } } */
> +
> +#include "peel_ind_9.c"
> +
> +int __attribute__ ((optimize (1)))
> +main (void)
> +{
> +  int res = foo ();
> +  asm volatile ("");
> +  if (res != START)
> +    __builtin_abort ();
> +  return 0;
> +}
> diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc
> index 
> 56a4e9a8b63f3cae0bf596bf5d22893887dc80e8..ea22c1c6050bd2867ee2ecf28379b342b89fddc9
>  100644
> --- a/gcc/tree-vect-loop-manip.cc
> +++ b/gcc/tree-vect-loop-manip.cc
> @@ -2244,6 +2244,8 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
>        induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info);
>        if (induction_type != vect_step_op_add)
>       {
> +       /* Mark if we have a non-linear IV.  */
> +       LOOP_VINFO_NON_LINEAR_IV (loop_vinfo) = true;

Please move this to vect_analyze_scalar_cycles_1 at

      if (dump_enabled_p ())
        dump_printf_loc (MSG_NOTE, vect_location, "Detected 
induction.\n");
      STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;

where you can check STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != 
vect_step_op_add.  Doing it in vect_can_advance_ivs_p makes it non-obvious
that it will be set reliably before you query it in
vect_use_loop_mask_for_alignment_p.

OK with that change.

Thanks,
Richard.

>         if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info))
>           return false;
>  
> diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc
> index 
> 9413dcef702597ab27165e676546b190e2bd36ba..efb870e8f60315c47c4e5ea18940988ed9986306
>  100644
> --- a/gcc/tree-vect-loop.cc
> +++ b/gcc/tree-vect-loop.cc
> @@ -1046,12 +1046,14 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, 
> vec_info_shared *shared)
>      suggested_unroll_factor (1),
>      max_vectorization_factor (0),
>      mask_skip_niters (NULL_TREE),
> +    mask_skip_niters_pfa_offset (NULL_TREE),
>      rgroup_compare_type (NULL_TREE),
>      simd_if_cond (NULL_TREE),
>      partial_vector_style (vect_partial_vectors_none),
>      unaligned_dr (NULL),
>      peeling_for_alignment (0),
>      ptr_mask (0),
> +    nonlinear_iv (false),
>      ivexpr_map (NULL),
>      scan_map (NULL),
>      slp_unrolling_factor (1),
> @@ -10678,6 +10680,54 @@ vectorizable_induction (loop_vec_info loop_vinfo,
>                                      LOOP_VINFO_MASK_SKIP_NITERS 
> (loop_vinfo));
>         peel_mul = gimple_build_vector_from_val (&init_stmts,
>                                                  step_vectype, peel_mul);
> +
> +       /* If early break then we have to create a new PHI which we can use as
> +         an offset to adjust the induction reduction in early exits.
> +
> +         This is because when peeling for alignment using masking, the first
> +         few elements of the vector can be inactive.  As such if we find the
> +         entry in the first iteration we have adjust the starting point of
> +         the scalar code.
> +
> +         We do this by creating a new scalar PHI that keeps track of whether
> +         we are the first iteration of the loop (with the additional masking)
> +         or whether we have taken a loop iteration already.
> +
> +         The generated sequence:
> +
> +         pre-header:
> +           bb1:
> +             i_1 = <number of leading inactive elements>
> +
> +         header:
> +           bb2:
> +             i_2 = PHI <i_1(bb1), 0(latch)>
> +             …
> +
> +         early-exit:
> +           bb3:
> +             i_3 = iv_step * i_2 + PHI<vector-iv>
> +
> +         The first part of the adjustment to create i_1 and i_2 are done here
> +         and the last part creating i_3 is done in
> +         vectorizable_live_operations when the induction extraction is
> +         materialized.  */
> +       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> +           && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
> +         {
> +           auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
> +           tree ty_skip_niters = TREE_TYPE (skip_niters);
> +           tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters,
> +                                                       vect_scalar_var,
> +                                                       "pfa_iv_offset");
> +           gphi *nphi = create_phi_node (break_lhs_phi, bb);
> +           add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION);
> +           add_phi_arg (nphi, build_zero_cst (ty_skip_niters),
> +                        loop_latch_edge (iv_loop), UNKNOWN_LOCATION);
> +
> +           LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)
> +             = PHI_RESULT (nphi);
> +         }
>       }
>        tree step_mul = NULL_TREE;
>        unsigned ivn;
> @@ -11565,8 +11615,10 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>             /* For early exit where the exit is not in the BB that leads
>                to the latch then we're restarting the iteration in the
>                scalar loop.  So get the first live value.  */
> -           if ((all_exits_as_early_p || !main_exit_edge)
> -               && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
> +           bool early_break_first_element_p
> +             = (all_exits_as_early_p || !main_exit_edge)
> +                && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def;
> +           if (early_break_first_element_p)
>               {
>                 tmp_vec_lhs = vec_lhs0;
>                 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
> @@ -11581,6 +11633,41 @@ vectorizable_live_operation (vec_info *vinfo, 
> stmt_vec_info stmt_info,
>                                                lhs_type, &exit_gsi);
>  
>             auto gsi = gsi_for_stmt (use_stmt);
> +           if (early_break_first_element_p
> +               && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo))
> +             {
> +               tree step_expr
> +                 = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
> +               tree break_lhs_phi
> +                 = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo);
> +               tree ty_skip_niters = TREE_TYPE (break_lhs_phi);
> +               gimple_seq iv_stmts = NULL;
> +
> +               /* Now create the PHI for the outside loop usage to
> +                  retrieve the value for the offset counter.  */
> +               tree rphi_step
> +                 = gimple_convert (&iv_stmts, ty_skip_niters, step_expr);
> +               tree tmp2
> +                 = gimple_build (&iv_stmts, MULT_EXPR,
> +                                 ty_skip_niters, rphi_step,
> +                                 break_lhs_phi);
> +
> +               if (POINTER_TYPE_P (TREE_TYPE (new_tree)))
> +                 tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR,
> +                                      TREE_TYPE (new_tree), new_tree, tmp2);
> +               else
> +                 {
> +                   tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree),
> +                                          tmp2);
> +                   tmp2 = gimple_build (&iv_stmts, PLUS_EXPR,
> +                                        TREE_TYPE (new_tree), new_tree,
> +                                        tmp2);
> +                 }
> +
> +               new_tree = tmp2;
> +               gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT);
> +             }
> +
>             tree lhs_phi = gimple_phi_result (use_stmt);
>             remove_phi_node (&gsi, false);
>             gimple *copy = gimple_build_assign (lhs_phi, new_tree);
> diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> index 
> 97caf61b345735d297ec49fd6ca64797435b46fc..01d19c77656198f92f06619f73752598edd47fab
>  100644
> --- a/gcc/tree-vectorizer.h
> +++ b/gcc/tree-vectorizer.h
> @@ -818,6 +818,11 @@ public:
>       elements that should be false in the first mask).  */
>    tree mask_skip_niters;
>  
> +  /* If we are using a loop mask to align memory addresses and we're in an
> +     early break loop then this variable contains the number of elements that
> +     were skipped during the initial iteration of the loop. */
> +  tree mask_skip_niters_pfa_offset;
> +
>    /* The type that the loop control IV should be converted to before
>       testing which of the VF scalars are active and inactive.
>       Only meaningful if LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
> @@ -854,6 +859,9 @@ public:
>    /* The mask used to check the alignment of pointers or arrays.  */
>    int ptr_mask;
>  
> +  /* Indicates whether the loop has any non-linear IV.  */
> +  bool nonlinear_iv;
> +
>    /* Data Dependence Relations defining address ranges that are candidates
>       for a run-time aliasing check.  */
>    auto_vec<ddr_p> may_alias_ddrs;
> @@ -1064,6 +1072,7 @@ public:
>  #define LOOP_VINFO_MASKS(L)                (L)->masks
>  #define LOOP_VINFO_LENS(L)                 (L)->lens
>  #define LOOP_VINFO_MASK_SKIP_NITERS(L)     (L)->mask_skip_niters
> +#define LOOP_VINFO_MASK_NITERS_PFA_OFFSET(L) (L)->mask_skip_niters_pfa_offset
>  #define LOOP_VINFO_RGROUP_COMPARE_TYPE(L)  (L)->rgroup_compare_type
>  #define LOOP_VINFO_RGROUP_IV_TYPE(L)       (L)->rgroup_iv_type
>  #define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style
> @@ -1073,6 +1082,7 @@ public:
>  #define LOOP_VINFO_DDRS(L)                 (L)->shared->ddrs
>  #define LOOP_VINFO_INT_NITERS(L)           (TREE_INT_CST_LOW 
> ((L)->num_iters))
>  #define LOOP_VINFO_PEELING_FOR_ALIGNMENT(L) (L)->peeling_for_alignment
> +#define LOOP_VINFO_NON_LINEAR_IV(L)        (L)->nonlinear_iv
>  #define LOOP_VINFO_UNALIGNED_DR(L)         (L)->unaligned_dr
>  #define LOOP_VINFO_MAY_MISALIGN_STMTS(L)   (L)->may_misalign_stmts
>  #define LOOP_VINFO_MAY_ALIAS_DDRS(L)       (L)->may_alias_ddrs
> @@ -2138,8 +2148,14 @@ unlimited_cost_model (loop_p loop)
>  inline bool
>  vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo)
>  {
> +  /* With early break vectorization we don't know whether the accesses will 
> stay
> +     inside the loop or not.  TODO: The early break adjustment code can be
> +     implemented the same way as vectorizable_linear_induction.  However we
> +     can't test this today so reject it.  */
>    return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
> -       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
> +       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
> +       && !(LOOP_VINFO_NON_LINEAR_IV (loop_vinfo)
> +            && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)));
>  }
>  
>  /* Return the number of vectors of type VECTYPE that are needed to get
> 
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to