On Tue, 15 Apr 2025, Tamar Christina wrote: > Hi All, > > The following example: > > #define N 512 > #define START 2 > #define END 505 > > int x[N] __attribute__((aligned(32))); > > int __attribute__((noipa)) > foo (void) > { > for (signed int i = START; i < END; ++i) > { > if (x[i] == 0) > return i; > } > return -1; > } > > generates incorrect code with fixed length SVE because for early break we need > to know which value to start the scalar loop with if we take an early exit. > > Historically this means that we take the first element of every induction. > this is because there's an assumption in place, that even with masked loops > the > masks come from a whilel* instruction. > > As such we reduce using a BIT_FIELD_REF <, 0>. > > When PFA was added this assumption was correct for non-masked loop, however we > assumed that PFA for VLA wouldn't work for now, and disabled it using the > alignment requirement checks. We also expected VLS to PFA using scalar loops. > > However as this PR shows, for VLS the vectorizer can, and does in some > circumstances choose to peel using masks by masking the first iteration of the > loop with an additional alignment mask. > > When this is done, the first elements of the predicate can be inactive. In > this > example element 1 is inactive based on the calculated misalignment. hence the > -1 value in the first vector IV element. > > When we reduce using BIT_FIELD_REF we get the wrong value. > > This patch updates it by creating a new scalar PHI that keeps track of whether > we are the first iteration of the loop (with the additional masking) or > whether > we have taken a loop iteration already. > > The generated sequence: > > pre-header: > bb1: > i_1 = <number of leading inactive elements> > > header: > bb2: > i_2 = PHI <i_1(bb1), 0(latch)> > … > > early-exit: > bb3: > i_3 = iv_step * i_2 + PHI<vector-iv> > > Which eliminates the need to do an expensive mask based reduction. > > This fixes gromacs with one OpenMP thread. But with > 1 there is still an > issue. > > Bootstrapped Regtested on aarch64-none-linux-gnu, > arm-none-linux-gnueabihf, x86_64-pc-linux-gnu > -m32, -m64 and no issues. > > Ok for master? > > Thanks, > Tamar > > gcc/ChangeLog: > > PR tree-optimization/119351 > * tree-vect-loop-manip.cc (vect_can_advance_ivs_p): Record non-linear > inductions. > * tree-vectorizer.h (LOOP_VINFO_MASK_NITERS_PFA_OFFSET, > LOOP_VINFO_NON_LINEAR_IV): New. > (class _loop_vec_info): Add mask_skip_niters_pfa_offset and > nonlinear_iv. > * tree-vect-loop.cc (_loop_vec_info::_loop_vec_info): Initialize them. > (vectorizable_induction): If early break and PFA using masking create a > new phi which tracks where the scalar code needs to start... > (vectorizable_live_operation): ...and generate the adjustments here. > (vect_use_loop_mask_for_alignment_p): Reject non-linear inductions and > early break needing peeling. > > gcc/testsuite/ChangeLog: > > PR tree-optimization/119351 > * gcc.target/aarch64/sve/peel_ind_10.c: New test. > * gcc.target/aarch64/sve/peel_ind_10_run.c: New test. > * gcc.target/aarch64/sve/peel_ind_5.c: New test. > * gcc.target/aarch64/sve/peel_ind_5_run.c: New test. > * gcc.target/aarch64/sve/peel_ind_6.c: New test. > * gcc.target/aarch64/sve/peel_ind_6_run.c: New test. > * gcc.target/aarch64/sve/peel_ind_7.c: New test. > * gcc.target/aarch64/sve/peel_ind_7_run.c: New test. > * gcc.target/aarch64/sve/peel_ind_8.c: New test. > * gcc.target/aarch64/sve/peel_ind_8_run.c: New test. > * gcc.target/aarch64/sve/peel_ind_9.c: New test. > * gcc.target/aarch64/sve/peel_ind_9_run.c: New test. > > --- > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..b7a7bc5cb0cfdfdb74adb120c54ba15019832cf1 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10.c > @@ -0,0 +1,24 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -msve-vector-bits=256 --param > aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ > + > +#define N 512 > +#define START 0 > +#define END 505 > + > +int x[N] __attribute__((aligned(32))); > + > +int __attribute__((noipa)) > +foo (int start) > +{ > + for (unsigned int i = start; i < END; ++i) > + { > + if (x[i] == 0) > + return i; > + } > + return -1; > +} > + > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..6169aebcc40cc1553f30c1af61ccec91b51cdb42 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_10_run.c > @@ -0,0 +1,17 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > + > +#include "peel_ind_10.c" > + > +int __attribute__ ((optimize (1))) > +main (void) > +{ > + int res = foo (START); > + asm volatile (""); > + if (res != START) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..a03bb1dec21ef75aa0cbfb22c8bb02b99644239e > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5.c > @@ -0,0 +1,24 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -msve-vector-bits=256 --param > aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ > + > +#define N 512 > +#define START 2 > +#define END 505 > + > +int x[N] __attribute__((aligned(32))); > + > +int __attribute__((noipa)) > +foo (void) > +{ > + for (signed int i = START; i < END; ++i) > + { > + if (x[i] == 0) > + return i; > + } > + return -1; > +} > + > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..f26befeab7e53561f84b037aec857b44cf018456 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_5_run.c > @@ -0,0 +1,17 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > + > +#include "peel_ind_5.c" > + > +int __attribute__ ((optimize (1))) > +main (void) > +{ > + int res = foo (); > + asm volatile (""); > + if (res != START) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..9bfd1a65c4feb0c140d4abf98508fc8af08042ba > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6.c > @@ -0,0 +1,24 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -msve-vector-bits=256 --param > aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ > + > +#define N 512 > +#define START 1 > +#define END 505 > + > +int x[N] __attribute__((aligned(32))); > + > +int __attribute__((noipa)) > +foo (int start) > +{ > + for (unsigned int i = start; i < END; ++i) > + { > + if (x[i] == 0) > + return i; > + } > + return -1; > +} > + > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..4fdf3e4e7cac70dc48bad487db37e1e5838b87ab > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_6_run.c > @@ -0,0 +1,17 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > + > +#include "peel_ind_6.c" > + > +int __attribute__ ((optimize (1))) > +main (void) > +{ > + int res = foo (START); > + asm volatile (""); > + if (res != START) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..0182e131a173b7b05e88c3393ba854b2da25c6b2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7.c > @@ -0,0 +1,24 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -msve-vector-bits=256 --param > aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ > + > +#define N 512 > +#define START 1 > +#define END 505 > + > +int x[N] __attribute__((aligned(32))); > + > +int __attribute__((noipa)) > +foo (void) > +{ > + for (unsigned int i = START; i < END; ++i) > + { > + if (x[i] == 0) > + return i; > + } > + return -1; > +} > + > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > +/* { dg-final { scan-tree-dump "pfa_iv_offset" "vect" } } */ > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..05608dd85f13912f8555ac3f39284f6894875998 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_7_run.c > @@ -0,0 +1,17 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > + > +#include "peel_ind_7.c" > + > +int __attribute__ ((optimize (1))) > +main (void) > +{ > + int res = foo (); > + asm volatile (""); > + if (res != START) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..043348b55d0e8e5e5a5c461b4a4f22b45dfba8e8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8.c > @@ -0,0 +1,24 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -msve-vector-bits=256 --param > aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ > + > +#define N 512 > +#define START 1 > +#define END 505 > + > +int x[N] __attribute__((aligned(32))); > + > +int __attribute__((noipa)) > +foo (void) > +{ > + for (unsigned int i = START; i < END; i*=2) > + { > + if (x[i] == 0) > + return i; > + } > + return -1; > +} > + > +/* { dg-final { scan-tree-dump-not "LOOP VECTORIZED" "vect" } } */ > +/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */ > +/* { dg-final { scan-tree-dump-not "Alignment of access forced using > peeling" "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..aa8612248bffdc9f4367b8f6699d395ab2726dec > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_8_run.c > @@ -0,0 +1,17 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > + > +#include "peel_ind_8.c" > + > +int __attribute__ ((optimize (1))) > +main (void) > +{ > + int res = foo (); > + asm volatile (""); > + if (res != START) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..cc904e88170f072e1d3c6be86643d99a7cd5cb12 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9.c > @@ -0,0 +1,25 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do compile } */ > +/* { dg-options "-Ofast -msve-vector-bits=256 --param > aarch64-autovec-preference=sve-only -fdump-tree-vect-details" } */ > + > +#define N 512 > +#define START 1 > +#define END 505 > + > +int x[N] __attribute__((aligned(32))); > + > +int __attribute__((noipa)) > +foo (void) > +{ > + for (int *p = x + START; p < x + END; p++) > + { > + if (*p == 0) > + return START; > + } > + return -1; > +} > + > +/* { dg-final { scan-tree-dump "LOOP VECTORIZED" "vect" } } */ > +/* Peels using a scalar loop. */ > +/* { dg-final { scan-tree-dump-not "pfa_iv_offset" "vect" } } */ > +/* { dg-final { scan-tree-dump "Alignment of access forced using peeling" > "vect" } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c > b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c > new file mode 100644 > index > 0000000000000000000000000000000000000000..767f8bd284ca7c3b9f595c5428c20175ed176d96 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/sve/peel_ind_9_run.c > @@ -0,0 +1,17 @@ > +/* Fix for PR119351 alignment peeling with vectors and VLS. */ > +/* { dg-do run { target aarch64_sve_hw } } */ > +/* { dg-options "-Ofast --param aarch64-autovec-preference=sve-only" } */ > +/* { dg-additional-options "-msve-vector-bits=256" { target > aarch64_sve256_hw } } */ > +/* { dg-additional-options "-msve-vector-bits=128" { target > aarch64_sve128_hw } } */ > + > +#include "peel_ind_9.c" > + > +int __attribute__ ((optimize (1))) > +main (void) > +{ > + int res = foo (); > + asm volatile (""); > + if (res != START) > + __builtin_abort (); > + return 0; > +} > diff --git a/gcc/tree-vect-loop-manip.cc b/gcc/tree-vect-loop-manip.cc > index > 56a4e9a8b63f3cae0bf596bf5d22893887dc80e8..ea22c1c6050bd2867ee2ecf28379b342b89fddc9 > 100644 > --- a/gcc/tree-vect-loop-manip.cc > +++ b/gcc/tree-vect-loop-manip.cc > @@ -2244,6 +2244,8 @@ vect_can_advance_ivs_p (loop_vec_info loop_vinfo) > induction_type = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (phi_info); > if (induction_type != vect_step_op_add) > { > + /* Mark if we have a non-linear IV. */ > + LOOP_VINFO_NON_LINEAR_IV (loop_vinfo) = true;
Please move this to vect_analyze_scalar_cycles_1 at if (dump_enabled_p ()) dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n"); STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; where you can check STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_vinfo) != vect_step_op_add. Doing it in vect_can_advance_ivs_p makes it non-obvious that it will be set reliably before you query it in vect_use_loop_mask_for_alignment_p. OK with that change. Thanks, Richard. > if (!vect_can_peel_nonlinear_iv_p (loop_vinfo, phi_info)) > return false; > > diff --git a/gcc/tree-vect-loop.cc b/gcc/tree-vect-loop.cc > index > 9413dcef702597ab27165e676546b190e2bd36ba..efb870e8f60315c47c4e5ea18940988ed9986306 > 100644 > --- a/gcc/tree-vect-loop.cc > +++ b/gcc/tree-vect-loop.cc > @@ -1046,12 +1046,14 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, > vec_info_shared *shared) > suggested_unroll_factor (1), > max_vectorization_factor (0), > mask_skip_niters (NULL_TREE), > + mask_skip_niters_pfa_offset (NULL_TREE), > rgroup_compare_type (NULL_TREE), > simd_if_cond (NULL_TREE), > partial_vector_style (vect_partial_vectors_none), > unaligned_dr (NULL), > peeling_for_alignment (0), > ptr_mask (0), > + nonlinear_iv (false), > ivexpr_map (NULL), > scan_map (NULL), > slp_unrolling_factor (1), > @@ -10678,6 +10680,54 @@ vectorizable_induction (loop_vec_info loop_vinfo, > LOOP_VINFO_MASK_SKIP_NITERS > (loop_vinfo)); > peel_mul = gimple_build_vector_from_val (&init_stmts, > step_vectype, peel_mul); > + > + /* If early break then we have to create a new PHI which we can use as > + an offset to adjust the induction reduction in early exits. > + > + This is because when peeling for alignment using masking, the first > + few elements of the vector can be inactive. As such if we find the > + entry in the first iteration we have adjust the starting point of > + the scalar code. > + > + We do this by creating a new scalar PHI that keeps track of whether > + we are the first iteration of the loop (with the additional masking) > + or whether we have taken a loop iteration already. > + > + The generated sequence: > + > + pre-header: > + bb1: > + i_1 = <number of leading inactive elements> > + > + header: > + bb2: > + i_2 = PHI <i_1(bb1), 0(latch)> > + … > + > + early-exit: > + bb3: > + i_3 = iv_step * i_2 + PHI<vector-iv> > + > + The first part of the adjustment to create i_1 and i_2 are done here > + and the last part creating i_3 is done in > + vectorizable_live_operations when the induction extraction is > + materialized. */ > + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) > + && !LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)) > + { > + auto skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo); > + tree ty_skip_niters = TREE_TYPE (skip_niters); > + tree break_lhs_phi = vect_get_new_vect_var (ty_skip_niters, > + vect_scalar_var, > + "pfa_iv_offset"); > + gphi *nphi = create_phi_node (break_lhs_phi, bb); > + add_phi_arg (nphi, skip_niters, pe, UNKNOWN_LOCATION); > + add_phi_arg (nphi, build_zero_cst (ty_skip_niters), > + loop_latch_edge (iv_loop), UNKNOWN_LOCATION); > + > + LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo) > + = PHI_RESULT (nphi); > + } > } > tree step_mul = NULL_TREE; > unsigned ivn; > @@ -11565,8 +11615,10 @@ vectorizable_live_operation (vec_info *vinfo, > stmt_vec_info stmt_info, > /* For early exit where the exit is not in the BB that leads > to the latch then we're restarting the iteration in the > scalar loop. So get the first live value. */ > - if ((all_exits_as_early_p || !main_exit_edge) > - && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) > + bool early_break_first_element_p > + = (all_exits_as_early_p || !main_exit_edge) > + && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def; > + if (early_break_first_element_p) > { > tmp_vec_lhs = vec_lhs0; > tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart)); > @@ -11581,6 +11633,41 @@ vectorizable_live_operation (vec_info *vinfo, > stmt_vec_info stmt_info, > lhs_type, &exit_gsi); > > auto gsi = gsi_for_stmt (use_stmt); > + if (early_break_first_element_p > + && LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo)) > + { > + tree step_expr > + = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info); > + tree break_lhs_phi > + = LOOP_VINFO_MASK_NITERS_PFA_OFFSET (loop_vinfo); > + tree ty_skip_niters = TREE_TYPE (break_lhs_phi); > + gimple_seq iv_stmts = NULL; > + > + /* Now create the PHI for the outside loop usage to > + retrieve the value for the offset counter. */ > + tree rphi_step > + = gimple_convert (&iv_stmts, ty_skip_niters, step_expr); > + tree tmp2 > + = gimple_build (&iv_stmts, MULT_EXPR, > + ty_skip_niters, rphi_step, > + break_lhs_phi); > + > + if (POINTER_TYPE_P (TREE_TYPE (new_tree))) > + tmp2 = gimple_build (&iv_stmts, POINTER_PLUS_EXPR, > + TREE_TYPE (new_tree), new_tree, tmp2); > + else > + { > + tmp2 = gimple_convert (&iv_stmts, TREE_TYPE (new_tree), > + tmp2); > + tmp2 = gimple_build (&iv_stmts, PLUS_EXPR, > + TREE_TYPE (new_tree), new_tree, > + tmp2); > + } > + > + new_tree = tmp2; > + gsi_insert_seq_before (&exit_gsi, iv_stmts, GSI_SAME_STMT); > + } > + > tree lhs_phi = gimple_phi_result (use_stmt); > remove_phi_node (&gsi, false); > gimple *copy = gimple_build_assign (lhs_phi, new_tree); > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > index > 97caf61b345735d297ec49fd6ca64797435b46fc..01d19c77656198f92f06619f73752598edd47fab > 100644 > --- a/gcc/tree-vectorizer.h > +++ b/gcc/tree-vectorizer.h > @@ -818,6 +818,11 @@ public: > elements that should be false in the first mask). */ > tree mask_skip_niters; > > + /* If we are using a loop mask to align memory addresses and we're in an > + early break loop then this variable contains the number of elements that > + were skipped during the initial iteration of the loop. */ > + tree mask_skip_niters_pfa_offset; > + > /* The type that the loop control IV should be converted to before > testing which of the VF scalars are active and inactive. > Only meaningful if LOOP_VINFO_USING_PARTIAL_VECTORS_P. */ > @@ -854,6 +859,9 @@ public: > /* The mask used to check the alignment of pointers or arrays. */ > int ptr_mask; > > + /* Indicates whether the loop has any non-linear IV. */ > + bool nonlinear_iv; > + > /* Data Dependence Relations defining address ranges that are candidates > for a run-time aliasing check. */ > auto_vec<ddr_p> may_alias_ddrs; > @@ -1064,6 +1072,7 @@ public: > #define LOOP_VINFO_MASKS(L) (L)->masks > #define LOOP_VINFO_LENS(L) (L)->lens > #define LOOP_VINFO_MASK_SKIP_NITERS(L) (L)->mask_skip_niters > +#define LOOP_VINFO_MASK_NITERS_PFA_OFFSET(L) (L)->mask_skip_niters_pfa_offset > #define LOOP_VINFO_RGROUP_COMPARE_TYPE(L) (L)->rgroup_compare_type > #define LOOP_VINFO_RGROUP_IV_TYPE(L) (L)->rgroup_iv_type > #define LOOP_VINFO_PARTIAL_VECTORS_STYLE(L) (L)->partial_vector_style > @@ -1073,6 +1082,7 @@ public: > #define LOOP_VINFO_DDRS(L) (L)->shared->ddrs > #define LOOP_VINFO_INT_NITERS(L) (TREE_INT_CST_LOW > ((L)->num_iters)) > #define LOOP_VINFO_PEELING_FOR_ALIGNMENT(L) (L)->peeling_for_alignment > +#define LOOP_VINFO_NON_LINEAR_IV(L) (L)->nonlinear_iv > #define LOOP_VINFO_UNALIGNED_DR(L) (L)->unaligned_dr > #define LOOP_VINFO_MAY_MISALIGN_STMTS(L) (L)->may_misalign_stmts > #define LOOP_VINFO_MAY_ALIAS_DDRS(L) (L)->may_alias_ddrs > @@ -2138,8 +2148,14 @@ unlimited_cost_model (loop_p loop) > inline bool > vect_use_loop_mask_for_alignment_p (loop_vec_info loop_vinfo) > { > + /* With early break vectorization we don't know whether the accesses will > stay > + inside the loop or not. TODO: The early break adjustment code can be > + implemented the same way as vectorizable_linear_induction. However we > + can't test this today so reject it. */ > return (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo) > - && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)); > + && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > + && !(LOOP_VINFO_NON_LINEAR_IV (loop_vinfo) > + && LOOP_VINFO_EARLY_BREAKS (loop_vinfo))); > } > > /* Return the number of vectors of type VECTYPE that are needed to get > > > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)