On Fri, 7 Feb 2025, Tamar Christina wrote:
> > -----Original Message-----
> > From: Richard Biener <[email protected]>
> > Sent: Wednesday, February 5, 2025 1:15 PM
> > To: Tamar Christina <[email protected]>
> > Cc: [email protected]; nd <[email protected]>
> > Subject: RE: [PATCH]middle-end: delay checking for alignment to load
> > [PR118464]
> >
> > On Wed, 5 Feb 2025, Tamar Christina wrote:
> >
> > [...]
> >
> > > >
> > 6eda40267bd1382938a77826d11f20dcc959a166..d0148d4938cafc3c59edfa6a
> > > > > > 60002933f384f65b 100644
> > > > > > > --- a/gcc/tree-vect-data-refs.cc
> > > > > > > +++ b/gcc/tree-vect-data-refs.cc
> > > > > > > @@ -731,7 +731,8 @@ vect_analyze_early_break_dependences
> > > > (loop_vec_info
> > > > > > loop_vinfo)
> > > > > > > if (is_gimple_debug (stmt))
> > > > > > > continue;
> > > > > > >
> > > > > > > - stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (stmt);
> > > > > > > + stmt_vec_info stmt_vinfo
> > > > > > > + = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (stmt));
> > > > > > > auto dr_ref = STMT_VINFO_DATA_REF (stmt_vinfo);
> > > > > > > if (!dr_ref)
> > > > > > > continue;
> > > > > > > @@ -748,26 +749,15 @@ vect_analyze_early_break_dependences
> > > > > > (loop_vec_info loop_vinfo)
> > > > > > > bounded by VF so accesses are within range. We only need
> > > > > > > to
> > check
> > > > > > > the reads since writes are moved to a safe place where if
> > > > > > > we get
> > > > > > > there we know they are safe to perform. */
> > > > > > > - if (DR_IS_READ (dr_ref)
> > > > > > > - && !ref_within_array_bound (stmt, DR_REF (dr_ref)))
> > > > > > > + if (DR_IS_READ (dr_ref))
> > > > > > > {
> > > > > > > - if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo)
> > > > > > > - || STMT_VINFO_STRIDED_P (stmt_vinfo))
> > > > > > > - {
> > > > > > > - const char *msg
> > > > > > > - = "early break not supported: cannot peel "
> > > > > > > - "for alignment, vectorization would read out of "
> > > > > > > - "bounds at %G";
> > > > > > > - return opt_result::failure_at (stmt, msg, stmt);
> > > > > > > - }
> > > > > > > -
> > > > > > > dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_vinfo);
> > > > > > > dr_info->need_peeling_for_alignment = true;
> > > > > >
> > > > > > You're setting the flag on any DR of a DR group here ...
> > > > > >
> > > > > > > if (dump_enabled_p ())
> > > > > > > dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > - "marking DR (read) as needing peeling
> > > > > > > for
> > "
> > > > > > > - "alignment at %G", stmt);
> > > > > > > + "marking DR (read) as possibly needing
> > peeling "
> > > > > > > + "for alignment at %G", stmt);
> > > > > > > }
> > > > > > >
> > > > > > > if (DR_IS_READ (dr_ref))
> > > > > > > @@ -1326,9 +1316,6 @@ vect_record_base_alignments (vec_info
> > > > > > > *vinfo)
> > > > > > > Compute the misalignment of the data reference DR_INFO when
> > vectorizing
> > > > > > > with VECTYPE.
> > > > > > >
> > > > > > > - RESULT is non-NULL iff VINFO is a loop_vec_info. In that
> > > > > > > case, *RESULT
> > will
> > > > > > > - be set appropriately on failure (but is otherwise left
> > > > > > > unchanged).
> > > > > > > -
> > > > > > > Output:
> > > > > > > 1. initialized misalignment info for DR_INFO
> > > > > > >
> > > > > > > @@ -1337,7 +1324,7 @@ vect_record_base_alignments (vec_info
> > > > > > > *vinfo)
> > > > > > >
> > > > > > > static void
> > > > > > > vect_compute_data_ref_alignment (vec_info *vinfo, dr_vec_info
> > > > > > > *dr_info,
> > > > > > > - tree vectype, opt_result *result =
> > > > > > > nullptr)
> > > > > > > + tree vectype)
> > > > > > > {
> > > > > > > stmt_vec_info stmt_info = dr_info->stmt;
> > > > > > > vec_base_alignments *base_alignments = &vinfo->base_alignments;
> > > > > > > @@ -1365,66 +1352,6 @@ vect_compute_data_ref_alignment (vec_info
> > > > *vinfo,
> > > > > > dr_vec_info *dr_info,
> > > > > > > = exact_div (targetm.vectorize.preferred_vector_alignment
> > > > > > > (vectype),
> > > > > > > BITS_PER_UNIT);
> > > > > > >
> > > > > > > - /* If this DR needs peeling for alignment for correctness, we
> > > > > > > must
> > > > > > > - ensure the target alignment is a constant power-of-two
> > > > > > > multiple of the
> > > > > > > - amount read per vector iteration (overriding the above hook
> > > > > > > where
> > > > > > > - necessary). */
> > > > > > > - if (dr_info->need_peeling_for_alignment)
> > > > > > > - {
> > > > > > > - /* Vector size in bytes. */
> > > > > > > - poly_uint64 safe_align = tree_to_poly_uint64
> > > > > > > (TYPE_SIZE_UNIT
> > > > (vectype));
> > > > > > > -
> > > > > > > - /* We can only peel for loops, of course. */
> > > > > > > - gcc_checking_assert (loop_vinfo);
> > > > > > > -
> > > > > > > - /* Calculate the number of vectors read per vector
> > > > > > > iteration. If
> > > > > > > - it is a power of two, multiply through to get the required
> > > > > > > - alignment in bytes. Otherwise, fail analysis since alignment
> > > > > > > - peeling wouldn't work in such a case. */
> > > > > > > - poly_uint64 num_scalars = LOOP_VINFO_VECT_FACTOR
> > > > > > > (loop_vinfo);
> > > > > > > - if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > > > > > > - num_scalars *= DR_GROUP_SIZE (stmt_info);
> > > > > > > -
> > > > > > > - auto num_vectors = vect_get_num_vectors (num_scalars,
> > > > > > > vectype);
> > > > > > > - if (!pow2p_hwi (num_vectors))
> > > > > > > - {
> > > > > > > - *result = opt_result::failure_at (vect_location,
> > > > > > > - "non-power-of-two num
> > vectors %u "
> > > > > > > - "for DR needing peeling for
> > > > > > > "
> > > > > > > - "alignment at %G",
> > > > > > > - num_vectors,
> > > > > > > stmt_info->stmt);
> > > > > > > - return;
> > > > > > > - }
> > > > > > > -
> > > > > > > - safe_align *= num_vectors;
> > > > > > > - if (maybe_gt (safe_align, 4096U))
> > > > > > > - {
> > > > > > > - pretty_printer pp;
> > > > > > > - pp_wide_integer (&pp, safe_align);
> > > > > > > - *result = opt_result::failure_at (vect_location,
> > > > > > > - "alignment required for
> > correctness"
> > > > > > > - " (%s) may exceed page
> > > > > > > size",
> > > > > > > - pp_formatted_text (&pp));
> > > > > > > - return;
> > > > > > > - }
> > > > > > > -
> > > > > > > - unsigned HOST_WIDE_INT multiple;
> > > > > > > - if (!constant_multiple_p (vector_alignment, safe_align,
> > > > > > > &multiple)
> > > > > > > - || !pow2p_hwi (multiple))
> > > > > > > - {
> > > > > > > - if (dump_enabled_p ())
> > > > > > > - {
> > > > > > > - dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > - "forcing alignment for DR from preferred
> > > > > > > (");
> > > > > > > - dump_dec (MSG_NOTE, vector_alignment);
> > > > > > > - dump_printf (MSG_NOTE, ") to safe align (");
> > > > > > > - dump_dec (MSG_NOTE, safe_align);
> > > > > > > - dump_printf (MSG_NOTE, ") for stmt: %G", stmt_info->stmt);
> > > > > > > - }
> > > > > > > - vector_alignment = safe_align;
> > > > > > > - }
> > > > > > > - }
> > > > > > > -
> > > > > > > SET_DR_TARGET_ALIGNMENT (dr_info, vector_alignment);
> > > > > > >
> > > > > > > /* If the main loop has peeled for alignment we have no way of
> > > > > > > knowing
> > > > > > > @@ -2479,7 +2406,7 @@ vect_enhance_data_refs_alignment
> > > > (loop_vec_info
> > > > > > loop_vinfo)
> > > > > > > || !slpeel_can_duplicate_loop_p (loop, LOOP_VINFO_IV_EXIT
> > > > (loop_vinfo),
> > > > > > > loop_preheader_edge (loop))
> > > > > > > || loop->inner
> > > > > > > - || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
> > > > > > > + || LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo)) //
> > > > > > > <<--
> > ??
> > > > > >
> > > > > > Spurious change(?)
> > > > >
> > > > > I was actually wondering why this is here. I'm curious why we're
> > > > > saying we
> > can't
> > > > > peel for alignment on an inverted loop.
> > > >
> > > > No idea either.
> > > >
> > > > > >
> > > > > > > do_peeling = false;
> > > > > > >
> > > > > > > struct _vect_peel_extended_info peel_for_known_alignment;
> > > > > > > @@ -2942,12 +2869,9 @@ vect_analyze_data_refs_alignment
> > > > (loop_vec_info
> > > > > > loop_vinfo)
> > > > > > > if (STMT_VINFO_GROUPED_ACCESS (dr_info->stmt)
> > > > > > > && DR_GROUP_FIRST_ELEMENT (dr_info->stmt) != dr_info-
> > >stmt)
> > > > > > > continue;
> > > > > > > - opt_result res = opt_result::success ();
> > > > > > > +
> > > > > > > vect_compute_data_ref_alignment (loop_vinfo, dr_info,
> > > > > > > - STMT_VINFO_VECTYPE (dr_info-
> > >stmt),
> > > > > > > - &res);
> > > > > > > - if (!res)
> > > > > > > - return res;
> > > > > > > + STMT_VINFO_VECTYPE (dr_info-
> > >stmt));
> > > > > > > }
> > > > > > > }
> > > > > > >
> > > > > > > @@ -7211,7 +7135,7 @@ vect_supportable_dr_alignment (vec_info
> > *vinfo,
> > > > > > dr_vec_info *dr_info,
> > > > > > >
> > > > > > > if (misalignment == 0)
> > > > > > > return dr_aligned;
> > > > > > > - else if (dr_info->need_peeling_for_alignment)
> > > > > > > + else if (dr_peeling_alignment (stmt_info))
> > > > > > > return dr_unaligned_unsupported;
> > > > > > >
> > > > > > > /* For now assume all conditional loads/stores support
> > > > > > > unaligned
> > > > > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > > > > > index
> > > > > >
> > > >
> > 1b639ae3b174779bb838d0f7ce4886d84ecafcfe..c213ec46c4bdb456f99a43a3a
> > > > > > ff7c1af80e8769a 100644
> > > > > > > --- a/gcc/tree-vect-stmts.cc
> > > > > > > +++ b/gcc/tree-vect-stmts.cc
> > > > > > > @@ -2597,6 +2597,69 @@ get_load_store_type (vec_info *vinfo,
> > > > > > stmt_vec_info stmt_info,
> > > > > > > return false;
> > > > > > > }
> > > > > > >
> > > > > > > + auto unit_size = GET_MODE_UNIT_SIZE (TYPE_MODE (vectype));
> > > > > > > + /* Check if a misalignment with an unsupported peeling for
> > > > > > > early break
> > is
> > > > > > > + still OK. First we need to distinguish between when we've
> > > > > > > reached
> > here
> > > > do
> > > > > >
> > > > > > due to
> > > > > >
> > > > > > > + to dependency analysis or when the user has requested
> > > > > > > -mstrict-align
> > or
> > > > > > > + similar. In those cases we must not override it. */
> > > > > > > + if (dr_peeling_alignment (stmt_info)
> > > > > > > + && *alignment_support_scheme == dr_unaligned_unsupported
> > > > > > > + /* We can only attempt to override if the misalignment is
> > > > > > > a multiple of
> > > > > > > + the element being loaded, otherwise peeling or versioning would
> > have
> > > > > > > + really been required. */
> > > > > > > + && multiple_p (*misalignment, unit_size))
> > > > > >
> > > > > > Hmm, but wouldn't that mean dr_info->target_alignment is bigger
> > > > > > than the vector size? Does that ever happen? I'll note that
> > > > > > *alignment_support_scheme == dr_aligned means alignment according
> > > > > > to dr_info->target_alignment which might be actually less than
> > > > > > the vector size (we've noticed this recently in other PRs), so
> > > > > > we might want to make sure that dr_info->target_alignment is
> > > > > > at least vector size when ->need_peeling_for_alignment I think.
> > > > > >
> > > > >
> > > > > One reason I block LOAD_LANES from the non-inbound case is that in
> > > > > those cases dr_info->target_alignment would need to be GROUP_SIZE *
> > vector
> > > > size
> > > > > to ensure that the entire access doesn't cross a page. Because this
> > > > > puts an
> > > > > excessive alignment request in place I currently just reject the loop.
> > > >
> > > > But that's true for all grouped accesses and one reason we wanted to
> > > > move
> > > > this code here - we know group_size and the vectorization factor.
> > > >
> > >
> > > Yes, I mentioned LOAD_LANES since that's what AArch64 makes group
> > > accesses,
> > > But yes you're right, this is about group accessed, but maybe my
> > > understanding
> > > is wrong here, I thought the only VMAT that can result in a group access
> > > is a
> > LOAD_LANES?
> >
> > No, even VMAT_CONTIGUOUS can be a group access.
> >
>
> Hmm That does correspond to what I think I was trying to fixup with the hunk
> below it
> and I now see what you meant by your review comment.
>
> I hadn't expected
>
> if (vect_a[i] > x || vect_a[i+1] > x)
> break;
>
> to still be a VMAT_CONTIGUOUS with slp lane == 1 as you'd need a permute as
> well
> so I was expecting VMAT_CONTIGUOUS_PERMUTE. But reading the comments on
> the enum indicates that this is only used for the non-SLP vectorizer.
>
> I guess SLP always models this as linear load + separate permutation. Which
> in hindsight
> makes sense looking at the node:
>
> note: node 0x627f608 (max_nunits=2, refcnt=1) vector(2) unsigned int
> note: op template: _6 = vect_a[_3];
> note: stmt 0 _6 = vect_a[_3];
> note: load permutation { 0 }
>
> The comments here make sense now, sorry was confused about the representation.
>
> > > But yes I agree I should drop the *memory_access_type ==
> > VMAT_LOAD_STORE_LANES bit
> > > and only look at the group access flag.
> > >
> > > > > But In principal it can happen, however the above checks for element
> > > > > size, not
> > > > > vector size. This fails when the user has intentionally misaligned
> > > > > the array and
> > we
> > > > > don't support peeling for the access type to correct it.
> > > > >
> > > > > So something like vect-early-break_133_pfa3.c with a grouped access
> > > > > for
> > > > instance.
> > > > >
> > > > > > So - which of the testcases gets you here? I think we
> > > > > > set *misalignment to be modulo target_alignment, never larger than
> > > > > > that.
> > > > > >
> > > > >
> > > > > The condition passes for most cases yes, unless we can't peel in
> > > > > which case we
> > > > > vect-early-break_22.c and vect-early-break_121-pr114081.c two that get
> > > > rejected
> > > > > inside the block.
> > > > >
> > > > > > > + {
> > > > > > > + bool inbounds
> > > > > > > + = ref_within_array_bound (STMT_VINFO_STMT (stmt_info),
> > > > > > > + DR_REF (STMT_VINFO_DATA_REF
> > (stmt_info)));
> > > > > > > + /* If we have a known misalignment, and are doing a group
> > > > > > > load for a
> > DR
> > > > > > > + that requires aligned access, check if the misalignment is a
> > > > > > > multiple
> > > > > > > + of the unit size. In which case the group load will be issued
> > > > > > > aligned
> > > > > > > + as long as the first load in the group is aligned.
> > > > > > > +
> > > > > > > + For the non-inbound case we'd need goup_size * vectype
> > alignment. But
> > > > > > > + this is quite huge and unlikely to ever happen so if we can't
> > > > > > > peel
> > for
> > > > > > > + it, just reject it. */
> > > >
> > > > I don't think the in-bound case is any different from the non-in-bound
> > > > case unless the size of the object is a multiple of the whole vector
> > > > access size as well.
> > > >
> > >
> > > The idea was that we know the accesses are all within the boundary of the
> > object,
> > > What we want to know is whether the accesses done are aligned. Since we
> > > don't
> > > support group loads with gaps here we know the elements must be sequential
> > and so
> > > the relaxation was that if we know all this that then all we really need
> > > to know is
> > that
> > > it is safe to read a multiple of GROUP_SIZE * element sizes, since we
> > > would still be
> > in
> > > range of the buffer. Because these are the number of scalar loads that
> > > would
> > have
> > > been done per iteration and so we can relax the alignment requirement
> > > based on
> > > the information known in the inbounds case.
> >
> > Huh, I guess I fail to parse this. ref_within_array_bound computes
> > whether we know all _scalar_ accesses are within bounds.
> >
> > At this point we know *alignment_support_scheme ==
> > dr_unaligned_unsupported which means the access is not aligned
> > but the misalignment is a multiple of the vector type unit size.
> >
> > How can the access not be aligned then? Only when target_alignment
> > > type unit size? But I think this implies a grouped access, thus
> > ncopies > 1 which is why I think you need to consider the VF?
> >
>
> But if we do that means we effectively can't vectorize a simple group access
> Such as
>
> if (vect_a[i] > x || vect_a[i+1] > x)
> break;
>
> even though we know it's safe to do so. If vect_a is V4SI, requiring 256 bit
> alignment
> on a 128-bit target alignment just doesn't work no?
Well, if you set ->target_alignment to 256 then peeling should make that
work. There's then the missed optimization (or wrong-code in other
context) when we decide on whether to emit an aligned or unaligned load
because we do that based on dr_aligned (to target_alignment) rather than
on byte alignment with respect to mode alignment of the load.
> But in the loop:
>
> #define N 1024
> unsigned vect_a[N];
> unsigned vect_b[N];
>
> unsigned test4(unsigned x)
> {
> unsigned ret = 0;
> for (int i = 1; i < (N/2)-1; i+=2)
> {
> if (vect_a[i] > x || vect_a[i+1] > x)
> break;
> vect_a[i] += x * vect_b[i];
> vect_a[i+1] += x * vect_b[i+1];
> }
> return ret;
> }
>
> Should be safe to vectorize though..
Yes.
> Or are you saying that in order to vectorize this we have to raise the
> alignment to 32 bytes?
For the former loop, yes.
> I.e. override the targets preferred alignment if the new alignment is a
> multiple of the preferred?
Yes. That's where we need to look at the VF - even a non-grouped load
can end up with two vector loads needed when there are multiple types
in the loop.
> > > > > > > + if (*memory_access_type == VMAT_LOAD_STORE_LANES
> > > > > > > + && (STMT_VINFO_GROUPED_ACCESS (stmt_info) || slp_node))
> > > > > > > + {
> > > > > > > + /* ?? This needs updating whenever we support slp group > 1.
> > > > > > > */
> > > >
> > > > ?
> > > >
> > > > > > > + auto group_size = DR_GROUP_SIZE (stmt_info);
> > > > > > > + /* For the inbound case it's enough to check for an alignment
> > > > > > > of
> > > > > > > + GROUP_SIZE * element size. */
> > > > > > > + if (inbounds
> > > > > > > + && (*misalignment % (group_size * unit_size)) == 0
> > > > > > > + && group_size % 2 == 0)
> > > >
> > > > It looks fishy that you do not need to consider the VF here.
> > >
> > > This and the ? above from you are related. I don't think we have to
> > > consider VF
> > here
> > > since early break vectorization does not support slp group size > 1. So
> > > the VF can
> > at
> > > this time never be larger than a single vector. The note is there to
> > > update this
> > when we do.
> >
> > You don't need a SLP group-size > 1 to have more than one vector read.
> > Consider
> >
> > if (long_long[i])
> > early break;
> > char[i] = char2[i];
> >
> > where you with V2DI and V16QI end up with a VF of 16 and 4 V2DI loads
> > to compute the early break condition? That said, implying, without
> > double-checking, some constraints on early break vectorization here
> > looks a bit fragile - if there are unwritten constraints we'd better
> > check them here. That also makes it easier to understand.
> >
>
> I did consider this case of widening/unpacking. But I had only considered
> the opposite case where the char was inside the if.
>
> Yes I can see why ncopies needs to be considered here too.
>
> > > >
> > > > > > > + {
> > > > > > > + if (dump_enabled_p ())
> > > > > > > + dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > + "Assuming grouped access is aligned due to
> > > > > > > load "
> > > > > > > + "lanes, overriding alignment scheme\n");
> > > > > > > +
> > > > > > > + *alignment_support_scheme = dr_unaligned_supported;
> > > > > > > + }
> > > > > > > + }
> > > > > > > + /* If we have a linear access and know the misalignment
> > > > > > > and know we
> > > > won't
> > > > > > > + read out of bounds then it's also ok if the misalignment is a
> > multiple
> > > > > > > + of the element size. We get this when the loop has known
> > misalignments
> > > > > > > + but the misalignments of the DRs can't be peeled to reach
> > > > > > > mutual
> > > > > > > + alignment. Because the misalignments are known however we
> > also know
> > > > > > > + that versioning won't work. If the target does support
> > > > > > > unaligned
> > > > > > > + accesses and we know we are free to read the entire buffer then
> > we
> > > > > > > + can allow the unaligned access if it's on elements for an
> > > > > > > early break
> > > > > > > + condition. */
> > > >
> > > > See above - one of the PRs was exactly that we ovverread a decl even
> > > > if the original scalar accesses are all in-bounds. So we can't allow
> > > > this.
> > > >
> > >
> > > But that PR was only because it was misaligned to an unnatural alignment
> > > of the
> > type
> > > which resulted one element possibly being split across a page boundary.
> > > That is
> > still
> > > rejected here.
> >
> > By the multiple_p (*misalignment, unit_size) condition? Btw, you should
> > check known_alignment_for_access_p before using *misalignment or
> > check *misalignment != DR_MISALIGNMENT_UNKNOWN.
> >
> > > This check is saying that if you have mutual misalignment, but each DR is
> > > in itself
> > still
> > > aligned to the type's natural alignment and we know that all access are
> > > in bounds
> > that
> > > we should be safe to vectorize as we won't accidentally access an invalid
> > > part of
> > memory.
> >
> > See above - when we do two vector loads we only know the first one had
> > a corresponding scalar access, the second vector load can still be
> > out of bounds. But when we can align to N * unit_size then it's fine
> > the vectorize. And only then (target_alignment > unit_size) we can have
> > *misalignment > unit_size?
> >
>
> Where N here I guess is ncopies * nunits? i.e. number of scalar loads per
> iteration?
> or rather VF. With most group access this is quite large though..
Yep, that's why we ended up with the worry about PAGE_SIZE in the last
discussion.
> I think you'll probably answer my question above which clarifies this one.
>
> Thanks for the review.
>
> I think with the next reply I have enough to respin it properly.
Thanks - take your time.
Richard.
> Thanks,
> Tamar
>
> > Richard.
> >
> > > > > > > + else if (*memory_access_type != VMAT_GATHER_SCATTER
> > > > > > > + && inbounds)
> > > > > > > + {
> > > > > > > + if (dump_enabled_p ())
> > > > > > > + dump_printf_loc (MSG_NOTE, vect_location,
> > > > > > > + "Access will not read beyond buffer to due
> > > > > > > known
> > size "
> > > > > > > + "buffer, overriding alignment scheme\n");
> > > > > > > +
> > > > > > > + *alignment_support_scheme = dr_unaligned_supported;
> > > > > > > + }
> > > > > > > + }
> > > > > > > +
> > > > > > > if (*alignment_support_scheme == dr_unaligned_unsupported)
> > > > > > > {
> > > > > > > if (dump_enabled_p ())
> > > > > > > @@ -10520,6 +10583,68 @@ vectorizable_load (vec_info *vinfo,
> > > > > > > /* Transform. */
> > > > > > >
> > > > > > > dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info),
> > *first_dr_info =
> > > > > > NULL;
> > > > > > > +
> > > > > > > + /* Check if we support the operation if early breaks are
> > > > > > > needed. */
> > > > > > > + if (loop_vinfo
> > > > > > > + && LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > > > > > + && (memory_access_type == VMAT_GATHER_SCATTER
> > > > > > > + || memory_access_type == VMAT_STRIDED_SLP))
> > > > > > > + {
> > > > > > > + if (dump_enabled_p ())
> > > > > > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > + "early break not supported: cannot peel for "
> > > > > > > + "alignment. With non-contiguous memory
> > vectorization"
> > > > > > > + " could read out of bounds at %G ",
> > > > > > > + STMT_VINFO_STMT (stmt_info));
> > > > > >
> > > > > > Hmm, this is now more restrictive than the original check in
> > > > > > vect_analyze_early_break_dependences because it covers all accesses.
> > > > > > The simplest fix would be to leave it there.
> > > > > >
> > > > >
> > > > > It covers all loads, which you're right is more restrictive, I think
> > > > > it just needs to
> > be
> > > > > moved inside if (costing_p && dr_info->need_peeling_for_alignment)
> > > > > block
> > > > > below it though.
> > > > >
> > > > > Delaying this to here instead of earlier has allowed us to vectorize
> > > > gcc.dg/vect/bb-slp-pr65935.c
> > > > > Which now vectorizes after the inner loops are unrolled.
> > > > >
> > > > > Are you happy with just moving it down?
> > > >
> > > > OK.
> > > >
> > > > > > > + return false;
> > > > > > > + }
> > > > > > > +
> > > > > > > + /* If this DR needs peeling for alignment for correctness, we
> > > > > > > must
> > > > > > > + ensure the target alignment is a constant power-of-two
> > > > > > > multiple of the
> > > > > > > + amount read per vector iteration (overriding the above hook
> > > > > > > where
> > > > > > > + necessary). We don't support group loads, which would have
> > > > > > > been
> > filterd
> > > > > > > + out in the check above. For now it means we don't have to
> > > > > > > look at the
> > > > > > > + group info and just check that the load is continguous and
> > > > > > > can just use
> > > > > > > + dr_info. For known size buffers we still need to check if
> > > > > > > the vector
> > > > > > > + is misaligned and if so we need to peel. */
> > > > > > > + if (costing_p && dr_info->need_peeling_for_alignment)
> > > > > >
> > > > > > dr_peeling_alignment ()
> > > > > >
> > > > > > I think this belongs in get_load_store_type, specifically I think
> > > > > > we want to only allow VMAT_CONTIGUOUS for
> > need_peeling_for_alignment
> > > > refs
> > > > > > and by construction dr_aligned should ensure type size alignment
> > > > > > (by altering target_alignment for the respective refs). Given that
> > > > > > both VF and vector type are fixed at
> > > > > > vect_analyze_data_refs_alignment
> > > > > > time we should be able to compute the appropriate target alignment
> > > > > > there (I'm not sure we support peeling of more than VF-1 iterations
> > > > > > though).
> > > > > >
> > > > > > > + {
> > > > > > > + /* Vector size in bytes. */
> > > > > > > + poly_uint64 safe_align = tree_to_poly_uint64
> > > > > > > (TYPE_SIZE_UNIT
> > > > (vectype));
> > > > > > > +
> > > > > > > + /* We can only peel for loops, of course. */
> > > > > > > + gcc_checking_assert (loop_vinfo);
> > > > > > > +
> > > > > > > + auto num_vectors = ncopies;
> > > > > > > + if (!pow2p_hwi (num_vectors))
> > > > > > > + {
> > > > > > > + if (dump_enabled_p ())
> > > > > > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > + "non-power-of-two num vectors %u "
> > > > > > > + "for DR needing peeling for "
> > > > > > > + "alignment at %G",
> > > > > > > + num_vectors, STMT_VINFO_STMT (stmt_info));
> > > > > > > + return false;
> > > > > > > + }
> > > > > > > +
> > > > > > > + safe_align *= num_vectors;
> > > > > > > + if (known_gt (safe_align, (unsigned)param_min_pagesize)
> > > > > > > + /* For VLA we don't support PFA when any unrolling needs to be
> > done.
> > > > > > > + We could though but too much work for GCC 15. For now we
> > assume a
> > > > > > > + vector is not larger than a page size so allow single
> > > > > > > loads. */
> > > > > > > + && (num_vectors > 1 && !vf.is_constant ()))
> > > > > > > + {
> > > > > > > + if (dump_enabled_p ())
> > > > > > > + {
> > > > > > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > > > > > + "alignment required for correctness (");
> > > > > > > + dump_dec (MSG_MISSED_OPTIMIZATION, safe_align);
> > > > > > > + dump_printf (MSG_NOTE, ") may exceed page size\n");
> > > > > > > + }
> > > > > > > + return false;
> > > > > > > + }
> > > > > > > + }
> > > > > > > +
> > > > > > > ensure_base_align (dr_info);
> > > > > > >
> > > > > > > if (memory_access_type == VMAT_INVARIANT)
> > > > > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > > > > > index
> > > > > >
> > > >
> > 44d3a1d46c409597f1e67a275211a1da414fc7c7..6ef97ee84336ac9a0e192214
> > > > > > 5f3d418436d709f4 100644
> > > > > > > --- a/gcc/tree-vectorizer.h
> > > > > > > +++ b/gcc/tree-vectorizer.h
> > > > > > > @@ -1998,6 +1998,19 @@ dr_target_alignment (dr_vec_info *dr_info)
> > > > > > > }
> > > > > > > #define DR_TARGET_ALIGNMENT(DR) dr_target_alignment (DR)
> > > > > > >
> > > > > > > +/* Return if the stmt_vec_info requires peeling for alignment.
> > > > > > > */
> > > > > > > +inline bool
> > > > > > > +dr_peeling_alignment (stmt_vec_info stmt_info)
> > > > > > > +{
> > > > > > > + dr_vec_info *dr_info;
> > > > > > > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > > > > > > + dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT
> > > > (stmt_info));
> > > > > >
> > > > > > ... but checking it on the first only.
> > > > >
> > > > > I did it that way because I was under the assumption that group loads
> > > > > could
> > be
> > > > relaxed
> > > > > to e.g. element wise or some other form. If it's the case that the
> > > > > group cannot
> > > > grow or
> > > > > be changed I could instead set it only on the first access and then
> > > > > not need to
> > > > check it
> > > > > elsewhere if you prefer.
> > > >
> > > > I've merely noted the discrepancy - consider
> > > >
> > > > if (a[2*i+1])
> > > > early break;
> > > > ... = a[2*i];
> > > >
> > > > then you'd set ->needs_peeling on the 2nd group member but
> > > > dr_peeling_alignment would always check the first. So yes, I think
> > > > we always want to set the flag on the first element of a grouped
> > > > access. We're no longer splitting groups when loop vectorizing.
> > > >
> > >
> > > Ack,
> > >
> > > Will update.
> > >
> > > Thanks,
> > > Tamar
> > >
> > > > Richard.
> > > >
> > > > >
> > > > > Thanks,
> > > > > Tamar
> > > > > >
> > > > > > > + else
> > > > > > > + dr_info = STMT_VINFO_DR_INFO (stmt_info);
> > > > > > > +
> > > > > > > + return dr_info->need_peeling_for_alignment;
> > > > > > > +}
> > > > > > > +
> > > > > > > inline void
> > > > > > > set_dr_target_alignment (dr_vec_info *dr_info, poly_uint64 val)
> > > > > > > {
> > > > > > >
> > > > > >
> > > > > > --
> > > > > > Richard Biener <[email protected]>
> > > > > > SUSE Software Solutions Germany GmbH,
> > > > > > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > > > Nuernberg)
> > > > >
> > > >
> > > > --
> > > > Richard Biener <[email protected]>
> > > > SUSE Software Solutions Germany GmbH,
> > > > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG
> > Nuernberg)
> > >
> >
> > --
> > Richard Biener <[email protected]>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)