RE: [3/3 PATCH v4]middle-end: delay checking for alignment to load [PR118464]

Tamar Christina Mon, 03 Mar 2025 06:32:32 -0800

> >    /* For now assume all conditional loads/stores support unaligned
> > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > index
> 6bbb16beff2c627fca11a7403ba5ee3a5faa21c1..b661deeeed400e5826fc1c4f70
> 957b335d1741fa 100644
> > --- a/gcc/tree-vect-stmts.cc
> > +++ b/gcc/tree-vect-stmts.cc
> > @@ -2597,6 +2597,128 @@ get_load_store_type (vec_info  *vinfo,
> stmt_vec_info stmt_info,
> >        return false;
> >      }
> >
> > +  /* If this DR needs alignment for correctness, we must ensure the target
> > +     alignment is a constant power-of-two multiple of the amount read per
> > +     vector iteration or force masking.  */
> > +  if (dr_safe_speculative_read_required (stmt_info))
> > +    {
> > +      /* We can only peel for loops, of course.  */
> > +      gcc_checking_assert (loop_vinfo);
> > +
> > +      /* Check if we support the operation if early breaks are needed.  
> > Here we
> > +    must ensure that we don't access any more than the scalar code would
> > +    have.  A masked operation would ensure this, so for these load types
> > +    force masking.  */
> > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > +     && (*memory_access_type == VMAT_GATHER_SCATTER
> > +         || *memory_access_type == VMAT_STRIDED_SLP))
> > +   {
> > +     if (dump_enabled_p ())
> > +       dump_printf_loc (MSG_NOTE, vect_location,
> > +                        "early break not supported: cannot peel for "
> > +                        "alignment. With non-contiguous memory
> vectorization"
> > +                        " could read out of bounds at %G ",
> > +                        STMT_VINFO_STMT (stmt_info));
> > +     LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> > +   }
> > +
> > +      auto target_alignment
> > +   = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
> > +      unsigned HOST_WIDE_INT target_align;
> > +      bool inbounds
> > +   = DR_SCALAR_KNOWN_BOUNDS (STMT_VINFO_DR_INFO (stmt_info));
> > +
> > +      /* If the scalar loop is known to be in bounds, and we're using 
> > scalar
> > +    accesses then there's no need to check further.  */
> > +      if (inbounds
> > +     && *memory_access_type == VMAT_ELEMENTWISE)
> > +   {
> > +     *alignment_support_scheme = dr_aligned;
> 
> Nothing should look at *alignment_support_scheme for VMAT_ELEMENTWISE.
> Did you actually need this adjustment?
>


Yes, bitfields are relaxed a few lines up from contiguous to this:

              if (SLP_TREE_LANES (slp_node) == 1)
                {
                  *memory_access_type = VMAT_ELEMENTWISE;
                  if (dump_enabled_p ())
                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                     "single-element interleaving not supported 
"
                                     "for not adjacent vector loads, using "
                                     "elementwise access\n");
                }

This means we then reach:
      if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())

we bail out because the permutes still exist.  The code relaxed the load to 
elements but
never removed the permutes or any associated information.

If the permutes are removed or some other workaround, you then hit

      if (!group_aligned && inbounds)
        LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;

Because these aren't group loads.  Because the original load didn't have any 
misalignment they
never needed peeling and as such are dr_unaligned_supported.

So the only way to avoid checking elementwise if by guarding the top level with

  if (dr_safe_speculative_read_required (stmt_info)
      && *alignment_support_scheme == dr_aligned)
    {

Instead of just 

  if (dr_safe_speculative_read_required (stmt_info))
    {

Which I wasn't sure if it was the right thing to do...  Anyway if I do that I 
can remove...

> > +     return true;
> > +   }
> > +
> > +      bool group_aligned = false;
> > +      if (*alignment_support_scheme == dr_aligned
> > +     && target_alignment.is_constant (&target_align)
> > +     && nunits.is_constant ())
> > +   {
> > +     poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > +     auto vectype_size
> > +       = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
> > +     poly_uint64 required_alignment = vf * vectype_size;
> > +     /* If we have a grouped access we require that the alignment be N * 
> > elem.
> */
> > +     if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > +       required_alignment *=
> > +           DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > +     if (!multiple_p (target_alignment, required_alignment))
> > +       {
> > +         if (dump_enabled_p ())
> > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                        "desired alignment %wu not met. Instead got %wu "
> > +                        "for DR alignment at %G",
> > +                        required_alignment.to_constant (),
> > +                        target_align, STMT_VINFO_STMT (stmt_info));
> > +         return false;
> > +       }
> > +
> > +     if (!pow2p_hwi (target_align))
> > +       {
> > +         if (dump_enabled_p ())
> > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                        "non-power-of-two vector alignment %wd "
> > +                        "for DR alignment at %G",
> > +                        target_align, STMT_VINFO_STMT (stmt_info));
> > +         return false;
> > +       }
> > +
> > +     /* For VLA we have to insert a runtime check that the vector loads
> > +        per iterations don't exceed a page size.  For now we can use
> > +        POLY_VALUE_MAX as a proxy as we can't peel for VLA.  */
> > +     if (known_gt (required_alignment, (unsigned)param_min_pagesize))
> > +       {
> > +         if (dump_enabled_p ())
> > +           {
> > +             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                          "alignment required for correctness (");
> > +             dump_dec (MSG_MISSED_OPTIMIZATION, required_alignment);
> > +             dump_printf (MSG_NOTE, ") may exceed page size\n");
> > +           }
> > +         return false;
> > +       }
> > +
> > +     group_aligned = true;
> > +   }
> > +
> > +      /* There are multiple loads that have a misalignment that we couldn't
> > +    align.  We would need LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P to
> > +    vectorize. */
> > +      if (!group_aligned)
> > +   LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> 
> I think we need to fail here unless scalar-access-in-bounds.
> 
> > +
> > +      /* When using a group access the first element may be aligned but the
> > +    subsequent loads may not be.  For LOAD_LANES since the loads are based
> > +    on the first DR then all loads in the group are aligned.  For
> > +    non-LOAD_LANES this is not the case. In particular a load + blend when
> > +    there are gaps can have the non first loads issued unaligned, even
> > +    partially overlapping the memory of the first load in order to simplify
> > +    the blend.  This is what the x86_64 backend does for instance.  As
> > +    such only the first load in the group is aligned, the rest are not.
> > +    Because of this the permutes may break the alignment requirements that
> > +    have been set, and as such we should for now, reject them.  */
> > +      if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> > +   {
> > +     if (dump_enabled_p ())
> > +       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > +                        "loads with load permutations not supported for "
> > +                        "speculative early break loads without partial "
> > +                        "vectors for %G",
> > +                        STMT_VINFO_STMT (stmt_info));
> > +     LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> 
> again, I think this doesn't save us.  Specifically ...
> 
> > +   }
> > +
> > +      *alignment_support_scheme = dr_aligned;
> 
> ... we must not simply claim the access is aligned when it wasn't
> analyzed as such.  If we committed to try peeling for a high
> target alignment we can't simply walk back here either.
> 

...This.

That also solves your other comment about once we commit to peeling we can't 
back out.

Are those changes ok?

Thanks,
Tamar

> Richard.
> 
> > +    }
> > +
> >    if (*alignment_support_scheme == dr_unaligned_unsupported)
> >      {
> >        if (dump_enabled_p ())
> > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > index
> b0cb081cba0ae8b11fbfcfcb8c6d440ec451ccb5..97caf61b345735d297ec49fd6ca
> 64797435b46fc 100644
> > --- a/gcc/tree-vectorizer.h
> > +++ b/gcc/tree-vectorizer.h
> > @@ -1281,7 +1281,11 @@ public:
> >
> >    /* Set by early break vectorization when this DR needs peeling for 
> > alignment
> >       for correctness.  */
> > -  bool need_peeling_for_alignment;
> > +  bool safe_speculative_read_required;
> > +
> > +  /* Set by early break vectorization when this DR's scalar accesses are 
> > known
> > +     to be inbounds of a known bounds loop.  */
> > +  bool scalar_access_known_in_bounds;
> >
> >    tree base_decl;
> >
> > @@ -1997,6 +2001,35 @@ dr_target_alignment (dr_vec_info *dr_info)
> >    return dr_info->target_alignment;
> >  }
> >  #define DR_TARGET_ALIGNMENT(DR) dr_target_alignment (DR)
> > +#define DR_SCALAR_KNOWN_BOUNDS(DR) (DR)-
> >scalar_access_known_in_bounds
> > +
> > +/* Return if the stmt_vec_info requires peeling for alignment.  */
> > +inline bool
> > +dr_safe_speculative_read_required (stmt_vec_info stmt_info)
> > +{
> > +  dr_vec_info *dr_info;
> > +  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > +    dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > +  else
> > +    dr_info = STMT_VINFO_DR_INFO (stmt_info);
> > +
> > +  return dr_info->safe_speculative_read_required;
> > +}
> > +
> > +/* Set the safe_speculative_read_required for the the stmt_vec_info, if 
> > group
> > +   access then set on the fist element otherwise set on DR directly.  */
> > +inline void
> > +dr_set_safe_speculative_read_required (stmt_vec_info stmt_info,
> > +                                  bool requires_alignment)
> > +{
> > +  dr_vec_info *dr_info;
> > +  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > +    dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > +  else
> > +    dr_info = STMT_VINFO_DR_INFO (stmt_info);
> > +
> > +  dr_info->safe_speculative_read_required = requires_alignment;
> > +}
> >
> >  inline void
> >  set_dr_target_alignment (dr_vec_info *dr_info, poly_uint64 val)
> >
> >
> >
> 
> --
> Richard Biener <rguent...@suse.de>
> SUSE Software Solutions Germany GmbH,
> Frankenstrasse 146, 90461 Nuernberg, Germany;
> GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

RE: [3/3 PATCH v4]middle-end: delay checking for alignment to load [PR118464]

Reply via email to