RE: [3/3 PATCH v4]middle-end: delay checking for alignment to load [PR118464]

Richard Biener Mon, 03 Mar 2025 06:40:28 -0800

On Mon, 3 Mar 2025, Tamar Christina wrote:

> > >    /* For now assume all conditional loads/stores support unaligned
> > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
> > > index
> > 6bbb16beff2c627fca11a7403ba5ee3a5faa21c1..b661deeeed400e5826fc1c4f70
> > 957b335d1741fa 100644
> > > --- a/gcc/tree-vect-stmts.cc
> > > +++ b/gcc/tree-vect-stmts.cc
> > > @@ -2597,6 +2597,128 @@ get_load_store_type (vec_info  *vinfo,
> > stmt_vec_info stmt_info,
> > >        return false;
> > >      }
> > >
> > > +  /* If this DR needs alignment for correctness, we must ensure the 
> > > target
> > > +     alignment is a constant power-of-two multiple of the amount read per
> > > +     vector iteration or force masking.  */
> > > +  if (dr_safe_speculative_read_required (stmt_info))
> > > +    {
> > > +      /* We can only peel for loops, of course.  */
> > > +      gcc_checking_assert (loop_vinfo);
> > > +
> > > +      /* Check if we support the operation if early breaks are needed.  
> > > Here we
> > > +  must ensure that we don't access any more than the scalar code would
> > > +  have.  A masked operation would ensure this, so for these load types
> > > +  force masking.  */
> > > +      if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
> > > +   && (*memory_access_type == VMAT_GATHER_SCATTER
> > > +       || *memory_access_type == VMAT_STRIDED_SLP))
> > > + {
> > > +   if (dump_enabled_p ())
> > > +     dump_printf_loc (MSG_NOTE, vect_location,
> > > +                      "early break not supported: cannot peel for "
> > > +                      "alignment. With non-contiguous memory
> > vectorization"
> > > +                      " could read out of bounds at %G ",
> > > +                      STMT_VINFO_STMT (stmt_info));
> > > +   LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> > > + }
> > > +
> > > +      auto target_alignment
> > > + = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info));
> > > +      unsigned HOST_WIDE_INT target_align;
> > > +      bool inbounds
> > > + = DR_SCALAR_KNOWN_BOUNDS (STMT_VINFO_DR_INFO (stmt_info));
> > > +
> > > +      /* If the scalar loop is known to be in bounds, and we're using 
> > > scalar
> > > +  accesses then there's no need to check further.  */
> > > +      if (inbounds
> > > +   && *memory_access_type == VMAT_ELEMENTWISE)
> > > + {
> > > +   *alignment_support_scheme = dr_aligned;
> > 
> > Nothing should look at *alignment_support_scheme for VMAT_ELEMENTWISE.
> > Did you actually need this adjustment?
> > 
> 
> Yes, bitfields are relaxed a few lines up from contiguous to this:
> 
>             if (SLP_TREE_LANES (slp_node) == 1)
>               {
>                 *memory_access_type = VMAT_ELEMENTWISE;
>                 if (dump_enabled_p ())
>                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
>                                    "single-element interleaving not supported 
> "
>                                    "for not adjacent vector loads, using "
>                                    "elementwise access\n");
>               }
> 
> This means we then reach:
>       if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> 
> we bail out because the permutes still exist.  The code relaxed the load to 
> elements but
> never removed the permutes or any associated information.
> 
> If the permutes are removed or some other workaround, you then hit
> 
>       if (!group_aligned && inbounds)
>       LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> 
> Because these aren't group loads.  Because the original load didn't have any 
> misalignment they
> never needed peeling and as such are dr_unaligned_supported.
> 
> So the only way to avoid checking elementwise if by guarding the top level 
> with
> 
>   if (dr_safe_speculative_read_required (stmt_info)
>       && *alignment_support_scheme == dr_aligned)
>     {
> 
> Instead of just 
> 
>   if (dr_safe_speculative_read_required (stmt_info))
>     {
> 
> Which I wasn't sure if it was the right thing to do...  Anyway if I do that I 
> can remove...
> 
> > > +   return true;
> > > + }
> > > +
> > > +      bool group_aligned = false;
> > > +      if (*alignment_support_scheme == dr_aligned
> > > +   && target_alignment.is_constant (&target_align)
> > > +   && nunits.is_constant ())
> > > + {
> > > +   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
> > > +   auto vectype_size
> > > +     = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
> > > +   poly_uint64 required_alignment = vf * vectype_size;
> > > +   /* If we have a grouped access we require that the alignment be N * 
> > > elem.
> > */
> > > +   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > > +     required_alignment *=
> > > +         DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > > +   if (!multiple_p (target_alignment, required_alignment))
> > > +     {
> > > +       if (dump_enabled_p ())
> > > +         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > +                      "desired alignment %wu not met. Instead got %wu "
> > > +                      "for DR alignment at %G",
> > > +                      required_alignment.to_constant (),
> > > +                      target_align, STMT_VINFO_STMT (stmt_info));
> > > +       return false;
> > > +     }
> > > +
> > > +   if (!pow2p_hwi (target_align))
> > > +     {
> > > +       if (dump_enabled_p ())
> > > +         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > +                      "non-power-of-two vector alignment %wd "
> > > +                      "for DR alignment at %G",
> > > +                      target_align, STMT_VINFO_STMT (stmt_info));
> > > +       return false;
> > > +     }
> > > +
> > > +   /* For VLA we have to insert a runtime check that the vector loads
> > > +      per iterations don't exceed a page size.  For now we can use
> > > +      POLY_VALUE_MAX as a proxy as we can't peel for VLA.  */
> > > +   if (known_gt (required_alignment, (unsigned)param_min_pagesize))
> > > +     {
> > > +       if (dump_enabled_p ())
> > > +         {
> > > +           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > +                        "alignment required for correctness (");
> > > +           dump_dec (MSG_MISSED_OPTIMIZATION, required_alignment);
> > > +           dump_printf (MSG_NOTE, ") may exceed page size\n");
> > > +         }
> > > +       return false;
> > > +     }
> > > +
> > > +   group_aligned = true;
> > > + }
> > > +
> > > +      /* There are multiple loads that have a misalignment that we 
> > > couldn't
> > > +  align.  We would need LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P to
> > > +  vectorize. */
> > > +      if (!group_aligned)
> > > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> > 
> > I think we need to fail here unless scalar-access-in-bounds.
> > 
> > > +
> > > +      /* When using a group access the first element may be aligned but 
> > > the
> > > +  subsequent loads may not be.  For LOAD_LANES since the loads are based
> > > +  on the first DR then all loads in the group are aligned.  For
> > > +  non-LOAD_LANES this is not the case. In particular a load + blend when
> > > +  there are gaps can have the non first loads issued unaligned, even
> > > +  partially overlapping the memory of the first load in order to simplify
> > > +  the blend.  This is what the x86_64 backend does for instance.  As
> > > +  such only the first load in the group is aligned, the rest are not.
> > > +  Because of this the permutes may break the alignment requirements that
> > > +  have been set, and as such we should for now, reject them.  */
> > > +      if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
> > > + {
> > > +   if (dump_enabled_p ())
> > > +     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
> > > +                      "loads with load permutations not supported for "
> > > +                      "speculative early break loads without partial "
> > > +                      "vectors for %G",
> > > +                      STMT_VINFO_STMT (stmt_info));
> > > +   LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true;
> > 
> > again, I think this doesn't save us.  Specifically ...
> > 
> > > + }
> > > +
> > > +      *alignment_support_scheme = dr_aligned;
> > 
> > ... we must not simply claim the access is aligned when it wasn't
> > analyzed as such.  If we committed to try peeling for a high
> > target alignment we can't simply walk back here either.
> > 
> 
> ...This.
> 
> That also solves your other comment about once we commit to peeling we can't 
> back out.
> 
> Are those changes ok?


Yes, that sounds good to me.

Richard.

> Thanks,
> Tamar
> 
> > Richard.
> > 
> > > +    }
> > > +
> > >    if (*alignment_support_scheme == dr_unaligned_unsupported)
> > >      {
> > >        if (dump_enabled_p ())
> > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
> > > index
> > b0cb081cba0ae8b11fbfcfcb8c6d440ec451ccb5..97caf61b345735d297ec49fd6ca
> > 64797435b46fc 100644
> > > --- a/gcc/tree-vectorizer.h
> > > +++ b/gcc/tree-vectorizer.h
> > > @@ -1281,7 +1281,11 @@ public:
> > >
> > >    /* Set by early break vectorization when this DR needs peeling for 
> > > alignment
> > >       for correctness.  */
> > > -  bool need_peeling_for_alignment;
> > > +  bool safe_speculative_read_required;
> > > +
> > > +  /* Set by early break vectorization when this DR's scalar accesses are 
> > > known
> > > +     to be inbounds of a known bounds loop.  */
> > > +  bool scalar_access_known_in_bounds;
> > >
> > >    tree base_decl;
> > >
> > > @@ -1997,6 +2001,35 @@ dr_target_alignment (dr_vec_info *dr_info)
> > >    return dr_info->target_alignment;
> > >  }
> > >  #define DR_TARGET_ALIGNMENT(DR) dr_target_alignment (DR)
> > > +#define DR_SCALAR_KNOWN_BOUNDS(DR) (DR)-
> > >scalar_access_known_in_bounds
> > > +
> > > +/* Return if the stmt_vec_info requires peeling for alignment.  */
> > > +inline bool
> > > +dr_safe_speculative_read_required (stmt_vec_info stmt_info)
> > > +{
> > > +  dr_vec_info *dr_info;
> > > +  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > > +    dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > > +  else
> > > +    dr_info = STMT_VINFO_DR_INFO (stmt_info);
> > > +
> > > +  return dr_info->safe_speculative_read_required;
> > > +}
> > > +
> > > +/* Set the safe_speculative_read_required for the the stmt_vec_info, if 
> > > group
> > > +   access then set on the fist element otherwise set on DR directly.  */
> > > +inline void
> > > +dr_set_safe_speculative_read_required (stmt_vec_info stmt_info,
> > > +                                bool requires_alignment)
> > > +{
> > > +  dr_vec_info *dr_info;
> > > +  if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
> > > +    dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info));
> > > +  else
> > > +    dr_info = STMT_VINFO_DR_INFO (stmt_info);
> > > +
> > > +  dr_info->safe_speculative_read_required = requires_alignment;
> > > +}
> > >
> > >  inline void
> > >  set_dr_target_alignment (dr_vec_info *dr_info, poly_uint64 val)
> > >
> > >
> > >
> > 
> > --
> > Richard Biener <rguent...@suse.de>
> > SUSE Software Solutions Germany GmbH,
> > Frankenstrasse 146, 90461 Nuernberg, Germany;
> > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

RE: [3/3 PATCH v4]middle-end: delay checking for alignment to load [PR118464]

Reply via email to