On Mon, 3 Mar 2025, Tamar Christina wrote: > > > /* For now assume all conditional loads/stores support unaligned > > > diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc > > > index > > 6bbb16beff2c627fca11a7403ba5ee3a5faa21c1..b661deeeed400e5826fc1c4f70 > > 957b335d1741fa 100644 > > > --- a/gcc/tree-vect-stmts.cc > > > +++ b/gcc/tree-vect-stmts.cc > > > @@ -2597,6 +2597,128 @@ get_load_store_type (vec_info *vinfo, > > stmt_vec_info stmt_info, > > > return false; > > > } > > > > > > + /* If this DR needs alignment for correctness, we must ensure the > > > target > > > + alignment is a constant power-of-two multiple of the amount read per > > > + vector iteration or force masking. */ > > > + if (dr_safe_speculative_read_required (stmt_info)) > > > + { > > > + /* We can only peel for loops, of course. */ > > > + gcc_checking_assert (loop_vinfo); > > > + > > > + /* Check if we support the operation if early breaks are needed. > > > Here we > > > + must ensure that we don't access any more than the scalar code would > > > + have. A masked operation would ensure this, so for these load types > > > + force masking. */ > > > + if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo) > > > + && (*memory_access_type == VMAT_GATHER_SCATTER > > > + || *memory_access_type == VMAT_STRIDED_SLP)) > > > + { > > > + if (dump_enabled_p ()) > > > + dump_printf_loc (MSG_NOTE, vect_location, > > > + "early break not supported: cannot peel for " > > > + "alignment. With non-contiguous memory > > vectorization" > > > + " could read out of bounds at %G ", > > > + STMT_VINFO_STMT (stmt_info)); > > > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true; > > > + } > > > + > > > + auto target_alignment > > > + = DR_TARGET_ALIGNMENT (STMT_VINFO_DR_INFO (stmt_info)); > > > + unsigned HOST_WIDE_INT target_align; > > > + bool inbounds > > > + = DR_SCALAR_KNOWN_BOUNDS (STMT_VINFO_DR_INFO (stmt_info)); > > > + > > > + /* If the scalar loop is known to be in bounds, and we're using > > > scalar > > > + accesses then there's no need to check further. */ > > > + if (inbounds > > > + && *memory_access_type == VMAT_ELEMENTWISE) > > > + { > > > + *alignment_support_scheme = dr_aligned; > > > > Nothing should look at *alignment_support_scheme for VMAT_ELEMENTWISE. > > Did you actually need this adjustment? > > > > Yes, bitfields are relaxed a few lines up from contiguous to this: > > if (SLP_TREE_LANES (slp_node) == 1) > { > *memory_access_type = VMAT_ELEMENTWISE; > if (dump_enabled_p ()) > dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > "single-element interleaving not supported > " > "for not adjacent vector loads, using " > "elementwise access\n"); > } > > This means we then reach: > if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) > > we bail out because the permutes still exist. The code relaxed the load to > elements but > never removed the permutes or any associated information. > > If the permutes are removed or some other workaround, you then hit > > if (!group_aligned && inbounds) > LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true; > > Because these aren't group loads. Because the original load didn't have any > misalignment they > never needed peeling and as such are dr_unaligned_supported. > > So the only way to avoid checking elementwise if by guarding the top level > with > > if (dr_safe_speculative_read_required (stmt_info) > && *alignment_support_scheme == dr_aligned) > { > > Instead of just > > if (dr_safe_speculative_read_required (stmt_info)) > { > > Which I wasn't sure if it was the right thing to do... Anyway if I do that I > can remove... > > > > + return true; > > > + } > > > + > > > + bool group_aligned = false; > > > + if (*alignment_support_scheme == dr_aligned > > > + && target_alignment.is_constant (&target_align) > > > + && nunits.is_constant ()) > > > + { > > > + poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); > > > + auto vectype_size > > > + = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (vectype))); > > > + poly_uint64 required_alignment = vf * vectype_size; > > > + /* If we have a grouped access we require that the alignment be N * > > > elem. > > */ > > > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > > > + required_alignment *= > > > + DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info)); > > > + if (!multiple_p (target_alignment, required_alignment)) > > > + { > > > + if (dump_enabled_p ()) > > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > > + "desired alignment %wu not met. Instead got %wu " > > > + "for DR alignment at %G", > > > + required_alignment.to_constant (), > > > + target_align, STMT_VINFO_STMT (stmt_info)); > > > + return false; > > > + } > > > + > > > + if (!pow2p_hwi (target_align)) > > > + { > > > + if (dump_enabled_p ()) > > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > > + "non-power-of-two vector alignment %wd " > > > + "for DR alignment at %G", > > > + target_align, STMT_VINFO_STMT (stmt_info)); > > > + return false; > > > + } > > > + > > > + /* For VLA we have to insert a runtime check that the vector loads > > > + per iterations don't exceed a page size. For now we can use > > > + POLY_VALUE_MAX as a proxy as we can't peel for VLA. */ > > > + if (known_gt (required_alignment, (unsigned)param_min_pagesize)) > > > + { > > > + if (dump_enabled_p ()) > > > + { > > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > > + "alignment required for correctness ("); > > > + dump_dec (MSG_MISSED_OPTIMIZATION, required_alignment); > > > + dump_printf (MSG_NOTE, ") may exceed page size\n"); > > > + } > > > + return false; > > > + } > > > + > > > + group_aligned = true; > > > + } > > > + > > > + /* There are multiple loads that have a misalignment that we > > > couldn't > > > + align. We would need LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P to > > > + vectorize. */ > > > + if (!group_aligned) > > > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true; > > > > I think we need to fail here unless scalar-access-in-bounds. > > > > > + > > > + /* When using a group access the first element may be aligned but > > > the > > > + subsequent loads may not be. For LOAD_LANES since the loads are based > > > + on the first DR then all loads in the group are aligned. For > > > + non-LOAD_LANES this is not the case. In particular a load + blend when > > > + there are gaps can have the non first loads issued unaligned, even > > > + partially overlapping the memory of the first load in order to simplify > > > + the blend. This is what the x86_64 backend does for instance. As > > > + such only the first load in the group is aligned, the rest are not. > > > + Because of this the permutes may break the alignment requirements that > > > + have been set, and as such we should for now, reject them. */ > > > + if (SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) > > > + { > > > + if (dump_enabled_p ()) > > > + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, > > > + "loads with load permutations not supported for " > > > + "speculative early break loads without partial " > > > + "vectors for %G", > > > + STMT_VINFO_STMT (stmt_info)); > > > + LOOP_VINFO_MUST_USE_PARTIAL_VECTORS_P (loop_vinfo) = true; > > > > again, I think this doesn't save us. Specifically ... > > > > > + } > > > + > > > + *alignment_support_scheme = dr_aligned; > > > > ... we must not simply claim the access is aligned when it wasn't > > analyzed as such. If we committed to try peeling for a high > > target alignment we can't simply walk back here either. > > > > ...This. > > That also solves your other comment about once we commit to peeling we can't > back out. > > Are those changes ok?
Yes, that sounds good to me. Richard. > Thanks, > Tamar > > > Richard. > > > > > + } > > > + > > > if (*alignment_support_scheme == dr_unaligned_unsupported) > > > { > > > if (dump_enabled_p ()) > > > diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h > > > index > > b0cb081cba0ae8b11fbfcfcb8c6d440ec451ccb5..97caf61b345735d297ec49fd6ca > > 64797435b46fc 100644 > > > --- a/gcc/tree-vectorizer.h > > > +++ b/gcc/tree-vectorizer.h > > > @@ -1281,7 +1281,11 @@ public: > > > > > > /* Set by early break vectorization when this DR needs peeling for > > > alignment > > > for correctness. */ > > > - bool need_peeling_for_alignment; > > > + bool safe_speculative_read_required; > > > + > > > + /* Set by early break vectorization when this DR's scalar accesses are > > > known > > > + to be inbounds of a known bounds loop. */ > > > + bool scalar_access_known_in_bounds; > > > > > > tree base_decl; > > > > > > @@ -1997,6 +2001,35 @@ dr_target_alignment (dr_vec_info *dr_info) > > > return dr_info->target_alignment; > > > } > > > #define DR_TARGET_ALIGNMENT(DR) dr_target_alignment (DR) > > > +#define DR_SCALAR_KNOWN_BOUNDS(DR) (DR)- > > >scalar_access_known_in_bounds > > > + > > > +/* Return if the stmt_vec_info requires peeling for alignment. */ > > > +inline bool > > > +dr_safe_speculative_read_required (stmt_vec_info stmt_info) > > > +{ > > > + dr_vec_info *dr_info; > > > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > > > + dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info)); > > > + else > > > + dr_info = STMT_VINFO_DR_INFO (stmt_info); > > > + > > > + return dr_info->safe_speculative_read_required; > > > +} > > > + > > > +/* Set the safe_speculative_read_required for the the stmt_vec_info, if > > > group > > > + access then set on the fist element otherwise set on DR directly. */ > > > +inline void > > > +dr_set_safe_speculative_read_required (stmt_vec_info stmt_info, > > > + bool requires_alignment) > > > +{ > > > + dr_vec_info *dr_info; > > > + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) > > > + dr_info = STMT_VINFO_DR_INFO (DR_GROUP_FIRST_ELEMENT (stmt_info)); > > > + else > > > + dr_info = STMT_VINFO_DR_INFO (stmt_info); > > > + > > > + dr_info->safe_speculative_read_required = requires_alignment; > > > +} > > > > > > inline void > > > set_dr_target_alignment (dr_vec_info *dr_info, poly_uint64 val) > > > > > > > > > > > > > -- > > Richard Biener <rguent...@suse.de> > > SUSE Software Solutions Germany GmbH, > > Frankenstrasse 146, 90461 Nuernberg, Germany; > > GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg) > -- Richard Biener <rguent...@suse.de> SUSE Software Solutions Germany GmbH, Frankenstrasse 146, 90461 Nuernberg, Germany; GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)