The following tries to reduce the amount of difference between
SLP and non-SLP for single-element interleaving load classification.
This fixes another fallout of --param vect-force-slp=1
PR tree-optimization/117606
* tree-vect-stmts.cc (get_group_load_store_type): For single
element interleaving also fall back to VMAT_ELEMENTWISE if
a left-over permutation isn't supported.
---
gcc/tree-vect-stmts.cc | 27 +++++++++++++++++++++++++--
1 file changed, 25 insertions(+), 2 deletions(-)
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index ab5ea038d1d..7ffee2c4cc8 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -2082,8 +2082,9 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info
stmt_info,
*memory_access_type = VMAT_CONTIGUOUS;
/* If this is single-element interleaving with an element
- distance that leaves unused vector loads around punt - we
- at least create very sub-optimal code in that case (and
+ distance that leaves unused vector loads around fall back
+ to elementwise access if possible - we otherwise least
+ create very sub-optimal code in that case (and
blow up memory, see PR65518). */
if (loop_vinfo
&& single_element_p
@@ -2110,6 +2111,28 @@ get_group_load_store_type (vec_info *vinfo,
stmt_vec_info stmt_info,
}
}
+ /* For single-element interleaving also fall back to elementwise
+ access in case we did not lower a permutation and cannot
+ code generate it. */
+ auto_vec<tree> temv;
+ unsigned n_perms;
+ if (loop_vinfo
+ && single_element_p
+ && SLP_TREE_LANES (slp_node) == 1
+ && (*memory_access_type == VMAT_CONTIGUOUS
+ || *memory_access_type == VMAT_CONTIGUOUS_REVERSE)
+ && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()
+ && !vect_transform_slp_perm_load
+ (loop_vinfo, slp_node, temv, NULL,
+ LOOP_VINFO_VECT_FACTOR (loop_vinfo), true, &n_perms))
+ {
+ *memory_access_type = VMAT_ELEMENTWISE;
+ if (dump_enabled_p ())
+ dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+ "single-element interleaving permutation not "
+ "supported, using elementwise access\n");
+ }
+
overrun_p = (loop_vinfo && gap != 0
&& *memory_access_type != VMAT_ELEMENTWISE);
if (overrun_p && vls_type != VLS_LOAD)
--
2.43.0