The following tries to addess one shortcoming of the SLP permute
optimization phase which is that it tries to refuse layouts that
cannot revert to layout 0 on an edge even if layout 0 isn't actually
valid. This hurts in particular on VLA vector targets where many
permutes cannot be code generated. While for fixed-length targets
costs eventually sort things out in a way to chose a "no permutation"
setting the above check prevents VLA targets from arriving there.
The patch fixes up internal_node_cost to anticipate redundant permute
eliding and short-cuts the above requirement on edges to partitions
that do not have a prefered layout with the idea we shouldn't have
the need to revert to layout zero for those.
The downside is that we now can arrive at invalid layout configurations
which we deal with simply giving up.
A testcase for this is gcc.dg/vect/vect-reduc-chain-4.c.
Bootstrapped and tested on x86_64-unknown-linux-gnu. I verified that
with RVV we now can use VLA vectors for the testcase. The patch
will run through the riscv CI, some aarch64 testing is appreciated.
Comments on the patch as well, I did try to understand the whole
operation of SLP permute optimization but likely failed to capture
some essential part ;)
Thanks,
Richard.
PR tree-optimization/120687
* tree-vect-slp.cc (vect_optimize_slp_pass::is_redundant_permutation):
New function, split out from ...
(vect_optimize_slp_pass::remove_redundant_permutations): ... here.
Use it.
(vect_optimize_slp_pass::backward_pass): Return whether
all layouts are possible.
(vect_optimize_slp_pass::internal_node_cost): When the effective
load permutation is redundant assume zero cost.
(vect_optimize_slp_pass::forward_pass): For a forward edge
to a partition without a prefered layout do not require layout
zero to be valid.
(vect_optimize_slp_pass::run): When not all layouts are possible
after the backward_pass, do nothing.
---
gcc/tree-vect-slp.cc | 131 +++++++++++++++++++++++++------------------
1 file changed, 75 insertions(+), 56 deletions(-)
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 9698709f567..4f276771d57 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -6267,13 +6267,14 @@ private:
slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
void forward_pass ();
- void backward_pass ();
+ bool backward_pass ();
/* Rematerialization. */
slp_tree get_result_with_layout (slp_tree, unsigned int);
void materialize ();
/* Clean-up. */
+ bool is_redundant_permutation (slp_tree, const load_permutation_t&);
void remove_redundant_permutations ();
/* Masked load lanes discovery. */
@@ -6748,6 +6749,46 @@ change_vec_perm_layout (slp_tree node,
lane_permutation_t &perm,
vect_slp_permute (m_perms[out_layout_i], perm, true);
}
+/* Return whether PERM is a redundant load permutation for NODE. */
+
+bool
+vect_optimize_slp_pass::is_redundant_permutation (slp_tree node,
+ const load_permutation_t&
+ perm)
+{
+ if (!is_a <loop_vec_info> (m_vinfo))
+ return false;
+ loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
+ bool this_load_permuted = false;
+ for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
+ if (perm[j] != j)
+ {
+ this_load_permuted = true;
+ break;
+ }
+ /* When this isn't a grouped access we know it's single element
+ and contiguous. */
+ if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
+ {
+ if (!this_load_permuted
+ && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
+ || SLP_TREE_LANES (node) == 1))
+ return true;
+ return false;
+ }
+ stmt_vec_info first_stmt_info
+ = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
+ if (!this_load_permuted
+ /* The load requires permutation when unrolling exposes
+ a gap either because the group is larger than the SLP
+ group-size or because there is a gap between the groups. */
+ && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
+ || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
+ && DR_GROUP_GAP (first_stmt_info) == 0)))
+ return true;
+ return false;
+}
+
/* Check whether the target allows NODE to be rearranged so that the node's
output has layout OUT_LAYOUT_I. Return the cost of the change if so,
in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
@@ -6831,9 +6872,11 @@ vect_optimize_slp_pass::internal_node_cost (slp_tree
node, int in_layout_i,
poly_uint64 vf = 1;
if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
- unsigned int n_perms;
- if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
- nullptr, vf, true, false, &n_perms))
+ unsigned int n_perms = 0;
+ if (!is_redundant_permutation (node, tmp_perm)
+ && !vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
+ nullptr, vf, true, false,
+ &n_perms))
{
auto rep = SLP_TREE_REPRESENTATIVE (node);
if (out_layout_i == 0)
@@ -7338,19 +7381,22 @@ vect_optimize_slp_pass::forward_pass ()
else
layout_costs.in_cost.add_parallel_cost (cost);
}
- else
- /* Reject the layout if it would make layout 0 impossible
- for later partitions. This amounts to testing that the
- target supports reversing the layout change on edges
- to later partitions.
-
- In principle, it might be possible to push a layout
- change all the way down a graph, so that it never
- needs to be reversed and so that the target doesn't
- need to support the reverse operation. But it would
- be awkward to bail out if we hit a partition that
- does not support the new layout, especially since
- we are not dealing with a lattice. */
+ /* Reject the layout if it would make layout 0 impossible
+ for later partitions. This amounts to testing that the
+ target supports reversing the layout change on edges
+ to later partitions.
+
+ In principle, it might be possible to push a layout
+ change all the way down a graph, so that it never
+ needs to be reversed and so that the target doesn't
+ need to support the reverse operation. But it would
+ be awkward to bail out if we hit a partition that
+ does not support the new layout, especially since
+ we are not dealing with a lattice.
+
+ At least when the other partition doesn't have a
+ prefered layout do not require the reverse permute. */
+ else if (m_partitions[other_vertex.partition].layout != -1)
is_possible &= edge_layout_cost (ud, other_node_i, 0,
layout_i).is_possible ();
};
@@ -7426,7 +7472,7 @@ vect_optimize_slp_pass::forward_pass ()
/* Make a backward pass through the partitions, accumulating output costs.
Make a final choice of layout for each partition. */
-void
+bool
vect_optimize_slp_pass::backward_pass ()
{
for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
@@ -7498,9 +7544,11 @@ vect_optimize_slp_pass::backward_pass ()
}
}
- gcc_assert (min_layout_cost.is_possible ());
+ if (!min_layout_cost.is_possible ())
+ return false;
partition.layout = min_layout_i;
}
+ return true;
}
/* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
@@ -7760,39 +7808,8 @@ vect_optimize_slp_pass::remove_redundant_permutations ()
}
else
{
- loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
- stmt_vec_info load_info;
- bool this_load_permuted = false;
- unsigned j;
- FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
- if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
- {
- this_load_permuted = true;
- break;
- }
- /* When this isn't a grouped access we know it's single element
- and contiguous. */
- if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
- {
- if (!this_load_permuted
- && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
- || SLP_TREE_LANES (node) == 1))
- SLP_TREE_LOAD_PERMUTATION (node).release ();
- continue;
- }
- stmt_vec_info first_stmt_info
- = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
- if (!this_load_permuted
- /* The load requires permutation when unrolling exposes
- a gap either because the group is larger than the SLP
- group-size or because there is a gap between the groups. */
- && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
- || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
- && DR_GROUP_GAP (first_stmt_info) == 0)))
- {
- SLP_TREE_LOAD_PERMUTATION (node).release ();
- continue;
- }
+ if (is_redundant_permutation (node, SLP_TREE_LOAD_PERMUTATION (node)))
+ SLP_TREE_LOAD_PERMUTATION (node).release ();
}
}
}
@@ -7995,10 +8012,12 @@ vect_optimize_slp_pass::run ()
if (m_perms.length () > 1)
{
forward_pass ();
- backward_pass ();
- if (dump_enabled_p ())
- dump ();
- materialize ();
+ if (backward_pass ())
+ {
+ if (dump_enabled_p ())
+ dump ();
+ materialize ();
+ }
while (!m_perms.is_empty ())
m_perms.pop ().release ();
}
--
2.51.0