Hi All, This change allows one to simplify lane permutes that select from multiple load leafs that load from the same DR group by promoting the VEC_PERM node into a load itself and pushing the lane permute into it as a load permute.
This saves us from having to calculate where to materialize a new load node. If the resulting loads are now unused they are freed and are removed from the graph. This allows us to handle cases where we would have generated: movi v4.4s, 0 adrp x3, .LC0 ldr q5, [x3, #:lo12:.LC0] mov x3, 0 .p2align 3,,7 .L2: mov v0.16b, v4.16b mov v3.16b, v4.16b ldr q1, [x1, x3] ldr q2, [x0, x3] fcmla v0.4s, v2.4s, v1.4s, #0 fcmla v3.4s, v1.4s, v2.4s, #0 fcmla v0.4s, v2.4s, v1.4s, #270 fcmla v3.4s, v1.4s, v2.4s, #270 mov v1.16b, v3.16b tbl v0.16b, {v0.16b - v1.16b}, v5.16b str q0, [x2, x3] add x3, x3, 16 cmp x3, 1600 bne .L2 ret and instead generate mov x3, 0 .p2align 3,,7 .L27: ldr q0, [x2, x3] ldr q1, [x0, x3] ldr q2, [x1, x3] fcmla v0.2d, v1.2d, v2.2d, #0 fcmla v0.2d, v1.2d, v2.2d, #270 str q0, [x2, x3] add x3, x3, 16 cmp x3, 512 bne .L27 ret This runs as a pre step such that permute simplification can still inspect this permute is needed Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. Tests are included as part of the final patch as they need the SLP pattern matcher to insert permutes in between. Ok for master? Thanks, Tamar gcc/ChangeLog: * tree-vect-slp.c (vect_optimize_slp): Promote permutes. --
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 48f615e1952707de4827f0e69e443c0a7db27d81..a3881b59b3c2aafb216ef320633255ed91f4dd45 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -2937,6 +2937,7 @@ vect_optimize_slp (vec_info *vinfo) unsigned i; auto_vec<slp_tree> vertices; auto_vec<int> leafs; + hash_set<int_hash<int, -1, -1> > dup_leafs; vect_slp_build_vertices (vinfo, vertices, leafs); struct graph *slpg = new_graph (vertices.length ()); @@ -2954,12 +2955,14 @@ vect_optimize_slp (vec_info *vinfo) graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL); auto_sbitmap n_visited (vertices.length ()); + auto_sbitmap n_replicated (vertices.length ()); auto_sbitmap n_materialize (vertices.length ()); auto_vec<int> n_perm (vertices.length ()); auto_vec<vec<unsigned> > perms; bitmap_clear (n_visited); bitmap_clear (n_materialize); + bitmap_clear (n_replicated); n_perm.quick_grow_cleared (vertices.length ()); perms.safe_push (vNULL); /* zero is no permute */ @@ -3000,6 +3003,11 @@ vect_optimize_slp (vec_info *vinfo) /* If there's no permute no need to split one out. */ if (!any_permute) continue; + + /* If the operation is a replication mark it for further inspection. */ + if (imin == imax) + dup_leafs.add (idx); + /* If the span doesn't match we'd disrupt VF computation, avoid that for now. */ if (imax - imin + 1 != SLP_TREE_LANES (node)) @@ -3028,6 +3036,100 @@ vect_optimize_slp (vec_info *vinfo) n_perm[idx] = perms.length () - 1; } + /* Inspect all replication node and determine if they are connected to a + permute operation that may linearize the load. */ + for (hash_set<int_hash<int, -1, -1> >::iterator iter = dup_leafs.begin (); + iter != dup_leafs.end (); ++iter) + { + int idx = *iter; + + graph_edge *edge = slpg->vertices[idx].pred; + do + { + slp_tree node = vertices[idx]; + unsigned src = edge->src; + /* If we've visited the permute node already leave it alone. This + prevents us from re-inspecting it for every leafs that lead to it. */ + if (bitmap_bit_p (n_replicated, src)) + continue; + + slp_tree parent = vertices[src]; + bitmap_set_bit (n_replicated, src); + + if (!SLP_TREE_LANE_PERMUTATION (parent).exists ()) + continue; + + /* Check if all edges lead to a load and that all the loads are + coming from the same group. */ + unsigned j; + bool distribute_p = SLP_TREE_CHILDREN (parent).length () > 0; + stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node); + stmt_vec_info dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt); + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node) + { + /* Check if this is one of the nodes we know have a replication + operation. */ + distribute_p = dup_leafs.contains (node->vertex); + if (!distribute_p) + break; + stmt_vec_info cur_dr_stmt = SLP_TREE_REPRESENTATIVE (node); + cur_dr_stmt = DR_GROUP_FIRST_ELEMENT (cur_dr_stmt); + distribute_p = dr_stmt == cur_dr_stmt; + if (!distribute_p) + break; + } + + /* If we have a valid node to optimize, do it and disconnect it from + the graph. */ + if (distribute_p) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "converting permute node %p into load\n", + parent); + + /* First convert this node into a load node and add it to the leaves + list and flatten the permute from a lane to a load one. If it's + unneeded it will be elided later. + + Question: I presume I need to add this new node to the instance's + loads list? *But I have no idea which instance I'm in. */ + vec<stmt_vec_info> stmts; + stmts.create (SLP_TREE_LANES (parent)); + load_permutation_t load_perm; + load_perm.create (SLP_TREE_LANES (parent)); + lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (parent); + for (j = 0; j < lane_perm.length (); j++) + { + std::pair<unsigned, unsigned> perm = lane_perm[j]; + node = SLP_TREE_CHILDREN (parent)[perm.first]; + stmts.safe_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]); + load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]); + } + SLP_TREE_REPRESENTATIVE (parent) = rep_stmt; + SLP_TREE_SCALAR_STMTS (parent) = stmts; + SLP_TREE_LANE_PERMUTATION (parent).release(); + SLP_TREE_LANE_PERMUTATION (parent) = vNULL; + SLP_TREE_LOAD_PERMUTATION (parent) = load_perm; + SLP_TREE_CODE (parent) = ERROR_MARK; + + /* One last iteration to free the nodes. */ + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node) + { + /* If we are the only reference to the node, remove the vertex. + We don't have to modify the graph since vertices lead the + graph traversal. */ + vect_free_slp_tree (node); + } + + SLP_TREE_CHILDREN (parent) = vNULL; + + /* And finally it to the leafs set and updated the vertices. */ + leafs.safe_push (parent->vertex); + } + } while ((edge = edge->pred_next)); + } + /* Propagate permutes along the graph and compute materialization points. */ bool changed; unsigned iteration = 0;