Hi All,

This change allows one to simplify lane permutes that select from multiple load
leafs that load from the same DR group by promoting the VEC_PERM node into a
load itself and pushing the lane permute into it as a load permute.

This saves us from having to calculate where to materialize a new load node.
If the resulting loads are now unused they are freed and are removed from the
graph.

This allows us to handle cases where we would have generated:

        movi    v4.4s, 0
        adrp    x3, .LC0
        ldr     q5, [x3, #:lo12:.LC0]
        mov     x3, 0
        .p2align 3,,7
.L2:
        mov     v0.16b, v4.16b
        mov     v3.16b, v4.16b
        ldr     q1, [x1, x3]
        ldr     q2, [x0, x3]
        fcmla   v0.4s, v2.4s, v1.4s, #0
        fcmla   v3.4s, v1.4s, v2.4s, #0
        fcmla   v0.4s, v2.4s, v1.4s, #270
        fcmla   v3.4s, v1.4s, v2.4s, #270
        mov     v1.16b, v3.16b
        tbl     v0.16b, {v0.16b - v1.16b}, v5.16b
        str     q0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, 1600
        bne     .L2
        ret

and instead generate

        mov     x3, 0
        .p2align 3,,7
.L27:
        ldr     q0, [x2, x3]
        ldr     q1, [x0, x3]
        ldr     q2, [x1, x3]
        fcmla   v0.2d, v1.2d, v2.2d, #0
        fcmla   v0.2d, v1.2d, v2.2d, #270
        str     q0, [x2, x3]
        add     x3, x3, 16
        cmp     x3, 512
        bne     .L27
        ret

This runs as a pre step such that permute simplification can still inspect this
permute is needed

Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
Tests are included as part of the final patch as they need the SLP pattern
matcher to insert permutes in between.

Ok for master?

Thanks,
Tamar

gcc/ChangeLog:

        * tree-vect-slp.c (vect_optimize_slp): Promote permutes.

-- 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 48f615e1952707de4827f0e69e443c0a7db27d81..a3881b59b3c2aafb216ef320633255ed91f4dd45 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -2937,6 +2937,7 @@ vect_optimize_slp (vec_info *vinfo)
   unsigned i;
   auto_vec<slp_tree> vertices;
   auto_vec<int> leafs;
+  hash_set<int_hash<int, -1, -1> > dup_leafs;
   vect_slp_build_vertices (vinfo, vertices, leafs);
 
   struct graph *slpg = new_graph (vertices.length ());
@@ -2954,12 +2955,14 @@ vect_optimize_slp (vec_info *vinfo)
   graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL);
 
   auto_sbitmap n_visited (vertices.length ());
+  auto_sbitmap n_replicated (vertices.length ());
   auto_sbitmap n_materialize (vertices.length ());
   auto_vec<int> n_perm (vertices.length ());
   auto_vec<vec<unsigned> > perms;
 
   bitmap_clear (n_visited);
   bitmap_clear (n_materialize);
+  bitmap_clear (n_replicated);
   n_perm.quick_grow_cleared (vertices.length ());
   perms.safe_push (vNULL); /* zero is no permute */
 
@@ -3000,6 +3003,11 @@ vect_optimize_slp (vec_info *vinfo)
       /* If there's no permute no need to split one out.  */
       if (!any_permute)
 	continue;
+
+      /* If the operation is a replication mark it for further inspection.  */
+      if (imin == imax)
+	dup_leafs.add (idx);
+
       /* If the span doesn't match we'd disrupt VF computation, avoid
 	 that for now.  */
       if (imax - imin + 1 != SLP_TREE_LANES (node))
@@ -3028,6 +3036,100 @@ vect_optimize_slp (vec_info *vinfo)
       n_perm[idx] = perms.length () - 1;
     }
 
+  /* Inspect all replication node and determine if they are connected to a
+     permute operation that may linearize the load.  */
+  for (hash_set<int_hash<int, -1, -1> >::iterator iter = dup_leafs.begin ();
+       iter != dup_leafs.end (); ++iter)
+    {
+      int idx = *iter;
+
+      graph_edge *edge = slpg->vertices[idx].pred;
+      do
+	{
+	  slp_tree node = vertices[idx];
+	  unsigned src = edge->src;
+	  /* If we've visited the permute node already leave it alone.  This
+	     prevents us from re-inspecting it for every leafs that lead to it.  */
+	  if (bitmap_bit_p (n_replicated, src))
+	    continue;
+
+	  slp_tree parent = vertices[src];
+	  bitmap_set_bit (n_replicated, src);
+
+	  if (!SLP_TREE_LANE_PERMUTATION (parent).exists ())
+	    continue;
+
+	  /* Check if all edges lead to a load and that all the loads are
+	     coming from the same group.  */
+	  unsigned j;
+	  bool distribute_p = SLP_TREE_CHILDREN (parent).length () > 0;
+	  stmt_vec_info rep_stmt = SLP_TREE_REPRESENTATIVE (node);
+	  stmt_vec_info dr_stmt = DR_GROUP_FIRST_ELEMENT (rep_stmt);
+	  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node)
+	    {
+	      /* Check if this is one of the nodes we know have a replication
+		 operation.  */
+	      distribute_p = dup_leafs.contains (node->vertex);
+	      if (!distribute_p)
+		break;
+	      stmt_vec_info cur_dr_stmt = SLP_TREE_REPRESENTATIVE (node);
+	      cur_dr_stmt = DR_GROUP_FIRST_ELEMENT (cur_dr_stmt);
+	      distribute_p = dr_stmt == cur_dr_stmt;
+	      if (!distribute_p)
+		break;
+	    }
+
+	  /* If we have a valid node to optimize, do it and disconnect it from
+	     the graph.  */
+	  if (distribute_p)
+	  {
+	    if (dump_enabled_p ())
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "converting permute node %p into load\n",
+			       parent);
+
+	    /* First convert this node into a load node and add it to the leaves
+	       list and flatten the permute from a lane to a load one.  If it's
+	       unneeded it will be elided later.
+
+	       Question: I presume I need to add this new node to the instance's
+	       loads list? *But I have no idea which instance I'm in.  */
+	    vec<stmt_vec_info> stmts;
+	    stmts.create (SLP_TREE_LANES (parent));
+	    load_permutation_t load_perm;
+	    load_perm.create (SLP_TREE_LANES (parent));
+	    lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (parent);
+	    for (j = 0; j < lane_perm.length (); j++)
+	      {
+		std::pair<unsigned, unsigned> perm = lane_perm[j];
+		node = SLP_TREE_CHILDREN (parent)[perm.first];
+		stmts.safe_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
+		load_perm.safe_push (SLP_TREE_LOAD_PERMUTATION (node)[perm.second]);
+	      }
+	    SLP_TREE_REPRESENTATIVE (parent) = rep_stmt;
+	    SLP_TREE_SCALAR_STMTS (parent) = stmts;
+	    SLP_TREE_LANE_PERMUTATION (parent).release();
+	    SLP_TREE_LANE_PERMUTATION (parent) = vNULL;
+	    SLP_TREE_LOAD_PERMUTATION (parent) = load_perm;
+	    SLP_TREE_CODE (parent) = ERROR_MARK;
+
+	    /* One last iteration to free the nodes.  */
+	    FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (parent), j, node)
+	      {
+		/* If we are the only reference to the node, remove the vertex.
+		   We don't have to modify the graph since vertices lead the
+		   graph traversal.  */
+		vect_free_slp_tree (node);
+	      }
+
+	    SLP_TREE_CHILDREN (parent) = vNULL;
+
+	    /* And finally it to the leafs set and updated the vertices.  */
+	    leafs.safe_push (parent->vertex);
+	  }
+	} while ((edge = edge->pred_next));
+    }
+
   /* Propagate permutes along the graph and compute materialization points.  */
   bool changed;
   unsigned iteration = 0;

Reply via email to