The following refactors the vectorizer vector_costs target API
to add a new vector_costs::add_vector_cost entry which groups
all individual sub-stmts we create per "vector stmt", aka SLP
node.  This allows for the targets to more easily match on
complex cases like emulated gather/scatter or even just vector
construction.

The patch itself is just a prototype and leaves out BB vectorization
for simplicity.  It also does not fully group all vector stmts
but leaves some bare add_stmt_cost hook invocations.  I'd expect
the add_stmt_hook to be still used for scalar stmt costing and
for costing added branching around prologue/epilogue.  The
default implementation of add_vector_cost just dispatches to
add_stmt_cost for individual stmts.  Eventually the actual data
we track for the combined costing will diverge (no need to track
SLP node or stmt_info there?), so targets would eventually be
expected to implement both hooks and splice out common workers
to deal with "missing" information coming in from the different
entries.

This should eventually baby-step us towards the generic vectorizer
code being able to compute and compare latency and resource
utilization throughout the scalar / vector loop iteration based
on latency and throughput data determined on a stmt-by-stmt base
from the target.  As given the grouping should be an incremental
improvement, but I have not tried to see how it can simplify
the x86 hook implementation - I've been triggered by the aarch64
reported bootstrap fail on the cleanup RFC I posted given that
code wants to identify a scalar load that's costed as part of
a gather/scatter operation.

Any comments or problems you forsee?

Thanks,
Richard.

        * tree-vectorizer.h (vector_costs::add_vector_cost): New method.
        (_slp_tree::cost_vec): New.
        * tree-vectorizer.cc (vector_costs::add_vector_cost): Add
        fallback implementation.
        * tree-vect-stmts.cc (vect_analyze_stmt): For loop vectorization
        record costs into the SLP node specific cost vector.
        * tree-vect-slp.cc (_slp_tree::_slp_tree): Initialize cost_vec.
        (_slp_tree::~_slp_tree): Release cost_vec.
        (vect_slp_add_node_cost): New.
        (vect_slp_analyze_operations): Cost the stmt groups recorded
        per SLP node for loop vectorization.
---
 gcc/tree-vect-slp.cc   | 27 +++++++++++++++++++++++++++
 gcc/tree-vect-stmts.cc | 32 +++++++++++++++++---------------
 gcc/tree-vectorizer.cc | 12 ++++++++++++
 gcc/tree-vectorizer.h  |  6 ++++++
 4 files changed, 62 insertions(+), 15 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 8d0a612577b..5c112800087 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -127,6 +127,7 @@ _slp_tree::_slp_tree ()
   SLP_TREE_REPRESENTATIVE (this) = NULL;
   SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
   SLP_TREE_REF_COUNT (this) = 1;
+  this->cost_vec = vNULL;
   this->failed = NULL;
   this->max_nunits = 1;
   this->lanes = 0;
@@ -149,6 +150,7 @@ _slp_tree::~_slp_tree ()
   SLP_TREE_LOAD_PERMUTATION (this).release ();
   SLP_TREE_LANE_PERMUTATION (this).release ();
   SLP_TREE_SIMD_CLONE_INFO (this).release ();
+  this->cost_vec.release ();
   if (this->failed)
     free (failed);
 }
@@ -8499,6 +8501,23 @@ vect_slp_prune_covered_roots (slp_tree node, 
hash_set<stmt_vec_info> &roots,
       vect_slp_prune_covered_roots (child, roots, visited);
 }
 
+/* Cost vectorization of NODE and children recursively.  */
+
+static void
+vect_slp_add_node_cost (vector_costs *vector_costs, slp_tree node,
+                       hash_set<slp_tree> &visited)
+{
+  if (visited.add (node))
+    return;
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+    if (child)
+      vect_slp_add_node_cost (vector_costs, child, visited);
+
+  if (node->cost_vec.exists ())
+    vector_costs->add_vector_cost (node, &node->cost_vec);
+}
+
 /* Analyze statements in SLP instances of VINFO.  Return true if the
    operations are supported. */
 
@@ -8582,6 +8601,14 @@ vect_slp_analyze_operations (vec_info *vinfo)
        }
     }
 
+  if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
+    {
+      visited.empty ();
+      for (auto instance : loop_vinfo->slp_instances)
+       vect_slp_add_node_cost (loop_vinfo->vector_costs,
+                               SLP_INSTANCE_TREE (instance), visited);
+    }
+
   /* Now look for SLP instances with a root that are covered by other
      instances and remove them.  */
   hash_set<stmt_vec_info> roots;
diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index 8f38d8bcb7c..19c29402068 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -14006,6 +14006,8 @@ vect_analyze_stmt (vec_info *vinfo,
     return opt_result::failure_at (stmt_info->stmt,
                                   "needs non-SLP handling\n");
 
+  gcc_assert (node->cost_vec.is_empty ());
+
   ok = true;
   if (!bb_vinfo
       && (STMT_VINFO_RELEVANT_P (stmt_info)
@@ -14013,34 +14015,34 @@ vect_analyze_stmt (vec_info *vinfo,
     /* Prefer vectorizable_call over vectorizable_simd_clone_call so
        -mveclibabi= takes preference over library functions with
        the simd attribute.  */
-    ok = (vectorizable_call (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+    ok = (vectorizable_call (vinfo, stmt_info, NULL, NULL, node, 
&node->cost_vec)
          || vectorizable_simd_clone_call (vinfo, stmt_info, NULL, NULL, node,
-                                          cost_vec)
+                                          &node->cost_vec)
          || vectorizable_conversion (vinfo, stmt_info,
-                                     NULL, NULL, node, cost_vec)
+                                     NULL, NULL, node, &node->cost_vec)
          || vectorizable_operation (vinfo, stmt_info,
-                                    NULL, NULL, node, cost_vec)
+                                    NULL, NULL, node, &node->cost_vec)
          || vectorizable_assignment (vinfo, stmt_info,
-                                     NULL, NULL, node, cost_vec)
-         || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, cost_vec)
-         || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+                                     NULL, NULL, node, &node->cost_vec)
+         || vectorizable_load (vinfo, stmt_info, NULL, NULL, node, 
&node->cost_vec)
+         || vectorizable_store (vinfo, stmt_info, NULL, NULL, node, 
&node->cost_vec)
          || vectorizable_lane_reducing (as_a <loop_vec_info> (vinfo),
-                                        stmt_info, node, cost_vec)
+                                        stmt_info, node, &node->cost_vec)
          || vectorizable_reduction (as_a <loop_vec_info> (vinfo), stmt_info,
-                                    node, node_instance, cost_vec)
+                                    node, node_instance, &node->cost_vec)
          || vectorizable_induction (as_a <loop_vec_info> (vinfo), stmt_info,
-                                    NULL, node, cost_vec)
-         || vectorizable_shift (vinfo, stmt_info, NULL, NULL, node, cost_vec)
+                                    NULL, node, &node->cost_vec)
+         || vectorizable_shift (vinfo, stmt_info, NULL, NULL, node, 
&node->cost_vec)
          || vectorizable_condition (vinfo, stmt_info,
-                                    NULL, NULL, node, cost_vec)
+                                    NULL, NULL, node, &node->cost_vec)
          || vectorizable_comparison (vinfo, stmt_info, NULL, NULL, node,
-                                     cost_vec)
+                                     &node->cost_vec)
          || vectorizable_lc_phi (as_a <loop_vec_info> (vinfo),
                                  stmt_info, node)
          || vectorizable_recurr (as_a <loop_vec_info> (vinfo),
-                                  stmt_info, NULL, node, cost_vec)
+                                  stmt_info, NULL, node, &node->cost_vec)
          || vectorizable_early_exit (vinfo, stmt_info, NULL, NULL, node,
-                                     cost_vec));
+                                     &node->cost_vec));
   else
     {
       if (bb_vinfo)
diff --git a/gcc/tree-vectorizer.cc b/gcc/tree-vectorizer.cc
index 447f882c518..083ab46728c 100644
--- a/gcc/tree-vectorizer.cc
+++ b/gcc/tree-vectorizer.cc
@@ -1844,6 +1844,18 @@ vector_costs::add_stmt_cost (int count, 
vect_cost_for_stmt kind,
 
 /* See the comment above the declaration for details.  */
 
+unsigned int
+vector_costs::add_vector_cost (slp_tree, stmt_vector_for_cost *cost_vec)
+{
+  unsigned int sum = 0;
+  for (auto item : *cost_vec)
+    sum += add_stmt_cost (item.count, item.kind, item.stmt_info, item.node,
+                         item.vectype, item.misalign, item.where);
+  return sum;
+}
+
+/* See the comment above the declaration for details.  */
+
 void
 vector_costs::finish_cost (const vector_costs *)
 {
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 990072fca95..26fee63961e 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -243,6 +243,8 @@ struct _slp_tree {
      for linear arguments (pair of NULLs for other arguments).  */
   vec<tree> simd_clone_info;
 
+  stmt_vector_for_cost cost_vec;
+
   tree vectype;
   /* Vectorized defs.  */
   vec<tree> vec_defs;
@@ -1665,6 +1667,10 @@ public:
                                      tree vectype, int misalign,
                                      vect_cost_model_location where);
 
+  /* Update the costs in response to generating vector code for NODE
+     with the stmt parts described by COST_VEC.  */
+  virtual unsigned int add_vector_cost (slp_tree node, stmt_vector_for_cost *);
+
   /* Finish calculating the cost of the code.  The results can be
      read back using the functions below.
 
-- 
2.43.0

Reply via email to