Override the add_slp_cost hook so the x86 cost model gets the recorded
stmt costs grouped per SLP node. The node-dependent costing that used to
live in add_stmt_cost -- the vec_construct/vec_deconstruct/scalar_to_vec
construction cost and the cross-lane vec_perm accounting -- is moved to
add_slp_cost, which handles those kinds and delegates the rest to
add_stmt_cost. The shared tail (default-cost fallback, Bonnell/Silvermont
fixups and frequency scaling) is factored into finish_stmt_cost, called by
both paths.
This is a refactor with no intended functional change.
Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
Ok for trunk?
gcc/ChangeLog:
* config/i386/i386.cc (ix86_vector_costs): Declare add_slp_cost,
ix86_slp_construct_cost, ix86_slp_account_perm and finish_stmt_cost.
(ix86_vector_costs::add_stmt_cost): Move the vec_construct and
vec_perm node costing out; finish via finish_stmt_cost.
(ix86_vector_costs::ix86_slp_construct_cost): New.
(ix86_vector_costs::ix86_slp_account_perm): New.
(ix86_vector_costs::finish_stmt_cost): New.
(ix86_vector_costs::add_slp_cost): New.
---
gcc/config/i386/i386.cc | 166 ++++++++++++++++++++++++++++++++--------
1 file changed, 135 insertions(+), 31 deletions(-)
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e66958db7ac..94c2b3c0090 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -26165,6 +26165,9 @@ public:
stmt_vec_info stmt_info, slp_tree node,
tree vectype, int misalign,
vect_cost_model_location where) override;
+ unsigned int add_slp_cost (slp_tree node,
+ const array_slice<stmt_info_for_cost> &)
+ override;
void finish_cost (const vector_costs *) override;
bool better_main_loop_than_p (const vector_costs *) const override;
bool better_epilogue_loop_than_p (const vector_costs *other,
@@ -26174,6 +26177,20 @@ private:
/* Estimate register pressure of the vectorized code. */
void ix86_vect_estimate_reg_pressure ();
+ /* Cost of a vec_construct/vec_deconstruct/scalar_to_vec of SLP NODE,
+ or -1 to fall back to the default vector cost. */
+ int ix86_slp_construct_cost (slp_tree node, vect_cost_for_stmt kind,
+ tree vectype, machine_mode mode, bool fp,
+ vect_cost_model_location where);
+ /* Account a vec_perm of SLP NODE to the cross-lane permute counters. */
+ void ix86_slp_account_perm (slp_tree node, int count, tree vectype,
+ stmt_vec_info stmt_info,
+ vect_cost_model_location where);
+ /* Common cost finalization shared by add_stmt_cost and add_slp_cost. */
+ unsigned int finish_stmt_cost (int stmt_cost, int count,
+ vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info, machine_mode mode,
+ tree vectype, vect_cost_model_location where);
/* Number of GENERAL_REGS/SSE_REGS used in the vectorizer, it's used for
estimation of register pressure.
??? Currently it's only used by vec_construct/scalar_to_vec
@@ -26255,7 +26272,6 @@ ix86_vector_costs::add_stmt_cost (int count,
vect_cost_for_stmt kind,
tree vectype, int,
vect_cost_model_location where)
{
- unsigned retval = 0;
bool scalar_p
= (kind == scalar_stmt || kind == scalar_load || kind == scalar_store);
int stmt_cost = - 1;
@@ -26732,18 +26748,35 @@ ix86_vector_costs::add_stmt_cost (int count,
vect_cost_for_stmt kind,
+ ix86_vec_cost (mode, ix86_cost->sse_op);
}
+ /* vec_construct/vec_deconstruct/scalar_to_vec and vec_perm are costed
+ per SLP node in add_slp_cost, which does the node-specific work before
+ finishing here. */
+ return finish_stmt_cost (stmt_cost, count, kind, stmt_info, mode, vectype,
+ where);
+}
+
+/* See the comment above the declaration for details. */
+
+int
+ix86_vector_costs::ix86_slp_construct_cost (slp_tree node,
+ vect_cost_for_stmt kind,
+ tree vectype, machine_mode mode,
+ bool fp,
+ vect_cost_model_location where)
+{
+ int stmt_cost = -1;
+
/* If we do elementwise loads into a vector then we are bound by
latency and execution resources for the many scalar loads
(AGU and load ports). Try to account for this by scaling the
construction cost by the number of elements involved. */
if ((kind == vec_construct || kind == vec_deconstruct)
- && ((node
- && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
- || SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP)
- && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
- (SLP_TREE_REPRESENTATIVE (node))))
- != INTEGER_CST))
- || mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))))))
+ && (((SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_ELEMENTWISE
+ || SLP_TREE_MEMORY_ACCESS_TYPE (node) == VMAT_STRIDED_SLP)
+ && (TREE_CODE (DR_STEP (STMT_VINFO_DATA_REF
+ (SLP_TREE_REPRESENTATIVE (node))))
+ != INTEGER_CST))
+ || mat_gather_scatter_p (SLP_TREE_MEMORY_ACCESS_TYPE (node))))
{
auto lsdata = static_cast<vect_load_store_data *> (node->data);
tree ls_type = lsdata->ls_type ? lsdata->ls_type : vectype;
@@ -26755,7 +26788,6 @@ ix86_vector_costs::add_stmt_cost (int count,
vect_cost_for_stmt kind,
/ GET_MODE_BITSIZE (TYPE_MODE (ls_eltype)) + 1);
}
else if ((kind == vec_construct || kind == scalar_to_vec)
- && node
&& SLP_TREE_DEF_TYPE (node) == vect_external_def)
{
stmt_cost = ix86_default_vector_cost (kind, mode);
@@ -26823,34 +26855,58 @@ ix86_vector_costs::add_stmt_cost (int count,
vect_cost_for_stmt kind,
if (TREE_CODE (op) == SSA_NAME)
TREE_VISITED (op) = 0;
}
- if (stmt_cost == -1)
- stmt_cost = ix86_default_vector_cost (kind, mode);
+ return stmt_cost;
+}
+
+/* See the comment above the declaration for details. */
+
+void
+ix86_vector_costs::ix86_slp_account_perm (slp_tree node, int count,
+ tree vectype,
+ stmt_vec_info stmt_info,
+ vect_cost_model_location where)
+{
/* BIT_FIELD_REF <vect_**, 64, 0> with count 0 costs 0 in body. */
- if (kind == vec_perm && vectype && count != 0)
- {
- unsigned vec_size = GET_MODE_SIZE (TYPE_MODE (vectype));
- unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
- unsigned *num_vec_perm = NULL;
+ if (!vectype || count == 0)
+ return;
- if (vec_size == 32)
- num_vec_perm = m_num_avx256_vec_perm;
- else if (vec_size == 64)
- num_vec_perm = m_num_avx512_vec_perm;
+ unsigned vec_size = GET_MODE_SIZE (TYPE_MODE (vectype));
+ unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
+ unsigned *num_vec_perm = NULL;
- if (num_vec_perm && ix86_count_cross_lane_perm_p (m_vinfo, node, nunits))
+ if (vec_size == 32)
+ num_vec_perm = m_num_avx256_vec_perm;
+ else if (vec_size == 64)
+ num_vec_perm = m_num_avx512_vec_perm;
+
+ if (num_vec_perm && ix86_count_cross_lane_perm_p (m_vinfo, node, nunits))
+ {
+ num_vec_perm[where] += count;
+ if (dump_file && (dump_flags & TDF_DETAILS))
{
- num_vec_perm[where] += count;
- if (dump_file && (dump_flags & TDF_DETAILS))
- {
- fprintf (dump_file,
- "Detected avx%u cross-lane permutation: ", vec_size * 8);
- if (stmt_info)
- print_gimple_expr (dump_file, stmt_info->stmt, 0, TDF_SLIM);
- fprintf (dump_file, " \n");
- }
+ fprintf (dump_file,
+ "Detected avx%u cross-lane permutation: ", vec_size * 8);
+ if (stmt_info)
+ print_gimple_expr (dump_file, stmt_info->stmt, 0, TDF_SLIM);
+ fprintf (dump_file, " \n");
}
}
+}
+
+/* Apply the default-cost fallback, uarch fixups and frequency scaling to
+ STMT_COST (-1 if not yet computed), accumulate into m_costs[WHERE] and
+ return the result. */
+
+unsigned int
+ix86_vector_costs::finish_stmt_cost (int stmt_cost, int count,
+ vect_cost_for_stmt kind,
+ stmt_vec_info stmt_info,
+ machine_mode mode, tree vectype,
+ vect_cost_model_location where)
+{
+ if (stmt_cost == -1)
+ stmt_cost = ix86_default_vector_cost (kind, mode);
/* Penalize DFmode vector operations for Bonnell. */
if (TARGET_CPU_P (BONNELL) && kind == vector_stmt
@@ -26860,7 +26916,8 @@ ix86_vector_costs::add_stmt_cost (int count,
vect_cost_for_stmt kind,
/* Statements in an inner loop relative to the loop being
vectorized are weighted more heavily. The value here is
arbitrary and could potentially be improved with analysis. */
- retval = adjust_cost_for_freq (stmt_info, where, count * stmt_cost);
+ unsigned int retval
+ = adjust_cost_for_freq (stmt_info, where, count * stmt_cost);
/* We need to multiply all vector stmt cost by 1.7 (estimated cost)
for Silvermont as it has out of order integer pipeline and can execute
@@ -26879,6 +26936,53 @@ ix86_vector_costs::add_stmt_cost (int count,
vect_cost_for_stmt kind,
return retval;
}
+/* Cost the entries of COST_VEC, which all belong to a single SLP node.
+ vec_construct/vec_deconstruct/scalar_to_vec and vec_perm get x86-specific
+ node-level costing here; everything else is handed to add_stmt_cost. */
+
+unsigned int
+ix86_vector_costs::add_slp_cost (slp_tree,
+ const array_slice<stmt_info_for_cost>
&cost_vec)
+{
+ unsigned int sum = 0;
+ for (auto item : cost_vec)
+ {
+ tree vectype = item.vectype;
+ /* vec_perm is accounted even without a node (treated as cross-lane);
+ the construct cases need a node. */
+ if (item.kind == vec_perm
+ || (item.node
+ && (item.kind == vec_construct
+ || item.kind == vec_deconstruct
+ || item.kind == scalar_to_vec)))
+ {
+ machine_mode mode = vectype ? TYPE_MODE (vectype) : TImode;
+ bool fp = vectype ? FLOAT_TYPE_P (vectype) : false;
+ int stmt_cost = -1;
+
+ if (item.kind == vec_perm)
+ ix86_slp_account_perm (item.node, item.count, vectype,
+ item.stmt_info, item.where);
+ else
+ stmt_cost = ix86_slp_construct_cost (item.node, item.kind, vectype,
+ mode, fp, item.where);
+
+ unsigned int cost
+ = finish_stmt_cost (stmt_cost, item.count, item.kind,
+ item.stmt_info, mode, vectype, item.where);
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ dump_stmt_cost (dump_file, item.count, item.kind, item.stmt_info,
+ item.node, vectype, item.misalign, cost,
+ item.where);
+ sum += cost;
+ }
+ else
+ sum += ::add_stmt_cost (this, item.count, item.kind, item.stmt_info,
+ item.node, vectype, item.misalign, item.where);
+ }
+ return sum;
+}
+
void
ix86_vector_costs::ix86_vect_estimate_reg_pressure ()
{
--
2.34.1