This is an update from last years attempt to tame down vectorization
when it runs in to ABI inefficiencies at function return. I've
updated the patch to look for multi-reg returns instead of
!vector ones (due to word_mode vectorization) and handle a bit
more returns, esp. the common pattern of
ret = vector;
D.1234 = ret;
ret = {CLOBBER};
return D.1234;
Bootstrapped and tested on x86_64-unknown-linux-gnu, OK?
I know this isn't the ultimate solution but we keep getting
duplicates of the PR.
Richard.
2019-03-28 Richard Biener <[email protected]>
PR tree-optimization/84101
* tree-vect-stmts.c: Include explow.h for hard_function_value,
regs.h for hard_regno_nregs.
(cfun_returns): New helper.
(vect_model_store_cost): When vectorizing a store to a decl
we return and the function ABI returns in a multi-reg location
account for the possible spilling that will happen.
* gcc.target/i386/pr84101.c: New testcase.
Index: gcc/tree-vect-stmts.c
===================================================================
--- gcc/tree-vect-stmts.c (revision 269987)
+++ gcc/tree-vect-stmts.c (working copy)
@@ -43,6 +43,7 @@ along with GCC; see the file COPYING3.
#include "tree-cfg.h"
#include "tree-ssa-loop-manip.h"
#include "cfgloop.h"
+#include "explow.h"
#include "tree-ssa-loop.h"
#include "tree-scalar-evolution.h"
#include "tree-vectorizer.h"
@@ -52,6 +53,7 @@ along with GCC; see the file COPYING3.
#include "vec-perm-indices.h"
#include "tree-ssa-loop-niter.h"
#include "gimple-fold.h"
+#include "regs.h"
/* For lang_hooks.types.type_for_mode. */
#include "langhooks.h"
@@ -948,6 +950,37 @@ vect_model_promotion_demotion_cost (stmt
"prologue_cost = %d .\n", inside_cost, prologue_cost);
}
+/* Returns true if the current function returns DECL. */
+
+static bool
+cfun_returns (tree decl)
+{
+ edge_iterator ei;
+ edge e;
+ FOR_EACH_EDGE (e, ei, EXIT_BLOCK_PTR_FOR_FN (cfun)->preds)
+ {
+ greturn *ret = safe_dyn_cast <greturn *> (last_stmt (e->src));
+ if (!ret)
+ continue;
+ if (gimple_return_retval (ret) == decl)
+ return true;
+ /* We often end up with an aggregate copy to the result decl,
+ handle that case as well. First skip intermediate clobbers
+ though. */
+ gimple *def = ret;
+ do
+ {
+ def = SSA_NAME_DEF_STMT (gimple_vuse (def));
+ }
+ while (gimple_clobber_p (def));
+ if (is_a <gassign *> (def)
+ && gimple_assign_lhs (def) == gimple_return_retval (ret)
+ && gimple_assign_rhs1 (def) == decl)
+ return true;
+ }
+ return false;
+}
+
/* Function vect_model_store_cost
Models cost for stores. In the case of grouped accesses, one access
@@ -1032,6 +1065,37 @@ vect_model_store_cost (stmt_vec_info stm
vec_to_scalar, stmt_info, 0, vect_body);
}
+ /* When vectorizing a store into the function result assign
+ a penalty if the function returns in a multi-register location.
+ In this case we assume we'll end up with having to spill the
+ vector result and do piecewise loads. */
+ tree base = get_base_address (STMT_VINFO_DATA_REF (stmt_info)->ref);
+ if (base
+ && (TREE_CODE (base) == RESULT_DECL
+ || (DECL_P (base) && cfun_returns (base)))
+ && !aggregate_value_p (base, cfun->decl))
+ {
+ rtx reg = hard_function_value (TREE_TYPE (base), cfun->decl, 0, 1);
+ /* ??? Handle PARALLEL in some way. */
+ if (REG_P (reg))
+ {
+ int nregs = hard_regno_nregs (REGNO (reg), GET_MODE (reg));
+ /* Assume that a reg-reg move is possible and cheap,
+ do not account for vector to gp register move cost. */
+ if (nregs > 1)
+ {
+ /* Spill. */
+ prologue_cost += record_stmt_cost (cost_vec, ncopies,
+ vector_store,
+ stmt_info, 0, vect_epilogue);
+ /* Loads. */
+ prologue_cost += record_stmt_cost (cost_vec, ncopies * nregs,
+ scalar_load,
+ stmt_info, 0, vect_epilogue);
+ }
+ }
+ }
+
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"vect_model_store_cost: inside_cost = %d, "
Index: gcc/testsuite/gcc.target/i386/pr84101.c
===================================================================
--- gcc/testsuite/gcc.target/i386/pr84101.c (nonexistent)
+++ gcc/testsuite/gcc.target/i386/pr84101.c (working copy)
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -fdump-tree-slp2-details" } */
+
+typedef struct uint64_pair uint64_pair_t ;
+struct uint64_pair
+{
+ unsigned long w0 ;
+ unsigned long w1 ;
+} ;
+
+uint64_pair_t pair(int num)
+{
+ uint64_pair_t p ;
+
+ p.w0 = num << 1 ;
+ p.w1 = num >> 1 ;
+
+ return p ;
+}
+
+/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp2" } } */