The following implements some easy improvements for the SLP cost model for PR37150 which shows excess cost for dead loads and permutes accounted when vectorizing a basic-block.
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Still doesn't vectorize the testcase in that PR without -fno-vect-cost-model though. Real improvements are only possible with re-doing the vectorizer data structures. Richard. 2016-11-04 Richard Biener <rguent...@suse.de> PR tree-optimization/37150 * tree-vectorizer.h (vect_transform_slp_perm_load): Add n_perms parameter. * tree-vect-slp.c (vect_supported_load_permutation_p): Adjust. (vect_analyze_slp_cost_1): Account for the real number of permutations emitted and for dead loads. (vect_transform_slp_perm_load): Add n_perms parameter counting the number of emitted permutations. * tree-vect-stmts.c (vectorizable_load): Adjust. Index: gcc/tree-vectorizer.h =================================================================== --- gcc/tree-vectorizer.h (revision 241791) +++ gcc/tree-vectorizer.h (working copy) @@ -1166,7 +1168,7 @@ extern int vect_get_known_peeling_cost ( extern void vect_free_slp_instance (slp_instance); extern bool vect_transform_slp_perm_load (slp_tree, vec<tree> , gimple_stmt_iterator *, int, - slp_instance, bool); + slp_instance, bool, unsigned *); extern bool vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *); extern bool vect_schedule_slp (vec_info *); Index: gcc/tree-vect-slp.c =================================================================== --- gcc/tree-vect-slp.c (revision 241791) +++ gcc/tree-vect-slp.c (working copy) @@ -1461,8 +1461,9 @@ vect_supported_load_permutation_p (slp_i { /* Verify the permutation can be generated. */ vec<tree> tem; + unsigned n_perms; if (!vect_transform_slp_perm_load (node, tem, NULL, - 1, slp_instn, true)) + 1, slp_instn, true, &n_perms)) { dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, @@ -1475,11 +1476,13 @@ vect_supported_load_permutation_p (slp_i } /* For loop vectorization verify we can generate the permutation. */ + unsigned n_perms; FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) if (node->load_permutation.exists () && !vect_transform_slp_perm_load (node, vNULL, NULL, - SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true)) + SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true, + &n_perms)) return false; return true; @@ -1548,14 +1551,38 @@ vect_analyze_slp_cost_1 (slp_instance in stmt = GROUP_FIRST_ELEMENT (stmt_info); stmt_info = vinfo_for_stmt (stmt); /* Record the cost for the permutation. */ - record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, + unsigned n_perms; + vect_transform_slp_perm_load (node, vNULL, NULL, + ncopies_for_cost, instance, true, + &n_perms); + record_stmt_cost (body_cost_vec, n_perms, vec_perm, stmt_info, 0, vect_body); - /* And adjust the number of loads performed. */ unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); - ncopies_for_cost - = (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info) - + nunits - 1) / nunits; + /* And adjust the number of loads performed. This handles + redundancies as well as loads that are later dead. */ + auto_sbitmap perm (GROUP_SIZE (stmt_info)); + bitmap_clear (perm); + for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i) + bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]); + ncopies_for_cost = 0; + bool load_seen = false; + for (i = 0; i < GROUP_SIZE (stmt_info); ++i) + { + if (i % nunits == 0) + { + if (load_seen) + ncopies_for_cost++; + load_seen = false; + } + if (bitmap_bit_p (perm, i)) + load_seen = true; + } + if (load_seen) + ncopies_for_cost++; + gcc_assert (ncopies_for_cost + <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info) + + nunits - 1) / nunits); ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance); } /* Record the cost for the vector loads. */ @@ -3402,7 +3489,8 @@ vect_create_mask_and_perm (gimple *stmt, bool vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain, gimple_stmt_iterator *gsi, int vf, - slp_instance slp_node_instance, bool analyze_only) + slp_instance slp_node_instance, bool analyze_only, + unsigned *n_perms) { gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0]; stmt_vec_info stmt_info = vinfo_for_stmt (stmt); @@ -3457,6 +3545,7 @@ vect_transform_slp_perm_load (slp_tree n int first_vec_index = -1; int second_vec_index = -1; bool noop_p = true; + *n_perms = 0; for (int j = 0; j < unroll_factor; j++) { @@ -3513,6 +3602,9 @@ vect_transform_slp_perm_load (slp_tree n return false; } + if (! noop_p) + ++*n_perms; + if (!analyze_only) { tree mask_vec = NULL_TREE; Index: gcc/tree-vect-stmts.c =================================================================== --- gcc/tree-vect-stmts.c (revision 241791) +++ gcc/tree-vect-stmts.c (working copy) @@ -6978,8 +7041,11 @@ vectorizable_load (gimple *stmt, gimple_ } } if (slp_perm) - vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, - slp_node_instance, false); + { + unsigned n_perms; + vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, + slp_node_instance, false, &n_perms); + } return true; } @@ -7497,8 +7563,10 @@ vectorizable_load (gimple *stmt, gimple_ if (slp_perm) { + unsigned n_perms; if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf, - slp_node_instance, false)) + slp_node_instance, false, + &n_perms)) { dr_chain.release (); return false;