The following fixes PR68892, the BB vectorizer now happily creates a load of dead vector loads (we had a similar bug with loop single-element interleaving support in the past). Fixed as a side-effect of making the SLP load cost reflect reality.
Bootstrapped on x86_64-unknown-linux-gnu, testing in progress. Richard. 2015-12-14 Richard Biener <rguent...@suse.de> PR tree-optimization/68892 * tree-vect-slp.c (vect_analyze_slp_cost_1): Properly compute cost for permuted loads. * gcc.dg/vect/bb-slp-pr68892.c: New testcase. Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c (revision 231617) --- gcc/tree-vect-slp.c (working copy) *************** vect_analyze_slp_cost_1 (slp_instance in *** 1405,1414 **** { unsigned i; slp_tree child; ! gimple *stmt, *s; stmt_vec_info stmt_info; tree lhs; - unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance); /* Recurse down the SLP tree. */ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) --- 1405,1413 ---- { unsigned i; slp_tree child; ! gimple *stmt; stmt_vec_info stmt_info; tree lhs; /* Recurse down the SLP tree. */ FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) *************** vect_analyze_slp_cost_1 (slp_instance in *** 1427,1470 **** node, prologue_cost_vec, body_cost_vec); else { - int i; gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); - /* If the load is permuted then the alignment is determined by - the first group element not by the first scalar stmt DR. */ if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) { stmt = GROUP_FIRST_ELEMENT (stmt_info); stmt_info = vinfo_for_stmt (stmt); } vect_model_load_cost (stmt_info, ncopies_for_cost, false, node, prologue_cost_vec, body_cost_vec); - /* If the load is permuted record the cost for the permutation. - ??? Loads from multiple chains are let through here only - for a single special case involving complex numbers where - in the end no permutation is necessary. */ - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, s) - if ((STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo_for_stmt (s)) - == STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) - && vect_get_place_in_interleaving_chain - (s, STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info)) != i) - { - record_stmt_cost (body_cost_vec, group_size, vec_perm, - stmt_info, 0, vect_body); - break; - } } } ! else { record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0, vect_body); ! if (SLP_TREE_TWO_OPERATORS (node)) ! { ! record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, ! stmt_info, 0, vect_body); ! record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, ! stmt_info, 0, vect_body); ! } } /* Scan operands and account for prologue cost of constants/externals. --- 1426,1464 ---- node, prologue_cost_vec, body_cost_vec); else { gcc_checking_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))); if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) { + /* If the load is permuted then the alignment is determined by + the first group element not by the first scalar stmt DR. */ stmt = GROUP_FIRST_ELEMENT (stmt_info); stmt_info = vinfo_for_stmt (stmt); + /* Record the cost for the permutation. */ + record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, + stmt_info, 0, vect_body); + /* And adjust the number of loads performed. */ + unsigned nunits + = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); + ncopies_for_cost + = (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info) + + nunits - 1) / nunits; + ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance); } + /* Record the cost for the vector loads. */ vect_model_load_cost (stmt_info, ncopies_for_cost, false, node, prologue_cost_vec, body_cost_vec); } + return; } ! ! record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, ! stmt_info, 0, vect_body); ! if (SLP_TREE_TWO_OPERATORS (node)) { record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt, stmt_info, 0, vect_body); ! record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm, ! stmt_info, 0, vect_body); } /* Scan operands and account for prologue cost of constants/externals. Index: gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c =================================================================== *** gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c (revision 0) --- gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c (revision 0) *************** *** 0 **** --- 1,16 ---- + /* { dg-do compile } */ + /* { dg-additional-options "-fvect-cost-model=dynamic" } */ + + double a[128][128]; + double b[128]; + + void foo(void) + { + b[0] = a[0][0]; + b[1] = a[1][0]; + b[2] = a[2][0]; + b[3] = a[3][0]; + } + + /* { dg-final { scan-tree-dump "not profitable" "slp2" } } */ + /* { dg-final { scan-tree-dump-times "Basic block will be vectorized" 0 "slp2" } } */