The following fixes a bug in vect_transform_slp_perm_load which tries to be clever in computing sth like "ncopies" but fails to do that correctly (and in fact it can't be done). Instead just compute all loads/permutations manually.
The fix is as simple as Index: tree-vect-slp.c =================================================================== --- tree-vect-slp.c (revision 245279) +++ tree-vect-slp.c (working copy) @@ -3412,7 +3412,7 @@ vect_transform_slp_perm_load (slp_tree n int second_vec_index = -1; bool noop_p = true; - for (int j = 0; j < unroll_factor; j++) + for (int j = 0; j < vf; j++) { for (int k = 0; k < group_size; k++) { @@ -3486,7 +3486,7 @@ vect_transform_slp_perm_load (slp_tree n vect_create_mask_and_perm (stmt, mask_vec, first_vec_index, second_vec_index, gsi, node, vectype, dr_chain, - ncopies, vect_stmts_counter++); + 1, vect_stmts_counter++); } index = 0; but it allows dead code to be removed and the now pointless helper vect_create_mask_and_perm to vanish. Bootstrapped and tested on x86_64-unknown-linux-gnu, applied. Richard. 2017-03-08 Richard Biener <rguent...@suse.de> PR tree-optimization/79920 * tree-vect-slp.c (vect_create_mask_and_perm): Remove and inline with ncopies == 1 to ... (vect_transform_slp_perm_load): ... here. Properly compute all element loads by iterating VF times over the group. Do not handle ncopies (computed in a broken way) in vect_create_mask_and_perm. * gcc.dg/vect/pr79920.c: New testcase. Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c (revision 245947) --- gcc/tree-vect-slp.c (working copy) *************** vect_get_slp_defs (vec<tree> ops, slp_tr *** 3379,3444 **** } } - - /* Create NCOPIES permutation statements using the mask MASK_BYTES (by - building a vector of type MASK_TYPE from it) and two input vectors placed in - DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and - shifting by STRIDE elements of DR_CHAIN for every copy. - (STRIDE is the number of vectorized stmts for NODE divided by the number of - copies). - VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where - the created stmts must be inserted. */ - - static inline void - vect_create_mask_and_perm (gimple *stmt, - tree mask, int first_vec_indx, int second_vec_indx, - gimple_stmt_iterator *gsi, slp_tree node, - tree vectype, vec<tree> dr_chain, - int ncopies, int vect_stmts_counter) - { - tree perm_dest; - gimple *perm_stmt = NULL; - int i, stride_in, stride_out; - tree first_vec, second_vec, data_ref; - - stride_out = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies; - stride_in = dr_chain.length () / ncopies; - - /* Initialize the vect stmts of NODE to properly insert the generated - stmts later. */ - for (i = SLP_TREE_VEC_STMTS (node).length (); - i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) - SLP_TREE_VEC_STMTS (node).quick_push (NULL); - - perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); - for (i = 0; i < ncopies; i++) - { - first_vec = dr_chain[first_vec_indx]; - second_vec = dr_chain[second_vec_indx]; - - /* Generate the permute statement if necessary. */ - if (mask) - { - perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR, - first_vec, second_vec, mask); - data_ref = make_ssa_name (perm_dest, perm_stmt); - gimple_set_lhs (perm_stmt, data_ref); - vect_finish_stmt_generation (stmt, perm_stmt, gsi); - } - else - /* If mask was NULL_TREE generate the requested identity transform. */ - perm_stmt = SSA_NAME_DEF_STMT (first_vec); - - /* Store the vector statement in NODE. */ - SLP_TREE_VEC_STMTS (node)[stride_out * i + vect_stmts_counter] - = perm_stmt; - - first_vec_indx += stride_in; - second_vec_indx += stride_in; - } - } - - /* Generate vector permute statements from a list of loads in DR_CHAIN. If ANALYZE_ONLY is TRUE, only check that it is possible to create valid permute statements for the SLP node NODE of the SLP instance --- 3379,3384 ---- *************** vect_transform_slp_perm_load (slp_tree n *** 3456,3462 **** int nunits, vec_index = 0; tree vectype = STMT_VINFO_VECTYPE (stmt_info); int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); ! int unroll_factor, mask_element, ncopies; unsigned char *mask; machine_mode mode; --- 3396,3402 ---- int nunits, vec_index = 0; tree vectype = STMT_VINFO_VECTYPE (stmt_info); int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); ! int mask_element; unsigned char *mask; machine_mode mode; *************** vect_transform_slp_perm_load (slp_tree n *** 3474,3484 **** mask_type = get_vectype_for_scalar_type (mask_element_type); nunits = TYPE_VECTOR_SUBPARTS (vectype); mask = XALLOCAVEC (unsigned char, nunits); - unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); ! /* Number of copies is determined by the final vectorization factor ! relatively to SLP_NODE_INSTANCE unrolling factor. */ ! ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); /* Generate permutation masks for every NODE. Number of masks for each NODE is equal to GROUP_SIZE. --- 3414,3426 ---- mask_type = get_vectype_for_scalar_type (mask_element_type); nunits = TYPE_VECTOR_SUBPARTS (vectype); mask = XALLOCAVEC (unsigned char, nunits); ! /* Initialize the vect stmts of NODE to properly insert the generated ! stmts later. */ ! if (! analyze_only) ! for (unsigned i = SLP_TREE_VEC_STMTS (node).length (); ! i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) ! SLP_TREE_VEC_STMTS (node).quick_push (NULL); /* Generate permutation masks for every NODE. Number of masks for each NODE is equal to GROUP_SIZE. *************** vect_transform_slp_perm_load (slp_tree n *** 3505,3511 **** bool noop_p = true; *n_perms = 0; ! for (int j = 0; j < unroll_factor; j++) { for (int k = 0; k < group_size; k++) { --- 3447,3453 ---- bool noop_p = true; *n_perms = 0; ! for (int j = 0; j < vf; j++) { for (int k = 0; k < group_size; k++) { *************** vect_transform_slp_perm_load (slp_tree n *** 3578,3587 **** if (second_vec_index == -1) second_vec_index = first_vec_index; ! vect_create_mask_and_perm (stmt, mask_vec, first_vec_index, ! second_vec_index, ! gsi, node, vectype, dr_chain, ! ncopies, vect_stmts_counter++); } index = 0; --- 3520,3549 ---- if (second_vec_index == -1) second_vec_index = first_vec_index; ! ! /* Generate the permute statement if necessary. */ ! tree first_vec = dr_chain[first_vec_index]; ! tree second_vec = dr_chain[second_vec_index]; ! gimple *perm_stmt; ! if (! noop_p) ! { ! tree perm_dest ! = vect_create_destination_var (gimple_assign_lhs (stmt), ! vectype); ! perm_dest = make_ssa_name (perm_dest); ! perm_stmt = gimple_build_assign (perm_dest, ! VEC_PERM_EXPR, ! first_vec, second_vec, ! mask_vec); ! vect_finish_stmt_generation (stmt, perm_stmt, gsi); ! } ! else ! /* If mask was NULL_TREE generate the requested ! identity transform. */ ! perm_stmt = SSA_NAME_DEF_STMT (first_vec); ! ! /* Store the vector statement in NODE. */ ! SLP_TREE_VEC_STMTS (node)[vect_stmts_counter++] = perm_stmt; } index = 0; Index: gcc/testsuite/gcc.dg/vect/pr79920.c =================================================================== *** gcc/testsuite/gcc.dg/vect/pr79920.c (nonexistent) --- gcc/testsuite/gcc.dg/vect/pr79920.c (working copy) *************** *** 0 **** --- 1,44 ---- + /* { dg-do run } */ + /* { dg-additional-options "-O3" } */ + + #include "tree-vect.h" + + double __attribute__((noinline,noclone)) + compute_integral (double w_1[18]) + { + double A = 0; + double t33[2][6] = {{0.0, 0.0, 0.0, 0.0, 0.0, 0.0}, + {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}}; + double t43[2] = {0.0, 0.0}; + double t31[2][2] = {{1.0, 1.0}, {1.0, 1.0}}; + double t32[2][3] = {{0.0, 0.0, 1.0}, {0.0, 0.0, 1.0}}; + + for (int ip_1 = 0; ip_1 < 2; ++ip_1) + { + for (int i_0 = 0; i_0 < 6; ++i_0) + t33[ip_1][i_0] = ((w_1[i_0*3] * t32[ip_1][0]) + + (w_1[i_0*3+2] * t32[ip_1][2])); + t43[ip_1] = 2.0; + } + for (int i_0 = 0; i_0 < 6; ++i_0) + A += t43[1]*t33[1][i_0]; + return A; + } + + int main() + { + check_vect (); + + double w_1[18] = {0., 1.0, 1.0, + 0., 1.0, 1.0, + 0., 1.0, 1.0, + 0., 1.0, 1.0, + 0., 1.0, 1.0, + 0., 1.0, 1.0}; + double A = compute_integral(w_1); + if (A != 12.0) + __builtin_abort (); + return 0; + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_double && { vect_perm && vect_hw_misalign } } } } } */