This fixes the vectorizer part of PR66051 (a x86 target part remains for the testcase in the PR - PR68655). The issue is again a misplaced check for SLP detection:
/* Check that the size of interleaved loads group is not greater than the SLP group size. */ unsigned ncopies = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype); if (is_a <loop_vec_info> (vinfo) && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt && ((GROUP_SIZE (vinfo_for_stmt (stmt)) - GROUP_GAP (vinfo_for_stmt (stmt))) > ncopies * group_size)) { if (dump_enabled_p ()) { dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, "Build SLP failed: the number " "of interleaved loads is greater than " "the SLP group size "); dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); } /* Fatal mismatch. */ matches[0] = false; return false; } I've relaxed this multiple times but that still doesn't make it necessary. It also uses a vectorization factor estimate as the vectorization factor is not yet determined. A good side-effect of the patch is that we can get rid of that estimate completely. Tested on the x86_64 vectorization tests sofar. Bootstrap & regtest pending and I'll make sure SPEC CPU 2006 is happy as well. Thanks, Richard. 2015-12-02 Richard Biener <rguent...@suse.de> PR tree-optimization/66051 * tree-vect-slp.c (vect_build_slp_tree_1): Remove restriction on load group size. Do not pass in vectorization_factor. (vect_transform_slp_perm_load): Do not require any permute support. (vect_build_slp_tree): Do not pass in vectorization factor. (vect_analyze_slp_instance): Do not compute vectorization factor estimate. Use vector size instead of vectorization factor estimate to split store groups for BB vectorization. * gcc.dg/vect/slp-42.c: New testcase. Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c (revision 231167) --- gcc/tree-vect-slp.c (working copy) *************** static bool *** 430,437 **** vect_build_slp_tree_1 (vec_info *vinfo, vec<gimple *> stmts, unsigned int group_size, unsigned nops, unsigned int *max_nunits, ! unsigned int vectorization_factor, bool *matches, ! bool *two_operators) { unsigned int i; gimple *first_stmt = stmts[0], *stmt = stmts[0]; --- 430,436 ---- vect_build_slp_tree_1 (vec_info *vinfo, vec<gimple *> stmts, unsigned int group_size, unsigned nops, unsigned int *max_nunits, ! bool *matches, bool *two_operators) { unsigned int i; gimple *first_stmt = stmts[0], *stmt = stmts[0]; *************** vect_build_slp_tree_1 (vec_info *vinfo, *** 523,533 **** /* In case of multiple types we need to detect the smallest type. */ if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype)) ! { ! *max_nunits = TYPE_VECTOR_SUBPARTS (vectype); ! if (is_a <bb_vec_info> (vinfo)) ! vectorization_factor = *max_nunits; ! } if (gcall *call_stmt = dyn_cast <gcall *> (stmt)) { --- 522,528 ---- /* In case of multiple types we need to detect the smallest type. */ if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype)) ! *max_nunits = TYPE_VECTOR_SUBPARTS (vectype); if (gcall *call_stmt = dyn_cast <gcall *> (stmt)) { *************** vect_build_slp_tree_1 (vec_info *vinfo, *** 700,730 **** else { /* Load. */ - /* Check that the size of interleaved loads group is not - greater than the SLP group size. */ - unsigned ncopies - = vectorization_factor / TYPE_VECTOR_SUBPARTS (vectype); - if (is_a <loop_vec_info> (vinfo) - && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt - && ((GROUP_SIZE (vinfo_for_stmt (stmt)) - - GROUP_GAP (vinfo_for_stmt (stmt))) - > ncopies * group_size)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: the number " - "of interleaved loads is greater than " - "the SLP group size "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, - stmt, 0); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - /* Fatal mismatch. */ - matches[0] = false; - return false; - } - first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)); if (prev_first_load) { --- 695,700 ---- *************** vect_build_slp_tree (vec_info *vinfo, *** 871,877 **** slp_tree *node, unsigned int group_size, unsigned int *max_nunits, vec<slp_tree> *loads, - unsigned int vectorization_factor, bool *matches, unsigned *npermutes, unsigned *tree_size, unsigned max_tree_size) { --- 841,846 ---- *************** vect_build_slp_tree (vec_info *vinfo, *** 895,902 **** bool two_operators = false; if (!vect_build_slp_tree_1 (vinfo, SLP_TREE_SCALAR_STMTS (*node), group_size, nops, ! max_nunits, vectorization_factor, matches, ! &two_operators)) return false; SLP_TREE_TWO_OPERATORS (*node) = two_operators; --- 864,870 ---- bool two_operators = false; if (!vect_build_slp_tree_1 (vinfo, SLP_TREE_SCALAR_STMTS (*node), group_size, nops, ! max_nunits, matches, &two_operators)) return false; SLP_TREE_TWO_OPERATORS (*node) = two_operators; *************** vect_build_slp_tree (vec_info *vinfo, *** 959,966 **** } if (vect_build_slp_tree (vinfo, &child, ! group_size, max_nunits, loads, ! vectorization_factor, matches, npermutes, &this_tree_size, max_tree_size)) { /* If we have all children of child built up from scalars then just --- 927,933 ---- } if (vect_build_slp_tree (vinfo, &child, ! group_size, max_nunits, loads, matches, npermutes, &this_tree_size, max_tree_size)) { /* If we have all children of child built up from scalars then just *************** vect_build_slp_tree (vec_info *vinfo, *** 1074,1080 **** bool *tem = XALLOCAVEC (bool, group_size); if (vect_build_slp_tree (vinfo, &child, group_size, max_nunits, loads, - vectorization_factor, tem, npermutes, &this_tree_size, max_tree_size)) { --- 1041,1046 ---- *************** vect_analyze_slp_instance (vec_info *vin *** 1656,1662 **** unsigned int unrolling_factor = 1, nunits; tree vectype, scalar_type = NULL_TREE; gimple *next; - unsigned int vectorization_factor = 0; unsigned int i; unsigned int max_nunits = 0; vec<slp_tree> loads; --- 1622,1627 ---- *************** vect_analyze_slp_instance (vec_info *vin *** 1697,1708 **** return false; } - nunits = TYPE_VECTOR_SUBPARTS (vectype); - if (is_a <loop_vec_info> (vinfo)) - vectorization_factor = as_a <loop_vec_info> (vinfo)->vectorization_factor; - else - vectorization_factor = nunits; /* Calculate the unrolling factor. */ unrolling_factor = least_common_multiple (nunits, group_size) / group_size; --- 1662,1668 ---- *************** vect_analyze_slp_instance (vec_info *vin *** 1755,1762 **** unsigned npermutes = 0; if (vect_build_slp_tree (vinfo, &node, group_size, &max_nunits, &loads, ! vectorization_factor, matches, &npermutes, NULL, ! max_tree_size)) { /* Calculate the unrolling factor based on the smallest type. */ if (max_nunits > nunits) --- 1715,1721 ---- unsigned npermutes = 0; if (vect_build_slp_tree (vinfo, &node, group_size, &max_nunits, &loads, ! matches, &npermutes, NULL, max_tree_size)) { /* Calculate the unrolling factor based on the smallest type. */ if (max_nunits > nunits) *************** vect_analyze_slp_instance (vec_info *vin *** 1852,1858 **** loads.release (); /* For basic block SLP, try to break the group up into multiples of the ! vectorization factor. */ if (is_a <bb_vec_info> (vinfo) && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))) --- 1811,1817 ---- loads.release (); /* For basic block SLP, try to break the group up into multiples of the ! vector size. */ if (is_a <bb_vec_info> (vinfo) && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) && STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))) *************** vect_analyze_slp_instance (vec_info *vin *** 1862,1872 **** for (i = 0; i < group_size; i++) if (!matches[i]) break; ! if (i >= vectorization_factor && i < group_size) { /* Split into two groups at the first vector boundary before i. */ ! gcc_assert ((vectorization_factor & (vectorization_factor - 1)) == 0); ! unsigned group1_size = i & ~(vectorization_factor - 1); gimple *rest = vect_split_slp_store_group (stmt, group1_size); bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size); --- 1821,1831 ---- for (i = 0; i < group_size; i++) if (!matches[i]) break; ! if (i >= nunits && i < group_size) { /* Split into two groups at the first vector boundary before i. */ ! gcc_assert ((nunits & (nunits - 1)) == 0); ! unsigned group1_size = i & ~(nunits - 1); gimple *rest = vect_split_slp_store_group (stmt, group1_size); bool res = vect_analyze_slp_instance (vinfo, stmt, max_tree_size); *************** vect_analyze_slp_instance (vec_info *vin *** 1874,1882 **** skip the rest of that vector. */ if (group1_size < i) { ! i = group1_size + vectorization_factor; if (i < group_size) ! rest = vect_split_slp_store_group (rest, vectorization_factor); } if (i < group_size) res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size); --- 1833,1841 ---- skip the rest of that vector. */ if (group1_size < i) { ! i = group1_size + nunits; if (i < group_size) ! rest = vect_split_slp_store_group (rest, nunits); } if (i < group_size) res |= vect_analyze_slp_instance (vinfo, rest, max_tree_size); *************** vect_transform_slp_perm_load (slp_tree n *** 3274,3291 **** mode = TYPE_MODE (vectype); - if (!can_vec_perm_p (mode, false, NULL)) - { - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "no vect permute for "); - dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0); - dump_printf (MSG_MISSED_OPTIMIZATION, "\n"); - } - return false; - } - /* The generic VEC_PERM_EXPR code always uses an integral type of the same size as the vector element being permuted. */ mask_element_type = lang_hooks.types.type_for_mode --- 3233,3238 ---- Index: gcc/testsuite/gcc.dg/vect/slp-42.c =================================================================== *** gcc/testsuite/gcc.dg/vect/slp-42.c (revision 0) --- gcc/testsuite/gcc.dg/vect/slp-42.c (working copy) *************** *** 0 **** --- 1,19 ---- + /* { dg-do compile } */ + /* { dg-require-effective-target vect_int } */ + + int p[4096], q[4096]; + + void foo (int n) + { + int i; + for (i = 0; i < n; ++i) + { + p[i*4+0] = q[i*8+0] + q[i*8+4]; + p[i*4+1] = q[i*8+1] + q[i*8+5]; + p[i*4+2] = q[i*8+2] + q[i*8+6]; + p[i*4+3] = q[i*8+3] + q[i*8+7]; + } + } + + /* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */ + /* { dg-final { scan-tree-dump "vectorized 1 loops" "vect" } } */