This makes the vectorizer handle loads of SLP groups with "trailing gaps". That is, a (for now know) gap between the groups between loop iterations as in the testcase
+ double self[1024]; + double a[1024][1024]; + double b[1024]; + + void __attribute__((noinline,noclone)) + foo (void) + { + int i, j; + for (i = 0; i < 1024; i+=6) + for (j = 0; j < 1024; j+=6) + { + self[i] = self[i] + a[i][j]*b[j]; + self[i+1] = self[i+1] + a[i][j+1]*b[j+1]; + } + } reduced to this issue from the complex multiplication vectorization issue in PR37021. A next step will eventually be to allow an unknown gap between the groups between iterations (including possible overlap). Bootstrapped and tested on x86_64-unknown-linux-gnu. Richard. 2013-03-27 Richard Biener <rguent...@suse.de> PR tree-optimization/37021 * tree-vect-slp.c (vect_build_slp_tree): When not unrolling do not restrict gaps between groups. * tree-vect-stmts.c (vectorizable_load): Properly account for a gap between groups. * gcc.dg/vect/fast-math-slp-38.c: New testcase. * gcc.dg/vect/O3-pr36098.c: Un-XFAIL. Index: gcc/tree-vect-slp.c =================================================================== *** gcc/tree-vect-slp.c.orig 2013-03-26 13:09:18.000000000 +0100 --- gcc/tree-vect-slp.c 2013-03-26 14:27:14.135697847 +0100 *************** vect_build_slp_tree (loop_vec_info loop_ *** 740,750 **** else { /* Load. */ ! /* FORNOW: Check that there is no gap between the loads. */ ! if ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt ! && GROUP_GAP (vinfo_for_stmt (stmt)) != 0) ! || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt ! && GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) { if (dump_enabled_p ()) { --- 750,765 ---- else { /* Load. */ ! /* FORNOW: Check that there is no gap between the loads ! and no gap between the groups when we need to load ! multiple groups at once. ! ??? We should enhance this to only disallow gaps ! inside vectors. */ ! if ((ncopies > 1 ! && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt ! && GROUP_GAP (vinfo_for_stmt (stmt)) != 0) ! || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt ! && GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) { if (dump_enabled_p ()) { *************** vect_build_slp_tree (loop_vec_info loop_ *** 762,768 **** /* Check that the size of interleaved loads group is not greater than the SLP group size. */ if (loop_vinfo ! && GROUP_SIZE (vinfo_for_stmt (stmt)) > ncopies * group_size) { if (dump_enabled_p ()) { --- 777,786 ---- /* Check that the size of interleaved loads group is not greater than the SLP group size. */ if (loop_vinfo ! && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt ! && ((GROUP_SIZE (vinfo_for_stmt (stmt)) ! - GROUP_GAP (vinfo_for_stmt (stmt))) ! > ncopies * group_size)) { if (dump_enabled_p ()) { Index: gcc/testsuite/gcc.dg/vect/fast-math-slp-38.c =================================================================== *** /dev/null 1970-01-01 00:00:00.000000000 +0000 --- gcc/testsuite/gcc.dg/vect/fast-math-slp-38.c 2013-03-26 13:09:20.860002059 +0100 *************** *** 0 **** --- 1,22 ---- + /* { dg-do compile } */ + /* { dg-require-effective-target vect_double } */ + + double self[1024]; + double a[1024][1024]; + double b[1024]; + + void __attribute__((noinline,noclone)) + foo (void) + { + int i, j; + for (i = 0; i < 1024; i+=6) + for (j = 0; j < 1024; j+=6) + { + self[i] = self[i] + a[i][j]*b[j]; + self[i+1] = self[i+1] + a[i][j+1]*b[j+1]; + } + } + + /* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ + /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ + /* { dg-final { cleanup-tree-dump "vect" } } */ Index: gcc/testsuite/gcc.dg/vect/O3-pr36098.c =================================================================== *** gcc/testsuite/gcc.dg/vect/O3-pr36098.c.orig 2013-03-26 13:02:00.000000000 +0100 --- gcc/testsuite/gcc.dg/vect/O3-pr36098.c 2013-03-26 13:09:20.860002059 +0100 *************** void foo (int ncons, t_sortblock *sb, in *** 17,22 **** iatom[m]=sb[i].iatom[m]; } ! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { xfail *-*-* } } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ - --- 17,21 ---- iatom[m]=sb[i].iatom[m]; } ! /* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" } } */ /* { dg-final { cleanup-tree-dump "vect" } } */ Index: gcc/tree-vect-stmts.c =================================================================== *** gcc/tree-vect-stmts.c.orig 2013-03-26 13:02:00.000000000 +0100 --- gcc/tree-vect-stmts.c 2013-03-26 13:09:20.861002070 +0100 *************** vectorizable_load (gimple stmt, gimple_s *** 4316,4322 **** gimple ptr_incr; int nunits = TYPE_VECTOR_SUBPARTS (vectype); int ncopies; ! int i, j, group_size; tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree realignment_token = NULL_TREE; --- 4316,4322 ---- gimple ptr_incr; int nunits = TYPE_VECTOR_SUBPARTS (vectype); int ncopies; ! int i, j, group_size, group_gap; tree msq = NULL_TREE, lsq; tree offset = NULL_TREE; tree realignment_token = NULL_TREE; *************** vectorizable_load (gimple stmt, gimple_s *** 4766,4780 **** vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); if (SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance).exists ()) slp_perm = true; } else ! vec_num = group_size; } else { first_stmt = stmt; first_dr = dr; group_size = vec_num = 1; } alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false); --- 4766,4785 ---- vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); if (SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance).exists ()) slp_perm = true; + group_gap = GROUP_GAP (vinfo_for_stmt (first_stmt)); } else ! { ! vec_num = group_size; ! group_gap = 0; ! } } else { first_stmt = stmt; first_dr = dr; group_size = vec_num = 1; + group_gap = 0; } alignment_support_scheme = vect_supportable_dr_alignment (first_dr, false); *************** vectorizable_load (gimple stmt, gimple_s *** 5134,5139 **** --- 5139,5153 ---- if (slp && !slp_perm) SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt); } + /* Bump the vector pointer to account for a gap. */ + if (slp && group_gap != 0) + { + tree bump = size_binop (MULT_EXPR, + TYPE_SIZE_UNIT (elem_type), + size_int (group_gap)); + dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, gsi, + stmt, bump); + } } if (slp && !slp_perm)