This avoids falling back to elementwise accesses for strided SLP loads when the group size is not a multiple of the vector element size. Instead we can use a smaller vector or integer type for the load.
For stores we can do the same though restrictions on stores we handle and the fact that store-merging covers up makes this mostly effective for cost modeling which shows for gcc.target/i386/vect-strided-3.c which we now vectorize with V4SI vectors rather than just V2SI ones. For all of this there's still the opportunity to use non-uniform accesses, say for a 6-element group with a VF of two do V4SI, { V2SI, V2SI }, V4SI. But that's for a possible followup. * gcc.target/i386/vect-strided-1.c: New testcase. * gcc.target/i386/vect-strided-2.c: Likewise. * gcc.target/i386/vect-strided-3.c: Likewise. * gcc.target/i386/vect-strided-4.c: Likewise. --- .../gcc.target/i386/vect-strided-1.c | 24 +++++ .../gcc.target/i386/vect-strided-2.c | 17 +++ .../gcc.target/i386/vect-strided-3.c | 20 ++++ .../gcc.target/i386/vect-strided-4.c | 20 ++++ gcc/tree-vect-stmts.cc | 100 ++++++++---------- 5 files changed, 127 insertions(+), 54 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-1.c create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-2.c create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-3.c create mode 100644 gcc/testsuite/gcc.target/i386/vect-strided-4.c diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-1.c b/gcc/testsuite/gcc.target/i386/vect-strided-1.c new file mode 100644 index 00000000000..db4a06711f1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-strided-1.c @@ -0,0 +1,24 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-avx" } */ + +void foo (int * __restrict a, int *b, int s) +{ + for (int i = 0; i < 1024; ++i) + { + a[8*i+0] = b[s*i+0]; + a[8*i+1] = b[s*i+1]; + a[8*i+2] = b[s*i+2]; + a[8*i+3] = b[s*i+3]; + a[8*i+4] = b[s*i+4]; + a[8*i+5] = b[s*i+5]; + a[8*i+6] = b[s*i+4]; + a[8*i+7] = b[s*i+5]; + } +} + +/* Three two-element loads, two four-element stores. On ia32 we elide + a permute and perform a redundant load. */ +/* { dg-final { scan-assembler-times "movq" 2 } } */ +/* { dg-final { scan-assembler-times "movhps" 2 { target ia32 } } } */ +/* { dg-final { scan-assembler-times "movhps" 1 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "movups" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-2.c b/gcc/testsuite/gcc.target/i386/vect-strided-2.c new file mode 100644 index 00000000000..6fd64e28cf0 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-strided-2.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-avx" } */ + +void foo (int * __restrict a, int *b, int s) +{ + for (int i = 0; i < 1024; ++i) + { + a[4*i+0] = b[s*i+0]; + a[4*i+1] = b[s*i+1]; + a[4*i+2] = b[s*i+0]; + a[4*i+3] = b[s*i+1]; + } +} + +/* One two-element load, one four-element store. */ +/* { dg-final { scan-assembler-times "movq" 1 } } */ +/* { dg-final { scan-assembler-times "movups" 1 } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-3.c b/gcc/testsuite/gcc.target/i386/vect-strided-3.c new file mode 100644 index 00000000000..b462701a0b2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-strided-3.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-avx -fno-tree-slp-vectorize" } */ + +void foo (int * __restrict a, int *b, int s) +{ + if (s >= 6) + for (int i = 0; i < 1024; ++i) + { + a[s*i+0] = b[4*i+0]; + a[s*i+1] = b[4*i+1]; + a[s*i+2] = b[4*i+2]; + a[s*i+3] = b[4*i+3]; + a[s*i+4] = b[4*i+0]; + a[s*i+5] = b[4*i+1]; + } +} + +/* While the vectorizer generates 6 uint64 stores. */ +/* { dg-final { scan-assembler-times "movq" 4 } } */ +/* { dg-final { scan-assembler-times "movhps" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-strided-4.c b/gcc/testsuite/gcc.target/i386/vect-strided-4.c new file mode 100644 index 00000000000..dd922926a2a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-strided-4.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse4.2 -mno-avx -fno-tree-slp-vectorize" } */ + +void foo (int * __restrict a, int * __restrict b, int *c, int s) +{ + if (s >= 2) + for (int i = 0; i < 1024; ++i) + { + a[s*i+0] = c[4*i+0]; + a[s*i+1] = c[4*i+1]; + b[s*i+0] = c[4*i+2]; + b[s*i+1] = c[4*i+3]; + } +} + +/* Vectorization factor two, two two-element stores to a using movq + and two two-element stores to b via pextrq/movhps of the high part. */ +/* { dg-final { scan-assembler-times "movq" 2 } } */ +/* { dg-final { scan-assembler-times "pextrq" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "movhps" 2 { target { ia32 } } } } */ diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc index 701a44e44cd..d148e11a514 100644 --- a/gcc/tree-vect-stmts.cc +++ b/gcc/tree-vect-stmts.cc @@ -2036,15 +2036,10 @@ get_group_load_store_type (vec_info *vinfo, stmt_vec_info stmt_info, first_dr_info = STMT_VINFO_DR_INFO (SLP_TREE_SCALAR_STMTS (slp_node)[0]); if (STMT_VINFO_STRIDED_P (first_stmt_info)) - { - /* Try to use consecutive accesses of DR_GROUP_SIZE elements, - separated by the stride, until we have a complete vector. - Fall back to scalar accesses if that isn't possible. */ - if (multiple_p (nunits, group_size)) - *memory_access_type = VMAT_STRIDED_SLP; - else - *memory_access_type = VMAT_ELEMENTWISE; - } + /* Try to use consecutive accesses of as many elements as possible, + separated by the stride, until we have a complete vector. + Fall back to scalar accesses if that isn't possible. */ + *memory_access_type = VMAT_STRIDED_SLP; else { int cmp = compare_step_with_zero (vinfo, stmt_info); @@ -8514,12 +8509,29 @@ vectorizable_store (vec_info *vinfo, tree lvectype = vectype; if (slp) { - if (group_size < const_nunits - && const_nunits % group_size == 0) + HOST_WIDE_INT n = gcd (group_size, const_nunits); + if (n == const_nunits) { - nstores = const_nunits / group_size; - lnel = group_size; - ltype = build_vector_type (elem_type, group_size); + int mis_align = dr_misalignment (first_dr_info, vectype); + dr_alignment_support dr_align + = vect_supportable_dr_alignment (vinfo, dr_info, vectype, + mis_align); + if (dr_align == dr_aligned + || dr_align == dr_unaligned_supported) + { + nstores = 1; + lnel = const_nunits; + ltype = vectype; + lvectype = vectype; + alignment_support_scheme = dr_align; + misalignment = mis_align; + } + } + else if (n > 1) + { + nstores = const_nunits / n; + lnel = n; + ltype = build_vector_type (elem_type, n); lvectype = vectype; /* First check if vec_extract optab doesn't support extraction @@ -8528,7 +8540,7 @@ vectorizable_store (vec_info *vinfo, machine_mode vmode; if (!VECTOR_MODE_P (TYPE_MODE (vectype)) || !related_vector_mode (TYPE_MODE (vectype), elmode, - group_size).exists (&vmode) + n).exists (&vmode) || (convert_optab_handler (vec_extract_optab, TYPE_MODE (vectype), vmode) == CODE_FOR_nothing)) @@ -8539,8 +8551,8 @@ vectorizable_store (vec_info *vinfo, re-interpreting it as the original vector type if supported. */ unsigned lsize - = group_size * GET_MODE_BITSIZE (elmode); - unsigned int lnunits = const_nunits / group_size; + = n * GET_MODE_BITSIZE (elmode); + unsigned int lnunits = const_nunits / n; /* If we can't construct such a vector fall back to element extracts from the original vector type and element size stores. */ @@ -8553,7 +8565,7 @@ vectorizable_store (vec_info *vinfo, != CODE_FOR_nothing)) { nstores = lnunits; - lnel = group_size; + lnel = n; ltype = build_nonstandard_integer_type (lsize, 1); lvectype = build_vector_type (ltype, nstores); } @@ -8564,24 +8576,6 @@ vectorizable_store (vec_info *vinfo, issue exists here for reasonable archs. */ } } - else if (group_size >= const_nunits - && group_size % const_nunits == 0) - { - int mis_align = dr_misalignment (first_dr_info, vectype); - dr_alignment_support dr_align - = vect_supportable_dr_alignment (vinfo, dr_info, vectype, - mis_align); - if (dr_align == dr_aligned - || dr_align == dr_unaligned_supported) - { - nstores = 1; - lnel = const_nunits; - ltype = vectype; - lvectype = vectype; - alignment_support_scheme = dr_align; - misalignment = mis_align; - } - } ltype = build_aligned_type (ltype, TYPE_ALIGN (elem_type)); ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); } @@ -10366,34 +10360,32 @@ vectorizable_load (vec_info *vinfo, auto_vec<tree> dr_chain; if (memory_access_type == VMAT_STRIDED_SLP) { - if (group_size < const_nunits) + HOST_WIDE_INT n = gcd (group_size, const_nunits); + /* Use the target vector type if the group size is a multiple + of it. */ + if (n == const_nunits) + { + nloads = 1; + lnel = const_nunits; + ltype = vectype; + } + /* Else use the biggest vector we can load the group without + accessing excess elements. */ + else if (n > 1) { - /* First check if vec_init optab supports construction from vector - elts directly. Otherwise avoid emitting a constructor of - vector elements by performing the loads using an integer type - of the same size, constructing a vector of those and then - re-interpreting it as the original vector type. This avoids a - huge runtime penalty due to the general inability to perform - store forwarding from smaller stores to a larger load. */ tree ptype; tree vtype - = vector_vector_composition_type (vectype, - const_nunits / group_size, + = vector_vector_composition_type (vectype, const_nunits / n, &ptype); if (vtype != NULL_TREE) { - nloads = const_nunits / group_size; - lnel = group_size; + nloads = const_nunits / n; + lnel = n; lvectype = vtype; ltype = ptype; } } - else - { - nloads = 1; - lnel = const_nunits; - ltype = vectype; - } + /* Else fall back to the default element-wise access. */ ltype = build_aligned_type (ltype, TYPE_ALIGN (TREE_TYPE (vectype))); } /* Load vector(1) scalar_type if it's 1 element-wise vectype. */ -- 2.35.3