To avoid misunderstanding. I haven't yet committed this obvious fix. Is it ok?
On Wed, Nov 12, 2014 at 2:15 PM, Evgeny Stupachenko <evstu...@gmail.com> wrote: > Committed r217359. > However, it appeared that AVX2 uses vperm2i128 for the shift here > (instead of palignr for SSSE3/AVX). To handle AVX2 case we need to > modify test case: > > diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c > b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c > index 1fbd258..020e983 100644 > --- a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c > +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c > @@ -19,4 +19,4 @@ pair_mul_sum(byte *in, byte *out, int size) > } > } > > -/* { dg-final { scan-assembler "palignr" } } */ > +/* { dg-final { scan-assembler "perm2i128|palignr" } } */ > > On Tue, Nov 11, 2014 at 5:28 PM, Richard Biener > <richard.guent...@gmail.com> wrote: >> On Tue, Nov 11, 2014 at 3:21 PM, Evgeny Stupachenko <evstu...@gmail.com> >> wrote: >>> Hi, >>> >>> The patch extends shift permutations technique on power of 2 cases >>> (previously even/odd transformations was used unconditionally). >>> Basically the patch just add loop for load group of length 2, like it >>> is done in "vect_permute_load_chain" function. >>> >>> For Silvermont it reduces insn sequence for load group of length 4 >>> from 31 to 20 insns. >>> Performance for the test in the patch improved by ~20%. >>> >>> Bootstrap passed. >>> Make check in progress. >>> >>> Is it ok? >> >> Ok. >> >> Thanks, >> Richard. >> >>> 2014-11-11 Evgeny Stupachenko <evstu...@gmail.com> >>> >>> gcc/testsuite >>> * gcc.target/i386/pr52252-atom-1.c: New. >>> >>> gcc/ >>> * tree-vect-data-refs.c (vect_shift_permute_load_chain): Extend >>> shift >>> permutations on power of 2 cases. >>> >>> diff --git a/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c >>> b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c >>> new file mode 100644 >>> index 0000000..1fbd258 >>> --- /dev/null >>> +++ b/gcc/testsuite/gcc.target/i386/pr52252-atom-1.c >>> @@ -0,0 +1,22 @@ >>> +/* { dg-do compile } */ >>> +/* { dg-require-effective-target ssse3 } */ >>> +/* { dg-options "-O2 -ftree-vectorize -mssse3 -mtune=slm" } */ >>> +#define byte unsigned char >>> + >>> +void >>> +pair_mul_sum(byte *in, byte *out, int size) >>> +{ >>> + int j; >>> + for(j = 0; j < size; j++) >>> + { >>> + byte a = in[0]; >>> + byte b = in[1]; >>> + byte c = in[2]; >>> + byte d = in[3]; >>> + out[0] = (byte)(a * b) + (byte)(b * c) + (byte)(c * d) + (byte)(d * >>> a); >>> + in += 4; >>> + out += 1; >>> + } >>> +} >>> + >>> +/* { dg-final { scan-assembler "palignr" } } */ >>> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c >>> index 0bc0356..d2e0e93 100644 >>> --- a/gcc/tree-vect-data-refs.c >>> +++ b/gcc/tree-vect-data-refs.c >>> @@ -5379,8 +5379,9 @@ vect_shift_permute_load_chain (vec<tree> dr_chain, >>> memcpy (result_chain->address (), dr_chain.address (), >>> length * sizeof (tree)); >>> >>> - if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) >>> + if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4) >>> { >>> + unsigned int j, log_length = exact_log2 (length); >>> for (i = 0; i < nelt / 2; ++i) >>> sel[i] = i * 2; >>> for (i = 0; i < nelt / 2; ++i) >>> @@ -5441,37 +5442,44 @@ vect_shift_permute_load_chain (vec<tree> dr_chain, >>> select_mask = vect_gen_perm_mask (vectype, sel); >>> gcc_assert (select_mask != NULL); >>> >>> - first_vect = dr_chain[0]; >>> - second_vect = dr_chain[1]; >>> - >>> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); >>> - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >>> - first_vect, first_vect, >>> - perm2_mask1); >>> - vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> - vect[0] = data_ref; >>> + for (i = 0; i < log_length; i++) >>> + { >>> + for (j = 0; j < length; j += 2) >>> + { >>> + first_vect = dr_chain[j]; >>> + second_vect = dr_chain[j + 1]; >>> >>> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2"); >>> - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >>> - second_vect, second_vect, >>> - perm2_mask2); >>> - vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> - vect[1] = data_ref; >>> + data_ref = make_temp_ssa_name (vectype, NULL, >>> "vect_shuffle2"); >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, >>> data_ref, >>> + first_vect, >>> first_vect, >>> + perm2_mask1); >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> + vect[0] = data_ref; >>> >>> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); >>> - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >>> - vect[0], vect[1], >>> - shift1_mask); >>> - vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> - (*result_chain)[1] = data_ref; >>> + data_ref = make_temp_ssa_name (vectype, NULL, >>> "vect_shuffle2"); >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, >>> data_ref, >>> + second_vect, >>> second_vect, >>> + perm2_mask2); >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> + vect[1] = data_ref; >>> >>> - data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); >>> - perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref, >>> - vect[0], vect[1], >>> - select_mask); >>> - vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> - (*result_chain)[0] = data_ref; >>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift"); >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, >>> data_ref, >>> + vect[0], vect[1], >>> + shift1_mask); >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> + (*result_chain)[j/2 + length/2] = data_ref; >>> >>> + data_ref = make_temp_ssa_name (vectype, NULL, "vect_select"); >>> + perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, >>> data_ref, >>> + vect[0], vect[1], >>> + select_mask); >>> + vect_finish_stmt_generation (stmt, perm_stmt, gsi); >>> + (*result_chain)[j/2] = data_ref; >>> + } >>> + memcpy (dr_chain.address (), result_chain->address (), >>> + length * sizeof (tree)); >>> + } >>> return true; >>> } >>> if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)