On Mon, Apr 28, 2014 at 9:48 AM, Evgeny Stupachenko <evstu...@gmail.com> wrote: > Hi, > > The patch enables use of "palignr with perm" instead of "2 pshufb, or > and perm" at AVX2 for some cases. > > Bootstrapped and passes make check on x86. > > Is it ok? > > 2014-04-28 Evgeny Stupachenko <evstu...@gmail.com> > > * config/i386/i386.c (expand_vec_perm_1): Try AVX2 vpshufb. > * config/i386/i386.c (expand_vec_perm_palignr): Extend to use AVX2 > PALINGR instruction.
Can you add testcases to verify that AVX2 vpshufb and paligngr are properly generated? > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 88142a8..ae80477 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -42807,6 +42807,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) > return true; > } > > +static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d); > + > /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D > in a single instruction. */ > > @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > if (expand_vec_perm_pshufb (d)) > return true; > > + /* Try the AVX2 vpshufb. */ > + if (expand_vec_perm_vpshufb2_vpermq (d)) > + return true; > + > /* Try the AVX512F vpermi2 instructions. */ > rtx vec[64]; > enum machine_mode mode = d->vmode; > @@ -43004,7 +43010,7 @@ expand_vec_perm_pshuflw_pshufhw (struct > expand_vec_perm_d *d) > } > > /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify > - the permutation using the SSSE3 palignr instruction. This succeeds > + the permutation using the SSSE3/AVX2 palignr instruction. This succeeds > when all of the elements in PERM fit within one vector and we merely > need to shift them down so that a single vector permutation has a > chance to succeed. */ > @@ -43015,14 +43021,20 @@ expand_vec_perm_palignr (struct expand_vec_perm_d > *d) > unsigned i, nelt = d->nelt; > unsigned min, max; > bool in_order, ok; > - rtx shift, target; > + rtx shift, shift1, target, tmp; > struct expand_vec_perm_d dcopy; > > - /* Even with AVX, palignr only operates on 128-bit vectors. */ > - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) > + /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions: > + PERM and PALIGNR. It is more profitable than 2 PSHUFB and PERM. > + PALIGNR of 2 128-bits registers takes only 1 instrucion. */ > + if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16 && ^^^ This should be only on the next lined. > + GET_MODE_SIZE (d->vmode) != 32)) > + return false; > + /* Only AVX2 or higher support PALIGNR on 256-bits registers. */ > + if (!TARGET_AVX2 && (GET_MODE_SIZE (d->vmode) == 32)) ^ No need for '(' here. Or you can use if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32)) > return false; > > - min = nelt, max = 0; > + min = 2 * nelt, max = 0; ^^^^^^^^^^^^^^^^^^^^^^^^^^ Will this change 128bit vector code? > for (i = 0; i < nelt; ++i) > { > unsigned e = d->perm[i]; > @@ -43041,9 +43053,34 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) > > dcopy = *d; > shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); > - target = gen_reg_rtx (TImode); > - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), > - gen_lowpart (TImode, d->op0), shift)); > + shift1 = GEN_INT ((min - nelt / 2) * > + GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); > + > + if (GET_MODE_SIZE (d->vmode) != 32) > + { > + target = gen_reg_rtx (TImode); > + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), > + gen_lowpart (TImode, d->op0), shift)); > + } > + else > + { > + target = gen_reg_rtx (V2TImode); > + tmp = gen_reg_rtx (V4DImode); > + emit_insn (gen_avx2_permv2ti (tmp, > + gen_lowpart (V4DImode, d->op0), > + gen_lowpart (V4DImode, d->op1), > + GEN_INT (33))); > + if (min < nelt / 2) > + emit_insn (gen_avx2_palignrv2ti (target, > + gen_lowpart (V2TImode, tmp), > + gen_lowpart (V2TImode, d->op0), > + shift)); > + else > + emit_insn (gen_avx2_palignrv2ti (target, > + gen_lowpart (V2TImode, d->op1), > + gen_lowpart (V2TImode, tmp), > + shift1)); > + } > > dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); > dcopy.one_operand_p = true; > > > Evgeny -- H.J.