The "expand_vec_perm_palignr" is similar for SSSE3 and AVX2 cases, but AVX2 requires more instructions to complete the scheme.
The patch below adds AVX2 support for six instructions, leaving SSSE3 for two. Is it ok? diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 2cffcef..70fc832 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -43130,23 +43130,38 @@ expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) return true; } +static bool +expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d); + /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify - the permutation using the SSSE3 palignr instruction. This succeeds + the permutation using the SSSE3/AVX2 palignr instruction. This succeeds when all of the elements in PERM fit within one vector and we merely need to shift them down so that a single vector permutation has a chance to succeed. */ static bool -expand_vec_perm_palignr (struct expand_vec_perm_d *d) +expand_vec_perm_palignr (struct expand_vec_perm_d *d, int insn_num) { unsigned i, nelt = d->nelt; unsigned min, max; bool in_order, ok; - rtx shift, target; + rtx shift, shift1, target, tmp; struct expand_vec_perm_d dcopy; - /* Even with AVX, palignr only operates on 128-bit vectors. */ - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + /* SSSE3 is required to apply PALIGNR on 16 bytes operands. */ + if (GET_MODE_SIZE (d->vmode) == 16) + { + if (!TARGET_SSSE3) + return false; + } + /* AVX2 is required to apply PALIGNR on 32 bytes operands. */ + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if (!TARGET_AVX2) + return false; + } + /* Other sizes are not supported. */ + else return false; min = nelt, max = 0; @@ -43168,9 +43183,34 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) dcopy = *d; shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); - target = gen_reg_rtx (TImode); - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), - gen_lowpart (TImode, d->op0), shift)); + shift1 = GEN_INT ((min - nelt / 2) + * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); + + if (GET_MODE_SIZE (d->vmode) != 32) + { + target = gen_reg_rtx (TImode); + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), + gen_lowpart (TImode, d->op0), shift)); + } + else + { + target = gen_reg_rtx (V2TImode); + tmp = gen_reg_rtx (V4DImode); + emit_insn (gen_avx2_permv2ti (tmp, + gen_lowpart (V4DImode, d->op0), + gen_lowpart (V4DImode, d->op1), + GEN_INT (33))); + if (min < nelt / 2) + emit_insn (gen_avx2_palignrv2ti (target, + gen_lowpart (V2TImode, tmp), + gen_lowpart (V2TImode, d->op0), + shift)); + else + emit_insn (gen_avx2_palignrv2ti (target, + gen_lowpart (V2TImode, d->op1), + gen_lowpart (V2TImode, tmp), + shift1)); + } dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); dcopy.one_operand_p = true; @@ -43192,9 +43232,22 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d) return true; } - ok = expand_vec_perm_1 (&dcopy); - gcc_assert (ok); - + /* For SSSE3 we need 1 instruction for palignr plus 1 for one + operand permutaoin. */ + if (insn_num == 2) + { + ok = expand_vec_perm_1 (&dcopy); + gcc_assert (ok); + } + /* For AVX2 we need 2 instructions for the shift: vpalignr and + vperm plus 4 instructions for one operand permutation. */ + else if (insn_num == 6) + { + ok = expand_vec_perm_vpshufb2_vpermq (&dcopy); + gcc_assert (ok); + } + else + ok = false; return ok; } @@ -44627,7 +44680,7 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_pshuflw_pshufhw (d)) return true; - if (expand_vec_perm_palignr (d)) + if (expand_vec_perm_palignr (d, 2)) return true; if (expand_vec_perm_interleave2 (d)) @@ -44680,6 +44733,10 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_even_odd (d)) return true; + /* Try sequences of six instructions. */ + if (expand_vec_perm_palignr (d, 6)) + return true; + /* Even longer sequences. */ if (expand_vec_perm_vpshufb4_vpermq2 (d)) return true; On Mon, May 19, 2014 at 7:32 PM, Richard Henderson <r...@redhat.com> wrote: > On 05/05/2014 09:49 AM, Evgeny Stupachenko wrote: >> @@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) >> if (expand_vec_perm_pshufb (d)) >> return true; >> >> + /* Try the AVX2 vpshufb. */ >> + if (expand_vec_perm_vpshufb2_vpermq (d)) >> + return true; > > Why is this here? It doesn't expand to 1 insn, which is > what expand_vec_perm_1 is intended to check. > > It's already called from ix86_expand_vec_perm_const_1. > If things need to be shuffled around in that function, > then that's the right place to do so. > > It's also clearly unrelated to "palignr", so has no > business at all within this patch. > >> - min = nelt, max = 0; >> + min = 2 * nelt, max = 0; > > This change to min is wrong for this patch. It probably > belonged in your 2/2 patch. > >> + shift1 = GEN_INT ((min - nelt / 2) * >> + GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); > > Coding convention sez > > shift1 = GEN_INT ((min - nelt / 2) > * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); > > > r~ >