Is the following patch ok? It passes bootstrap and make check.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 88142a8..91f6f21 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -42807,6 +42807,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return true;
}
+static bool expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d);
+
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D
in a single instruction. */
@@ -42946,6 +42948,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_pshufb (d))
return true;
+ /* Try the AVX2 vpshufb. */
+ if (expand_vec_perm_vpshufb2_vpermq (d))
+ return true;
+
/* Try the AVX512F vpermi2 instructions. */
rtx vec[64];
enum machine_mode mode = d->vmode;
@@ -43004,7 +43010,7 @@ expand_vec_perm_pshuflw_pshufhw (struct
expand_vec_perm_d *d)
}
/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify
- the permutation using the SSSE3 palignr instruction. This succeeds
+ the permutation using the SSSE3/AVX2 palignr instruction. This succeeds
when all of the elements in PERM fit within one vector and we merely
need to shift them down so that a single vector permutation has a
chance to succeed. */
@@ -43015,14 +43021,26 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
unsigned i, nelt = d->nelt;
unsigned min, max;
bool in_order, ok;
- rtx shift, target;
+ rtx shift, shift1, target, tmp;
struct expand_vec_perm_d dcopy;
- /* Even with AVX, palignr only operates on 128-bit vectors. */
- if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+ /* SSSE3 is required to apply PALIGNR on 16 bytes operands. */
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if (!TARGET_SSSE3)
+ return false;
+ }
+ /* AVX2 is required to apply PALIGNR on 32 bytes operands. */
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if (!TARGET_AVX2)
+ return false;
+ }
+ /* Other sizes are not supported. */
+ else
return false;
- min = nelt, max = 0;
+ min = 2 * nelt, max = 0;
for (i = 0; i < nelt; ++i)
{
unsigned e = d->perm[i];
@@ -43041,9 +43059,35 @@ expand_vec_perm_palignr (struct expand_vec_perm_d *d)
dcopy = *d;
shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
- target = gen_reg_rtx (TImode);
- emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
- gen_lowpart (TImode, d->op0), shift));
+ shift1 = GEN_INT ((min - nelt / 2) *
+ GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode)));
+
+ if (GET_MODE_SIZE (d->vmode) != 32)
+ {
+ target = gen_reg_rtx (TImode);
+ emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1),
+ gen_lowpart (TImode, d->op0), shift));
+ }
+ else
+ {
+ target = gen_reg_rtx (V2TImode);
+ tmp = gen_reg_rtx (V4DImode);
+ emit_insn (gen_avx2_permv2ti (tmp,
+ gen_lowpart (V4DImode, d->op0),
+ gen_lowpart (V4DImode, d->op1),
+ GEN_INT (33)));
+ if (min < nelt / 2)
+ emit_insn (gen_avx2_palignrv2ti (target,
+ gen_lowpart (V2TImode, tmp),
+ gen_lowpart (V2TImode, d->op0),
+ shift));
+ else
+ emit_insn (gen_avx2_palignrv2ti (target,
+ gen_lowpart (V2TImode, d->op1),
+ gen_lowpart (V2TImode, tmp),
+ shift1));
+ }
+
dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
dcopy.one_operand_p = true;
On Tue, Apr 29, 2014 at 1:03 AM, Richard Henderson <[email protected]> wrote:
> On 04/28/2014 01:43 PM, Evgeny Stupachenko wrote:
>> Agree on checks:
>>
>> /* PALIGNR of 2 128-bits registers takes only 1 instrucion.
>> Requires SSSE3. */
>> if (GET_MODE_SIZE (d->vmode) == 16)
>> {
>> if(!TARGET_SSSE3)
>> return false;
>> }
>> /* PALIGNR of 2 256-bits registers on AVX2 costs only 2 instructions:
>> PERM and PALIGNR. It is more profitable than 2 PSHUFB and PERM. */
>> else if (GET_MODE_SIZE (d->vmode) == 32)
>> {
>> if(!TARGET_AVX2)
>> return false;
>> }
>> else
>> return false;
>
> Thanks, much better.
>
>
> r~