Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,} Ready push to trunk.
gcc/ChangeLog: PR target/113090 * config/i386/i386-expand.cc (expand_vec_perm_punpckldq_pshuf): New function. (ix86_expand_vec_perm_const_1): Try expand_vec_perm_punpckldq_pshuf for sequence of 2 instructions. gcc/testsuite/ChangeLog: * gcc.target/i386/pr113090.c: New test. --- gcc/config/i386/i386-expand.cc | 71 ++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/pr113090.c | 25 +++++++++ 2 files changed, 96 insertions(+) create mode 100644 gcc/testsuite/gcc.target/i386/pr113090.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 8bb8f21e686..fd49d866004 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -20813,6 +20813,74 @@ expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d) return true; } +/* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle. */ +static bool +expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d) +{ + if (GET_MODE_BITSIZE (d->vmode) != 64 + || !TARGET_MMX_WITH_SSE + || d->one_operand_p) + return false; + + machine_mode widen_vmode; + switch (d->vmode) + { + /* pshufd. */ + case E_V2SImode: + widen_vmode = V4SImode; + break; + + /* pshufd. */ + case E_V2SFmode: + widen_vmode = V4SFmode; + break; + + case E_V4HImode: + widen_vmode = V8HImode; + /* pshufb. */ + if (!TARGET_SSSE3) + return false; + break; + + case E_V8QImode: + /* pshufb. */ + widen_vmode = V16QImode; + if (!TARGET_SSSE3) + return false; + break; + + default: + return false; + } + + if (d->testing_p) + return true; + + struct expand_vec_perm_d dperm; + dperm.target = gen_reg_rtx (widen_vmode); + rtx op0 = gen_reg_rtx (widen_vmode); + emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1)); + dperm.op0 = op0; + dperm.op1 = op0; + dperm.vmode = widen_vmode; + unsigned nelt = GET_MODE_NUNITS (widen_vmode); + dperm.nelt = nelt; + dperm.one_operand_p = true; + dperm.testing_p = false; + + for (unsigned i = 0; i != nelt / 2; i++) + { + dperm.perm[i] = d->perm[i]; + dperm.perm[i + nelt / 2] = d->perm[i]; + } + + gcc_assert (expand_vec_perm_1 (&dperm)); + emit_move_insn (d->target, lowpart_subreg (d->vmode, + dperm.target, + dperm.vmode)); + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify the permutation using the SSSE3 palignr instruction. This succeeds when all of the elements in PERM fit within one vector and we merely @@ -23325,6 +23393,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_shufps_shufps (d)) return true; + if (expand_vec_perm_punpckldq_pshuf (d)) + return true; + /* Try sequences of three instructions. */ if (expand_vec_perm_even_odd_pack (d)) diff --git a/gcc/testsuite/gcc.target/i386/pr113090.c b/gcc/testsuite/gcc.target/i386/pr113090.c new file mode 100644 index 00000000000..0f0b7cc0084 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr113090.c @@ -0,0 +1,25 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -msse4.1" } */ +/* { dg-final { scan-assembler-times "pshufd" 3 } } */ + +typedef int v2si __attribute__((vector_size(8))); +typedef short v4hi __attribute__((vector_size(8))); +typedef char v8qi __attribute__((vector_size(8))); + +v2si +foo (v2si a, v2si b) +{ + return __builtin_shufflevector (a, b, 1, 2); +} + +v4hi +foo1 (v4hi a, v4hi b) +{ + return __builtin_shufflevector (a, b, 2, 3, 4, 5); +} + +v8qi +foo2 (v8qi a, v8qi b) +{ + return __builtin_shufflevector (a, b, 4, 5, 6, 7, 8, 9, 10, 11); +} -- 2.31.1