On Mon, May 9, 2022 at 1:22 PM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> pand/pandn may be used to clear upper/lower bits of the operands, in
> that case there will be 4-5 instructions for permutation, and it's
> still better than scalar codes.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}.
> Ok for trunk?
>
>
> gcc/ChangeLog:
>
>         PR target/105354
>         * config/i386/i386-expand.cc
>         (expand_vec_perm_pslldq_psrldq_por): New function.
>         (ix86_expand_vec_perm_const_1): Try
>         expand_vec_perm_pslldq_psrldq_por for both 3-instruction and
>         4/5-instruction sequence.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/i386/pr105354-1.c: New test.
>         * gcc.target/i386/pr105354-2.c: New test.
> ---
>  gcc/config/i386/i386-expand.cc             | 109 +++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr105354-1.c | 130 +++++++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr105354-2.c | 110 +++++++++++++++++
>  3 files changed, 349 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr105354-2.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index bc806ffa283..49231e964ba 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -20941,6 +20941,108 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct 
> expand_vec_perm_d *d)
>    return true;
>  }
>
> +/* Implement permutation with pslldq + psrldq + por when pshufb is not
> +   available.  */
> +static bool
> +expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
> +{
> +  unsigned i, nelt = d->nelt;
> +  unsigned start1, end1 = -1;
> +  machine_mode vmode = d->vmode, imode;
> +  int start2 = -1;
> +  bool clear_op0, clear_op1;
> +  unsigned inner_size;
> +  rtx op0, op1, dop1;
> +  rtx (*gen_vec_shr) (rtx, rtx, rtx);
> +  rtx (*gen_vec_shl) (rtx, rtx, rtx);
> +
> +  /* pshufb is available under TARGET_SSSE3.  */
> +  if (TARGET_SSSE3 || !TARGET_SSE2
> +      /* pshufd can be used for V4SI/V2DI under TARGET_SSE2.  */
> +      || (vmode != E_V16QImode && vmode != E_V8HImode))
> +    return false;
> +
> +  start1 = d->perm[0];
> +  for (i = 1; i < nelt; i++)
> +    {
> +      if (d->perm[i] != d->perm[i-1] + 1)
> +       {
> +         if (start2 == -1)
> +           {
> +             start2 = d->perm[i];
> +             end1 = d->perm[i-1];
> +           }
> +         else
> +           return false;
> +       }
> +      else if (d->perm[i] >= nelt
> +              && start2 == -1)
> +       {
> +         start2 = d->perm[i];
> +         end1 = d->perm[i-1];
> +       }
> +    }
> +
> +  clear_op0 = end1 != nelt - 1;
> +  clear_op1 = start2 % nelt != 0;
> +  /* pandn/pand is needed to clear upper/lower bits of op0/op1.  */
> +  if (!pandn && (clear_op0 || clear_op1))
> +    return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
> +  gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
> +  imode = GET_MODE_INNER (vmode);
> +  inner_size = GET_MODE_BITSIZE (imode);
> +  op0 = gen_reg_rtx (vmode);
> +  op1 = gen_reg_rtx (vmode);
> +
> +  if (start1)
> +    emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
> +  else
> +    emit_move_insn (op0, d->op0);
> +
> +  dop1 = d->op1;
> +  if (d->one_operand_p)
> +    dop1 = d->op0;
> +
> +  int shl_offset = end1 - start1 + 1 - start2 % nelt;
> +  if (shl_offset)
> +    emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
> +  else
> +    emit_move_insn (op1, dop1);
> +
> +  /* Clear lower/upper bits for op0/op1.  */
> +  if (clear_op0 || clear_op1)
> +    {
> +      rtx vec[16];
> +      rtx const_vec;
> +      rtx clear;
> +      for (i = 0; i != nelt; i++)
> +       {
> +         if (i < (end1 - start1 + 1))
> +           vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, 
> imode);
> +         else
> +           vec[i] = CONST0_RTX (imode);
> +       }
> +      const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
> +      const_vec = validize_mem (force_const_mem (vmode, const_vec));
> +      clear = force_reg (vmode, const_vec);
> +
> +      if (clear_op0)
> +       emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
> +      if (clear_op1)
> +       emit_move_insn (op1, gen_rtx_AND (vmode,
> +                                         gen_rtx_NOT (vmode, clear),
> +                                         op1));
> +    }
> +
> +  emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
> +  return true;
> +}
> +
>  /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
>     and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
>     operands with two "and" and "pack" or two "shift" and "pack" insns.
> @@ -21853,6 +21955,9 @@ ix86_expand_vec_perm_const_1 (struct 
> expand_vec_perm_d *d)
>    if (expand_vec_perm_pshufb2 (d))
>      return true;
>
> +  if (expand_vec_perm_pslldq_psrldq_por (d, false))
> +    return true;
> +
>    if (expand_vec_perm_interleave3 (d))
>      return true;
>
> @@ -21891,6 +21996,10 @@ ix86_expand_vec_perm_const_1 (struct 
> expand_vec_perm_d *d)
>    if (expand_vec_perm_even_odd (d))
>      return true;
>
> +  /* Generate four or five instructions.  */
> +  if (expand_vec_perm_pslldq_psrldq_por (d, true))
> +    return true;
> +
>    /* Even longer sequences.  */
>    if (expand_vec_perm_vpshufb4_vpermq2 (d))
>      return true;
> diff --git a/gcc/testsuite/gcc.target/i386/pr105354-1.c 
> b/gcc/testsuite/gcc.target/i386/pr105354-1.c
> new file mode 100644
> index 00000000000..8d91ded7420
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105354-1.c
> @@ -0,0 +1,130 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> +/* { dg-final { scan-assembler-times {(?n)psrldq[\t ]+} 16 } } */
> +/* { dg-final { scan-assembler-times {(?n)pslldq[\t ]+} 16 } } */
> +/* { dg-final { scan-assembler-times {(?n)por[\t ]+} 16 } } */
> +/* { dg-final { scan-assembler-times {(?n)pandn[\t ]+} 8 } } */
> +/* { dg-final { scan-assembler-times {(?n)pand[\t ]+} 8 } } */
> +
> +typedef short v8hi __attribute__((vector_size (16)));
> +typedef char v16qi __attribute__((vector_size (16)));
> +
> +v16qi
> +__attribute__((noipa))
> +foo (v16qi a, v16qi b)
> +{
> +  return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> +                                 13, 14, 15, 16, 17, 18, 19, 20);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo1 (v16qi a, v16qi b)
> +{
> +  return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> +                                 13, 14, 15, 18, 19, 20, 21, 22);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo2 (v16qi a, v16qi b)
> +{
> +  return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> +                                 13, 14, 16, 17, 18, 19, 20, 21);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo3 (v16qi a, v16qi b)
> +{
> +  return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12,
> +                                 13, 14, 17, 18, 19, 20, 21, 22);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo4 (v8hi a, v8hi b)
> +{
> +  return __builtin_shufflevector (a, b, 5, 6, 7, 8, 9, 10, 11, 12);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo5 (v8hi a, v8hi b)
> +{
> +  return __builtin_shufflevector (a, b, 5, 6, 7, 9, 10, 11, 12, 13);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo6 (v8hi a, v8hi b)
> +{
> +  return __builtin_shufflevector (a, b, 5, 6, 8, 9, 10, 11, 12, 13);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo7 (v8hi a, v8hi b)
> +{
> +  return __builtin_shufflevector (a, b, 5, 6, 9, 10, 11, 12, 13, 14);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo8 (v16qi a)
> +{
> +  return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> +                                 13, 14, 15, 16, 17, 18, 19, 20);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo9 (v16qi a)
> +{
> +  return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> +                                 13, 14, 15, 18, 19, 20, 21, 22);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo10 (v16qi a)
> +{
> +  return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> +                                 13, 14, 16, 17, 18, 19, 20, 21);
> +}
> +
> +v16qi
> +__attribute__((noipa))
> +foo11 (v16qi a)
> +{
> +  return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12,
> +                                 13, 14, 17, 18, 19, 20, 21, 22);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo12 (v8hi a)
> +{
> +  return __builtin_shufflevector (a, a, 5, 6, 7, 8, 9, 10, 11, 12);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo13 (v8hi a)
> +{
> +  return __builtin_shufflevector (a, a, 5, 6, 7, 9, 10, 11, 12, 13);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo14 (v8hi a)
> +{
> +  return __builtin_shufflevector (a, a, 5, 6, 8, 9, 10, 11, 12, 13);
> +}
> +
> +v8hi
> +__attribute__((noipa))
> +foo15 (v8hi a)
> +{
> +  return __builtin_shufflevector (a, a, 5, 6, 9, 10, 11, 12, 13, 14);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr105354-2.c 
> b/gcc/testsuite/gcc.target/i386/pr105354-2.c
> new file mode 100644
> index 00000000000..b78b62e1e7e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr105354-2.c
> @@ -0,0 +1,110 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> +/* { dg-require-effective-target sse2 } */
> +
> +#include "sse2-check.h"
> +
> +#include "pr105354-1.c"
> +void
> +sse2_test (void)
> +{
> +  union128i_b a, b, res_ab, exp_ab;
> +  union128i_w c, d, res_cd, exp_cd;
> +
> +  for (int i = 0; i != 16;i++)
> +    {
> +      a.a[i] = i;
> +      b.a[i] = i + 16;
> +      res_ab.a[i] = 0;
> +      exp_ab.a[i] = -1;
> +      if (i <= 8)
> +       {
> +         c.a[i] = i;
> +         d.a[i] = i + 8;
> +         res_cd.a[i] = 0;
> +         exp_cd.a[i] = -1;
> +       }
> +    }
> +
> +  res_ab.x = (__m128i)foo ((v16qi)a.x, (v16qi)b.x);
> +  exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 
> 14, 15, 16, 17, 18, 19, 20 };
> +  if (check_union128i_b (exp_ab, res_ab.a))
> +    abort ();
> +
> +  exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 
> 14, 15, 18, 19, 20, 21, 22 };
> +  res_ab.x = (__m128i)foo1 ((v16qi)a.x, (v16qi)b.x);
> +  if (check_union128i_b (exp_ab, res_ab.a))
> +    abort();
> +
> +  exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 
> 14, 16, 17, 18, 19, 20, 21 };
> +  res_ab.x = (__m128i)foo2 ((v16qi)a.x, (v16qi)b.x);
> +  if (check_union128i_b (exp_ab, res_ab.a))
> +    abort();
> +
> +  exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 
> 14, 17, 18, 19, 20, 21, 22 };
> +  res_ab.x = (__m128i)foo3 ((v16qi)a.x, (v16qi)b.x);
> +  if (check_union128i_b (exp_ab, res_ab.a))
> +    abort();
> +
> +  res_ab.x = (__m128i)foo8 ((v16qi)a.x);
> +  exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 
> 14, 15, 0, 1, 2, 3, 4 };
> +  if (check_union128i_b (exp_ab, res_ab.a))
> +    abort ();
> +
> +  exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 
> 14, 15, 2, 3, 4, 5, 6 };
> +  res_ab.x = (__m128i)foo9 ((v16qi)a.x);
> +  if (check_union128i_b (exp_ab, res_ab.a))
> +    abort();
> +
> +  exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 
> 14, 0, 1, 2, 3, 4, 5 };
> +  res_ab.x = (__m128i)foo10 ((v16qi)a.x);
> +  if (check_union128i_b (exp_ab, res_ab.a))
> +    abort();
> +
> +  exp_ab.x = __extension__(__m128i) (v16qi) { 5, 6, 7, 8, 9, 10, 11, 12, 13, 
> 14, 1, 2, 3, 4, 5, 6 };
> +  res_ab.x = (__m128i)foo11 ((v16qi)a.x);
> +  if (check_union128i_b (exp_ab, res_ab.a))
> +    abort();
> +
> +  res_cd.x = (__m128i)foo4 ((v8hi)c.x, (v8hi)d.x);
> +  exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 8, 9, 10, 11, 12 };
> +  if (check_union128i_w (exp_cd, res_cd.a))
> +    abort ();
> +
> +  exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 9, 10, 11, 12, 13 };
> +  res_cd.x = (__m128i)foo5 ((v8hi)c.x, (v8hi)d.x);
> +  if (check_union128i_w (exp_cd, res_cd.a))
> +    abort();
> +
> +  exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 8, 9, 10, 11, 12, 13 };
> +  res_cd.x = (__m128i)foo6 ((v8hi)c.x, (v8hi)d.x);
> +  if (check_union128i_w (exp_cd, res_cd.a))
> +    abort();
> +
> +  res_cd.x = (__m128i)foo7 ((v8hi)c.x, (v8hi)d.x);
> +  exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 9, 10, 11, 12, 13, 14 };
> +  if (check_union128i_w (exp_cd, res_cd.a))
> +    abort ();
> +
> +  exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 0, 1, 2, 3, 4 };
> +  res_cd.x = (__m128i)foo12 ((v8hi)c.x);
> +  if (check_union128i_w (exp_cd, res_cd.a))
> +    abort();
> +
> +  exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 7, 1, 2, 3, 4, 5 };
> +  res_cd.x = (__m128i)foo13 ((v8hi)c.x);
> +  if (check_union128i_w (exp_cd, res_cd.a))
> +    abort();
> +
> +  exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 0, 1, 2, 3, 4, 5 };
> +  res_cd.x = (__m128i)foo14 ((v8hi)c.x);
> +  if (check_union128i_w (exp_cd, res_cd.a))
> +    abort();
> +
> +  exp_cd.x = __extension__(__m128i) (v8hi) { 5, 6, 1, 2, 3, 4, 5, 6 };
> +  res_cd.x = (__m128i)foo15 ((v8hi)c.x);
> +  if (check_union128i_w (exp_cd, res_cd.a))
> +    abort();
> +
> +}
> +
> --
> 2.18.1
>


-- 
BR,
Hongtao

Reply via email to