On Thu, May 9, 2024 at 11:12 AM Levy Hsu <ad...@levyhsu.com> wrote:
>
> Hi All
>
> We've introduced a new subroutine in ix86_expand_vec_perm_const_1
> to optimize vector shifting for the V16QI type on x86.
> This patch uses a three-instruction sequence psrlw, psllw, and por
> to handle specific vector shuffle operations more efficiently.
> The change aims to improve assembly code generation for configurations
> supporting SSE2.
>
> Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?
>
> Best
> Levy
>
> gcc/ChangeLog:
>
>         PR target/107563
>         * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
>         subroutine.
>         (ix86_expand_vec_perm_const_1): New Entry.

Please say (ix86_expand_vec_perm_const_1): Call expand_vec_perm_psrlw_psllw_por.

>
> gcc/testsuite/ChangeLog:
>
>         PR target/107563
>         * g++.target/i386/pr107563-a.C: New test.
>         * g++.target/i386/pr107563-b.C: New test.

OK with the above adjustment.

Thanks,
Uros.

> ---
>  gcc/config/i386/i386-expand.cc             | 64 ++++++++++++++++++++++
>  gcc/testsuite/g++.target/i386/pr107563-a.C | 13 +++++
>  gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++++
>  3 files changed, 89 insertions(+)
>  create mode 100755 gcc/testsuite/g++.target/i386/pr107563-a.C
>  create mode 100755 gcc/testsuite/g++.target/i386/pr107563-b.C
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..5098d2886bb 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct 
> expand_vec_perm_d *d, bool two_insn)
>    return true;
>  }
>
> +/* A subroutine of ix86_expand_vec_perm_const_1.
> +   Implement a permutation with psrlw, psllw and por.
> +   It handles case:
> +   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
> +   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
> +
> +static bool
> +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
> +{
> +  unsigned i;
> +  rtx (*gen_shr) (rtx, rtx, rtx);
> +  rtx (*gen_shl) (rtx, rtx, rtx);
> +  rtx (*gen_or) (rtx, rtx, rtx);
> +  machine_mode mode = VOIDmode;
> +
> +  if (!TARGET_SSE2 || !d->one_operand_p)
> +    return false;
> +
> +  switch (d->vmode)
> +    {
> +    case E_V8QImode:
> +      if (!TARGET_MMX_WITH_SSE)
> +       return false;
> +      mode = V4HImode;
> +      gen_shr = gen_ashrv4hi3;
> +      gen_shl = gen_ashlv4hi3;
> +      gen_or = gen_iorv4hi3;
> +      break;
> +    case E_V16QImode:
> +      mode = V8HImode;
> +      gen_shr = gen_vlshrv8hi3;
> +      gen_shl = gen_vashlv8hi3;
> +      gen_or = gen_iorv8hi3;
> +      break;
> +    default: return false;
> +    }
> +
> +  if (!rtx_equal_p (d->op0, d->op1))
> +    return false;
> +
> +  for (i = 0; i < d->nelt; i += 2)
> +    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  rtx tmp1 = gen_reg_rtx (mode);
> +  rtx tmp2 = gen_reg_rtx (mode);
> +  rtx op0 = force_reg (d->vmode, d->op0);
> +
> +  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
> +  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
> +  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
> +  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
> +  emit_insn (gen_or (tmp1, tmp1, tmp2));
> +  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
> +
> +  return true;
> +}
> +
>  /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
>     permutation using two vperm2f128, followed by a vshufpd insn blending
>     the two vectors together.  */
> @@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct 
> expand_vec_perm_d *d)
>    if (expand_vec_perm_2perm_pblendv (d, false))
>      return true;
>
> +  if (expand_vec_perm_psrlw_psllw_por (d))
> +    return true;
> +
>    /* Try sequences of four instructions.  */
>
>    if (expand_vec_perm_even_odd_trunc (d))
> diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C 
> b/gcc/testsuite/g++.target/i386/pr107563-a.C
> new file mode 100755
> index 00000000000..605c1bdf814
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
> @@ -0,0 +1,13 @@
> +/* PR target/107563.C */
> +/* { dg-do compile { target { ! ia32 } } } */
> +/* { dg-options "-std=c++2b -O3 -msse2" } */
> +/* { dg-final { scan-assembler-times "psllw" 1 } } */
> +/* { dg-final { scan-assembler-times "psraw" 1 } } */
> +/* { dg-final { scan-assembler-times "por" 1 } } */
> +
> +using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
> +
> +void foo2(temp_vec_type2& v) noexcept
> +{
> +  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
> +}
> diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C 
> b/gcc/testsuite/g++.target/i386/pr107563-b.C
> new file mode 100755
> index 00000000000..0ce3e8263bb
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
> @@ -0,0 +1,12 @@
> +/* PR target/107563.C */
> +/* { dg-options "-std=c++2b -O3 -msse2" } */
> +/* { dg-final { scan-assembler-times "psllw" 1 } } */
> +/* { dg-final { scan-assembler-times "psrlw" 1 } } */
> +/* { dg-final { scan-assembler-times "por" 1 } } */
> +
> +using temp_vec_type [[__gnu__::__vector_size__(16)]] = char;
> +
> +void foo(temp_vec_type& v) noexcept
> +{
> +  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 
> 13, 12, 15, 14);
> +}
> --
> 2.31.1
>

Reply via email to