Re: [PATCH] Optimize 128-bit vector permutation with pand, pandn and por.

Hongtao Liu Sun, 24 Nov 2024 21:04:40 -0800

On Wed, Nov 20, 2024 at 8:03 PM Cui, Lili <lili....@intel.com> wrote:
>
> Hi, all
>
> This patch aims to handle certain vector shuffle operations using pand, pandn 
> and por more efficiently.
>
> Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk?
Although it's stage 3, I think this one is low risk, so Ok for trunk.
>
> Regards,
> Lili.
>
>
> This patch introduces a new subroutine in ix86_expand_vec_perm_const_1.
> On x86, use mixed constant permutation for V8HImode and V16QImode when
> SSE2 is supported. This patch handles certain vector shuffle operations
> more efficiently using pand, pandn and por. This change is intended to
> improve assembly code generation for configurations that support SSE2.
>
> gcc/ChangeLog:
>
>         PR target/116675
>         * config/i386/i386-expand.cc (expand_vec_perm_pand_pandn_por):
>         New subroutine.
>         (ix86_expand_vec_perm_const_1): Call expand_vec_perm_pand_pandn_por.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/116675
>         * gcc.target/i386/pr116675.c: New test.
> ---
>  gcc/config/i386/i386-expand.cc           | 50 ++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr116675.c | 75 ++++++++++++++++++++++++
>  2 files changed, 125 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr116675.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index a6e6e738a52..f9fa0281298 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -23103,6 +23103,53 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct 
> expand_vec_perm_d *d)
>    return true;
>  }
>
> +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
> +   permutation (which is a bland) with and, andnot and or when pshufb is not 
> available.
> +
> +   It handles case:
> +   __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
> +   __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
> +
> +   An element[i] must be chosen between op0[i] and op1[i] to satisfy the
> +   requirement.
> + */
> +
> +static bool
> +expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
> +{
> +  rtx rperm[16], vperm;
> +  unsigned int i, nelt = d->nelt;
> +
> +  if (!TARGET_SSE2
> +      || d->one_operand_p
> +      || (d->vmode != V16QImode && d->vmode != V8HImode))
> +    return false;
> +
> +  if (d->perm[0] != 0)
> +    return false;
> +
> +  /* The dest[i] must select an element between op0[i] and op1[i].  */
> +  for (i = 1; i < nelt; i++)
> +    if ((d->perm[i] % nelt) != i)
> +      return false;
> +
> +  if (d->testing_p)
> +     return true;
> +
> +  /* Generates a blend mask for the operators AND and ANDNOT.  */
> +  machine_mode inner_mode = GET_MODE_INNER (d->vmode);
> +  for (i = 0; i < nelt; i++)
> +    rperm[i] = (d->perm[i] <  nelt) ? CONSTM1_RTX (inner_mode)
> +      : CONST0_RTX (inner_mode);
> +
> +  vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
> +  vperm = force_reg (d->vmode, vperm);
> +
> +  ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
> +
> +  return true;
> +}
> +
>  /* Implement permutation with pslldq + psrldq + por when pshufb is not
>     available.  */
>  static bool
> @@ -24162,6 +24209,9 @@ ix86_expand_vec_perm_const_1 (struct 
> expand_vec_perm_d *d)
>    if (expand_vec_perm_psrlw_psllw_por (d))
>      return true;
>
> +  if (expand_vec_perm_pand_pandn_por (d))
> +    return true;
> +
>    /* Try sequences of four instructions.  */
>
>    if (expand_vec_perm_even_odd_trunc (d))
> diff --git a/gcc/testsuite/gcc.target/i386/pr116675.c 
> b/gcc/testsuite/gcc.target/i386/pr116675.c
> new file mode 100644
> index 00000000000..e463dd8415f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr116675.c
> @@ -0,0 +1,75 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -msse2 -mno-ssse3" } */
> +/* { dg-final { scan-assembler-times "pand" 4 } } */
> +/* { dg-final { scan-assembler-times "pandn" 4 } } */
> +/* { dg-final { scan-assembler-times "por" 4 } } */
> +
> +#include <emmintrin.h>
> +
> +__attribute__((noinline, noclone, target("sse2")))
> +static __v8hi foo1 (__v8hi a, __v8hi b)
> +{
> +  return __builtin_shufflevector (a, b, 0, 9, 2, 11, 4, 13, 6, 15);
> +}
> +
> +__attribute__((noinline, noclone, target("sse2")))
> +static __v8hi foo2 (__v8hi a, __v8hi b)
> +{
> +  return __builtin_shufflevector (a, b, 8, 9, 2, 3, 4, 13, 14, 15);
> +}
> +
> +__attribute__((noinline, noclone, target("sse2")))
> +static __v16qi foo3 (__v16qi a, __v16qi b)
> +{
> +  return __builtin_shufflevector (a, b, 0, 17, 2, 19, 4, 21, 6, 23,
> +                                 8, 25, 10, 27, 12, 29, 14, 31);
> +}
> +
> +__attribute__((noinline, noclone, target("sse2")))
> +static __v16qi foo4 (__v16qi a, __v16qi b)
> +{
> +  return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 21, 6, 23,
> +                                        8, 25, 10, 27,12,29,14,31);
> +}
> +
> +__attribute__((noinline, noclone)) void
> +compare_v8hi (__v8hi a,  __v8hi b)
> +{
> +  for (int i = 0; i < 8; i++)
> +    if (a[i] != b[i])
> +      __builtin_abort ();
> +}
> +
> +__attribute__((noinline, noclone)) void
> +compare_v16qi (__v16qi a,  __v16qi b)
> +{
> +  for (int i = 0; i < 16; i++)
> +    if (a[i] != b[i])
> +      __builtin_abort ();
> +}
> +
> +int main (void)
> +{
> +  __v8hi s1, s2, s3, s4, s5, s6;
> +  __v16qi s7, s8, s9, s10, s11, s12;
> +  s1 = (__v8hi) {0, 1, 2, 3, 4, 5, 6, 7};
> +  s2 = (__v8hi) {8, 9, 10, 11, 12, 13, 14, 15};
> +  s7 = (__v16qi) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
> +  s8 = (__v16qi) {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 
> 30, 31};
> +
> +  s3  = foo1 (s1, s2);
> +  s4  = foo2 (s1, s2);
> +  s9  = foo3 (s7, s8);
> +  s10 = foo4 (s7, s8);
> +
> +  s5 = (__v8hi) {0, 9, 2, 11, 4, 13, 6, 15};
> +  s6 = (__v8hi) {8, 9, 2, 3, 4, 13, 14, 15};
> +  s11 = (__v16qi) {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 
> 31};
> +  s12 = (__v16qi) {0, 1, 2, 3, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31};
> +
> +  compare_v8hi (s3, s5);
> +  compare_v8hi (s4, s6);
> +  compare_v16qi (s9, s11);
> +  compare_v16qi (s10, s12);
> +  return 0;
> +}
> --
> 2.34.1
>



-- 
BR,
Hongtao

Re: [PATCH] Optimize 128-bit vector permutation with pand, pandn and por.

Reply via email to