On Wed, Nov 20, 2024 at 8:03 PM Cui, Lili <lili....@intel.com> wrote: > > Hi, all > > This patch aims to handle certain vector shuffle operations using pand, pandn > and por more efficiently. > > Bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk? Although it's stage 3, I think this one is low risk, so Ok for trunk. > > Regards, > Lili. > > > This patch introduces a new subroutine in ix86_expand_vec_perm_const_1. > On x86, use mixed constant permutation for V8HImode and V16QImode when > SSE2 is supported. This patch handles certain vector shuffle operations > more efficiently using pand, pandn and por. This change is intended to > improve assembly code generation for configurations that support SSE2. > > gcc/ChangeLog: > > PR target/116675 > * config/i386/i386-expand.cc (expand_vec_perm_pand_pandn_por): > New subroutine. > (ix86_expand_vec_perm_const_1): Call expand_vec_perm_pand_pandn_por. > > gcc/testsuite/ChangeLog: > > PR target/116675 > * gcc.target/i386/pr116675.c: New test. > --- > gcc/config/i386/i386-expand.cc | 50 ++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr116675.c | 75 ++++++++++++++++++++++++ > 2 files changed, 125 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/i386/pr116675.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index a6e6e738a52..f9fa0281298 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -23103,6 +23103,53 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct > expand_vec_perm_d *d) > return true; > } > > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a > + permutation (which is a bland) with and, andnot and or when pshufb is not > available. > + > + It handles case: > + __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15); > + __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15); > + > + An element[i] must be chosen between op0[i] and op1[i] to satisfy the > + requirement. > + */ > + > +static bool > +expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d) > +{ > + rtx rperm[16], vperm; > + unsigned int i, nelt = d->nelt; > + > + if (!TARGET_SSE2 > + || d->one_operand_p > + || (d->vmode != V16QImode && d->vmode != V8HImode)) > + return false; > + > + if (d->perm[0] != 0) > + return false; > + > + /* The dest[i] must select an element between op0[i] and op1[i]. */ > + for (i = 1; i < nelt; i++) > + if ((d->perm[i] % nelt) != i) > + return false; > + > + if (d->testing_p) > + return true; > + > + /* Generates a blend mask for the operators AND and ANDNOT. */ > + machine_mode inner_mode = GET_MODE_INNER (d->vmode); > + for (i = 0; i < nelt; i++) > + rperm[i] = (d->perm[i] < nelt) ? CONSTM1_RTX (inner_mode) > + : CONST0_RTX (inner_mode); > + > + vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm)); > + vperm = force_reg (d->vmode, vperm); > + > + ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1); > + > + return true; > +} > + > /* Implement permutation with pslldq + psrldq + por when pshufb is not > available. */ > static bool > @@ -24162,6 +24209,9 @@ ix86_expand_vec_perm_const_1 (struct > expand_vec_perm_d *d) > if (expand_vec_perm_psrlw_psllw_por (d)) > return true; > > + if (expand_vec_perm_pand_pandn_por (d)) > + return true; > + > /* Try sequences of four instructions. */ > > if (expand_vec_perm_even_odd_trunc (d)) > diff --git a/gcc/testsuite/gcc.target/i386/pr116675.c > b/gcc/testsuite/gcc.target/i386/pr116675.c > new file mode 100644 > index 00000000000..e463dd8415f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr116675.c > @@ -0,0 +1,75 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -msse2 -mno-ssse3" } */ > +/* { dg-final { scan-assembler-times "pand" 4 } } */ > +/* { dg-final { scan-assembler-times "pandn" 4 } } */ > +/* { dg-final { scan-assembler-times "por" 4 } } */ > + > +#include <emmintrin.h> > + > +__attribute__((noinline, noclone, target("sse2"))) > +static __v8hi foo1 (__v8hi a, __v8hi b) > +{ > + return __builtin_shufflevector (a, b, 0, 9, 2, 11, 4, 13, 6, 15); > +} > + > +__attribute__((noinline, noclone, target("sse2"))) > +static __v8hi foo2 (__v8hi a, __v8hi b) > +{ > + return __builtin_shufflevector (a, b, 8, 9, 2, 3, 4, 13, 14, 15); > +} > + > +__attribute__((noinline, noclone, target("sse2"))) > +static __v16qi foo3 (__v16qi a, __v16qi b) > +{ > + return __builtin_shufflevector (a, b, 0, 17, 2, 19, 4, 21, 6, 23, > + 8, 25, 10, 27, 12, 29, 14, 31); > +} > + > +__attribute__((noinline, noclone, target("sse2"))) > +static __v16qi foo4 (__v16qi a, __v16qi b) > +{ > + return __builtin_shufflevector (a, b, 0, 1, 2, 3, 4, 21, 6, 23, > + 8, 25, 10, 27,12,29,14,31); > +} > + > +__attribute__((noinline, noclone)) void > +compare_v8hi (__v8hi a, __v8hi b) > +{ > + for (int i = 0; i < 8; i++) > + if (a[i] != b[i]) > + __builtin_abort (); > +} > + > +__attribute__((noinline, noclone)) void > +compare_v16qi (__v16qi a, __v16qi b) > +{ > + for (int i = 0; i < 16; i++) > + if (a[i] != b[i]) > + __builtin_abort (); > +} > + > +int main (void) > +{ > + __v8hi s1, s2, s3, s4, s5, s6; > + __v16qi s7, s8, s9, s10, s11, s12; > + s1 = (__v8hi) {0, 1, 2, 3, 4, 5, 6, 7}; > + s2 = (__v8hi) {8, 9, 10, 11, 12, 13, 14, 15}; > + s7 = (__v16qi) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; > + s8 = (__v16qi) {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, > 30, 31}; > + > + s3 = foo1 (s1, s2); > + s4 = foo2 (s1, s2); > + s9 = foo3 (s7, s8); > + s10 = foo4 (s7, s8); > + > + s5 = (__v8hi) {0, 9, 2, 11, 4, 13, 6, 15}; > + s6 = (__v8hi) {8, 9, 2, 3, 4, 13, 14, 15}; > + s11 = (__v16qi) {0, 17, 2, 19, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, > 31}; > + s12 = (__v16qi) {0, 1, 2, 3, 4, 21, 6, 23, 8, 25, 10, 27, 12, 29, 14, 31}; > + > + compare_v8hi (s3, s5); > + compare_v8hi (s4, s6); > + compare_v16qi (s9, s11); > + compare_v16qi (s10, s12); > + return 0; > +} > -- > 2.34.1 >
-- BR, Hongtao