On Thu, Aug 29, 2019 at 10:41 AM Jakub Jelinek <ja...@redhat.com> wrote: > > Hi! > > The following patch improves especially V8SFmode permutations for > AVX (non-AVX2) ISA, where we punted way too often, even when we can handle > it. > On the > typedef float __v8sf __attribute__((vector_size (32))); > typedef double __v4df __attribute__((vector_size (32))); > typedef int __v8si __attribute__((vector_size (32))); > typedef long long __v4di __attribute__((vector_size (32))); > #ifdef __clang__ > #define S(x, y, t, ...) __builtin_shufflevector (x, y, __VA_ARGS__) > #else > #define S(x, y, t, ...) __builtin_shuffle (x, y, (t) { __VA_ARGS__ }) > #endif > > __v8sf f1 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 8, 9, 10, 11, 12, > 13, 14 ); } > __v8sf f2 (__v8sf x, __v8sf y) { return S (x, y, __v8si, 0, 1, 8, 9, 10, 11, > 12, 13 ); } > testcase we used to emit terrible code (8 BIT_FIELD_REFs + composition > back), while LLVM emits: > vpermilps $144, %xmm1, %xmm2 # xmm2 = xmm1[0,0,1,2] > vextractf128 $1, %ymm1, %xmm3 > vblendps $8, %xmm1, %xmm3, %xmm1 # xmm1 = xmm3[0,1,2],xmm1[3] > vpermilps $147, %xmm1, %xmm1 # xmm1 = xmm1[3,0,1,2] > vinsertf128 $1, %xmm1, %ymm2, %ymm1 > vblendps $1, %ymm0, %ymm1, %ymm0 # ymm0 = > ymm0[0],ymm1[1,2,3,4,5,6,7] > and > vextractf128 $1, %ymm1, %xmm2 > vshufpd $1, %xmm2, %xmm1, %xmm2 # xmm2 = xmm1[1],xmm2[0] > vmovddup %xmm1, %xmm1 # xmm1 = xmm1[0,0] > vinsertf128 $1, %xmm2, %ymm1, %ymm1 > vblendps $3, %ymm0, %ymm1, %ymm0 # ymm0 = > ymm0[0,1],ymm1[2,3,4,5,6,7] > With the patch we emit: > vpermilps $144, %ymm1, %ymm2 > vpermilps .LC0(%rip), %ymm1, %ymm1 > vblendps $238, %ymm2, %ymm0, %ymm0 > vperm2f128 $1, %ymm1, %ymm1, %ymm1 > vblendps $16, %ymm1, %ymm0, %ymm0 > and > vshufps $68, %ymm1, %ymm0, %ymm0 > vpermilps .LC1(%rip), %ymm1, %ymm1 > vperm2f128 $1, %ymm1, %ymm1, %ymm1 > vblendps $48, %ymm1, %ymm0, %ymm0 > so one insn each shorter than what LLVM emits. > > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? > > 2019-08-29 Jakub Jelinek <ja...@redhat.com> > > PR target/91560 > * config/i386/i386-expand.c (expand_vec_perm_movs, > expand_vec_perm_blend, expand_vec_perm_vpermil, > expand_vec_perm_pshufb, expand_vec_perm_1, > expand_vec_perm_pshuflw_pshufhw, expand_vec_perm_palignr, > expand_vec_perm_interleave2, expand_vec_perm_vpermq_perm_1, > expand_vec_perm_vperm2f128, expand_vec_perm_interleave3, > expand_vec_perm_vperm2f128_vblend, expand_vec_perm_2vperm2f128_vshuf, > expand_vec_perm_even_odd, expand_vec_perm_broadcast): Adjust function > comments - replace ix86_expand_vec_perm_builtin_1 with > ix86_expand_vec_perm_const_1. > (expand_vec_perm2_vperm2f128_vblend): New function. > (ix86_expand_vec_perm_const_1): New forward declaration. Call > expand_vec_perm2_vperm2f128_vblend as last resort. > (canonicalize_perm): Formatting fix. > > * gcc.dg/torture/vshuf-8.inc: Add two further permutations.
LGTM, but actually your area ;) Thanks, Uros. > --- gcc/config/i386/i386-expand.c.jj 2019-08-27 12:26:25.383089132 +0200 > +++ gcc/config/i386/i386-expand.c 2019-08-28 15:22:43.911004586 +0200 > @@ -16372,7 +16372,7 @@ expand_vselect_vconcat (rtx target, rtx > return ok; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D > using movss or movsd. */ > static bool > expand_vec_perm_movs (struct expand_vec_perm_d *d) > @@ -16408,7 +16408,7 @@ expand_vec_perm_movs (struct expand_vec_ > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D > in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */ > > static bool > @@ -16633,7 +16633,7 @@ expand_vec_perm_blend (struct expand_vec > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D > in terms of the variable form of vpermilps. > > Note that we will have already failed the immediate input vpermilps, > @@ -16709,7 +16709,7 @@ valid_perm_using_mode_p (machine_mode vm > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D > in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */ > > static bool > @@ -17026,7 +17026,7 @@ ix86_expand_vec_one_operand_perm_avx512 > > static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool); > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to instantiate D > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D > in a single instruction. */ > > static bool > @@ -17216,7 +17216,7 @@ expand_vec_perm_1 (struct expand_vec_per > return false; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D > in terms of a pair of pshuflw + pshufhw instructions. */ > > static bool > @@ -17257,7 +17257,7 @@ expand_vec_perm_pshuflw_pshufhw (struct > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify > the permutation using the SSSE3 palignr instruction. This succeeds > when all of the elements in PERM fit within one vector and we merely > need to shift them down so that a single vector permutation has a > @@ -17474,7 +17474,7 @@ expand_vec_perm_pblendv (struct expand_v > > static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d); > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify > a two vector permutation into a single vector permutation by using > an interleave operation to merge the vectors. */ > > @@ -17752,7 +17752,7 @@ expand_vec_perm_interleave2 (struct expa > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify > a single vector cross-lane permutation into vpermq followed > by any of the single insn permutations. */ > > @@ -17833,7 +17833,7 @@ expand_vec_perm_vpermq_perm_1 (struct ex > > static bool canonicalize_perm (struct expand_vec_perm_d *d); > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to expand > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to expand > a vector permutation using two instructions, vperm2f128 resp. > vperm2i128 followed by any single in-lane permutation. */ > > @@ -17950,7 +17950,7 @@ expand_vec_perm_vperm2f128 (struct expan > return false; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to simplify > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify > a two vector permutation using 2 intra-lane interleave insns > and cross-lane shuffle for 32-byte vectors. */ > > @@ -18026,7 +18026,7 @@ expand_vec_perm_interleave3 (struct expa > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement > a single vector permutation using a single intra-lane vector > permutation, vperm2f128 swapping the lanes and vblend* insn blending > the non-swapped and swapped vectors together. */ > @@ -18094,7 +18094,7 @@ expand_vec_perm_vperm2f128_vblend (struc > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement a V4DF > +/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF > permutation using two vperm2f128, followed by a vshufpd insn blending > the two vectors together. */ > > @@ -18145,6 +18145,106 @@ expand_vec_perm_2vperm2f128_vshuf (struc > return true; > } > > +static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *); > + > +/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement > + a two vector permutation using two intra-lane vector > + permutations, vperm2f128 swapping the lanes and vblend* insn blending > + the non-swapped and swapped vectors together. */ > + > +static bool > +expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d) > +{ > + struct expand_vec_perm_d dfirst, dsecond, dthird; > + unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = > 0; > + rtx_insn *seq1, *seq2; > + bool ok; > + rtx (*blend) (rtx, rtx, rtx, rtx) = NULL; > + > + if (!TARGET_AVX > + || TARGET_AVX2 > + || (d->vmode != V8SFmode && d->vmode != V4DFmode) > + || d->one_operand_p) > + return false; > + > + dfirst = *d; > + dsecond = *d; > + for (i = 0; i < nelt; i++) > + { > + dfirst.perm[i] = 0xff; > + dsecond.perm[i] = 0xff; > + } > + for (i = 0, msk = 0; i < nelt; i++) > + { > + j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2; > + if (j == i) > + { > + dfirst.perm[j] = d->perm[i]; > + which1 |= (d->perm[i] < nelt ? 1 : 2); > + } > + else > + { > + dsecond.perm[j] = d->perm[i]; > + which2 |= (d->perm[i] < nelt ? 1 : 2); > + msk |= (1U << i); > + } > + } > + if (msk == 0 || msk == (1U << nelt) - 1) > + return false; > + > + if (!d->testing_p) > + { > + dfirst.target = gen_reg_rtx (dfirst.vmode); > + dsecond.target = gen_reg_rtx (dsecond.vmode); > + } > + > + for (i = 0; i < nelt; i++) > + { > + if (dfirst.perm[i] == 0xff) > + dfirst.perm[i] = (which1 == 2 ? i + nelt : i); > + if (dsecond.perm[i] == 0xff) > + dsecond.perm[i] = (which2 == 2 ? i + nelt : i); > + } > + canonicalize_perm (&dfirst); > + start_sequence (); > + ok = ix86_expand_vec_perm_const_1 (&dfirst); > + seq1 = get_insns (); > + end_sequence (); > + > + if (!ok) > + return false; > + > + canonicalize_perm (&dsecond); > + start_sequence (); > + ok = ix86_expand_vec_perm_const_1 (&dsecond); > + seq2 = get_insns (); > + end_sequence (); > + > + if (!ok) > + return false; > + > + if (d->testing_p) > + return true; > + > + emit_insn (seq1); > + emit_insn (seq2); > + > + dthird = *d; > + dthird.op0 = dsecond.target; > + dthird.op1 = dsecond.target; > + dthird.one_operand_p = true; > + dthird.target = gen_reg_rtx (dthird.vmode); > + for (i = 0; i < nelt; i++) > + dthird.perm[i] = i ^ nelt2; > + > + ok = expand_vec_perm_1 (&dthird); > + gcc_assert (ok); > + > + blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256; > + emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk))); > + return true; > +} > + > /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word > permutation with two pshufb insns and an ior. We should have already > failed all two instruction sequences. */ > @@ -18534,7 +18634,7 @@ expand_vec_perm_even_odd_trunc (struct e > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even > +/* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even > and extract-odd permutations. */ > > static bool > @@ -18743,7 +18843,7 @@ expand_vec_perm_even_odd_1 (struct expan > return true; > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match > +/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match > extract-even and extract-odd permutations. */ > > static bool > @@ -18762,7 +18862,7 @@ expand_vec_perm_even_odd (struct expand_ > return expand_vec_perm_even_odd_1 (d, odd); > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement broadcast > +/* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast > permutations. We assume that expand_vec_perm_1 has already failed. */ > > static bool > @@ -18841,7 +18941,7 @@ expand_vec_perm_broadcast_1 (struct expa > } > } > > -/* A subroutine of ix86_expand_vec_perm_builtin_1. Pattern match > +/* A subroutine of ix86_expand_vec_perm_const_1. Pattern match > broadcast permutations. */ > > static bool > @@ -19137,6 +19237,10 @@ ix86_expand_vec_perm_const_1 (struct exp > return true; > } > > + /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */ > + if (expand_vec_perm2_vperm2f128_vblend (d)) > + return true; > + > return false; > } > > @@ -19149,7 +19253,7 @@ canonicalize_perm (struct expand_vec_per > int i, which, nelt = d->nelt; > > for (i = which = 0; i < nelt; ++i) > - which |= (d->perm[i] < nelt ? 1 : 2); > + which |= (d->perm[i] < nelt ? 1 : 2); > > d->one_operand_p = true; > switch (which) > --- gcc/testsuite/gcc.dg/torture/vshuf-8.inc.jj 2015-12-04 09:24:31.234396066 > +0100 > +++ gcc/testsuite/gcc.dg/torture/vshuf-8.inc 2019-08-28 15:11:35.778754247 > +0200 > @@ -25,7 +25,9 @@ T (21, 4, 12, 5, 13, 6, 14, 7, 15) \ > T (22, 1, 2, 3, 4, 5, 6, 7, 0) \ > T (23, 6, 5, 4, 3, 2, 1, 0, 7) \ > T (24, 0, 1, 2, 3, 8, 9, 10, 11) \ > -T (25, 0, 1, 2, 3, 12, 13, 14, 15) > +T (25, 0, 1, 2, 3, 12, 13, 14, 15) \ > +T (26, 0, 1, 8, 9, 10, 11, 12, 13) \ > +T (27, 0, 8, 9, 10, 11, 12, 13, 14) > #define EXPTESTS \ > T (116, 9, 3, 9, 4, 7, 0, 0, 6) \ > T (117, 4, 14, 12, 8, 9, 6, 0, 10) \ > > Jakub