On Wed, Oct 01, 2014 at 04:12:15PM +0200, Jakub Jelinek wrote:
> For PR62128, IMHO the right fix is attached. Note, this is again covered
> in vshuf-*.c tests (test 22 in both vshuf-v32*.c and vshuf-v16*.c).
> With that attached patch, pr62128.c (aka test_22 in vshuf-v32qi.c), changes:
> - vpshufb .LC0(%rip), %ymm0, %ymm1
> - vpshufb .LC1(%rip), %ymm0, %ymm0
> - vpermq $78, %ymm1, %ymm1
> - vpor %ymm1, %ymm0, %ymm0
> + vpermq $78, %ymm0, %ymm1
> + vpalignr $1, %ymm0, %ymm1, %ymm0
> ret
>
> 2014-10-01 Jakub Jelinek <[email protected]>
>
> PR target/62128
> * config/i386/i386.c (expand_vec_perm_1): Try expand_vec_perm_palignr
> if it expands to a single insn only.
> (expand_vec_perm_palignr): Add SINGLE_INSN_ONLY_P argument. If true,
> fail unless in_order is true. Add forward declaration.
> (expand_vec_perm_vperm2f128): Fix up comment about which permutation
> is useful for one_operand_p.
> (ix86_expand_vec_perm_const_1): Adjust expand_vec_perm_palignr caller.
Now bootstrapped/regtested on x86_64-linux and i686-linux (and additionally
tested also with --target_board=unix/-mavx2), ok for trunk?
> --- gcc/config/i386/i386.c.jj 2014-10-01 15:42:28.000000000 +0200
> +++ gcc/config/i386/i386.c 2014-10-01 15:50:06.234079887 +0200
> @@ -39636,6 +39636,7 @@ struct expand_vec_perm_d
> static bool canonicalize_perm (struct expand_vec_perm_d *d);
> static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
> static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
> +static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
>
> /* Get a vector mode of the same size as the original but with elements
> twice as wide. This is only guaranteed to apply to integral vectors. */
> @@ -43225,6 +43226,10 @@ expand_vec_perm_1 (struct expand_vec_per
> if (expand_vec_perm_pshufb (d))
> return true;
>
> + /* Try the AVX2 vpalignr instruction. */
> + if (expand_vec_perm_palignr (d, true))
> + return true;
> +
> /* Try the AVX512F vpermi2 instructions. */
> rtx vec[64];
> enum machine_mode mode = d->vmode;
> @@ -43286,10 +43291,11 @@ expand_vec_perm_pshuflw_pshufhw (struct
> the permutation using the SSSE3 palignr instruction. This succeeds
> when all of the elements in PERM fit within one vector and we merely
> need to shift them down so that a single vector permutation has a
> - chance to succeed. */
> + chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
> + the vpalignr instruction itself can perform the requested permutation. */
>
> static bool
> -expand_vec_perm_palignr (struct expand_vec_perm_d *d)
> +expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool
> single_insn_only_p)
> {
> unsigned i, nelt = d->nelt;
> unsigned min, max;
> @@ -43320,8 +43326,9 @@ expand_vec_perm_palignr (struct expand_v
>
> /* Given that we have SSSE3, we know we'll be able to implement the
> single operand permutation after the palignr with pshufb for
> - 128-bit vectors. */
> - if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16)
> + 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
> + first. */
> + if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
> return true;
>
> dcopy = *d;
> @@ -43342,6 +43349,9 @@ expand_vec_perm_palignr (struct expand_v
> }
> dcopy.one_operand_p = true;
>
> + if (single_insn_only_p && !in_order)
> + return false;
> +
> /* For AVX2, test whether we can permute the result in one instruction. */
> if (d->testing_p)
> {
> @@ -43922,7 +43932,8 @@ expand_vec_perm_vperm2f128 (struct expan
> return true;
> }
>
> - /* For one operand, the only useful vperm2f128 permutation is 0x10. */
> + /* For one operand, the only useful vperm2f128 permutation is 0x01
> + aka lanes swap. */
> if (d->one_operand_p)
> return false;
> }
> @@ -44811,7 +44822,7 @@ ix86_expand_vec_perm_const_1 (struct exp
> if (expand_vec_perm_pshuflw_pshufhw (d))
> return true;
>
> - if (expand_vec_perm_palignr (d))
> + if (expand_vec_perm_palignr (d, false))
> return true;
>
> if (expand_vec_perm_interleave2 (d))
Jakub