On Wed, Oct 01, 2014 at 02:25:01PM +0200, Uros Bizjak wrote: > OK. And now the expand_vec_perm_palignr improvement, tested with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \ RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c' E.g. typedef unsigned long long V __attribute__ ((vector_size (32))); extern void abort (void); V a, b, c, d; void test_14 (void) { V mask = { 6, 1, 3, 4 }; int i; c = __builtin_shuffle (a, mask); d = __builtin_shuffle (a, b, mask); } (distilled from test 15 in vshuf-v4di.c) results in: - vmovdqa a(%rip), %ymm0 - vpermq $54, %ymm0, %ymm1 - vpshufb .LC1(%rip), %ymm0, %ymm0 - vmovdqa %ymm1, c(%rip) - vmovdqa b(%rip), %ymm1 - vpshufb .LC0(%rip), %ymm1, %ymm1 - vpermq $78, %ymm1, %ymm1 - vpor %ymm1, %ymm0, %ymm0 + vmovdqa a(%rip), %ymm1 + vpermq $54, %ymm1, %ymm0 + vmovdqa %ymm0, c(%rip) + vmovdqa b(%rip), %ymm0 + vpalignr $8, %ymm1, %ymm0, %ymm0 + vpermq $99, %ymm0, %ymm0 vmovdqa %ymm0, d(%rip) vzeroupper ret change (and two fewer .rodata constants).
Ok for trunk? 2014-10-01 Jakub Jelinek <ja...@redhat.com> * config/i386/i386.c (expand_vec_perm_palignr): Handle 256-bit vectors for TARGET_AVX2. --- gcc/config/i386/i386.c.jj 2014-10-01 14:24:16.483138899 +0200 +++ gcc/config/i386/i386.c 2014-10-01 14:27:53.577222011 +0200 @@ -43297,44 +43297,75 @@ expand_vec_perm_palignr (struct expand_v rtx shift, target; struct expand_vec_perm_d dcopy; - /* Even with AVX, palignr only operates on 128-bit vectors. */ - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + /* Even with AVX, palignr only operates on 128-bit vectors, + in AVX2 palignr operates on both 128-bit lanes. */ + if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) + && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32)) return false; - min = nelt, max = 0; + min = 2 * nelt, max = 0; for (i = 0; i < nelt; ++i) { unsigned e = d->perm[i]; + if (GET_MODE_SIZE (d->vmode) == 32) + e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1); if (e < min) min = e; if (e > max) max = e; } - if (min == 0 || max - min >= nelt) + if (min == 0 + || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt)) return false; /* Given that we have SSSE3, we know we'll be able to implement the - single operand permutation after the palignr with pshufb. */ - if (d->testing_p) + single operand permutation after the palignr with pshufb for + 128-bit vectors. */ + if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16) return true; dcopy = *d; - shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); - target = gen_reg_rtx (TImode); - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), - gen_lowpart (TImode, d->op0), shift)); - - dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); - dcopy.one_operand_p = true; in_order = true; for (i = 0; i < nelt; ++i) { - unsigned e = dcopy.perm[i] - min; + unsigned e = dcopy.perm[i]; + if (GET_MODE_SIZE (d->vmode) == 32 + && e >= nelt + && (e & (nelt / 2 - 1)) < min) + e = e - min - (nelt / 2); + else + e = e - min; if (e != i) in_order = false; dcopy.perm[i] = e; } + dcopy.one_operand_p = true; + + /* For AVX2, test whether we can permute the result in one instruction. */ + if (d->testing_p) + { + if (in_order) + return true; + dcopy.op1 = dcopy.op0; + return expand_vec_perm_1 (&dcopy); + } + + shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); + if (GET_MODE_SIZE (d->vmode) == 16) + { + target = gen_reg_rtx (TImode); + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), + gen_lowpart (TImode, d->op0), shift)); + } + else + { + target = gen_reg_rtx (V2TImode); + emit_insn (gen_avx2_palignrv2ti (target, gen_lowpart (V2TImode, d->op1), + gen_lowpart (V2TImode, d->op0), shift)); + } + + dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); /* Test for the degenerate case where the alignment by itself produces the desired permutation. */ @@ -43345,7 +43376,7 @@ expand_vec_perm_palignr (struct expand_v } ok = expand_vec_perm_1 (&dcopy); - gcc_assert (ok); + gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32); return ok; } Jakub