On Wed, Oct 1, 2014 at 2:56 PM, Jakub Jelinek <ja...@redhat.com> wrote:
> And now the expand_vec_perm_palignr improvement, tested > with GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \ > RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c' > E.g. > typedef unsigned long long V __attribute__ ((vector_size (32))); > extern void abort (void); > V a, b, c, d; > void test_14 (void) > { > V mask = { 6, 1, 3, 4 }; > int i; > c = __builtin_shuffle (a, mask); > d = __builtin_shuffle (a, b, mask); > } > (distilled from test 15 in vshuf-v4di.c) results in: > - vmovdqa a(%rip), %ymm0 > - vpermq $54, %ymm0, %ymm1 > - vpshufb .LC1(%rip), %ymm0, %ymm0 > - vmovdqa %ymm1, c(%rip) > - vmovdqa b(%rip), %ymm1 > - vpshufb .LC0(%rip), %ymm1, %ymm1 > - vpermq $78, %ymm1, %ymm1 > - vpor %ymm1, %ymm0, %ymm0 > + vmovdqa a(%rip), %ymm1 > + vpermq $54, %ymm1, %ymm0 > + vmovdqa %ymm0, c(%rip) > + vmovdqa b(%rip), %ymm0 > + vpalignr $8, %ymm1, %ymm0, %ymm0 > + vpermq $99, %ymm0, %ymm0 > vmovdqa %ymm0, d(%rip) > vzeroupper > ret > change (and two fewer .rodata constants). > > Ok for trunk? > > 2014-10-01 Jakub Jelinek <ja...@redhat.com> > > * config/i386/i386.c (expand_vec_perm_palignr): Handle > 256-bit vectors for TARGET_AVX2. Please mention PR 62128 and include the testcase from the PR. Also, please add a version of gcc.target/i386/pr52252-atom.c, compiled with -mavx2 (perhaps named pr52252-avx2.c) OK with a small adjustment below. Thanks, Uros. > --- gcc/config/i386/i386.c.jj 2014-10-01 14:24:16.483138899 +0200 > +++ gcc/config/i386/i386.c 2014-10-01 14:27:53.577222011 +0200 > @@ -43297,44 +43297,75 @@ expand_vec_perm_palignr (struct expand_v > rtx shift, target; > struct expand_vec_perm_d dcopy; > > - /* Even with AVX, palignr only operates on 128-bit vectors. */ > - if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) > + /* Even with AVX, palignr only operates on 128-bit vectors, > + in AVX2 palignr operates on both 128-bit lanes. */ > + if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16) > + && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32)) Please simplify the above condition ... > return false; > > - min = nelt, max = 0; > + min = 2 * nelt, max = 0; > for (i = 0; i < nelt; ++i) > { > unsigned e = d->perm[i]; > + if (GET_MODE_SIZE (d->vmode) == 32) > + e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1); > if (e < min) > min = e; > if (e > max) > max = e; > } > - if (min == 0 || max - min >= nelt) > + if (min == 0 > + || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt)) > return false; > > /* Given that we have SSSE3, we know we'll be able to implement the > - single operand permutation after the palignr with pshufb. */ > - if (d->testing_p) > + single operand permutation after the palignr with pshufb for > + 128-bit vectors. */ > + if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16) > return true; > > dcopy = *d; > - shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); > - target = gen_reg_rtx (TImode); > - emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), > - gen_lowpart (TImode, d->op0), shift)); > - > - dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); > - dcopy.one_operand_p = true; > > in_order = true; > for (i = 0; i < nelt; ++i) > { > - unsigned e = dcopy.perm[i] - min; > + unsigned e = dcopy.perm[i]; > + if (GET_MODE_SIZE (d->vmode) == 32 > + && e >= nelt > + && (e & (nelt / 2 - 1)) < min) > + e = e - min - (nelt / 2); > + else > + e = e - min; > if (e != i) > in_order = false; > dcopy.perm[i] = e; > } > + dcopy.one_operand_p = true; > + > + /* For AVX2, test whether we can permute the result in one instruction. */ > + if (d->testing_p) > + { > + if (in_order) > + return true; > + dcopy.op1 = dcopy.op0; > + return expand_vec_perm_1 (&dcopy); > + } > + > + shift = GEN_INT (min * GET_MODE_BITSIZE (GET_MODE_INNER (d->vmode))); > + if (GET_MODE_SIZE (d->vmode) == 16) > + { > + target = gen_reg_rtx (TImode); > + emit_insn (gen_ssse3_palignrti (target, gen_lowpart (TImode, d->op1), > + gen_lowpart (TImode, d->op0), shift)); > + } > + else > + { > + target = gen_reg_rtx (V2TImode); > + emit_insn (gen_avx2_palignrv2ti (target, gen_lowpart (V2TImode, > d->op1), > + gen_lowpart (V2TImode, d->op0), > shift)); > + } > + > + dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target); > > /* Test for the degenerate case where the alignment by itself > produces the desired permutation. */ > @@ -43345,7 +43376,7 @@ expand_vec_perm_palignr (struct expand_v > } > > ok = expand_vec_perm_1 (&dcopy); > - gcc_assert (ok); > + gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32); > > return ok; > } > > > Jakub