On Wed, Oct 01, 2014 at 01:45:54PM +0200, Uros Bizjak wrote:
> OK.
Thanks. Second step is a tiny optimization, for the
simplified 122 (now 24) vshuf-v4di.c testcase:
typedef unsigned long long V __attribute__ ((vector_size (32)));
V a, b, c, d;
int
main ()
{
int i;
for (i = 0; i < 4; ++i)
{
a[i] = i + 2;
b[i] = 4 + i + 2;
}
asm volatile ("" : : : "memory");
c = __builtin_shuffle (a, b, (V) { 2, 5, 6, 3 });
d = __builtin_shuffle ((V) { 2, 3, 4, 5 }, (V) { 6, 7, 8, 9 }, (V) { 2, 5, 6,
3 });
if (__builtin_memcmp (&c, &d, sizeof (c)))
__builtin_abort ();
return 0;
}
this patch allows better code to be generated:
- vmovdqa b(%rip), %ymm0
+ vpermq $238, a(%rip), %ymm1
movl $32, %edx
- movl $d, %esi
- vmovdqa a(%rip), %ymm1
+ vmovdqa b(%rip), %ymm0
+ movl $d, %esi
movl $c, %edi
- vperm2i128 $17, %ymm0, %ymm1, %ymm1
vpblendd $195, %ymm1, %ymm0, %ymm0
vmovdqa %ymm0, c(%rip)
That is because vperm2i128 $17 unnecessarily uses
two operands when all the data it grabs are from a single one.
So, by canonicalizing the permutation we can emit
vpermq $238 instead. Perhaps more places might benefit from
extra canonicalize_perm calls (two spots already use that beyond
the single one on the expansion/testing entry point).
Tested again with
GCC_TEST_RUN_EXPENSIVE=1 make check-gcc \
RUNTESTFLAGS='--target_board=unix/-mavx2 dg-torture.exp=vshuf*.c'
on x86_64-linux. Ok for trunk?
2014-10-01 Jakub Jelinek <[email protected]>
* config/i386/i386.c (expand_vec_perm_vperm2f128): Canonicalize
dfirst permutation.
--- gcc/config/i386/i386.c.jj 2014-10-01 13:00:30.000000000 +0200
+++ gcc/config/i386/i386.c 2014-10-01 13:59:40.061956852 +0200
@@ -43905,15 +43905,16 @@ expand_vec_perm_vperm2f128 (struct expan
dfirst.perm[i] = (i & (nelt2 - 1))
+ ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
+ canonicalize_perm (&dfirst);
ok = expand_vec_perm_1 (&dfirst);
gcc_assert (ok);
/* And dsecond is some single insn shuffle, taking
d->op0 and result of vperm2f128 (if perm < 16) or
d->op1 and result of vperm2f128 (otherwise). */
- dsecond.op1 = dfirst.target;
if (perm >= 16)
- dsecond.op0 = dfirst.op1;
+ dsecond.op0 = dsecond.op1;
+ dsecond.op1 = dfirst.target;
ok = expand_vec_perm_1 (&dsecond);
gcc_assert (ok);
Jakub