Some permutations can be implemented without costly PSHUFB instruction, e.g.:
{ 8,9,10,11,12,13,14,15, 0,1,2,3,4,5,6,7 } with PALIGNR, { 0,1,2,3, 4,5,6,7, 4,5,6,7, 12,13,14,15 } with PSHUFD, { 0,1, 2,3, 2,3, 6,7, 8,9,10,11,12,13,14,15 } with PSHUFLW and { 0,1,2,3,4,5,6,7, 8,9, 10,11, 10,11, 14,15 } with PSHUFHW. All these instructions have constant shuffle control mask and do not need to load shuffle mask from a memory to a temporary XMM register. 2021-06-11 Uroš Bizjak <ubiz...@gmail.com> gcc/ PR target/101021 * config/i386/i386-expand.c (expand_vec_perm_pshufb): Return false if the permutation can be implemented with constant permutation instruction in wider mode. (canonicalize_vector_int_perm): Move above expand_vec_perm_pshufb. Handle V8QImode and V4HImode. gcc/testsuite/ PR target/101021 * gcc.target/i386/pr101021-1.c: New test. * gcc.target/i386/pr101021-2.c: Ditto. Bootstrapped and regression tested on x86_64-linux-gnu {,-m32}. Additionally tested with: GCC_TEST_RUN_EXPENSIVE=1 make check-gcc RUNTESTFLAGS='--target_board=unix/-mavx dg-torture.exp=vshuf*.c' Pushed to master. Uros.
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 9ee5257adf9..2fa3a18dc6a 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -17354,6 +17354,59 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d) return true; } +/* For V*[QHS]Imode permutations, check if the same permutation + can't be performed in a 2x, 4x or 8x wider inner mode. */ + +static bool +canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, + struct expand_vec_perm_d *nd) +{ + int i; + machine_mode mode = VOIDmode; + + switch (d->vmode) + { + case E_V8QImode: mode = V4HImode; break; + case E_V16QImode: mode = V8HImode; break; + case E_V32QImode: mode = V16HImode; break; + case E_V64QImode: mode = V32HImode; break; + case E_V4HImode: mode = V2SImode; break; + case E_V8HImode: mode = V4SImode; break; + case E_V16HImode: mode = V8SImode; break; + case E_V32HImode: mode = V16SImode; break; + case E_V4SImode: mode = V2DImode; break; + case E_V8SImode: mode = V4DImode; break; + case E_V16SImode: mode = V8DImode; break; + default: return false; + } + for (i = 0; i < d->nelt; i += 2) + if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) + return false; + nd->vmode = mode; + nd->nelt = d->nelt / 2; + for (i = 0; i < nd->nelt; i++) + nd->perm[i] = d->perm[2 * i] / 2; + if (GET_MODE_INNER (mode) != DImode) + canonicalize_vector_int_perm (nd, nd); + if (nd != d) + { + nd->one_operand_p = d->one_operand_p; + nd->testing_p = d->testing_p; + if (d->op0 == d->op1) + nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); + else + { + nd->op0 = gen_lowpart (nd->vmode, d->op0); + nd->op1 = gen_lowpart (nd->vmode, d->op1); + } + if (d->testing_p) + nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); + else + nd->target = gen_reg_rtx (nd->vmode); + } + return true; +} + /* Return true if permutation D can be performed as VMODE permutation instead. */ @@ -17391,6 +17444,7 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) unsigned i, nelt, eltsz, mask; unsigned char perm[64]; machine_mode vmode = V16QImode; + struct expand_vec_perm_d nd; rtx rperm[64], vperm, target, op0, op1; nelt = d->nelt; @@ -17539,6 +17593,10 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } + /* Try to avoid variable permutation instruction. */ + if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd)) + return false; + if (d->testing_p) return true; @@ -17617,57 +17675,6 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return true; } -/* For V*[QHS]Imode permutations, check if the same permutation - can't be performed in a 2x, 4x or 8x wider inner mode. */ - -static bool -canonicalize_vector_int_perm (const struct expand_vec_perm_d *d, - struct expand_vec_perm_d *nd) -{ - int i; - machine_mode mode = VOIDmode; - - switch (d->vmode) - { - case E_V16QImode: mode = V8HImode; break; - case E_V32QImode: mode = V16HImode; break; - case E_V64QImode: mode = V32HImode; break; - case E_V8HImode: mode = V4SImode; break; - case E_V16HImode: mode = V8SImode; break; - case E_V32HImode: mode = V16SImode; break; - case E_V4SImode: mode = V2DImode; break; - case E_V8SImode: mode = V4DImode; break; - case E_V16SImode: mode = V8DImode; break; - default: return false; - } - for (i = 0; i < d->nelt; i += 2) - if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1) - return false; - nd->vmode = mode; - nd->nelt = d->nelt / 2; - for (i = 0; i < nd->nelt; i++) - nd->perm[i] = d->perm[2 * i] / 2; - if (GET_MODE_INNER (mode) != DImode) - canonicalize_vector_int_perm (nd, nd); - if (nd != d) - { - nd->one_operand_p = d->one_operand_p; - nd->testing_p = d->testing_p; - if (d->op0 == d->op1) - nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0); - else - { - nd->op0 = gen_lowpart (nd->vmode, d->op0); - nd->op1 = gen_lowpart (nd->vmode, d->op1); - } - if (d->testing_p) - nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1); - else - nd->target = gen_reg_rtx (nd->vmode); - } - return true; -} - /* Try to expand one-operand permutation with constant mask. */ static bool diff --git a/gcc/testsuite/gcc.target/i386/pr101021-1.c b/gcc/testsuite/gcc.target/i386/pr101021-1.c new file mode 100644 index 00000000000..f4649c00338 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101021-1.c @@ -0,0 +1,35 @@ +/* PR target/101021 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-not "vpshufb" } } */ + +typedef char S; +typedef S V __attribute__((vector_size(16 * sizeof(S)))); + +V t1 (V x) +{ + return __builtin_shuffle (x, (V) { 8,9,10,11,12,13,14,15, 0,1,2,3,4,5,6,7 }); +} + +/* { dg-final { scan-assembler "vpalignr" } } */ + +V t2 (V x) +{ + return __builtin_shuffle (x, (V) { 0,1,2,3, 4,5,6,7, 4,5,6,7, 12,13,14,15 }); +} + +/* { dg-final { scan-assembler "vpshufd" } } */ + +V t3 (V x) +{ + return __builtin_shuffle (x, (V) { 0,1, 2,3, 2,3, 6,7, 8,9,10,11,12,13,14,15 }); +} + +/* { dg-final { scan-assembler "vpshuflw" } } */ + +V t4 (V x) +{ + return __builtin_shuffle (x, (V) { 0,1,2,3,4,5,6,7, 8,9, 10,11, 10,11, 14,15 }); +} + +/* { dg-final { scan-assembler "vpshufhw" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr101021-2.c b/gcc/testsuite/gcc.target/i386/pr101021-2.c new file mode 100644 index 00000000000..1e046f7d990 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101021-2.c @@ -0,0 +1,21 @@ +/* PR target/101021 */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -mavx" } */ +/* { dg-final { scan-assembler-not "vpshufb" } } */ + +typedef char S; +typedef S V __attribute__((vector_size(8 * sizeof(S)))); + +V t1 (V x) +{ + return __builtin_shuffle (x, (V) { 4,5,6,7, 0,1,2,3 }); +} + +/* { dg-final { scan-assembler "vpshufd" } } */ + +V t2 (V x) +{ + return __builtin_shuffle (x, (V) { 0,1, 2,3, 2,3, 6,7 }); +} + +/* { dg-final { scan-assembler "vpshuflw" } } */