Thank you.
Patch with proposed fixes:
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 085eb54..09c0057 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -48322,6 +48322,120 @@ expand_vec_perm_vpshufb2_vpermq_even_odd
(struct expand_vec_perm_d *d)
return true;
}
+/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
+ and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
+ with two "and" and "pack" or two "shift" and "pack" insns. We should
+ have already failed all two instruction sequences. */
+
+static bool
+expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
+{
+ rtx op, dop0, dop1, t, rperm[16];
+ unsigned i, odd, c, s, nelt = d->nelt;
+ bool end_perm = false;
+ machine_mode half_mode;
+ rtx (*gen_and) (rtx, rtx, rtx);
+ rtx (*gen_pack) (rtx, rtx, rtx);
+ rtx (*gen_shift) (rtx, rtx, rtx);
+
+ /* Required for "pack". */
+ if (!TARGET_SSE4_2 || d->one_operand_p)
+ return false;
+
+ switch (d->vmode)
+ {
+ case V8HImode:
+ c = 0xffff;
+ s = 16;
+ half_mode = V4SImode;
+ gen_and = gen_andv4si3;
+ gen_pack = gen_sse4_1_packusdw;
+ gen_shift = gen_lshrv4si3;
+ break;
+ case V16QImode:
+ c = 0xff;
+ s = 8;
+ half_mode = V8HImode;
+ gen_and = gen_andv8hi3;
+ gen_pack = gen_sse2_packuswb;
+ gen_shift = gen_lshrv8hi3;
+ break;
+ case V16HImode:
+ c = 0xffff;
+ s = 16;
+ half_mode = V8SImode;
+ gen_and = gen_andv8si3;
+ gen_pack = gen_avx2_packusdw;
+ gen_shift = gen_lshrv8si3;
+ end_perm = true;
+ break;
+ case V32QImode:
+ c = 0xff;
+ s = 8;
+ half_mode = V16HImode;
+ gen_and = gen_andv16hi3;
+ gen_pack = gen_avx2_packuswb;
+ gen_shift = gen_lshrv16hi3;
+ end_perm = true;
+ break;
+ default:
+ /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
+ general shuffles. */
+ return false;
+ }
+
+ /* Check that permutation is even or odd. */
+ odd = d->perm[0];
+ if (odd > 1)
+ return false;
+
+ for (i = 1; i < nelt; ++i)
+ if (d->perm[i] != 2 * i + odd)
+ return false;
+
+ if (d->testing_p)
+ return true;
+
+ dop0 = gen_reg_rtx (half_mode);
+ dop1 = gen_reg_rtx (half_mode);
+ if (odd == 0)
+ {
+ for (i = 0; i < nelt / 2; i++)
+ rperm[i] = GEN_INT (c);
+ t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
+ t = force_reg (half_mode, t);
+ emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
+ emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
+ }
+ else
+ {
+ emit_insn (gen_shift (dop0,
+ gen_lowpart (half_mode, d->op0),
+ GEN_INT (s)));
+ emit_insn (gen_shift (dop1,
+ gen_lowpart (half_mode, d->op1),
+ GEN_INT (s)));
+ }
+ /* In AVX2 for 256 bit case we need to permute pack result. */
+ if (TARGET_AVX2 && end_perm)
+ {
+ op = gen_reg_rtx (d->vmode);
+ t = gen_reg_rtx (V4DImode);
+ emit_insn (gen_pack (op, dop0, dop1));
+ emit_insn (gen_avx2_permv4di_1 (t,
+ gen_lowpart (V4DImode, op),
+ const0_rtx,
+ const2_rtx,
+ const1_rtx,
+ GEN_INT (3)));
+ emit_move_insn (d->target, gen_lowpart (d->vmode, t));
+ }
+ else
+ emit_insn (gen_pack (d->target, dop0, dop1));
+
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
and extract-odd permutations. */
@@ -48393,7 +48507,9 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
gcc_unreachable ();
case V8HImode:
- if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+ if (TARGET_SSE4_2)
+ return expand_vec_perm_even_odd_pack (d);
+ else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
return expand_vec_perm_pshufb2 (d);
else
{
@@ -48416,7 +48532,9 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
break;
case V16QImode:
- if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+ if (TARGET_SSE4_2)
+ return expand_vec_perm_even_odd_pack (d);
+ else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
return expand_vec_perm_pshufb2 (d);
else
{
@@ -48441,7 +48559,7 @@ expand_vec_perm_even_odd_1 (struct
expand_vec_perm_d *d, unsigned odd)
case V16HImode:
case V32QImode:
- return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
+ return expand_vec_perm_even_odd_pack (d);
case V4DImode:
if (!TARGET_AVX2)
@@ -48814,6 +48932,9 @@ ix86_expand_vec_perm_const_1 (struct
expand_vec_perm_d *d)
/* Try sequences of three instructions. */
+ if (expand_vec_perm_even_odd_pack (d))
+ return true;
+
if (expand_vec_perm_2vperm2f128_vshuf (d))
return true;
diff --git a/gcc/testsuite/gcc.target/i386/pr60451.c
b/gcc/testsuite/gcc.target/i386/pr60451.c
new file mode 100644
index 0000000..29f019d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr60451.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target sse4 } */
+/* { dg-options "-O2 -ftree-vectorize -msse4.2" } */
+
+void
+foo (unsigned char *a, unsigned char *b, unsigned char *c, int size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ a[i] = (unsigned char) ((unsigned int)1 + b[i] * c[i] * 117);
+}
+
+/* { dg-final { scan-assembler "packuswb|vpunpck" } } */
On Thu, Nov 20, 2014 at 3:26 PM, Uros Bizjak <[email protected]> wrote:
> On Thu, Nov 20, 2014 at 12:36 PM, Evgeny Stupachenko <[email protected]>
> wrote:
>> Hi,
>>
>> The patch expand even/odd permutation using:
>> "and, and, pack" in odd case
>> "shift, shift, pack" in even case
>>
>> instead of current "pshufb, pshufb, or" or big set of unpack insns.
>>
>> AVX2/CORE bootstrap and make check passed.
>> expensive tests are in progress
>>
>> Is it ok for trunk?
>>
>> Evgeny
>>
>> 2014-11-20 Evgeny Stupachenko <[email protected]>
>>
>> gcc/testsuite
>> PR target/60451
>> * gcc.target/i386/pr60451.c: New.
>>
>> gcc/
>> PR target/60451
>> * config/i386/i386.c (expand_vec_perm_even_odd_pack): New.
>> (expand_vec_perm_even_odd_1): Add new expand for SSE cases,
>> replace with for AVX2 cases.
>> (ix86_expand_vec_perm_const_1): Add new expand.
>
> OK with a couple of small adjustments below.
>
> Thanks,
> Uros.
>
>> +/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
>> + and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
>> + with two "and" and "pack" or two "shift" and "pack" insns. We should
>> + have already failed all two instruction sequences. */
>> +
>> +static bool
>> +expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
>> +{
>> + rtx op, dop0, dop1, t, rperm[16];
>> + unsigned i, odd, c, s, nelt = d->nelt;
>> + bool end_perm = false;
>> + machine_mode half_mode;
>> + rtx (*gen_and) (rtx, rtx, rtx);
>> + rtx (*gen_pack) (rtx, rtx, rtx);
>> + rtx (*gen_shift) (rtx, rtx, rtx);
>> +
>> + /* Required for "pack". */
>> + if (!TARGET_SSE4_2 || d->one_operand_p)
>> + return false;
>> +
>> + /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
>> general
>> + shuffles. */
>> + if (d->vmode == V8HImode)
>
> Use switch, as proposed by Jakub.
>
>> + {
>> + c = 0xffff;
>> + s = 16;
>> + half_mode = V4SImode;
>> + gen_and = gen_andv4si3;
>> + gen_pack = gen_sse4_1_packusdw;
>> + gen_shift = gen_lshrv4si3;
>> + }
>> + else if (d->vmode == V16QImode)
>> + {
>> + c = 0xff;
>> + s = 8;
>> + half_mode = V8HImode;
>> + gen_and = gen_andv8hi3;
>> + gen_pack = gen_sse2_packuswb;
>> + gen_shift = gen_lshrv8hi3;
>> + }
>> + else if (d->vmode == V16HImode)
>> + {
>> + c = 0xffff;
>> + s = 16;
>> + half_mode = V8SImode;
>> + gen_and = gen_andv8si3;
>> + gen_pack = gen_avx2_packusdw;
>> + gen_shift = gen_lshrv8si3;
>> + end_perm = true;
>> + }
>> + else if (d->vmode == V32QImode)
>> + {
>> + c = 0xff;
>> + s = 8;
>> + half_mode = V16HImode;
>> + gen_and = gen_andv16hi3;
>> + gen_pack = gen_avx2_packuswb;
>> + gen_shift = gen_lshrv16hi3;
>> + end_perm = true;
>> + }
>> + else
>> + return false;
>> +
>> + /* Check that permutation is even or odd. */
>> + odd = d->perm[0];
>> + if (odd != 0 && odd != 1)
>
> if (odd > 1)
>
>> + return false;
>> +
>> + for (i = 1; i < nelt; ++i)
>> + if (d->perm[i] != 2 * i + odd)
>> + return false;
>> +
>> + if (d->testing_p)
>> + return true;
>> +
>> + dop0 = gen_reg_rtx (half_mode);
>> + dop1 = gen_reg_rtx (half_mode);
>> + if (odd == 0)
>> + {
>> + for (i = 0; i < nelt / 2; rperm[i++] = GEN_INT (c));
>
> Please write above as:
>
> for (i = 0; i < nelt / 2; i++)
> rperm[i] = GEN_INT (c));
>
>> + t = gen_rtx_CONST_VECTOR (half_mode, gen_rtvec_v (nelt / 2, rperm));
>> + t = force_reg (half_mode, t);
>> + emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
>> + emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
>> + }
>> + else
>> + {
>> + emit_insn (gen_shift (dop0,
>> + gen_lowpart (half_mode, d->op0),
>> + GEN_INT (s)));
>> + emit_insn (gen_shift (dop1,
>> + gen_lowpart (half_mode, d->op1),
>> + GEN_INT (s)));
>> + }
>> + /* In AVX2 for 256 bit case we need to permute pack result. */
>> + if (TARGET_AVX2 && end_perm)
>> + {
>> + op = gen_reg_rtx (d->vmode);
>> + t = gen_reg_rtx (V4DImode);
>> + emit_insn (gen_pack (op, dop0, dop1));
>> + emit_insn (gen_avx2_permv4di_1 (t, gen_lowpart (V4DImode, op),
>> const0_rtx,
>> + const2_rtx, const1_rtx, GEN_INT (3)));
>> + emit_move_insn (d->target, gen_lowpart (d->vmode, t));
>> + }
>> + else
>> + emit_insn (gen_pack (d->target, dop0, dop1));
>> +
>> + return true;
>> +}
>> +
>> /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even
>> and extract-odd permutations. */
>>
>> @@ -48393,6 +48503,8 @@ expand_vec_perm_even_odd_1 (struct
>> expand_vec_perm_d *d, unsigned odd)
>> gcc_unreachable ();
>>
>> case V8HImode:
>> + if (TARGET_SSE4_2)
>> + return expand_vec_perm_even_odd_pack (d);
>> if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
>
> "else if" in the above line, to be consistent with else below.
>
>> return expand_vec_perm_pshufb2 (d);
>> else
>> @@ -48416,6 +48528,8 @@ expand_vec_perm_even_odd_1 (struct
>> expand_vec_perm_d *d, unsigned odd)
>> break;
>>
>> case V16QImode:
>> + if (TARGET_SSE4_2)
>> + return expand_vec_perm_even_odd_pack (d);
>> if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
>
> "else if" in the above line.
>
>> return expand_vec_perm_pshufb2 (d);
>> else
>> @@ -48441,7 +48555,7 @@ expand_vec_perm_even_odd_1 (struct
>> expand_vec_perm_d *d, unsigned odd)
>>
>> case V16HImode:
>> case V32QImode:
>> - return expand_vec_perm_vpshufb2_vpermq_even_odd (d);
>> + return expand_vec_perm_even_odd_pack (d);
>>
>> case V4DImode:
>> if (!TARGET_AVX2)
>> @@ -48814,6 +48928,9 @@ ix86_expand_vec_perm_const_1 (struct
>> expand_vec_perm_d *d)
>>
>> /* Try sequences of three instructions. */
>>
>> + if (expand_vec_perm_even_odd_pack (d))
>> + return true;
>> +
>> if (expand_vec_perm_2vperm2f128_vshuf (d))
>> return true;
>>
>> diff --git a/gcc/testsuite/gcc.target/i386/pr60451.c
>> b/gcc/testsuite/gcc.target/i386/pr60451.c
>> new file mode 100644
>> index 0000000..29f019d
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/pr60451.c
>> @@ -0,0 +1,14 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target sse4 } */
>> +/* { dg-options "-O2 -ftree-vectorize -msse4.2" } */
>> +
>> +void
>> +foo (unsigned char *a, unsigned char *b, unsigned char *c, int size)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < size; i++)
>> + a[i] = (unsigned char) ((unsigned int)1 + b[i] * c[i] * 117);
>> +}
>> +
>> +/* { dg-final { scan-assembler "packuswb|vpunpck" } } */