Hi! The ix86_expand_vecop_qihi function has been adjusted for AVX512* just by changing i < 32 to i < 64 (where both were sometimes wasteful), but for !full_interleave that is even wrong, swapping the second and third quarter is something that works to undo AVX256 unpacks only, where we want 0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62, permutation. But, for AVX512 we want 0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126 where the current trunk code has been producing 0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62,64,66,68,70,72,74,76,78,96,98,100,102,104,106,108,110,80,82,84,86,88,90,92,94,112,114,116,118,120,122,124,126 instead.
Fixed thusly, bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2016-03-21 Jakub Jelinek <ja...@redhat.com> PR target/70329 * config/i386/i386.c (ix86_expand_vecop_qihi): Don't bother computing d.perm[i] for i >= d.nelt. If not full_interleave, compute d.perm[i] in a way that works also for AVX512BW. * gcc.target/i386/avx512bw-pr70329-1.c: New test. * gcc.target/i386/avx512bw-pr70329-2.c: New test. --- gcc/config/i386/i386.c.jj 2016-03-11 17:37:38.000000000 +0100 +++ gcc/config/i386/i386.c 2016-03-21 15:49:37.322044578 +0100 @@ -51910,7 +51910,7 @@ ix86_expand_vecop_qihi (enum rtx_code co { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 64; ++i) + for (i = 0; i < d.nelt; ++i) d.perm[i] = i * 2; } else @@ -51918,8 +51918,8 @@ ix86_expand_vecop_qihi (enum rtx_code co /* For AVX, the interleave used above was not cross-lane. So the extraction is evens but with the second and third quarter swapped. Happily, that is even one insn shorter than even extraction. */ - for (i = 0; i < 64; ++i) - d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); + for (i = 0; i < d.nelt; ++i) + d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15); } ok = ix86_expand_vec_perm_const_1 (&d); --- gcc/testsuite/gcc.target/i386/avx512bw-pr70329-1.c.jj 2016-03-21 16:06:44.073158614 +0100 +++ gcc/testsuite/gcc.target/i386/avx512bw-pr70329-1.c 2016-03-21 16:18:35.805538887 +0100 @@ -0,0 +1,27 @@ +/* PR target/70329 */ +/* { dg-do run } */ +/* { dg-options "-O0 -mavx512bw" } */ +/* { dg-require-effective-target avx512bw } */ + +#define AVX512BW +#include "avx512f-helper.h" + +typedef unsigned char A __attribute__ ((vector_size (64))); +typedef unsigned int B __attribute__ ((vector_size (64))); + +unsigned __attribute__ ((noinline, noclone)) +foo (A a, A b, B c) +{ + a *= b; + c[1] += a[8]; + return c[1]; +} + +void +TEST (void) +{ + A a = (A) { 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + unsigned x = foo (a, a, (B) { 1, 2 }); + if (x != 83) + abort (); +} --- gcc/testsuite/gcc.target/i386/avx512bw-pr70329-2.c.jj 2016-03-21 16:09:37.667811745 +0100 +++ gcc/testsuite/gcc.target/i386/avx512bw-pr70329-2.c 2016-03-21 16:17:38.726309545 +0100 @@ -0,0 +1,33 @@ +/* PR target/70329 */ +/* { dg-do run } */ +/* { dg-options "-O2 -ftree-vectorize -mavx512bw" } */ +/* { dg-require-effective-target avx512bw } */ + +#define AVX512BW +#include "avx512f-helper.h" + +__attribute__((noinline, noclone)) void +foo (unsigned char *src1, unsigned char *src2, unsigned char *dst) +{ + int i; + + for (i = 0; i < 64; i++) + dst[i] = (unsigned char) ((int) src1[i] * (int) src2[i]); +} + +void +TEST (void) +{ + unsigned char a[64], b[64], c[64]; + int i; + + for (i = 0; i < 64; i++) + { + a[i] = i; + b[i] = (i + 1); + } + foo (a, b, c); + for (i = 0; i < 64; i++) + if (c[i] != (unsigned char) (i * (i + 1))) + abort (); +} Jakub