https://gcc.gnu.org/g:f5a8cdc1ef5d6aa2de60849c23658ac5298df7bb
commit r15-366-gf5a8cdc1ef5d6aa2de60849c23658ac5298df7bb Author: Roger Sayle <ro...@nextmovesoftware.com> Date: Fri May 10 20:26:40 2024 +0100 i386: Improve V[48]QI shifts on AVX512/SSE4.1 The following one line patch improves the code generated for V8QI and V4QI shifts when AV512BW and AVX512VL functionality is available. For the testcase (from gcc.target/i386/vect-shiftv8qi.c): typedef signed char v8qi __attribute__ ((__vector_size__ (8))); v8qi foo (v8qi x) { return x >> 5; } GCC with -O2 -march=cascadelake currently generates: foo: movl $67372036, %eax vpsraw $5, %xmm0, %xmm2 vpbroadcastd %eax, %xmm1 movl $117901063, %eax vpbroadcastd %eax, %xmm3 vmovdqa %xmm1, %xmm0 vmovdqa %xmm3, -24(%rsp) vpternlogd $120, -24(%rsp), %xmm2, %xmm0 vpsubb %xmm1, %xmm0, %xmm0 ret with this patch we now generate the much improved: foo: vpmovsxbw %xmm0, %xmm0 vpsraw $5, %xmm0, %xmm0 vpmovwb %xmm0, %xmm0 ret This patch also fixes the FAILs of gcc.target/i386/vect-shiftv[48]qi.c when run with the additional -march=cascadelake flag, by splitting these tests into two; one form testing code generation with -msse2 (and -mno-avx512vl) as originally intended, and the other testing AVX512 code generation with an explicit -march=cascadelake. 2024-05-10 Roger Sayle <ro...@nextmovesoftware.com> Hongtao Liu <hongtao....@intel.com> gcc/ChangeLog * config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial): Don't attempt ix86_expand_vec_shift_qihi_constant on SSE4.1. gcc/testsuite/ChangeLog * gcc.target/i386/vect-shiftv4qi.c: Specify -mno-avx512vl. * gcc.target/i386/vect-shiftv8qi.c: Likewise. * gcc.target/i386/vect-shiftv4qi-2.c: New test case. * gcc.target/i386/vect-shiftv8qi-2.c: Likewise. Diff: --- gcc/config/i386/i386-expand.cc | 3 ++ gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c | 43 ++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c | 2 +- gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c | 43 ++++++++++++++++++++++++ gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c | 2 +- 5 files changed, 91 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2f27bfb484c2..1ab22fe79736 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -24283,6 +24283,9 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2) if (CONST_INT_P (op2) && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT) + /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb. + Even with SSE4.1 the alternative is better. */ + && !TARGET_SSE4_1 && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2)) { emit_move_insn (dest, gen_lowpart (qimode, qdest)); diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c new file mode 100644 index 000000000000..abc1a276b043 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=cascadelake" } */ + +#define N 4 + +typedef unsigned char __vu __attribute__ ((__vector_size__ (N))); +typedef signed char __vi __attribute__ ((__vector_size__ (N))); + +__vu sll (__vu a, int n) +{ + return a << n; +} + +__vu sll_c (__vu a) +{ + return a << 5; +} + +/* { dg-final { scan-assembler-times "vpsllw" 2 } } */ + +__vu srl (__vu a, int n) +{ + return a >> n; +} + +__vu srl_c (__vu a) +{ + return a >> 5; +} + +/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */ + +__vi sra (__vi a, int n) +{ + return a >> n; +} + +__vi sra_c (__vi a) +{ + return a >> 5; +} + +/* { dg-final { scan-assembler-times "vpsraw" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c index b7e45c2e8799..9b52582d01f8 100644 --- a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c +++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c @@ -1,5 +1,5 @@ /* { dg-do compile } */ -/* { dg-options "-O2 -msse2" } */ +/* { dg-options "-O2 -msse2 -mno-avx2 -mno-avx512vl" } */ #define N 4 diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c new file mode 100644 index 000000000000..52760f5a0607 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c @@ -0,0 +1,43 @@ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-O2 -march=cascadelake" } */ + +#define N 8 + +typedef unsigned char __vu __attribute__ ((__vector_size__ (N))); +typedef signed char __vi __attribute__ ((__vector_size__ (N))); + +__vu sll (__vu a, int n) +{ + return a << n; +} + +__vu sll_c (__vu a) +{ + return a << 5; +} + +/* { dg-final { scan-assembler-times "vpsllw" 2 } } */ + +__vu srl (__vu a, int n) +{ + return a >> n; +} + +__vu srl_c (__vu a) +{ + return a >> 5; +} + +/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */ + +__vi sra (__vi a, int n) +{ + return a >> n; +} + +__vi sra_c (__vi a) +{ + return a >> 5; +} + +/* { dg-final { scan-assembler-times "vpsraw" 2 } } */ diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c index 2471e6ed17d8..3dfcfd28a733 100644 --- a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c +++ b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c @@ -1,5 +1,5 @@ /* { dg-do compile { target { ! ia32 } } } */ -/* { dg-options "-O2 -msse2" } */ +/* { dg-options "-O2 -msse2 -mno-avx2 -mno-avx512vl" } */ #define N 8