[gcc r15-366] i386: Improve V[48]QI shifts on AVX512/SSE4.1

Roger Sayle via Gcc-cvs Fri, 10 May 2024 12:28:20 -0700

https://gcc.gnu.org/g:f5a8cdc1ef5d6aa2de60849c23658ac5298df7bb


commit r15-366-gf5a8cdc1ef5d6aa2de60849c23658ac5298df7bb
Author: Roger Sayle <ro...@nextmovesoftware.com>
Date:   Fri May 10 20:26:40 2024 +0100

    i386: Improve V[48]QI shifts on AVX512/SSE4.1
    
    The following one line patch improves the code generated for V8QI and V4QI
    shifts when AV512BW and AVX512VL functionality is available.
    
    For the testcase (from gcc.target/i386/vect-shiftv8qi.c):
    
    typedef signed char v8qi __attribute__ ((__vector_size__ (8)));
    v8qi foo (v8qi x) { return x >> 5; }
    
    GCC with -O2 -march=cascadelake currently generates:
    
    foo:    movl    $67372036, %eax
            vpsraw  $5, %xmm0, %xmm2
            vpbroadcastd    %eax, %xmm1
            movl    $117901063, %eax
            vpbroadcastd    %eax, %xmm3
            vmovdqa %xmm1, %xmm0
            vmovdqa %xmm3, -24(%rsp)
            vpternlogd      $120, -24(%rsp), %xmm2, %xmm0
            vpsubb  %xmm1, %xmm0, %xmm0
            ret
    
    with this patch we now generate the much improved:
    
    foo:    vpmovsxbw       %xmm0, %xmm0
            vpsraw  $5, %xmm0, %xmm0
            vpmovwb %xmm0, %xmm0
            ret
    
    This patch also fixes the FAILs of gcc.target/i386/vect-shiftv[48]qi.c
    when run with the additional -march=cascadelake flag, by splitting these
    tests into two; one form testing code generation with -msse2 (and
    -mno-avx512vl) as originally intended, and the other testing AVX512
    code generation with an explicit -march=cascadelake.
    
    2024-05-10  Roger Sayle  <ro...@nextmovesoftware.com>
                Hongtao Liu  <hongtao....@intel.com>
    
    gcc/ChangeLog
            * config/i386/i386-expand.cc (ix86_expand_vecop_qihi_partial):
            Don't attempt ix86_expand_vec_shift_qihi_constant on SSE4.1.
    
    gcc/testsuite/ChangeLog
            * gcc.target/i386/vect-shiftv4qi.c: Specify -mno-avx512vl.
            * gcc.target/i386/vect-shiftv8qi.c: Likewise.
            * gcc.target/i386/vect-shiftv4qi-2.c: New test case.
            * gcc.target/i386/vect-shiftv8qi-2.c: Likewise.

Diff:
---
 gcc/config/i386/i386-expand.cc                   |  3 ++
 gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c | 43 ++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c   |  2 +-
 gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c | 43 ++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c   |  2 +-
 5 files changed, 91 insertions(+), 2 deletions(-)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c2..1ab22fe79736 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -24283,6 +24283,9 @@ ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx 
dest, rtx op1, rtx op2)
 
   if (CONST_INT_P (op2)
       && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+      /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb.
+         Even with SSE4.1 the alternative is better.  */
+      && !TARGET_SSE4_1
       && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
     {
       emit_move_insn (dest, gen_lowpart (qimode, qdest));
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c 
b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c
new file mode 100644
index 000000000000..abc1a276b043
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi-2.c
@@ -0,0 +1,43 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -march=cascadelake" } */
+
+#define N 4
+
+typedef unsigned char __vu __attribute__ ((__vector_size__ (N)));
+typedef signed char __vi __attribute__ ((__vector_size__ (N)));
+
+__vu sll (__vu a, int n)
+{
+  return a << n;
+}
+
+__vu sll_c (__vu a)
+{
+  return a << 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsllw" 2 } } */
+
+__vu srl (__vu a, int n)
+{
+  return a >> n;
+}
+
+__vu srl_c (__vu a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */
+
+__vi sra (__vi a, int n)
+{
+  return a >> n;
+}
+
+__vi sra_c (__vi a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsraw" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c 
b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c
index b7e45c2e8799..9b52582d01f8 100644
--- a/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv4qi.c
@@ -1,5 +1,5 @@
 /* { dg-do compile } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-avx2 -mno-avx512vl" } */
 
 #define N 4
 
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c 
b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c
new file mode 100644
index 000000000000..52760f5a0607
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi-2.c
@@ -0,0 +1,43 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-O2 -march=cascadelake" } */
+
+#define N 8
+
+typedef unsigned char __vu __attribute__ ((__vector_size__ (N)));
+typedef signed char __vi __attribute__ ((__vector_size__ (N)));
+
+__vu sll (__vu a, int n)
+{
+  return a << n;
+}
+
+__vu sll_c (__vu a)
+{
+  return a << 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsllw" 2 } } */
+
+__vu srl (__vu a, int n)
+{
+  return a >> n;
+}
+
+__vu srl_c (__vu a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsrlw" 2 } } */
+
+__vi sra (__vi a, int n)
+{
+  return a >> n;
+}
+
+__vi sra_c (__vi a)
+{
+  return a >> 5;
+}
+
+/* { dg-final { scan-assembler-times "vpsraw" 2 } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c 
b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c
index 2471e6ed17d8..3dfcfd28a733 100644
--- a/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c
+++ b/gcc/testsuite/gcc.target/i386/vect-shiftv8qi.c
@@ -1,5 +1,5 @@
 /* { dg-do compile { target { ! ia32 } } } */
-/* { dg-options "-O2 -msse2" } */
+/* { dg-options "-O2 -msse2 -mno-avx2 -mno-avx512vl" } */
 
 #define N 8

[gcc r15-366] i386: Improve V[48]QI shifts on AVX512/SSE4.1

Reply via email to