https://gcc.gnu.org/g:a71f90c5a7ae2942083921033cb23dcd63e70525

commit r15-499-ga71f90c5a7ae2942083921033cb23dcd63e70525
Author: Levy Hsu <ad...@levyhsu.com>
Date:   Thu May 9 16:50:56 2024 +0800

    x86: Add 3-instruction subroutine vector shift for V16QI in 
ix86_expand_vec_perm_const_1 [PR107563]
    
    Hi All
    
    We've introduced a new subroutine in ix86_expand_vec_perm_const_1
    to optimize vector shifting for the V16QI type on x86.
    This patch uses a three-instruction sequence psrlw, psllw, and por
    to handle specific vector shuffle operations more efficiently.
    The change aims to improve assembly code generation for configurations
    supporting SSE2.
    
    Bootstrapped and tested on x86_64-linux-gnu, OK for trunk?
    
    Best
    Levy
    
    gcc/ChangeLog:
    
            PR target/107563
            * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New
            subroutine.
            (ix86_expand_vec_perm_const_1): Call 
expand_vec_perm_psrlw_psllw_por.
    
    gcc/testsuite/ChangeLog:
    
            PR target/107563
            * g++.target/i386/pr107563-a.C: New test.
            * g++.target/i386/pr107563-b.C: New test.

Diff:
---
 gcc/config/i386/i386-expand.cc             | 64 ++++++++++++++++++++++++++++++
 gcc/testsuite/g++.target/i386/pr107563-a.C | 13 ++++++
 gcc/testsuite/g++.target/i386/pr107563-b.C | 12 ++++++
 3 files changed, 89 insertions(+)

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 1ab22fe79736..e846a946de07 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct 
expand_vec_perm_d *d, bool two_insn)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.
+   Implement a permutation with psrlw, psllw and por.
+   It handles case:
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
+   __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
+
+static bool
+expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
+{
+  unsigned i;
+  rtx (*gen_shr) (rtx, rtx, rtx);
+  rtx (*gen_shl) (rtx, rtx, rtx);
+  rtx (*gen_or) (rtx, rtx, rtx);
+  machine_mode mode = VOIDmode;
+
+  if (!TARGET_SSE2 || !d->one_operand_p)
+    return false;
+
+  switch (d->vmode)
+    {
+    case E_V8QImode:
+      if (!TARGET_MMX_WITH_SSE)
+       return false;
+      mode = V4HImode;
+      gen_shr = gen_ashrv4hi3;
+      gen_shl = gen_ashlv4hi3;
+      gen_or = gen_iorv4hi3;
+      break;
+    case E_V16QImode:
+      mode = V8HImode;
+      gen_shr = gen_vlshrv8hi3;
+      gen_shl = gen_vashlv8hi3;
+      gen_or = gen_iorv8hi3;
+      break;
+    default: return false;
+    }
+
+  if (!rtx_equal_p (d->op0, d->op1))
+    return false;
+
+  for (i = 0; i < d->nelt; i += 2)
+    if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
+      return false;
+
+  if (d->testing_p)
+    return true;
+
+  rtx tmp1 = gen_reg_rtx (mode);
+  rtx tmp2 = gen_reg_rtx (mode);
+  rtx op0 = force_reg (d->vmode, d->op0);
+
+  emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
+  emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
+  emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
+  emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
+  emit_insn (gen_or (tmp1, tmp1, tmp2));
+  emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
+
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
@@ -23782,6 +23843,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d 
*d)
   if (expand_vec_perm_2perm_pblendv (d, false))
     return true;
 
+  if (expand_vec_perm_psrlw_psllw_por (d))
+    return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
diff --git a/gcc/testsuite/g++.target/i386/pr107563-a.C 
b/gcc/testsuite/g++.target/i386/pr107563-a.C
new file mode 100755
index 000000000000..605c1bdf814b
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-a.C
@@ -0,0 +1,13 @@
+/* PR target/107563.C */
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psraw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type2 [[__gnu__::__vector_size__(8)]] = char;
+
+void foo2(temp_vec_type2& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6);
+}
diff --git a/gcc/testsuite/g++.target/i386/pr107563-b.C 
b/gcc/testsuite/g++.target/i386/pr107563-b.C
new file mode 100755
index 000000000000..0ce3e8263bb5
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr107563-b.C
@@ -0,0 +1,12 @@
+/* PR target/107563.C */
+/* { dg-options "-std=c++2b -O3 -msse2" } */
+/* { dg-final { scan-assembler-times "psllw" 1 } } */
+/* { dg-final { scan-assembler-times "psrlw" 1 } } */
+/* { dg-final { scan-assembler-times "por" 1 } } */
+
+using temp_vec_type [[__gnu__::__vector_size__(16)]] = char;
+
+void foo(temp_vec_type& v) noexcept
+{
+  v = __builtin_shufflevector(v, v, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 
12, 15, 14);
+}

Reply via email to