Hi All We've introduced a new subroutine in ix86_expand_vec_perm_const_1 to optimize vector shifting for the V16QI type on x86. This patch uses a three-instruction sequence psrlw, psllw, and por to handle specific vector shuffle operations more efficiently. The change aims to improve assembly code generation for configurations supporting SSE2. This update addresses the issue detailed in Bugzilla report 107563.
Bootstrapped and tested on x86_64-linux-gnu, OK for trunk? BRs, Levy gcc/ChangeLog: * config/i386/i386-expand.cc (expand_vec_perm_psrlw_psllw_por): New subroutine. (ix86_expand_vec_perm_const_1): New Entry. gcc/testsuite/ChangeLog: * g++.target/i386/pr107563.C: New test. --- gcc/config/i386/i386-expand.cc | 64 ++++++++++++++++++++++++ gcc/testsuite/g++.target/i386/pr107563.C | 23 +++++++++ 2 files changed, 87 insertions(+) create mode 100755 gcc/testsuite/g++.target/i386/pr107563.C diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..2718b0acb87 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -22362,6 +22362,67 @@ expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn) return true; } +/* A subroutine of ix86_expand_vec_perm_const_1. + Implement a permutation with psrlw, psllw and por. + It handles case: + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14); + __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */ + +static bool +expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d) +{ + unsigned i; + rtx (*gen_shr) (rtx, rtx, rtx); + rtx (*gen_shl) (rtx, rtx, rtx); + rtx (*gen_or) (rtx, rtx, rtx); + machine_mode mode = VOIDmode; + + if (!TARGET_SSE2 || !d->one_operand_p) + return false; + + switch (d->vmode) + { + case E_V8QImode: + if (!TARGET_MMX_WITH_SSE) + return false; + mode = V4HImode; + gen_shr = gen_ashrv4hi3; + gen_shl = gen_ashlv4hi3; + gen_or = gen_iorv4hi3; + break; + case E_V16QImode: + mode = V8HImode; + gen_shr = gen_vlshrv8hi3; + gen_shl = gen_vashlv8hi3; + gen_or = gen_iorv8hi3; + break; + default: return false; + } + + if (!rtx_equal_p (d->op0, d->op1)) + return false; + + for (i = 0; i < d->nelt; i += 2) + if (d->perm[i] != i + 1 || d->perm[i + 1] != i) + return false; + + if (d->testing_p) + return true; + + rtx tmp1 = gen_reg_rtx (mode); + rtx tmp2 = gen_reg_rtx (mode); + rtx op0 = force_reg (d->vmode, d->op0); + + emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode)); + emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode)); + emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8))); + emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8))); + emit_insn (gen_or (tmp1, tmp1, tmp2)); + emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode)); + + return true; +} + /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF permutation using two vperm2f128, followed by a vshufpd insn blending the two vectors together. */ @@ -23781,6 +23842,9 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_2perm_pblendv (d, false)) return true; + + if (expand_vec_perm_psrlw_psllw_por (d)) + return true; /* Try sequences of four instructions. */ diff --git a/gcc/testsuite/g++.target/i386/pr107563.C b/gcc/testsuite/g++.target/i386/pr107563.C new file mode 100755 index 00000000000..5b0c648e8f1 --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr107563.C @@ -0,0 +1,23 @@ +/* PR target/107563.C */ +/* { dg-do compile { target { ! ia32 } } } */ +/* { dg-options "-std=c++2b -O3 -msse2" } */ +/* { dg-final { scan-assembler-not "movzbl" } } */ +/* { dg-final { scan-assembler-not "salq" } } */ +/* { dg-final { scan-assembler-not "orq" } } */ +/* { dg-final { scan-assembler-not "punpcklqdq" } } */ +/* { dg-final { scan-assembler-times "psllw" 2 } } */ +/* { dg-final { scan-assembler-times "psrlw" 1 } } */ +/* { dg-final { scan-assembler-times "psraw" 1 } } */ +/* { dg-final { scan-assembler-times "por" 2 } } */ + +using temp_vec_type [[__gnu__::__vector_size__ (16)]] = char; +void foo (temp_vec_type& v) noexcept +{ + v = __builtin_shufflevector(v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14); +} + +using temp_vec_type2 [[__gnu__::__vector_size__ (8)]] = char; +void foo2 (temp_vec_type2& v) noexcept +{ + v=__builtin_shufflevector(v,v,1,0,3,2,5,4,7,6); +} -- 2.31.1