https://gcc.gnu.org/g:0ebaffccb294d90184ad78367de66b6307de3ac0
commit r15-717-g0ebaffccb294d90184ad78367de66b6307de3ac0 Author: liuhongt <hongtao....@intel.com> Date: Fri Mar 22 14:40:00 2024 +0800 Use pblendw instead of pand to clear upper 16 bits. For vec_pack_truncv8si/v4si w/o AVX512, (const_vector:v4si (const_int 0xffff) x4) is used as mask to clear upper 16 bits, but vpblendw with zero_vector can also be used, and zero vector is cheaper than (const_vector:v4si (const_int 0xffff) x4). gcc/ChangeLog: PR target/114427 * config/i386/i386-expand.cc (expand_vec_perm_even_odd_pack): Use pblendw instead of pand to clear upper bits. gcc/testsuite/ChangeLog: * gcc.target/i386/pr114427.c: New test. Diff: --- gcc/config/i386/i386-expand.cc | 34 ++++++++++++++++++++++++++++---- gcc/testsuite/gcc.target/i386/pr114427.c | 18 +++++++++++++++++ 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 100fb2afb3a..7142c0a9d77 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -22587,6 +22587,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) { rtx op, dop0, dop1, t; unsigned i, odd, c, s, nelt = d->nelt; + int pblendw_i = 0; bool end_perm = false; machine_mode half_mode; rtx (*gen_and) (rtx, rtx, rtx); @@ -22608,6 +22609,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) gen_and = gen_andv2si3; gen_pack = gen_mmx_packusdw; gen_shift = gen_lshrv2si3; + pblendw_i = 0x5; break; case E_V8HImode: /* Required for "pack". */ @@ -22619,6 +22621,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) gen_and = gen_andv4si3; gen_pack = gen_sse4_1_packusdw; gen_shift = gen_lshrv4si3; + pblendw_i = 0x55; break; case E_V8QImode: /* No check as all instructions are SSE2. */ @@ -22647,6 +22650,7 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) gen_and = gen_andv8si3; gen_pack = gen_avx2_packusdw; gen_shift = gen_lshrv8si3; + pblendw_i = 0x5555; end_perm = true; break; case E_V32QImode: @@ -22682,10 +22686,32 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) dop1 = gen_reg_rtx (half_mode); if (odd == 0) { - t = gen_const_vec_duplicate (half_mode, GEN_INT (c)); - t = force_reg (half_mode, t); - emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); - emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); + /* Use pblendw since const_vector 0 should be cheaper than + const_vector 0xffff. */ + if (d->vmode == V4HImode + || d->vmode == E_V8HImode + || d->vmode == E_V16HImode) + { + rtx dop0_t = gen_reg_rtx (d->vmode); + rtx dop1_t = gen_reg_rtx (d->vmode); + t = gen_reg_rtx (d->vmode); + emit_move_insn (t, CONST0_RTX (d->vmode)); + + emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t, + GEN_INT (pblendw_i))); + emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t, + GEN_INT (pblendw_i))); + + emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t)); + emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t)); + } + else + { + t = gen_const_vec_duplicate (half_mode, GEN_INT (c)); + t = force_reg (half_mode, t); + emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0))); + emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1))); + } } else { diff --git a/gcc/testsuite/gcc.target/i386/pr114427.c b/gcc/testsuite/gcc.target/i386/pr114427.c new file mode 100644 index 00000000000..58b66db7fff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr114427.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64-v3 -O2 -mno-avx512f" } */ +/* { dg-final { scan-assembler-not "vpand" } } */ +/* { dg-final { scan-assembler-not "65535" } } */ + +void +foo (int* a, short* __restrict b, int* c) +{ + for (int i = 0; i != 16; i++) + b[i] = c[i] + a[i]; +} + +void +foo1 (int* a, short* __restrict b, int* c) +{ + for (int i = 0; i != 8; i++) + b[i] = c[i] + a[i]; +}