https://gcc.gnu.org/g:b6fb4f7f651d2aa89548c5833fe2679af2638df5
commit r15-2940-gb6fb4f7f651d2aa89548c5833fe2679af2638df5 Author: Roger Sayle <ro...@nextmovesoftware.com> Date: Thu Aug 15 22:02:05 2024 +0100 i386: Improve split of *extendv2di2_highpart_stv_noavx512vl. This patch follows up on the previous patch to fix PR target/116275 by improving the code STV (ultimately) generates for highpart sign extensions like (x<<8)>>8. The arithmetic right shift is able to take advantage of the available common subexpressions from the preceding left shift. Hence previously with -O2 -m32 -mavx -mno-avx512vl we'd generate: vpsllq $8, %xmm0, %xmm0 vpsrad $8, %xmm0, %xmm1 vpsrlq $8, %xmm0, %xmm0 vpblendw $51, %xmm0, %xmm1, %xmm0 But with improved splitting, we now generate three instructions: vpslld $8, %xmm1, %xmm0 vpsrad $8, %xmm0, %xmm0 vpblendw $51, %xmm1, %xmm0, %xmm0 This patch also implements Uros' suggestion that the pre-reload splitter could introduced a new pseudo to hold the intermediate to potentially help reload with register allocation, which applies when not performing the above optimization, i.e. on TARGET_XOP. 2024-08-15 Roger Sayle <ro...@nextmovesoftware.com> Uros Bizjak <ubiz...@gmail.com> gcc/ChangeLog * config/i386/i386.md (*extendv2di2_highpart_stv_noavx512vl): Split to an improved implementation on !TARGET_XOP. On TARGET_XOP, use a new pseudo for the intermediate to simplify register allocation. gcc/testsuite/ChangeLog * g++.target/i386/pr116275-2.C: New test case. Diff: --- gcc/config/i386/i386.md | 32 ++++++++++++++++++++++++++++-- gcc/testsuite/g++.target/i386/pr116275-2.C | 19 ++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index efbab2f25ec..36108e5c2c9 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -17872,10 +17872,38 @@ && ix86_pre_reload_split ()" "#" "&& 1" - [(set (match_dup 0) + [(set (match_dup 4) (ashift:V2DI (match_dup 1) (match_dup 2))) (set (match_dup 0) - (ashiftrt:V2DI (match_dup 0) (match_dup 2)))]) + (ashiftrt:V2DI (match_dup 4) (match_dup 2)))] +{ + if (!TARGET_XOP) + { + rtx op0 = operands[0]; + rtx op2 = operands[2]; + rtx tmp1 = gen_reg_rtx (V4SImode); + rtx tmp2 = gen_reg_rtx (V4SImode); + rtx tmp3 = gen_reg_rtx (V4SImode); + rtx tmp4 = gen_reg_rtx (V4SImode); + emit_move_insn (tmp1, lowpart_subreg (V4SImode, operands[1], V2DImode)); + emit_insn (gen_ashlv4si3 (tmp2, tmp1, op2)); + emit_insn (gen_ashrv4si3 (tmp3, tmp2, op2)); + vec_perm_builder sel (4, 4, 1); + sel.quick_grow (4); + sel[0] = 0; + sel[1] = 5; + sel[2] = 2; + sel[3] = 7; + vec_perm_indices indices(sel, 2, 4); + bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode, tmp4, + tmp1, tmp3, indices); + gcc_assert (ok); + emit_move_insn (op0, lowpart_subreg (V2DImode, tmp4, V4SImode)); + DONE; + } + else + operands[4] = gen_reg_rtx (V2DImode); +}) ;; Rotate instructions diff --git a/gcc/testsuite/g++.target/i386/pr116275-2.C b/gcc/testsuite/g++.target/i386/pr116275-2.C new file mode 100644 index 00000000000..98d3c19e59c --- /dev/null +++ b/gcc/testsuite/g++.target/i386/pr116275-2.C @@ -0,0 +1,19 @@ +/* { dg-do compile { target ia32 } } */ +/* { dg-options "-O2 -mavx -mno-avx512vl -std=c++11" } */ + +struct SymbolDesc push_back(SymbolDesc); +struct SymbolDesc { + long long ELFLocalSymIdx; +}; +struct Expected { + long long &operator*(); +}; +void SymbolizableObjectFileaddSymbol() { + Expected SymbolAddressOrErr; + long long SymbolAddress = *SymbolAddressOrErr << 8 >> 8; + push_back({SymbolAddress}); +} + +/* { dg-final { scan-assembler "vpslld" } } */ +/* { dg-final { scan-assembler-not "vpsllq" } } */ +/* { dg-final { scan-assembler-not "vpsrlq" } } */