This patch follows up on the previous patch to fix PR target/116275 by
improving the code STV (ultimately) generates for highpart sign extensions
like (x<<8)>>8.  The arithmetic right shift is able to take advantage of
the available common subexpressions from the preceding left shift.

Hence previously with -O2 -m32 -mavx -mno-avx512vl we'd generate:

        vpsllq  $8, %xmm0, %xmm0
        vpsrad  $8, %xmm0, %xmm1
        vpsrlq  $8, %xmm0, %xmm0
        vpblendw        $51, %xmm0, %xmm1, %xmm0

But with improved splitting, we now generate three instructions:

        vpslld  $8, %xmm1, %xmm0
        vpsrad  $8, %xmm0, %xmm0
        vpblendw        $51, %xmm1, %xmm0, %xmm0

This patch also implements Uros' suggestion that the pre-reload
splitter could introduced a new pseudo to hold the intermediate
to potentially help reload with register allocation, which applies
when not performing the above optimization, i.e. on TARGET_XOP.


This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
and make -k check, both with and without --target_board=unix{-m32}
with no new failures.  Ok for mainline?


2024-08-15  Roger Sayle  <ro...@nextmovesoftware.com>
            Uros Bizjak  <ubiz...@gmail.com>

gcc/ChangeLog
        * config/i386/i386.md (*extendv2di2_highpart_stv_noavx512vl): Split
        to an improved implementation on !TARGET_XOP.  On TARGET_XOP, use
        a new pseudo for the intermediate to simplify register allocation.

gcc/testsuite/ChangeLog
        * g++.target/i386/pr116275-2.C: New test case.


Thanks in advance,
Roger
--

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 1a6188f..6bd8e766 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -17407,10 +17407,38 @@
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
-  [(set (match_dup 0)
+  [(set (match_dup 4)
        (ashift:V2DI (match_dup 1) (match_dup 2)))
    (set (match_dup 0)
-       (ashiftrt:V2DI (match_dup 0) (match_dup 2)))])
+       (ashiftrt:V2DI (match_dup 4) (match_dup 2)))]
+{
+  if (!TARGET_XOP)
+    {
+      rtx op0 = operands[0];
+      rtx op2 = operands[2];
+      rtx tmp1 = gen_reg_rtx (V4SImode);
+      rtx tmp2 = gen_reg_rtx (V4SImode);
+      rtx tmp3 = gen_reg_rtx (V4SImode);
+      rtx tmp4 = gen_reg_rtx (V4SImode);
+      emit_move_insn (tmp1, lowpart_subreg (V4SImode, operands[1], V2DImode));
+      emit_insn (gen_ashlv4si3 (tmp2, tmp1, op2));
+      emit_insn (gen_ashrv4si3 (tmp3, tmp2, op2));
+      vec_perm_builder sel (4, 4, 1);
+      sel.quick_grow (4);
+      sel[0] = 0;
+      sel[1] = 5;
+      sel[2] = 2;
+      sel[3] = 7;
+      vec_perm_indices indices(sel, 2, 4);
+      bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode, tmp4,
+                                                 tmp1, tmp3, indices);
+      gcc_assert (ok);
+      emit_move_insn (op0, lowpart_subreg (V2DImode, tmp4, V4SImode));
+      DONE;
+    }
+  else
+    operands[4] = gen_reg_rtx (V2DImode);
+})
 
 ;; Rotate instructions
 
diff --git a/gcc/testsuite/g++.target/i386/pr116275-2.C 
b/gcc/testsuite/g++.target/i386/pr116275-2.C
new file mode 100644
index 0000000..98d3c19
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr116275-2.C
@@ -0,0 +1,19 @@
+/* { dg-do compile { target ia32 } } */
+/* { dg-options "-O2 -mavx -mno-avx512vl -std=c++11" } */
+
+struct SymbolDesc push_back(SymbolDesc);
+struct SymbolDesc {
+  long long ELFLocalSymIdx;
+};
+struct Expected {
+  long long &operator*();
+};
+void SymbolizableObjectFileaddSymbol() {
+  Expected SymbolAddressOrErr;
+  long long SymbolAddress = *SymbolAddressOrErr << 8 >> 8;
+  push_back({SymbolAddress});
+}
+
+/* { dg-final { scan-assembler "vpslld" } } */
+/* { dg-final { scan-assembler-not "vpsllq" } } */
+/* { dg-final { scan-assembler-not "vpsrlq" } } */

Reply via email to