[PATCH] target/98856 - split vpinsrq with new peephole2

Richard Biener Mon, 08 Mar 2021 03:04:42 -0800

This reduces the latency of a V2DImode construction from two
GPRs by avoiding the dependence on the GPR->XMM move with the
used vpinsrq instruction and instead allow the two GPR->XMM moves
to be concurrently executed and scheduled, performing the insert
using vpunpcklqdq.


Bootstrapped and tested on x86_64-unknown-linux-gnu.

OK for trunk or do we want to defer this to GCC 12, maybe
unless we can also solve the spilling in PR98856 which would
then fix the performance regression?

2021-03-05  Richard Biener  <[email protected]>

        PR target/98856
        * config/i386/sse.md (vpinsrq peephole): New peephole2
        splitting vpinsrq to a vmovq and vpunpcklqdq.

        * gcc.target/i386/pr98856.c: New testcase.
        * gcc.target/i386/avx512dq-concatv2di-1.c: Adjust.
        * gcc.target/i386/avx512vl-concatv2di-1.c: Likewise.
---
 gcc/config/i386/sse.md                        | 15 +++++++++++
 .../gcc.target/i386/avx512dq-concatv2di-1.c   |  4 +--
 .../gcc.target/i386/avx512vl-concatv2di-1.c   |  2 +-
 gcc/testsuite/gcc.target/i386/pr98856.c       | 25 +++++++++++++++++++
 4 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr98856.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index ca4372d4164..7c9be80540b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -1427,6 +1427,21 @@
   DONE;
 })
 
+;; Further split pinsrq variants of vec_concatv2di to hide the latency
+;; the GPR->XMM transition(s).
+(define_peephole2
+  [(match_scratch:DI 3 "Yv")
+   (set (match_operand:V2DI 0 "sse_reg_operand")
+       (vec_concat:V2DI (match_operand:DI 1 "sse_reg_operand")
+                        (match_operand:DI 2 "nonimmediate_gr_operand")))]
+  "TARGET_64BIT && TARGET_SSE4_1
+   && !optimize_insn_for_size_p ()"
+  [(set (match_dup 3)
+        (match_dup 2))
+   (set (match_dup 0)
+       (vec_concat:V2DI (match_dup 1)
+                        (match_dup 3)))])
+
 ;; Merge movsd/movhpd to movupd for TARGET_SSE_UNALIGNED_LOAD_OPTIMAL targets.
 (define_peephole2
   [(set (match_operand:V2DF 0 "sse_reg_operand")
diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c 
b/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c
index 82cb402575b..ac652bb1382 100644
--- a/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c
@@ -14,7 +14,7 @@ f1 (long long x, long long y)
   asm volatile ("" : "+v" (c));
 }
 
-/* { dg-final { scan-assembler 
"vpinsrq\[^\n\r]*\\\$1\[^\n\r]*%rsi\[^\n\r]*%xmm16\[^\n\r]*%xmm17" } } */
+/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm16\[^\n\r]*%xmm17" } } 
*/
 
 void
 f2 (long long x, long long *y)
@@ -27,7 +27,7 @@ f2 (long long x, long long *y)
   asm volatile ("" : "+v" (c));
 }
 
-/* { dg-final { scan-assembler 
"vpinsrq\[^\n\r]*\\\$1\[^\n\r]*%\[re]si\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */
+/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } 
*/
 
 void
 f3 (long long x)
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c
index 8e637071aa2..b8300371a21 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c
@@ -28,7 +28,7 @@ f2 (long long x, long long *y)
   asm volatile ("" : "+v" (c));
 }
 
-/* { dg-final { scan-assembler 
"vmovhps\[^\n\r]*%\[re]si\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */
+/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } 
*/
 
 void
 f3 (long long x)
diff --git a/gcc/testsuite/gcc.target/i386/pr98856.c 
b/gcc/testsuite/gcc.target/i386/pr98856.c
new file mode 100644
index 00000000000..1ea24d0f1fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr98856.c
@@ -0,0 +1,25 @@
+/* { dg-do compile { target lp64 } } */
+/* { dg-options "-O3 -march=znver2" } */
+
+typedef __UINT64_TYPE__ uint64_t;
+void poly_double_le2 (unsigned char *out, const unsigned char *in)
+{
+  uint64_t W[2];
+
+  __builtin_memcpy (&W, in, 16);
+  uint64_t carry = (W[1] >> 63) * 135;
+  W[1] = (W[1] << 1) ^ (W[0] >> 63);
+  W[0] = (W[0] << 1) ^ carry;
+  __builtin_memcpy (out, &W[0], 8);
+  __builtin_memcpy (out + 8, &W[1], 8);
+}
+
+/* We should split 
+     vpinsrq $1, %rax, %xmm0, %xmm0
+   to
+     vmovq %rax, %xmm1
+     vpunpcklqdq %xmm0, %xmm1, %xmm0
+   to better hide the latency of the GPR->XMM transitions.  */
+
+/* { dg-final { scan-assembler-not "pinsrq" } } */
+/* { dg-final { scan-assembler-times "punpcklqdq" 1 } } */
-- 
2.26.2

[PATCH] target/98856 - split vpinsrq with new peephole2

Reply via email to