This reduces the latency of a V2DImode construction from two GPRs by avoiding the dependence on the GPR->XMM move with the used vpinsrq instruction and instead allow the two GPR->XMM moves to be concurrently executed and scheduled, performing the insert using vpunpcklqdq.
Bootstrapped and tested on x86_64-unknown-linux-gnu. OK for trunk or do we want to defer this to GCC 12, maybe unless we can also solve the spilling in PR98856 which would then fix the performance regression? 2021-03-05 Richard Biener <rguent...@suse.de> PR target/98856 * config/i386/sse.md (vpinsrq peephole): New peephole2 splitting vpinsrq to a vmovq and vpunpcklqdq. * gcc.target/i386/pr98856.c: New testcase. * gcc.target/i386/avx512dq-concatv2di-1.c: Adjust. * gcc.target/i386/avx512vl-concatv2di-1.c: Likewise. --- gcc/config/i386/sse.md | 15 +++++++++++ .../gcc.target/i386/avx512dq-concatv2di-1.c | 4 +-- .../gcc.target/i386/avx512vl-concatv2di-1.c | 2 +- gcc/testsuite/gcc.target/i386/pr98856.c | 25 +++++++++++++++++++ 4 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr98856.c diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index ca4372d4164..7c9be80540b 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -1427,6 +1427,21 @@ DONE; }) +;; Further split pinsrq variants of vec_concatv2di to hide the latency +;; the GPR->XMM transition(s). +(define_peephole2 + [(match_scratch:DI 3 "Yv") + (set (match_operand:V2DI 0 "sse_reg_operand") + (vec_concat:V2DI (match_operand:DI 1 "sse_reg_operand") + (match_operand:DI 2 "nonimmediate_gr_operand")))] + "TARGET_64BIT && TARGET_SSE4_1 + && !optimize_insn_for_size_p ()" + [(set (match_dup 3) + (match_dup 2)) + (set (match_dup 0) + (vec_concat:V2DI (match_dup 1) + (match_dup 3)))]) + ;; Merge movsd/movhpd to movupd for TARGET_SSE_UNALIGNED_LOAD_OPTIMAL targets. (define_peephole2 [(set (match_operand:V2DF 0 "sse_reg_operand") diff --git a/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c b/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c index 82cb402575b..ac652bb1382 100644 --- a/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512dq-concatv2di-1.c @@ -14,7 +14,7 @@ f1 (long long x, long long y) asm volatile ("" : "+v" (c)); } -/* { dg-final { scan-assembler "vpinsrq\[^\n\r]*\\\$1\[^\n\r]*%rsi\[^\n\r]*%xmm16\[^\n\r]*%xmm17" } } */ +/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm16\[^\n\r]*%xmm17" } } */ void f2 (long long x, long long *y) @@ -27,7 +27,7 @@ f2 (long long x, long long *y) asm volatile ("" : "+v" (c)); } -/* { dg-final { scan-assembler "vpinsrq\[^\n\r]*\\\$1\[^\n\r]*%\[re]si\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */ +/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */ void f3 (long long x) diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c b/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c index 8e637071aa2..b8300371a21 100644 --- a/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c +++ b/gcc/testsuite/gcc.target/i386/avx512vl-concatv2di-1.c @@ -28,7 +28,7 @@ f2 (long long x, long long *y) asm volatile ("" : "+v" (c)); } -/* { dg-final { scan-assembler "vmovhps\[^\n\r]*%\[re]si\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */ +/* { dg-final { scan-assembler "vpunpcklqdq\[^\n\r]*%xmm18\[^\n\r]*%xmm19" } } */ void f3 (long long x) diff --git a/gcc/testsuite/gcc.target/i386/pr98856.c b/gcc/testsuite/gcc.target/i386/pr98856.c new file mode 100644 index 00000000000..1ea24d0f1fb --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr98856.c @@ -0,0 +1,25 @@ +/* { dg-do compile { target lp64 } } */ +/* { dg-options "-O3 -march=znver2" } */ + +typedef __UINT64_TYPE__ uint64_t; +void poly_double_le2 (unsigned char *out, const unsigned char *in) +{ + uint64_t W[2]; + + __builtin_memcpy (&W, in, 16); + uint64_t carry = (W[1] >> 63) * 135; + W[1] = (W[1] << 1) ^ (W[0] >> 63); + W[0] = (W[0] << 1) ^ carry; + __builtin_memcpy (out, &W[0], 8); + __builtin_memcpy (out + 8, &W[1], 8); +} + +/* We should split + vpinsrq $1, %rax, %xmm0, %xmm0 + to + vmovq %rax, %xmm1 + vpunpcklqdq %xmm0, %xmm1, %xmm0 + to better hide the latency of the GPR->XMM transitions. */ + +/* { dg-final { scan-assembler-not "pinsrq" } } */ +/* { dg-final { scan-assembler-times "punpcklqdq" 1 } } */ -- 2.26.2