[Bug target/111332] Using GCC7.3.0 and GCC10.3.0 to compile the same test case, assembler file instructions are different and performance fallback is obvious.

d_vampile at 163 dot com via Gcc-bugs Thu, 07 Sep 2023 19:40:22 -0700

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111332


--- Comment #7 from d_vampile <d_vampile at 163 dot com> ---
(In reply to Andrew Pinski from comment #3)
> GCC 11+ produces:
> .L3:
>         vmovdqu (%rsi), %ymm2
>         vmovdqu 32(%rsi), %ymm1
>         subq    $-128, %rdi
>         subq    $-128, %rsi
>         vmovdqu -64(%rsi), %ymm0
>         vmovdqu -32(%rsi), %ymm3
>         vmovdqu %ymm2, -128(%rdi)
>         vmovdqu %ymm3, -32(%rdi)
>         vmovdqu %ymm1, -96(%rdi)
>         vmovdqu %ymm0, -64(%rdi)
>         cmpq    %rax, %rdi
>         jne     .L3
> 
> Which is the best code ...

GCC 7.3.0 produces:
extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_loadu_si256 (__m256i_u const *__P)
{
  return *__P;
  401170:       c5 fa 6f 1e             vmovdqu (%rsi),%xmm3
                dst = (uint8_t *)dst + 128;
  401174:       48 83 ef 80             sub    $0xffffffffffffff80,%rdi
                src = (const uint8_t *)src + 128;
  401178:       48 83 ee 80             sub    $0xffffffffffffff80,%rsi
  40117c:       c5 fa 6f 56 a0          vmovdqu -0x60(%rsi),%xmm2
  401181:       c4 e3 65 38 5e 90 01    vinserti128
$0x1,-0x70(%rsi),%ymm3,%ymm3
  401188:       c5 fa 6f 4e c0          vmovdqu -0x40(%rsi),%xmm1
  40118d:       c4 e3 6d 38 56 b0 01    vinserti128
$0x1,-0x50(%rsi),%ymm2,%ymm2
  401194:       c5 fa 6f 46 e0          vmovdqu -0x20(%rsi),%xmm0
  401199:       c4 e3 75 38 4e d0 01    vinserti128
$0x1,-0x30(%rsi),%ymm1,%ymm1
  4011a0:       c4 e3 7d 38 46 f0 01    vinserti128
$0x1,-0x10(%rsi),%ymm0,%ymm0
}

extern __inline void __attribute__((__gnu_inline__, __always_inline__,
__artificial__))
_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
{
  *__P = __A;
  4011a7:       c5 f8 11 5f 80          vmovups %xmm3,-0x80(%rdi)
  4011ac:       c4 e3 7d 39 5f 90 01    vextracti128 $0x1,%ymm3,-0x70(%rdi)
  4011b3:       c5 f8 11 57 a0          vmovups %xmm2,-0x60(%rdi)
  4011b8:       c4 e3 7d 39 57 b0 01    vextracti128 $0x1,%ymm2,-0x50(%rdi)
  4011bf:       c5 f8 11 4f c0          vmovups %xmm1,-0x40(%rdi)
  4011c4:       c4 e3 7d 39 4f d0 01    vextracti128 $0x1,%ymm1,-0x30(%rdi)
  4011cb:       c5 f8 11 47 e0          vmovups %xmm0,-0x20(%rdi)
  4011d0:       c4 e3 7d 39 47 f0 01    vextracti128 $0x1,%ymm0,-0x10(%rdi)
        while (n >= 128) {
  4011d7:       48 39 c7                cmp    %rax,%rdi
  4011da:       75 94                   jne    401170 <rte_mov128blocks+0x20>
  4011dc:       c5 f8 77                vzeroupper

In terms of runtime, this code is the best.

[Bug target/111332] Using GCC7.3.0 and GCC10.3.0 to compile the same test case, assembler file instructions are different and performance fallback is obvious.

Reply via email to