https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829

            Bug ID: 111829
           Summary: Redudant register moves inside the loop
           Product: gcc
           Version: 14.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: crazylht at gmail dot com
  Target Milestone: ---
            Target: x86_64-*-* i?86-*-*

#include<immintrin.h>
int
foo (__m128i* __restrict pa, int* b,
 __m128i* __restrict pc, int n)
{
    __m128i vsum = _mm_setzero_si128();
    for (int i = 0; i != 100000; i++)
    {
        vsum = _mm_dpbusd_epi32 (vsum, pa[i], _mm_set1_epi32 (b[i]));
    }
    *pc = vsum;
    int ssum = 0;
    for (int i = 0; i != 4; i++)
      ssum += ((__v4si)vsum)[i];
    return ssum;
}

gcc -O2 -mavxvnni

foo(long long __vector(2)*, int*, long long __vector(2)*, int):
        leaq    400000(%rsi), %rax
        vpxor   %xmm0, %xmm0, %xmm0
.L2:
        vmovdqa (%rdi), %xmm2
        vmovdqa %xmm0, %xmm1 ---- redundant
        addq    $4, %rsi
        addq    $16, %rdi
        vpbroadcastd    -4(%rsi), %xmm3
        {vex} vpdpbusd  %xmm3, %xmm2, %xmm1
        vmovdqa %xmm1, %xmm0 --- redundant
        cmpq    %rax, %rsi
        jne     .L2
        vmovdqa %xmm1, (%rdx)
        leaq    -24(%rsp), %rax
        leaq    -8(%rsp), %rcx
        xorl    %edx, %edx
.L3:
        vmovdqa %xmm0, -24(%rsp)
        addq    $4, %rax
        addl    -4(%rax), %edx
        cmpq    %rax, %rcx
        jne     .L3
        movl    %edx, %eax
        ret


it can be better with


foo(long long __vector(2)*, int*, long long __vector(2)*, int):
        leaq    400000(%rsi), %rax
        vpxor   %xmm0, %xmm0, %xmm0
.L2:
        vmovdqa (%rdi), %xmm2

        addq    $4, %rsi
        addq    $16, %rdi
        vpbroadcastd    -4(%rsi), %xmm3
        {vex} vpdpbusd  %xmm3, %xmm2, %xmm0
        cmpq    %rax, %rsi
        jne     .L2
        vmovdqa %xmm0, (%rdx)
        leaq    -24(%rsp), %rax
        leaq    -8(%rsp), %rcx
        xorl    %edx, %edx
.L3:
        vmovdqa %xmm0, -24(%rsp)
        addq    $4, %rax
        addl    -4(%rax), %edx
        cmpq    %rax, %rcx
        jne     .L3
        movl    %edx, %eax
        ret

Reply via email to