https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111829
Bug ID: 111829 Summary: Redudant register moves inside the loop Product: gcc Version: 14.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: crazylht at gmail dot com Target Milestone: --- Target: x86_64-*-* i?86-*-* #include<immintrin.h> int foo (__m128i* __restrict pa, int* b, __m128i* __restrict pc, int n) { __m128i vsum = _mm_setzero_si128(); for (int i = 0; i != 100000; i++) { vsum = _mm_dpbusd_epi32 (vsum, pa[i], _mm_set1_epi32 (b[i])); } *pc = vsum; int ssum = 0; for (int i = 0; i != 4; i++) ssum += ((__v4si)vsum)[i]; return ssum; } gcc -O2 -mavxvnni foo(long long __vector(2)*, int*, long long __vector(2)*, int): leaq 400000(%rsi), %rax vpxor %xmm0, %xmm0, %xmm0 .L2: vmovdqa (%rdi), %xmm2 vmovdqa %xmm0, %xmm1 ---- redundant addq $4, %rsi addq $16, %rdi vpbroadcastd -4(%rsi), %xmm3 {vex} vpdpbusd %xmm3, %xmm2, %xmm1 vmovdqa %xmm1, %xmm0 --- redundant cmpq %rax, %rsi jne .L2 vmovdqa %xmm1, (%rdx) leaq -24(%rsp), %rax leaq -8(%rsp), %rcx xorl %edx, %edx .L3: vmovdqa %xmm0, -24(%rsp) addq $4, %rax addl -4(%rax), %edx cmpq %rax, %rcx jne .L3 movl %edx, %eax ret it can be better with foo(long long __vector(2)*, int*, long long __vector(2)*, int): leaq 400000(%rsi), %rax vpxor %xmm0, %xmm0, %xmm0 .L2: vmovdqa (%rdi), %xmm2 addq $4, %rsi addq $16, %rdi vpbroadcastd -4(%rsi), %xmm3 {vex} vpdpbusd %xmm3, %xmm2, %xmm0 cmpq %rax, %rsi jne .L2 vmovdqa %xmm0, (%rdx) leaq -24(%rsp), %rax leaq -8(%rsp), %rcx xorl %edx, %edx .L3: vmovdqa %xmm0, -24(%rsp) addq $4, %rax addl -4(%rax), %edx cmpq %rax, %rcx jne .L3 movl %edx, %eax ret