https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81502
Bug ID: 81502
Summary: In some cases the data is moved to memory
unnecessarily [partial regression]
Product: gcc
Version: 7.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: middle-end
Assignee: unassigned at gcc dot gnu.org
Reporter: uzytkownik2 at gmail dot com
Target Milestone: ---
#include
#include
#include
template
int foo(unsigned char (&foo)[SIZE]) {
__m128i acc = _mm_set_epi32(0, 0, 0, 0);
size_t i = 0;
for(; i + sizeof(__m128i) <= SIZE; i += sizeof(__m128i)) {
__m128i word;
memcpy(&word, foo + i, sizeof(__m128i));
acc = _mm_add_epi32(word, acc);
}
if (i != SIZE) {
__m128i word = _mm_set_epi32(0, 0, 0, 0);
memcpy(&word, foo + i, SIZE - i); // (1)
acc = _mm_add_epi32(word, acc);
}
int res;
memcpy(&res, &acc, sizeof(res));
return res;
}
int bar(void *ptr) {
unsigned char buf[sizeof(ptr)];
memcpy(buf, &ptr, sizeof(ptr));
return foo(buf);
}
(yeah, code is dummy but it is simplified from real example)
Both in -O2 and -O3 it produces following assembler:
bar(void*):
movq%rdi, -16(%rsp)
movq-16(%rsp), %xmm1
movq%xmm1, %xmm0
movd%xmm0, %eax
ret
Line marked as (1) seems to be responsible for unnecessary move to -16(%rsp)
even though the whole code does not need touch anything outside registers at
all. It doesn't seem to even choose optimal register (chooses %xmm1 instead of
%xmm0 adding additional copy).
Non-SSE code is somehow better though seems to have dead moves to memory:
#include
#include
#include
#include
template
int foo(unsigned char (&foo)[SIZE]) {
uint64_t acc = 0;
size_t i = 0;
for(; i + sizeof(uint64_t) <= SIZE; i += sizeof(uint64_t)) {
uint64_t word;
memcpy(&word, foo + i, sizeof(__m128i));
acc += word;
}
if (i != SIZE) {
uint64_t word = 0;
memcpy(&word, foo + i, SIZE - i);
acc += word;
}
int res;
memcpy(&res, &acc, sizeof(res));
return res;
}
int bar(uint32_t ptr) {
unsigned char buf[sizeof(ptr)];
memcpy(buf, &ptr, sizeof(ptr));
return foo(buf);
}
bar(unsigned int):
movq$0, -8(%rsp)
movl%edi, %eax
movl%edi, -8(%rsp)
ret
PS. It seems to work in first case when type is uint32_t in 6.3 but not in 7.1:
bar(unsigned int): // 6.3
movl%edi, %eax
ret
bar(unsigned int): // 7.1
pxor%xmm0, %xmm0
movl%edi, %eax
movaps %xmm0, -24(%rsp)
movq%rax, -24(%rsp)
movl-24(%rsp), %eax
ret