https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81502
Bug ID: 81502 Summary: In some cases the data is moved to memory unnecessarily [partial regression] Product: gcc Version: 7.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: middle-end Assignee: unassigned at gcc dot gnu.org Reporter: uzytkownik2 at gmail dot com Target Milestone: --- #include <stdio.h> #include <memory.h> #include <emmintrin.h> template<size_t SIZE> int foo(unsigned char (&foo)[SIZE]) { __m128i acc = _mm_set_epi32(0, 0, 0, 0); size_t i = 0; for(; i + sizeof(__m128i) <= SIZE; i += sizeof(__m128i)) { __m128i word; memcpy(&word, foo + i, sizeof(__m128i)); acc = _mm_add_epi32(word, acc); } if (i != SIZE) { __m128i word = _mm_set_epi32(0, 0, 0, 0); memcpy(&word, foo + i, SIZE - i); // (1) acc = _mm_add_epi32(word, acc); } int res; memcpy(&res, &acc, sizeof(res)); return res; } int bar(void *ptr) { unsigned char buf[sizeof(ptr)]; memcpy(buf, &ptr, sizeof(ptr)); return foo(buf); } (yeah, code is dummy but it is simplified from real example) Both in -O2 and -O3 it produces following assembler: bar(void*): movq %rdi, -16(%rsp) movq -16(%rsp), %xmm1 movq %xmm1, %xmm0 movd %xmm0, %eax ret Line marked as (1) seems to be responsible for unnecessary move to -16(%rsp) even though the whole code does not need touch anything outside registers at all. It doesn't seem to even choose optimal register (chooses %xmm1 instead of %xmm0 adding additional copy). Non-SSE code is somehow better though seems to have dead moves to memory: #include <stdio.h> #include <memory.h> #include <emmintrin.h> #include <stdint.h> template<size_t SIZE> int foo(unsigned char (&foo)[SIZE]) { uint64_t acc = 0; size_t i = 0; for(; i + sizeof(uint64_t) <= SIZE; i += sizeof(uint64_t)) { uint64_t word; memcpy(&word, foo + i, sizeof(__m128i)); acc += word; } if (i != SIZE) { uint64_t word = 0; memcpy(&word, foo + i, SIZE - i); acc += word; } int res; memcpy(&res, &acc, sizeof(res)); return res; } int bar(uint32_t ptr) { unsigned char buf[sizeof(ptr)]; memcpy(buf, &ptr, sizeof(ptr)); return foo(buf); } bar(unsigned int): movq $0, -8(%rsp) movl %edi, %eax movl %edi, -8(%rsp) ret PS. It seems to work in first case when type is uint32_t in 6.3 but not in 7.1: bar(unsigned int): // 6.3 movl %edi, %eax ret bar(unsigned int): // 7.1 pxor %xmm0, %xmm0 movl %edi, %eax movaps %xmm0, -24(%rsp) movq %rax, -24(%rsp) movl -24(%rsp), %eax ret