https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81502

            Bug ID: 81502
           Summary: In some cases the data is moved to memory
                    unnecessarily [partial regression]
           Product: gcc
           Version: 7.1.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: middle-end
          Assignee: unassigned at gcc dot gnu.org
          Reporter: uzytkownik2 at gmail dot com
  Target Milestone: ---

#include <stdio.h>
#include <memory.h>
#include <emmintrin.h>

template<size_t SIZE>
int foo(unsigned char (&foo)[SIZE]) {
  __m128i acc = _mm_set_epi32(0, 0, 0, 0);
  size_t i = 0;
  for(; i + sizeof(__m128i) <= SIZE; i += sizeof(__m128i)) {
    __m128i word;
    memcpy(&word, foo + i, sizeof(__m128i));
    acc = _mm_add_epi32(word, acc);
  }
  if (i != SIZE) {
    __m128i word = _mm_set_epi32(0, 0, 0, 0);
    memcpy(&word, foo + i, SIZE - i); // (1)
    acc = _mm_add_epi32(word, acc);
  }
  int res;
  memcpy(&res, &acc, sizeof(res));
  return res;
}

int bar(void *ptr) {
  unsigned char buf[sizeof(ptr)];
  memcpy(buf, &ptr, sizeof(ptr));
  return foo(buf);
}

(yeah, code is dummy but it is simplified from real example)

Both in -O2 and -O3 it produces following assembler:

bar(void*):
        movq    %rdi, -16(%rsp)
        movq    -16(%rsp), %xmm1
        movq    %xmm1, %xmm0
        movd    %xmm0, %eax
        ret

Line marked as (1) seems to be responsible for unnecessary move to -16(%rsp)
even though the whole code does not need touch anything outside registers at
all. It doesn't seem to even choose optimal register (chooses %xmm1 instead of
%xmm0 adding additional copy).

Non-SSE code is somehow better though seems to have dead moves to memory:

#include <stdio.h>
#include <memory.h>
#include <emmintrin.h>
#include <stdint.h>

template<size_t SIZE>
int foo(unsigned char (&foo)[SIZE]) {
  uint64_t acc = 0;
  size_t i = 0;
  for(; i + sizeof(uint64_t) <= SIZE; i += sizeof(uint64_t)) {
    uint64_t word;
    memcpy(&word, foo + i, sizeof(__m128i));
    acc += word;
  }
  if (i != SIZE) {
    uint64_t word = 0;
    memcpy(&word, foo + i, SIZE - i);
    acc += word;
  }
  int res;
  memcpy(&res, &acc, sizeof(res));
  return res;
}

int bar(uint32_t ptr) {
  unsigned char buf[sizeof(ptr)];
  memcpy(buf, &ptr, sizeof(ptr));
  return foo(buf);
}

bar(unsigned int):
        movq    $0, -8(%rsp)
        movl    %edi, %eax
        movl    %edi, -8(%rsp)
        ret

PS. It seems to work in first case when type is uint32_t in 6.3 but not in 7.1:
bar(unsigned int): // 6.3
        movl    %edi, %eax
        ret
bar(unsigned int): // 7.1
        pxor    %xmm0, %xmm0
        movl    %edi, %eax
        movaps  %xmm0, -24(%rsp)
        movq    %rax, -24(%rsp)
        movl    -24(%rsp), %eax
        ret

Reply via email to