[Bug middle-end/81502] New: In some cases the data is moved to memory unnecessarily [partial regression]

2017-07-20 Thread uzytkownik2 at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=81502

Bug ID: 81502
   Summary: In some cases the data is moved to memory
unnecessarily [partial regression]
   Product: gcc
   Version: 7.1.0
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: middle-end
  Assignee: unassigned at gcc dot gnu.org
  Reporter: uzytkownik2 at gmail dot com
  Target Milestone: ---

#include 
#include 
#include 

template
int foo(unsigned char (&foo)[SIZE]) {
  __m128i acc = _mm_set_epi32(0, 0, 0, 0);
  size_t i = 0;
  for(; i + sizeof(__m128i) <= SIZE; i += sizeof(__m128i)) {
__m128i word;
memcpy(&word, foo + i, sizeof(__m128i));
acc = _mm_add_epi32(word, acc);
  }
  if (i != SIZE) {
__m128i word = _mm_set_epi32(0, 0, 0, 0);
memcpy(&word, foo + i, SIZE - i); // (1)
acc = _mm_add_epi32(word, acc);
  }
  int res;
  memcpy(&res, &acc, sizeof(res));
  return res;
}

int bar(void *ptr) {
  unsigned char buf[sizeof(ptr)];
  memcpy(buf, &ptr, sizeof(ptr));
  return foo(buf);
}

(yeah, code is dummy but it is simplified from real example)

Both in -O2 and -O3 it produces following assembler:

bar(void*):
movq%rdi, -16(%rsp)
movq-16(%rsp), %xmm1
movq%xmm1, %xmm0
movd%xmm0, %eax
ret

Line marked as (1) seems to be responsible for unnecessary move to -16(%rsp)
even though the whole code does not need touch anything outside registers at
all. It doesn't seem to even choose optimal register (chooses %xmm1 instead of
%xmm0 adding additional copy).

Non-SSE code is somehow better though seems to have dead moves to memory:

#include 
#include 
#include 
#include 

template
int foo(unsigned char (&foo)[SIZE]) {
  uint64_t acc = 0;
  size_t i = 0;
  for(; i + sizeof(uint64_t) <= SIZE; i += sizeof(uint64_t)) {
uint64_t word;
memcpy(&word, foo + i, sizeof(__m128i));
acc += word;
  }
  if (i != SIZE) {
uint64_t word = 0;
memcpy(&word, foo + i, SIZE - i);
acc += word;
  }
  int res;
  memcpy(&res, &acc, sizeof(res));
  return res;
}

int bar(uint32_t ptr) {
  unsigned char buf[sizeof(ptr)];
  memcpy(buf, &ptr, sizeof(ptr));
  return foo(buf);
}

bar(unsigned int):
movq$0, -8(%rsp)
movl%edi, %eax
movl%edi, -8(%rsp)
ret

PS. It seems to work in first case when type is uint32_t in 6.3 but not in 7.1:
bar(unsigned int): // 6.3
movl%edi, %eax
ret
bar(unsigned int): // 7.1
pxor%xmm0, %xmm0
movl%edi, %eax
movaps  %xmm0, -24(%rsp)
movq%rax, -24(%rsp)
movl-24(%rsp), %eax
ret

[Bug tree-optimization/82625] New: lower-optimization are not inlined with symbol multiversioning

2017-10-19 Thread uzytkownik2 at gmail dot com
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82625

Bug ID: 82625
   Summary: lower-optimization are not inlined with symbol
multiversioning
   Product: gcc
   Version: unknown
Status: UNCONFIRMED
  Severity: normal
  Priority: P3
 Component: tree-optimization
  Assignee: unassigned at gcc dot gnu.org
  Reporter: uzytkownik2 at gmail dot com
  Target Milestone: ---

Consider following toy example:

__attribute__ ((target ("default")))
static uint32_t foo(const char *buf, size_t size) {
  return 1;
}

__attribute__ ((target ("avx")))
static uint32_t foo(const char *buf, size_t size) {
  return 2;
}

__attribute__ ((target ("default")))
uint32_t bar() {
  char buf[4096];
  uint32_t acc = 0;
  for (int i = 0; i < sizeof(buf); i++) {
acc += foo(&buf[i], 1);
  }
  return acc;
}

__attribute__ ((target ("avx")))
uint32_t bar() {
  char buf[4096];
  uint32_t acc = 0;
  for (int i = 0; i < sizeof(buf); i++) {
acc += foo(&buf[i], 1);
  }
  return acc;
}

bar.avx is correctly optimized to mov:

bar() [clone .avx]:
movl$8192, %eax
ret

However even though default bar could be optimized to mov as well it goes
through loop and dispatch:

bar():
pushq   %r12
pushq   %rbp
xorl%ebp, %ebp
pushq   %rbx
subq$4096, %rsp
leaq4096(%rsp), %r12
movq%rsp, %rbx
.L10:
movq%rbx, %rdi
movl$1, %esi
addq$1, %rbx
call   
_ZL3fooPKcm._GLOBALtmp_compiler_explorer_compiler117919_59_b8onwy.b8iqhyqfr_example.cpp__0x82e640d209aabe90.ifunc(char
const*, unsigned long)
addl%eax, %ebp
cmpq%r12, %rbx
jne .L10
addq$4096, %rsp
movl%ebp, %eax
popq%rbx
popq%rbp
popq%r12
ret

Possibly overlapping with bug #71990.