https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65847

--- Comment #3 from Richard Biener <rguenth at gcc dot gnu.org> ---
Similarly

struct X { int a; int b; int c; int d; };

struct X foo (struct X x, struct X y)
{
  struct X res;
  res.a = x.a + y.a;
  res.b = x.b + y.b;
  res.c = x.c + y.c;
  res.d = x.d + y.d;
  return res;
}

is vectorized as

foo:
.LFB0:
        .cfi_startproc
        movq    %rdi, -40(%rsp)
        movq    %rsi, -32(%rsp)
        movdqa  -40(%rsp), %xmm0
        movq    %rdx, -24(%rsp)
        movq    %rcx, -16(%rsp)
        paddd   -24(%rsp), %xmm0
        movaps  %xmm0, -40(%rsp)
        movq    -40(%rsp), %rax
        movq    -32(%rsp), %rdx
        ret

which is bad because the on-stack construction of %xmm0 causes a STLF fail.
Unvectorized code isn't necessarily worse, but the vectorized sequence
can be improved 

foo:
.LFB0:
        .cfi_startproc
        movq    %rdi, %rax
        movq    %rdi, %r10
        movq    %rdx, %rdi
        movq    %rsi, %r9
        sarq    $32, %r10
        sarq    $32, %rdi
        addl    %edx, %eax
        movq    %rcx, %r8
        addl    %r10d, %edi
        sarq    $32, %r9
        movl    %eax, %eax
        leal    (%rsi,%rcx), %edx
        movl    %edi, %edi
        sarq    $32, %r8
        salq    $32, %rdi
        orq     %rdi, %rax
        leal    (%r9,%r8), %edi
        salq    $32, %rdi
        orq     %rdi, %rdx
        ret

in this case the spill is caused by LRA not knowing how to re-load
the TImode reg build by pieces by the RTL expansion code.

Reply via email to