http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50713

             Bug #: 50713
           Summary: SLP vs loop: code generated differs
    Classification: Unclassified
           Product: gcc
           Version: 4.7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassig...@gcc.gnu.org
        ReportedBy: vincenzo.innoce...@cern.ch


in the following code 
for "float" 
the code generated by "dosum" and "dosuml" differs (dosum better)
for "complex" 
"sum" does not vectorize! (a problem in itself)
"dosum" excellent vectorization, dusuml: same issue that with floats

if you have time please have a look of what happens with aligned(32)
sse vs avx, float vs double… (not an urgent use case at the moment)

compiled with
c++ -Ofast -ftree-vectorizer-verbose=2 -c slp2.cc -mtune=corei7 -msse4.2

// typedef __complex__ float Value;
typedef float Value;
// typedef double Value;

struct A {
  Value a[4];
}   __attribute__ ((aligned(16)));

A a1, a2, a3;

A sum(A a, A b) {
  a.a[0]+=b.a[0];
  a.a[1]+=b.a[1];
  a.a[2]+=b.a[2];
  a.a[3]+=b.a[3];
  return a;
}

A suml(A a, A b) {
  for (int i=0;i!=4;++i) a.a[i]+=b.a[i];
  return a;
}


void dosum() {
  a1 = sum(a2,a3);
}

void dosuml() {
  a1 = suml(a2,a3);
}




float
========
sum(A, A):
        movq    %xmm2,0xc8(%rsp)
        movq    %xmm3,0xd0(%rsp)
        movq    %xmm0,0xd8(%rsp)
        movaps  0xc8(%rsp),%xmm0
        movq    %xmm1,0xe0(%rsp)
        addps   0xd8(%rsp),%xmm0
        movaps  %xmm0,0xa8(%rsp)
        movq    0xa8(%rsp),%rax
        movaps  %xmm0,0xe8(%rsp)
        movq    0xf0(%rsp),%xmm1
        movd    %rax,%xmm0
        ret
        nopl    (%rax)
suml(A, A):
        movq    %xmm2,0xc8(%rsp)
        movq    %xmm3,0xd0(%rsp)
        movq    %xmm0,0xd8(%rsp)
        movaps  0xc8(%rsp),%xmm0
        movq    %xmm1,0xe0(%rsp)
        addps   0xd8(%rsp),%xmm0
        movaps  %xmm0,0xa8(%rsp)
        movq    0xa8(%rsp),%rax
        movaps  %xmm0,0xd8(%rsp)
        movq    0xe0(%rsp),%xmm1
        movd    %rax,%xmm0
        ret
        nopl    (%rax)
dosum():
        movaps  _a2(%rip),%xmm0
        addps   _a3(%rip),%xmm0
        movaps  %xmm0,_a1(%rip)
        ret
        nopw    %cs:sum(A, A)(%rax,%rax)
dosuml():
        movq    _a2(%rip),%rax
        movq    %rax,0xb8(%rsp)
        movq    _a2+0x00000008(%rip),%rax
        movq    %rax,0xc0(%rsp)
        movq    _a3(%rip),%rax
        movq    %rax,0xc8(%rsp)
        movq    _a3+0x00000008(%rip),%rax
        movq    %rax,0xd0(%rsp)
        movaps  0xc8(%rsp),%xmm0
        addps   0xb8(%rsp),%xmm0
        movaps  %xmm0,0xa8(%rsp)
        movq    0xa8(%rsp),%rax
        movaps  %xmm0,0xb8(%rsp)
        movq    %rax,_a1(%rip)
        movq    0xc0(%rsp),%rax
        movq    %rax,_a1+0x00000008(%rip)
        ret


complex
==========

sum(A, A):
    movss    0x28(%rsp),%xmm7
    movq    %rdi,%rax
    movss    0x2c(%rsp),%xmm6
    movss    0x30(%rsp),%xmm5
    movss    0x34(%rsp),%xmm4
    movss    0x38(%rsp),%xmm3
    movss    0x3c(%rsp),%xmm2
    movss    0x40(%rsp),%xmm1
    movss    0x44(%rsp),%xmm0
    addss    0x08(%rsp),%xmm7
    addss    0x0c(%rsp),%xmm6
    addss    0x10(%rsp),%xmm5
    addss    0x14(%rsp),%xmm4
    movss    %xmm7,(%rdi)
    addss    0x18(%rsp),%xmm3
    movss    %xmm6,0x04(%rdi)
    addss    0x1c(%rsp),%xmm2
    movss    %xmm5,0x08(%rdi)
    addss    0x20(%rsp),%xmm1
    movss    %xmm4,0x0c(%rdi)
    addss    0x24(%rsp),%xmm0
    movss    %xmm3,0x10(%rdi)
    movss    %xmm2,0x14(%rdi)
    movss    %xmm1,0x18(%rdi)
    movss    %xmm0,0x1c(%rdi)
    ret
    nopl    sum(A, A)(%rax,%rax)
suml(A, A):
    movaps    0x08(%rsp),%xmm0
    movq    %rdi,%rax
    addps    0x28(%rsp),%xmm0
    movaps    %xmm0,0xe8(%rsp)
    movaps    %xmm0,0x08(%rsp)
    movaps    0x18(%rsp),%xmm0
    movq    0xe8(%rsp),%rcx
    addps    0x38(%rsp),%xmm0
    movaps    %xmm0,0xe8(%rsp)
    movq    0xe8(%rsp),%rdx
    movaps    %xmm0,0x18(%rsp)
    movq    %rcx,(%rdi)
    movq    0x10(%rsp),%rcx
    movq    %rdx,0x10(%rdi)
    movq    0x20(%rsp),%rdx
    movq    %rcx,0x08(%rdi)
    movq    %rdx,0x18(%rdi)
    ret
    nop
dosum():
    movaps    _a2+0x00000010(%rip),%xmm0
    movaps    _a2(%rip),%xmm1
    addps    _a3+0x00000010(%rip),%xmm0
    addps    _a3(%rip),%xmm1
    movaps    %xmm0,_a1+0x00000010(%rip)
    movaps    %xmm1,_a1(%rip)
    ret
    nopl    sum(A, A)(%rax,%rax)
dosuml():
    movq    _a2(%rip),%rax
    movq    %rax,0x98(%rsp)
    movq    _a2+0x00000008(%rip),%rax
    movq    %rax,0xa0(%rsp)
    movq    _a2+0x00000010(%rip),%rax
    movq    %rax,0xa8(%rsp)
    movq    _a2+0x00000018(%rip),%rax
    movq    %rax,0xb0(%rsp)
    movq    _a3(%rip),%rax
    movq    %rax,0xb8(%rsp)
    movq    _a3+0x00000008(%rip),%rax
    movq    %rax,0xc0(%rsp)
    movq    _a3+0x00000010(%rip),%rax
    movaps    0xb8(%rsp),%xmm0
    addps    0x98(%rsp),%xmm0
    movq    %rax,0xc8(%rsp)
    movq    _a3+0x00000018(%rip),%rax
    movaps    %xmm0,0x88(%rsp)
    movaps    %xmm0,0x98(%rsp)
    movaps    0xa8(%rsp),%xmm0
    movq    %rax,0xd0(%rsp)
    movq    0x88(%rsp),%rdx
    addps    0xc8(%rsp),%xmm0
    movaps    %xmm0,0x88(%rsp)
    movq    0x88(%rsp),%rax
    movaps    %xmm0,0xa8(%rsp)
    movq    %rdx,_a1(%rip)
    movq    0xa0(%rsp),%rdx
    movq    %rax,_a1+0x00000010(%rip)
    movq    0xb0(%rsp),%rax
    movq    %rdx,_a1+0x00000008(%rip)
    movq    %rax,_a1+0x00000018(%rip)
    ret

Reply via email to