http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50713
Bug #: 50713 Summary: SLP vs loop: code generated differs Classification: Unclassified Product: gcc Version: 4.7.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassig...@gcc.gnu.org ReportedBy: vincenzo.innoce...@cern.ch in the following code for "float" the code generated by "dosum" and "dosuml" differs (dosum better) for "complex" "sum" does not vectorize! (a problem in itself) "dosum" excellent vectorization, dusuml: same issue that with floats if you have time please have a look of what happens with aligned(32) sse vs avx, float vs double⦠(not an urgent use case at the moment) compiled with c++ -Ofast -ftree-vectorizer-verbose=2 -c slp2.cc -mtune=corei7 -msse4.2 // typedef __complex__ float Value; typedef float Value; // typedef double Value; struct A { Value a[4]; } __attribute__ ((aligned(16))); A a1, a2, a3; A sum(A a, A b) { a.a[0]+=b.a[0]; a.a[1]+=b.a[1]; a.a[2]+=b.a[2]; a.a[3]+=b.a[3]; return a; } A suml(A a, A b) { for (int i=0;i!=4;++i) a.a[i]+=b.a[i]; return a; } void dosum() { a1 = sum(a2,a3); } void dosuml() { a1 = suml(a2,a3); } float ======== sum(A, A): movq %xmm2,0xc8(%rsp) movq %xmm3,0xd0(%rsp) movq %xmm0,0xd8(%rsp) movaps 0xc8(%rsp),%xmm0 movq %xmm1,0xe0(%rsp) addps 0xd8(%rsp),%xmm0 movaps %xmm0,0xa8(%rsp) movq 0xa8(%rsp),%rax movaps %xmm0,0xe8(%rsp) movq 0xf0(%rsp),%xmm1 movd %rax,%xmm0 ret nopl (%rax) suml(A, A): movq %xmm2,0xc8(%rsp) movq %xmm3,0xd0(%rsp) movq %xmm0,0xd8(%rsp) movaps 0xc8(%rsp),%xmm0 movq %xmm1,0xe0(%rsp) addps 0xd8(%rsp),%xmm0 movaps %xmm0,0xa8(%rsp) movq 0xa8(%rsp),%rax movaps %xmm0,0xd8(%rsp) movq 0xe0(%rsp),%xmm1 movd %rax,%xmm0 ret nopl (%rax) dosum(): movaps _a2(%rip),%xmm0 addps _a3(%rip),%xmm0 movaps %xmm0,_a1(%rip) ret nopw %cs:sum(A, A)(%rax,%rax) dosuml(): movq _a2(%rip),%rax movq %rax,0xb8(%rsp) movq _a2+0x00000008(%rip),%rax movq %rax,0xc0(%rsp) movq _a3(%rip),%rax movq %rax,0xc8(%rsp) movq _a3+0x00000008(%rip),%rax movq %rax,0xd0(%rsp) movaps 0xc8(%rsp),%xmm0 addps 0xb8(%rsp),%xmm0 movaps %xmm0,0xa8(%rsp) movq 0xa8(%rsp),%rax movaps %xmm0,0xb8(%rsp) movq %rax,_a1(%rip) movq 0xc0(%rsp),%rax movq %rax,_a1+0x00000008(%rip) ret complex ========== sum(A, A): movss 0x28(%rsp),%xmm7 movq %rdi,%rax movss 0x2c(%rsp),%xmm6 movss 0x30(%rsp),%xmm5 movss 0x34(%rsp),%xmm4 movss 0x38(%rsp),%xmm3 movss 0x3c(%rsp),%xmm2 movss 0x40(%rsp),%xmm1 movss 0x44(%rsp),%xmm0 addss 0x08(%rsp),%xmm7 addss 0x0c(%rsp),%xmm6 addss 0x10(%rsp),%xmm5 addss 0x14(%rsp),%xmm4 movss %xmm7,(%rdi) addss 0x18(%rsp),%xmm3 movss %xmm6,0x04(%rdi) addss 0x1c(%rsp),%xmm2 movss %xmm5,0x08(%rdi) addss 0x20(%rsp),%xmm1 movss %xmm4,0x0c(%rdi) addss 0x24(%rsp),%xmm0 movss %xmm3,0x10(%rdi) movss %xmm2,0x14(%rdi) movss %xmm1,0x18(%rdi) movss %xmm0,0x1c(%rdi) ret nopl sum(A, A)(%rax,%rax) suml(A, A): movaps 0x08(%rsp),%xmm0 movq %rdi,%rax addps 0x28(%rsp),%xmm0 movaps %xmm0,0xe8(%rsp) movaps %xmm0,0x08(%rsp) movaps 0x18(%rsp),%xmm0 movq 0xe8(%rsp),%rcx addps 0x38(%rsp),%xmm0 movaps %xmm0,0xe8(%rsp) movq 0xe8(%rsp),%rdx movaps %xmm0,0x18(%rsp) movq %rcx,(%rdi) movq 0x10(%rsp),%rcx movq %rdx,0x10(%rdi) movq 0x20(%rsp),%rdx movq %rcx,0x08(%rdi) movq %rdx,0x18(%rdi) ret nop dosum(): movaps _a2+0x00000010(%rip),%xmm0 movaps _a2(%rip),%xmm1 addps _a3+0x00000010(%rip),%xmm0 addps _a3(%rip),%xmm1 movaps %xmm0,_a1+0x00000010(%rip) movaps %xmm1,_a1(%rip) ret nopl sum(A, A)(%rax,%rax) dosuml(): movq _a2(%rip),%rax movq %rax,0x98(%rsp) movq _a2+0x00000008(%rip),%rax movq %rax,0xa0(%rsp) movq _a2+0x00000010(%rip),%rax movq %rax,0xa8(%rsp) movq _a2+0x00000018(%rip),%rax movq %rax,0xb0(%rsp) movq _a3(%rip),%rax movq %rax,0xb8(%rsp) movq _a3+0x00000008(%rip),%rax movq %rax,0xc0(%rsp) movq _a3+0x00000010(%rip),%rax movaps 0xb8(%rsp),%xmm0 addps 0x98(%rsp),%xmm0 movq %rax,0xc8(%rsp) movq _a3+0x00000018(%rip),%rax movaps %xmm0,0x88(%rsp) movaps %xmm0,0x98(%rsp) movaps 0xa8(%rsp),%xmm0 movq %rax,0xd0(%rsp) movq 0x88(%rsp),%rdx addps 0xc8(%rsp),%xmm0 movaps %xmm0,0x88(%rsp) movq 0x88(%rsp),%rax movaps %xmm0,0xa8(%rsp) movq %rdx,_a1(%rip) movq 0xa0(%rsp),%rdx movq %rax,_a1+0x00000010(%rip) movq 0xb0(%rsp),%rax movq %rdx,_a1+0x00000008(%rip) movq %rax,_a1+0x00000018(%rip) ret