http://gcc.gnu.org/bugzilla/show_bug.cgi?id=55071
Bug #: 55071
Summary: "Horizontal sum" of bultin vectors
Classification: Unclassified
Product: gcc
Version: 4.8.0
Status: UNCONFIRMED
Severity: enhancement
Priority: P3
Component: tree-optimization
AssignedTo: [email protected]
ReportedBy: [email protected]
this
typedef float __attribute__( ( vector_size( 16 ) ) ) float32x4_t;
float sum(float32x4_t x) {
return x[0]+x[1]+x[2]+x[3];
}
does not use hadd
on corei7
produces a quite inefficient
movaps %xmm0, %xmm2
movaps %xmm0, %xmm3
shufps $85, %xmm0, %xmm2
movaps %xmm2, %xmm1
movaps %xmm0, %xmm2
addss %xmm2, %xmm1
unpckhps %xmm0, %xmm3
movaps %xmm3, %xmm2
shufps $255, %xmm0, %xmm0
addss %xmm2, %xmm1
addss %xmm0, %xmm1
movaps %xmm1, %xmm0
ret
for avx is a bit better (just thanks to the three arguments)
vshufps $85, %xmm0, %xmm0, %xmm2
vaddss %xmm0, %xmm2, %xmm1
vunpckhps %xmm0, %xmm0, %xmm2
vshufps $255, %xmm0, %xmm0, %xmm0
vaddss %xmm2, %xmm1, %xmm1
vaddss %xmm0, %xmm1, %xmm0