Consider this example: #include <xmmintrin.h>
typedef union { __m128 vec; float data[4]; struct { float x,y,z,w; }; } vec4f_t; static inline float __attribute__((__always_inline__)) acc(vec4f_t src) { float a; src.vec = _mm_add_ps(src.vec, _mm_movehl_ps(src.vec, src.vec,)); _mm_store_ss(&a, _mm_add_ss(src.vec, _mm_shuffle_ps(src.vec, src.vec, _MM_SHUFFLE(3,2,1,1)))); return a; } int main(int argc, char *argv[]) { vec4f_t b; printf("%f\n", acc(b)); return 0; } This gets compiled to: .section .rodata.str1.1,"aMS",@progbits,1 .LC0: .string "%f\n" .text .p2align 4,,15 .globl main .type main, @function main: .LFB506: subq $40, %rsp .LCFI0: movl $.LC0, %edi movq 16(%rsp), %rax movq %rax, (%rsp) movq 24(%rsp), %rax movq %rax, 8(%rsp) movl $1, %eax movaps (%rsp), %xmm1 movaps %xmm1, %xmm0 movhlps %xmm1, %xmm0 addps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $229, %xmm0, %xmm1 addss %xmm1, %xmm0 cvtss2sd %xmm0, %xmm0 call printf xorl %eax, %eax addq $40, %rsp ret As we can see the union is passed on the stack instead of a value in %xmm0 this would make sense if this would not be an inline function and members other than the __m128 would be accessed. Using the same code as above but passing __m128 directly instead of the union gets compiled to: .section .rodata.str1.1,"aMS",@progbits,1 .LC0: .string "%f\n" .text .p2align 4,,15 .globl main .type main, @function main: .LFB506: movhlps %xmm0, %xmm0 subq $8, %rsp .LCFI0: movl $.LC0, %edi movl $1, %eax addps %xmm0, %xmm0 movaps %xmm0, %xmm1 shufps $229, %xmm0, %xmm1 addss %xmm1, %xmm0 cvtss2sd %xmm0, %xmm0 call printf xorl %eax, %eax addq $8, %rsp ret -- Summary: Passing unions of _vector_ types and struct or array of the same size as value to inline functions causes unecessary load/stores on the stack even if no members except the _vector_ is accessed Product: gcc Version: 4.1.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: other AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: j_daniel at rbg dot informatik dot tu-darmstadt dot de GCC build triplet: x86_64-pc-linux-gnu GCC host triplet: x86_64-pc-linux-gnu GCC target triplet: x86_64-pc-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26546