Apologies if this is the wrong list. I'm afraid I'm not much of an assembly programmer, but I was just wondering if this generated code was 'correct', because from descriptions of SSE that I've read, it looks like it's inefficient.
The C code: float *vector_add4f(float va[4], const float vb[4]) { va[0] += vb[0]; va[1] += vb[1]; va[2] += vb[2]; va[3] += vb[3]; return va; } Now unless my understanding is totally off, the processor should be able to do those four additions in one by using the SSE extensions. The standard code generated (without SSE) is as expected: 88: 55 push %ebp 89: 89 e5 mov %esp,%ebp 8b: 8b 45 08 mov 0x8(%ebp),%eax 8e: 8b 55 0c mov 0xc(%ebp),%edx 91: d9 00 flds (%eax) 93: d8 02 fadds (%edx) 95: d9 18 fstps (%eax) 97: d9 40 04 flds 0x4(%eax) 9a: d8 42 04 fadds 0x4(%edx) 9d: d9 58 04 fstps 0x4(%eax) a0: d9 40 08 flds 0x8(%eax) a3: d8 42 08 fadds 0x8(%edx) a6: d9 58 08 fstps 0x8(%eax) a9: d9 40 0c flds 0xc(%eax) ac: d8 42 0c fadds 0xc(%edx) af: d9 58 0c fstps 0xc(%eax) b2: c9 leave b3: c3 ret Using -march=pentium3 -mtune=pentium3m -mfpmath=sse, the following is generated: 140: 55 push %ebp 141: 89 e5 mov %esp,%ebp 143: 8b 4d 08 mov 0x8(%ebp),%ecx 146: 8b 45 08 mov 0x8(%ebp),%eax 149: 8b 55 0c mov 0xc(%ebp),%edx 14c: f3 0f 10 00 movss (%eax),%xmm0 150: f3 0f 58 02 addss (%edx),%xmm0 154: f3 0f 11 01 movss %xmm0,(%ecx) 158: 8b 4d 08 mov 0x8(%ebp),%ecx 15b: 83 c1 04 add $0x4,%ecx 15e: 8b 45 08 mov 0x8(%ebp),%eax 161: 83 c0 04 add $0x4,%eax 164: 8b 55 0c mov 0xc(%ebp),%edx 167: 83 c2 04 add $0x4,%edx 16a: f3 0f 10 00 movss (%eax),%xmm0 16e: f3 0f 58 02 addss (%edx),%xmm0 172: f3 0f 11 01 movss %xmm0,(%ecx) 176: 8b 4d 08 mov 0x8(%ebp),%ecx 179: 83 c1 08 add $0x8,%ecx 17c: 8b 45 08 mov 0x8(%ebp),%eax 17f: 83 c0 08 add $0x8,%eax 182: 8b 55 0c mov 0xc(%ebp),%edx 185: 83 c2 08 add $0x8,%edx 188: f3 0f 10 00 movss (%eax),%xmm0 18c: f3 0f 58 02 addss (%edx),%xmm0 190: f3 0f 11 01 movss %xmm0,(%ecx) 194: 8b 4d 08 mov 0x8(%ebp),%ecx 197: 83 c1 0c add $0xc,%ecx 19a: 8b 45 08 mov 0x8(%ebp),%eax 19d: 83 c0 0c add $0xc,%eax 1a0: 8b 55 0c mov 0xc(%ebp),%edx 1a3: 83 c2 0c add $0xc,%edx 1a6: f3 0f 10 00 movss (%eax),%xmm0 1aa: f3 0f 58 02 addss (%edx),%xmm0 1ae: f3 0f 11 01 movss %xmm0,(%ecx) 1b2: 8b 45 08 mov 0x8(%ebp),%eax 1b5: 5d pop %ebp 1b6: c3 ret 1b7: 89 f6 mov %esi,%esi 1b9: 8d bc 27 00 00 00 00 lea 0x0(%edi),%edi Now, uh, isn't that four additions? Do I need to do something gcc-specific to get it to use the 'add-packed-single' instruction to turn those four additions into one? MC