Apologies if this is the wrong list.

I'm afraid I'm not much of an assembly programmer, but I was just
wondering if this generated code was 'correct', because from descriptions
of SSE that I've read, it looks like it's inefficient.

The C code:

float *vector_add4f(float va[4], const float vb[4])
{
 va[0] += vb[0];
 va[1] += vb[1];
 va[2] += vb[2];
 va[3] += vb[3];
 return va;
}

Now unless my understanding is totally off, the processor should be able
to do those four additions in one by using the SSE extensions. The standard
code generated (without SSE) is as expected:

 88:   55                      push   %ebp
 89:   89 e5                   mov    %esp,%ebp
 8b:   8b 45 08                mov    0x8(%ebp),%eax
 8e:   8b 55 0c                mov    0xc(%ebp),%edx
 91:   d9 00                   flds   (%eax)
 93:   d8 02                   fadds  (%edx)
 95:   d9 18                   fstps  (%eax)
 97:   d9 40 04                flds   0x4(%eax)
 9a:   d8 42 04                fadds  0x4(%edx)
 9d:   d9 58 04                fstps  0x4(%eax)
 a0:   d9 40 08                flds   0x8(%eax)
 a3:   d8 42 08                fadds  0x8(%edx)
 a6:   d9 58 08                fstps  0x8(%eax)
 a9:   d9 40 0c                flds   0xc(%eax)
 ac:   d8 42 0c                fadds  0xc(%edx)
 af:   d9 58 0c                fstps  0xc(%eax)
 b2:   c9                      leave
 b3:   c3                      ret

Using -march=pentium3 -mtune=pentium3m -mfpmath=sse, the following
is generated:

140:   55                      push   %ebp
141:   89 e5                   mov    %esp,%ebp
143:   8b 4d 08                mov    0x8(%ebp),%ecx
146:   8b 45 08                mov    0x8(%ebp),%eax
149:   8b 55 0c                mov    0xc(%ebp),%edx
14c:   f3 0f 10 00             movss  (%eax),%xmm0
150:   f3 0f 58 02             addss  (%edx),%xmm0
154:   f3 0f 11 01             movss  %xmm0,(%ecx)
158:   8b 4d 08                mov    0x8(%ebp),%ecx
15b:   83 c1 04                add    $0x4,%ecx
15e:   8b 45 08                mov    0x8(%ebp),%eax
161:   83 c0 04                add    $0x4,%eax
164:   8b 55 0c                mov    0xc(%ebp),%edx
167:   83 c2 04                add    $0x4,%edx
16a:   f3 0f 10 00             movss  (%eax),%xmm0
16e:   f3 0f 58 02             addss  (%edx),%xmm0
172:   f3 0f 11 01             movss  %xmm0,(%ecx)
176:   8b 4d 08                mov    0x8(%ebp),%ecx
179:   83 c1 08                add    $0x8,%ecx
17c:   8b 45 08                mov    0x8(%ebp),%eax
17f:   83 c0 08                add    $0x8,%eax
182:   8b 55 0c                mov    0xc(%ebp),%edx
185:   83 c2 08                add    $0x8,%edx
188:   f3 0f 10 00             movss  (%eax),%xmm0
18c:   f3 0f 58 02             addss  (%edx),%xmm0
190:   f3 0f 11 01             movss  %xmm0,(%ecx)
194:   8b 4d 08                mov    0x8(%ebp),%ecx
197:   83 c1 0c                add    $0xc,%ecx
19a:   8b 45 08                mov    0x8(%ebp),%eax
19d:   83 c0 0c                add    $0xc,%eax
1a0:   8b 55 0c                mov    0xc(%ebp),%edx
1a3:   83 c2 0c                add    $0xc,%edx
1a6:   f3 0f 10 00             movss  (%eax),%xmm0
1aa:   f3 0f 58 02             addss  (%edx),%xmm0
1ae:   f3 0f 11 01             movss  %xmm0,(%ecx)
1b2:   8b 45 08                mov    0x8(%ebp),%eax
1b5:   5d                      pop    %ebp
1b6:   c3                      ret
1b7:   89 f6                   mov    %esi,%esi
1b9:   8d bc 27 00 00 00 00    lea    0x0(%edi),%edi

Now, uh, isn't that four additions? Do I need to do something gcc-specific
to get it to use the 'add-packed-single' instruction to turn those four
additions into one?

MC

Reply via email to