Hi,
I noticed something strange when I use GCC's builtins for MMX:
I defined some unions:
typedef int v4hi __attribute__ ((__mode__(__V4HI__))); typedef int v2si __attribute__ ((__mode__(__V2SI__))); typedef int di __attribute__ ((__mode__(__DI__)));
typedef union
{
v4hi v;
short s[4];
int i[2];
} _v4hi;typedef union
{
v2si v;
int i[2];
} _v2si;
And the strange thing now is. If I use those unions (eg _v4hi var) in my code and pass the vector to the mmx builtin (eg var.v), gcc produces faster code than if I use eg the v4hi type directly. In my case latter case was 10% slower in my tests. I'd expect identical results and even identical object files considering scheduling of the assembler, but that was not the case.
Is this a known issue with gcc-3.4.3? I compiled the code using -O2 -march=athlon-xp -g3. If you want a smaller test case, I could try to do so. Right now I just didn't want to waste my time in case this is a know issue or I did something stupid...
I also tried using (Intel style?) intrinsics via mmintrin.h and here the times are nearly the same using unions or vectors, but both as slow as above using vectors.
The function I used, was (using the unions):
/* Code for use in OpenAL; LGPL license; Copyright 2005 by Prakash Punnor */
/* prepare sign-extension from 16bit to 32 bit for stream ST */
#define GET_SIGNMASK(ST) \
indata.v = *(v4hi*)(entries[ST].data + offset); \
signmask.v = (v4hi)__builtin_ia32_pand((di)indata.v, (di)m->v); \
signmask.v = (v4hi)__builtin_ia32_pcmpeqw(signmask.v, m->v);/* mix stream 0 */
#define MIX_ST0 \
GET_SIGNMASK (0);\
\
loout.v = (v2si)__builtin_ia32_punpcklwd(indata.v, signmask.v);\
hiout.v = (v2si)__builtin_ia32_punpckhwd(indata.v, signmask.v);
/* sign-extension and mix stream ST */ #define MIX(ST) \ GET_SIGNMASK(ST) \ temp.v = (v2si)__builtin_ia32_punpcklwd(indata.v, signmask.v); \ loout.v = __builtin_ia32_paddd(loout.v, temp.v); \ temp.v = (v2si)__builtin_ia32_punpckhwd(indata.v, signmask.v); \ hiout.v = __builtin_ia32_paddd(hiout.v, temp.v);
/* manual saturation to dst */
#define SATURATE(OFFSET) \
if (sample == (short)sample) dst[OFFSET] = sample; \
else { \
if(sample > 0 ) \
dst[OFFSET] = max_audioval; \
else \
dst[OFFSET] = min_audioval; \
}\/* manually mix samples of mod_len */
#define MIX_MOD \
for (offset=0; offset<mod_len; ++offset) { \
int sample = 0; \
\
for (st=0; st<streams; ++st) \
sample += ((ALshort*)entries[st ].data)[offset]; \
\
SATURATE(offset); \
}/* Mix all remaining and write to dst */
#define LOOP_MIX \
while (st<streams) { \
MIX (st); \
MIX (st+1); \
MIX (st+2); \
MIX (st+3); \
st+=4; \
} \
\
*(v4hi*)((void*)dst + offset) = __builtin_ia32_packssdw(loout.v,
hiout.v);__attribute__((aligned(16))) static const short sm[4] =
{0x8000,0x8000,0x8000,0x8000};
__attribute__((aligned(16))) static const _v4hi *m = (_v4hi*)sm;typedef struct _alMixEntry {
ALvoid *data;
ALint bytes;
} alMixEntry;void MixAudio16_MMX_MOD0(ALshort *dst, alMixEntry *entries, int streams)
{
int len = entries[0].bytes;
int mod_len = len % (4 * sizeof(ALshort));
int offset;
int st; _v4hi indata;
_v4hi signmask; _v2si loout;
_v2si hiout;_v2si temp;
MIX_MOD;
for (offset=0; offset<len; offset+=4*sizeof(ALshort)) { MIX_ST0;
st = 1;
LOOP_MIX;
}
__builtin_ia32_emms();
return;
}I attached the objdumps:
old.dump - using unions -> fast n3.dump - using vectors directly -> 10% slower on my athlon-xp, even when generated asm seems to be shorter
BTW, the buffers were 16-byte aligned.
-- Prakash Punnoor
formerly known as Prakash K. Cheemplavam
mixaudio16.o: file format elf32-i386
Disassembly of section .text:
00000000 <MixAudio16_MMX_MOD0>:
__attribute__((aligned(16))) static const short sm[4] =
{0x8000,0x8000,0x8000,0x8000};
__attribute__((aligned(16))) static const v4hi *m = (v4hi*)sm;
void MixAudio16_MMX_MOD0(ALshort *dst, alMixEntry *entries, int streams)
{
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 57 push %edi
4: 56 push %esi
5: 53 push %ebx
6: 83 ec 0c sub $0xc,%esp
int len = entries[0].bytes;
int mod_len = len % (4 * sizeof(ALshort));
int offset;
int st;
v4hi indata;
v4hi signmask;
v2si loout;
v2si hiout;
v2si temp;
MIX_MOD;
9: 31 db xor %ebx,%ebx
b: 8b 75 0c mov 0xc(%ebp),%esi
e: 8b 7d 10 mov 0x10(%ebp),%edi
11: 8b 46 04 mov 0x4(%esi),%eax
14: 89 45 f0 mov %eax,0xfffffff0(%ebp)
17: 83 e0 07 and $0x7,%eax
1a: 39 c3 cmp %eax,%ebx
1c: 89 45 ec mov %eax,0xffffffec(%ebp)
1f: 7d 47 jge 68 <MixAudio16_MMX_MOD0+0x68>
21: eb 0d jmp 30 <MixAudio16_MMX_MOD0+0x30>
23: 90 nop
24: 90 nop
25: 90 nop
26: 90 nop
27: 90 nop
28: 90 nop
29: 90 nop
2a: 90 nop
2b: 90 nop
2c: 90 nop
2d: 90 nop
2e: 90 nop
2f: 90 nop
30: 31 c9 xor %ecx,%ecx
32: 31 d2 xor %edx,%edx
34: eb 0a jmp 40 <MixAudio16_MMX_MOD0+0x40>
36: 8b 04 d6 mov (%esi,%edx,8),%eax
39: 42 inc %edx
3a: 0f bf 04 58 movswl (%eax,%ebx,2),%eax
3e: 01 c1 add %eax,%ecx
40: 39 fa cmp %edi,%edx
42: 7c f2 jl 36 <MixAudio16_MMX_MOD0+0x36>
44: 0f bf c1 movswl %cx,%eax
47: 39 c8 cmp %ecx,%eax
49: 0f 84 2a 01 00 00 je 179 <MixAudio16_MMX_MOD0+0x179>
4f: 31 c0 xor %eax,%eax
51: 8b 55 08 mov 0x8(%ebp),%edx
54: 85 c9 test %ecx,%ecx
56: 0f 9e c0 setle %al
59: 05 ff 7f 00 00 add $0x7fff,%eax
5e: 66 89 04 5a mov %ax,(%edx,%ebx,2)
62: 43 inc %ebx
63: 3b 5d ec cmp 0xffffffec(%ebp),%ebx
66: 7c c8 jl 30 <MixAudio16_MMX_MOD0+0x30>
for (offset=0; offset<len; offset+=4*sizeof(ALshort)) {
68: 31 db xor %ebx,%ebx
6a: 3b 5d f0 cmp 0xfffffff0(%ebp),%ebx
6d: 0f 8d fc 00 00 00 jge 16f <MixAudio16_MMX_MOD0+0x16f>
73: 8b 06 mov (%esi),%eax
75: 8b 0d 00 00 00 00 mov 0x0,%ecx
7b: 89 45 e8 mov %eax,0xffffffe8(%ebp)
7e: 89 f6 mov %esi,%esi
MIX_ST0;
80: 8b 55 e8 mov 0xffffffe8(%ebp),%edx
83: 0f 6f 14 1a movq (%edx,%ebx,1),%mm2
st = 1;
87: ba 01 00 00 00 mov $0x1,%edx
LOOP_MIX;
8c: 39 fa cmp %edi,%edx
8e: 0f 6f c2 movq %mm2,%mm0
91: 0f db 01 pand (%ecx),%mm0
94: 0f 6f c8 movq %mm0,%mm1
97: 0f 75 09 pcmpeqw (%ecx),%mm1
9a: 0f 6f c2 movq %mm2,%mm0
9d: 0f 61 c1 punpcklwd %mm1,%mm0
a0: 0f 69 d1 punpckhwd %mm1,%mm2
a3: 0f 6f e0 movq %mm0,%mm4
a6: 0f 6f da movq %mm2,%mm3
a9: 0f 8d a7 00 00 00 jge 156 <MixAudio16_MMX_MOD0+0x156>
af: 0f 6f 31 movq (%ecx),%mm6
b2: 0f 6f 29 movq (%ecx),%mm5
b5: 8d 74 26 00 lea 0x0(%esi),%esi
b9: 8d bc 27 00 00 00 00 lea 0x0(%edi),%edi
c0: 8b 04 d6 mov (%esi,%edx,8),%eax
c3: 0f 6f 14 18 movq (%eax,%ebx,1),%mm2
c7: 8b 44 d6 08 mov 0x8(%esi,%edx,8),%eax
cb: 0f 6f c2 movq %mm2,%mm0
ce: 0f db c6 pand %mm6,%mm0
d1: 0f 6f c8 movq %mm0,%mm1
d4: 0f 75 cd pcmpeqw %mm5,%mm1
d7: 0f 6f c2 movq %mm2,%mm0
da: 0f 69 d1 punpckhwd %mm1,%mm2
dd: 0f 61 c1 punpcklwd %mm1,%mm0
e0: 0f fe da paddd %mm2,%mm3
e3: 0f 6f 14 18 movq (%eax,%ebx,1),%mm2
e7: 0f fe e0 paddd %mm0,%mm4
ea: 8b 44 d6 10 mov 0x10(%esi,%edx,8),%eax
ee: 0f 6f c2 movq %mm2,%mm0
f1: 0f db c6 pand %mm6,%mm0
f4: 0f 6f c8 movq %mm0,%mm1
f7: 0f 75 cd pcmpeqw %mm5,%mm1
fa: 0f 6f c2 movq %mm2,%mm0
fd: 0f 69 d1 punpckhwd %mm1,%mm2
100: 0f fe da paddd %mm2,%mm3
103: 0f 6f 14 18 movq (%eax,%ebx,1),%mm2
107: 0f 61 c1 punpcklwd %mm1,%mm0
10a: 8b 44 d6 18 mov 0x18(%esi,%edx,8),%eax
10e: 0f fe e0 paddd %mm0,%mm4
111: 83 c2 04 add $0x4,%edx
114: 39 fa cmp %edi,%edx
116: 0f 6f c2 movq %mm2,%mm0
119: 0f db c6 pand %mm6,%mm0
11c: 0f 6f c8 movq %mm0,%mm1
11f: 0f 75 cd pcmpeqw %mm5,%mm1
122: 0f 6f c2 movq %mm2,%mm0
125: 0f 69 d1 punpckhwd %mm1,%mm2
128: 0f fe da paddd %mm2,%mm3
12b: 0f 6f 14 18 movq (%eax,%ebx,1),%mm2
12f: 0f 61 c1 punpcklwd %mm1,%mm0
132: 0f fe e0 paddd %mm0,%mm4
135: 0f 6f c2 movq %mm2,%mm0
138: 0f db c6 pand %mm6,%mm0
13b: 0f 6f c8 movq %mm0,%mm1
13e: 0f 75 cd pcmpeqw %mm5,%mm1
141: 0f 6f c2 movq %mm2,%mm0
144: 0f 61 c1 punpcklwd %mm1,%mm0
147: 0f 69 d1 punpckhwd %mm1,%mm2
14a: 0f fe e0 paddd %mm0,%mm4
14d: 0f fe da paddd %mm2,%mm3
150: 0f 8c 6a ff ff ff jl c0 <MixAudio16_MMX_MOD0+0xc0>
156: 8b 45 08 mov 0x8(%ebp),%eax
159: 0f 6f c4 movq %mm4,%mm0
15c: 0f 6b c3 packssdw %mm3,%mm0
15f: 0f 7f 04 18 movq %mm0,(%eax,%ebx,1)
163: 83 c3 08 add $0x8,%ebx
166: 3b 5d f0 cmp 0xfffffff0(%ebp),%ebx
169: 0f 8c 11 ff ff ff jl 80 <MixAudio16_MMX_MOD0+0x80>
}
__builtin_ia32_emms();
16f: 0f 77 emms
return;
171: 83 c4 0c add $0xc,%esp
174: 5b pop %ebx
175: 5e pop %esi
176: 5f pop %edi
177: 5d pop %ebp
178: c3 ret
179: 8b 55 08 mov 0x8(%ebp),%edx
17c: 66 89 0c 5a mov %cx,(%edx,%ebx,2)
180: 43 inc %ebx
181: 3b 5d ec cmp 0xffffffec(%ebp),%ebx
184: e9 dd fe ff ff jmp 66 <MixAudio16_MMX_MOD0+0x66>
mixaudio16.o: file format elf32-i386
Disassembly of section .text:
00000000 <MixAudio16_MMX_MOD0>:
__attribute__((aligned(16))) static const short sm[4] =
{0x8000,0x8000,0x8000,0x8000};
__attribute__((aligned(16))) static const _v4hi *m = (_v4hi*)sm;
void MixAudio16_MMX_MOD0(ALshort *dst, alMixEntry *entries, int streams)
{
0: 55 push %ebp
1: 89 e5 mov %esp,%ebp
3: 57 push %edi
4: 56 push %esi
5: 53 push %ebx
6: 83 ec 0c sub $0xc,%esp
int len = entries[0].bytes;
int mod_len = len % (4 * sizeof(ALshort));
int offset;
int st;
_v4hi indata;
_v4hi signmask;
_v2si loout;
_v2si hiout;
_v2si temp;
MIX_MOD;
9: 31 db xor %ebx,%ebx
b: 8b 75 0c mov 0xc(%ebp),%esi
e: 8b 7d 10 mov 0x10(%ebp),%edi
11: 8b 46 04 mov 0x4(%esi),%eax
14: 89 45 f0 mov %eax,0xfffffff0(%ebp)
17: 83 e0 07 and $0x7,%eax
1a: 39 c3 cmp %eax,%ebx
1c: 89 45 ec mov %eax,0xffffffec(%ebp)
1f: 7d 47 jge 68 <MixAudio16_MMX_MOD0+0x68>
21: eb 0d jmp 30 <MixAudio16_MMX_MOD0+0x30>
23: 90 nop
24: 90 nop
25: 90 nop
26: 90 nop
27: 90 nop
28: 90 nop
29: 90 nop
2a: 90 nop
2b: 90 nop
2c: 90 nop
2d: 90 nop
2e: 90 nop
2f: 90 nop
30: 31 c9 xor %ecx,%ecx
32: 31 d2 xor %edx,%edx
34: eb 0a jmp 40 <MixAudio16_MMX_MOD0+0x40>
36: 8b 04 d6 mov (%esi,%edx,8),%eax
39: 42 inc %edx
3a: 0f bf 04 58 movswl (%eax,%ebx,2),%eax
3e: 01 c1 add %eax,%ecx
40: 39 fa cmp %edi,%edx
42: 7c f2 jl 36 <MixAudio16_MMX_MOD0+0x36>
44: 0f bf c1 movswl %cx,%eax
47: 39 c8 cmp %ecx,%eax
49: 0f 84 36 01 00 00 je 185 <MixAudio16_MMX_MOD0+0x185>
4f: 31 c0 xor %eax,%eax
51: 8b 55 08 mov 0x8(%ebp),%edx
54: 85 c9 test %ecx,%ecx
56: 0f 9e c0 setle %al
59: 05 ff 7f 00 00 add $0x7fff,%eax
5e: 66 89 04 5a mov %ax,(%edx,%ebx,2)
62: 43 inc %ebx
63: 3b 5d ec cmp 0xffffffec(%ebp),%ebx
66: 7c c8 jl 30 <MixAudio16_MMX_MOD0+0x30>
for (offset=0; offset<len; offset+=4*sizeof(ALshort)) {
68: 31 db xor %ebx,%ebx
6a: 3b 5d f0 cmp 0xfffffff0(%ebp),%ebx
6d: 0f 8d 08 01 00 00 jge 17b <MixAudio16_MMX_MOD0+0x17b>
73: 8b 06 mov (%esi),%eax
75: 8b 0d 00 00 00 00 mov 0x0,%ecx
7b: 89 45 e8 mov %eax,0xffffffe8(%ebp)
7e: 89 f6 mov %esi,%esi
MIX_ST0;
80: 8b 55 e8 mov 0xffffffe8(%ebp),%edx
83: 0f 6f 04 1a movq (%edx,%ebx,1),%mm0
st = 1;
87: ba 01 00 00 00 mov $0x1,%edx
LOOP_MIX;
8c: 39 fa cmp %edi,%edx
8e: 0f 6f d0 movq %mm0,%mm2
91: 0f db 01 pand (%ecx),%mm0
94: 0f 6f c8 movq %mm0,%mm1
97: 0f 75 09 pcmpeqw (%ecx),%mm1
9a: 0f 6f c2 movq %mm2,%mm0
9d: 0f 61 c1 punpcklwd %mm1,%mm0
a0: 0f 6f e8 movq %mm0,%mm5
a3: 0f 6f c2 movq %mm2,%mm0
a6: 0f 69 c1 punpckhwd %mm1,%mm0
a9: 0f 6f e0 movq %mm0,%mm4
ac: 0f 8d b0 00 00 00 jge 162 <MixAudio16_MMX_MOD0+0x162>
b2: 0f 6f 19 movq (%ecx),%mm3
b5: 8d 74 26 00 lea 0x0(%esi),%esi
b9: 8d bc 27 00 00 00 00 lea 0x0(%edi),%edi
c0: 8b 04 d6 mov (%esi,%edx,8),%eax
c3: 0f 6f cb movq %mm3,%mm1
c6: 0f 6f 04 18 movq (%eax,%ebx,1),%mm0
ca: 8b 44 d6 08 mov 0x8(%esi,%edx,8),%eax
ce: 0f 6f d0 movq %mm0,%mm2
d1: 0f db c3 pand %mm3,%mm0
d4: 0f 75 c8 pcmpeqw %mm0,%mm1
d7: 0f 6f c2 movq %mm2,%mm0
da: 0f 61 c1 punpcklwd %mm1,%mm0
dd: 0f fe e8 paddd %mm0,%mm5
e0: 0f 6f c2 movq %mm2,%mm0
e3: 0f 69 c1 punpckhwd %mm1,%mm0
e6: 0f fe e0 paddd %mm0,%mm4
e9: 0f 6f 04 18 movq (%eax,%ebx,1),%mm0
ed: 0f 6f cb movq %mm3,%mm1
f0: 8b 44 d6 10 mov 0x10(%esi,%edx,8),%eax
f4: 0f 6f d0 movq %mm0,%mm2
f7: 0f db c3 pand %mm3,%mm0
fa: 0f 75 c8 pcmpeqw %mm0,%mm1
fd: 0f 6f c2 movq %mm2,%mm0
100: 0f 61 c1 punpcklwd %mm1,%mm0
103: 0f fe e8 paddd %mm0,%mm5
106: 0f 6f c2 movq %mm2,%mm0
109: 0f 69 c1 punpckhwd %mm1,%mm0
10c: 0f fe e0 paddd %mm0,%mm4
10f: 0f 6f 04 18 movq (%eax,%ebx,1),%mm0
113: 0f 6f cb movq %mm3,%mm1
116: 8b 44 d6 18 mov 0x18(%esi,%edx,8),%eax
11a: 83 c2 04 add $0x4,%edx
11d: 39 fa cmp %edi,%edx
11f: 0f 6f d0 movq %mm0,%mm2
122: 0f db c3 pand %mm3,%mm0
125: 0f 75 c8 pcmpeqw %mm0,%mm1
128: 0f 6f c2 movq %mm2,%mm0
12b: 0f 61 c1 punpcklwd %mm1,%mm0
12e: 0f fe e8 paddd %mm0,%mm5
131: 0f 6f c2 movq %mm2,%mm0
134: 0f 69 c1 punpckhwd %mm1,%mm0
137: 0f fe e0 paddd %mm0,%mm4
13a: 0f 6f 04 18 movq (%eax,%ebx,1),%mm0
13e: 0f 6f cb movq %mm3,%mm1
141: 0f 6f d0 movq %mm0,%mm2
144: 0f db c3 pand %mm3,%mm0
147: 0f 75 c8 pcmpeqw %mm0,%mm1
14a: 0f 6f c2 movq %mm2,%mm0
14d: 0f 61 c1 punpcklwd %mm1,%mm0
150: 0f fe e8 paddd %mm0,%mm5
153: 0f 6f c2 movq %mm2,%mm0
156: 0f 69 c1 punpckhwd %mm1,%mm0
159: 0f fe e0 paddd %mm0,%mm4
15c: 0f 8c 5e ff ff ff jl c0 <MixAudio16_MMX_MOD0+0xc0>
162: 8b 45 08 mov 0x8(%ebp),%eax
165: 0f 6f c5 movq %mm5,%mm0
168: 0f 6b c4 packssdw %mm4,%mm0
16b: 0f 7f 04 18 movq %mm0,(%eax,%ebx,1)
16f: 83 c3 08 add $0x8,%ebx
172: 3b 5d f0 cmp 0xfffffff0(%ebp),%ebx
175: 0f 8c 05 ff ff ff jl 80 <MixAudio16_MMX_MOD0+0x80>
}
__builtin_ia32_emms();
17b: 0f 77 emms
return;
17d: 83 c4 0c add $0xc,%esp
180: 5b pop %ebx
181: 5e pop %esi
182: 5f pop %edi
183: 5d pop %ebp
184: c3 ret
185: 8b 55 08 mov 0x8(%ebp),%edx
188: 66 89 0c 5a mov %cx,(%edx,%ebx,2)
18c: 43 inc %ebx
18d: 3b 5d ec cmp 0xffffffec(%ebp),%ebx
190: e9 d1 fe ff ff jmp 66 <MixAudio16_MMX_MOD0+0x66>
signature.asc
Description: OpenPGP digital signature
