Hi,
I noticed something strange when I use GCC's builtins for MMX:
I defined some unions:
typedef int v4hi __attribute__ ((__mode__(__V4HI__))); typedef int v2si __attribute__ ((__mode__(__V2SI__))); typedef int di __attribute__ ((__mode__(__DI__)));
typedef union { v4hi v; short s[4]; int i[2]; } _v4hi;
typedef union { v2si v; int i[2]; } _v2si;
And the strange thing now is. If I use those unions (eg _v4hi var) in my code and pass the vector to the mmx builtin (eg var.v), gcc produces faster code than if I use eg the v4hi type directly. In my case latter case was 10% slower in my tests. I'd expect identical results and even identical object files considering scheduling of the assembler, but that was not the case.
Is this a known issue with gcc-3.4.3? I compiled the code using -O2 -march=athlon-xp -g3. If you want a smaller test case, I could try to do so. Right now I just didn't want to waste my time in case this is a know issue or I did something stupid...
I also tried using (Intel style?) intrinsics via mmintrin.h and here the times are nearly the same using unions or vectors, but both as slow as above using vectors.
The function I used, was (using the unions):
/* Code for use in OpenAL; LGPL license; Copyright 2005 by Prakash Punnor */
/* prepare sign-extension from 16bit to 32 bit for stream ST */ #define GET_SIGNMASK(ST) \ indata.v = *(v4hi*)(entries[ST].data + offset); \ signmask.v = (v4hi)__builtin_ia32_pand((di)indata.v, (di)m->v); \ signmask.v = (v4hi)__builtin_ia32_pcmpeqw(signmask.v, m->v);
/* mix stream 0 */ #define MIX_ST0 \ GET_SIGNMASK (0);\ \ loout.v = (v2si)__builtin_ia32_punpcklwd(indata.v, signmask.v);\ hiout.v = (v2si)__builtin_ia32_punpckhwd(indata.v, signmask.v);
/* sign-extension and mix stream ST */ #define MIX(ST) \ GET_SIGNMASK(ST) \ temp.v = (v2si)__builtin_ia32_punpcklwd(indata.v, signmask.v); \ loout.v = __builtin_ia32_paddd(loout.v, temp.v); \ temp.v = (v2si)__builtin_ia32_punpckhwd(indata.v, signmask.v); \ hiout.v = __builtin_ia32_paddd(hiout.v, temp.v);
/* manual saturation to dst */ #define SATURATE(OFFSET) \ if (sample == (short)sample) dst[OFFSET] = sample; \ else { \ if(sample > 0 ) \ dst[OFFSET] = max_audioval; \ else \ dst[OFFSET] = min_audioval; \ }\
/* manually mix samples of mod_len */ #define MIX_MOD \ for (offset=0; offset<mod_len; ++offset) { \ int sample = 0; \ \ for (st=0; st<streams; ++st) \ sample += ((ALshort*)entries[st ].data)[offset]; \ \ SATURATE(offset); \ }
/* Mix all remaining and write to dst */ #define LOOP_MIX \ while (st<streams) { \ MIX (st); \ MIX (st+1); \ MIX (st+2); \ MIX (st+3); \ st+=4; \ } \ \ *(v4hi*)((void*)dst + offset) = __builtin_ia32_packssdw(loout.v, hiout.v);
__attribute__((aligned(16))) static const short sm[4] = {0x8000,0x8000,0x8000,0x8000}; __attribute__((aligned(16))) static const _v4hi *m = (_v4hi*)sm;
typedef struct _alMixEntry { ALvoid *data; ALint bytes; } alMixEntry;
void MixAudio16_MMX_MOD0(ALshort *dst, alMixEntry *entries, int streams) { int len = entries[0].bytes; int mod_len = len % (4 * sizeof(ALshort)); int offset; int st;
_v4hi indata; _v4hi signmask;
_v2si loout; _v2si hiout;
_v2si temp;
MIX_MOD;
for (offset=0; offset<len; offset+=4*sizeof(ALshort)) {
MIX_ST0; st = 1; LOOP_MIX; } __builtin_ia32_emms(); return; }
I attached the objdumps:
old.dump - using unions -> fast n3.dump - using vectors directly -> 10% slower on my athlon-xp, even when generated asm seems to be shorter
BTW, the buffers were 16-byte aligned.
-- Prakash Punnoor
formerly known as Prakash K. Cheemplavam
mixaudio16.o: file format elf32-i386
Disassembly of section .text: 00000000 <MixAudio16_MMX_MOD0>: __attribute__((aligned(16))) static const short sm[4] = {0x8000,0x8000,0x8000,0x8000}; __attribute__((aligned(16))) static const v4hi *m = (v4hi*)sm; void MixAudio16_MMX_MOD0(ALshort *dst, alMixEntry *entries, int streams) { 0: 55 push %ebp 1: 89 e5 mov %esp,%ebp 3: 57 push %edi 4: 56 push %esi 5: 53 push %ebx 6: 83 ec 0c sub $0xc,%esp int len = entries[0].bytes; int mod_len = len % (4 * sizeof(ALshort)); int offset; int st; v4hi indata; v4hi signmask; v2si loout; v2si hiout; v2si temp; MIX_MOD; 9: 31 db xor %ebx,%ebx b: 8b 75 0c mov 0xc(%ebp),%esi e: 8b 7d 10 mov 0x10(%ebp),%edi 11: 8b 46 04 mov 0x4(%esi),%eax 14: 89 45 f0 mov %eax,0xfffffff0(%ebp) 17: 83 e0 07 and $0x7,%eax 1a: 39 c3 cmp %eax,%ebx 1c: 89 45 ec mov %eax,0xffffffec(%ebp) 1f: 7d 47 jge 68 <MixAudio16_MMX_MOD0+0x68> 21: eb 0d jmp 30 <MixAudio16_MMX_MOD0+0x30> 23: 90 nop 24: 90 nop 25: 90 nop 26: 90 nop 27: 90 nop 28: 90 nop 29: 90 nop 2a: 90 nop 2b: 90 nop 2c: 90 nop 2d: 90 nop 2e: 90 nop 2f: 90 nop 30: 31 c9 xor %ecx,%ecx 32: 31 d2 xor %edx,%edx 34: eb 0a jmp 40 <MixAudio16_MMX_MOD0+0x40> 36: 8b 04 d6 mov (%esi,%edx,8),%eax 39: 42 inc %edx 3a: 0f bf 04 58 movswl (%eax,%ebx,2),%eax 3e: 01 c1 add %eax,%ecx 40: 39 fa cmp %edi,%edx 42: 7c f2 jl 36 <MixAudio16_MMX_MOD0+0x36> 44: 0f bf c1 movswl %cx,%eax 47: 39 c8 cmp %ecx,%eax 49: 0f 84 2a 01 00 00 je 179 <MixAudio16_MMX_MOD0+0x179> 4f: 31 c0 xor %eax,%eax 51: 8b 55 08 mov 0x8(%ebp),%edx 54: 85 c9 test %ecx,%ecx 56: 0f 9e c0 setle %al 59: 05 ff 7f 00 00 add $0x7fff,%eax 5e: 66 89 04 5a mov %ax,(%edx,%ebx,2) 62: 43 inc %ebx 63: 3b 5d ec cmp 0xffffffec(%ebp),%ebx 66: 7c c8 jl 30 <MixAudio16_MMX_MOD0+0x30> for (offset=0; offset<len; offset+=4*sizeof(ALshort)) { 68: 31 db xor %ebx,%ebx 6a: 3b 5d f0 cmp 0xfffffff0(%ebp),%ebx 6d: 0f 8d fc 00 00 00 jge 16f <MixAudio16_MMX_MOD0+0x16f> 73: 8b 06 mov (%esi),%eax 75: 8b 0d 00 00 00 00 mov 0x0,%ecx 7b: 89 45 e8 mov %eax,0xffffffe8(%ebp) 7e: 89 f6 mov %esi,%esi MIX_ST0; 80: 8b 55 e8 mov 0xffffffe8(%ebp),%edx 83: 0f 6f 14 1a movq (%edx,%ebx,1),%mm2 st = 1; 87: ba 01 00 00 00 mov $0x1,%edx LOOP_MIX; 8c: 39 fa cmp %edi,%edx 8e: 0f 6f c2 movq %mm2,%mm0 91: 0f db 01 pand (%ecx),%mm0 94: 0f 6f c8 movq %mm0,%mm1 97: 0f 75 09 pcmpeqw (%ecx),%mm1 9a: 0f 6f c2 movq %mm2,%mm0 9d: 0f 61 c1 punpcklwd %mm1,%mm0 a0: 0f 69 d1 punpckhwd %mm1,%mm2 a3: 0f 6f e0 movq %mm0,%mm4 a6: 0f 6f da movq %mm2,%mm3 a9: 0f 8d a7 00 00 00 jge 156 <MixAudio16_MMX_MOD0+0x156> af: 0f 6f 31 movq (%ecx),%mm6 b2: 0f 6f 29 movq (%ecx),%mm5 b5: 8d 74 26 00 lea 0x0(%esi),%esi b9: 8d bc 27 00 00 00 00 lea 0x0(%edi),%edi c0: 8b 04 d6 mov (%esi,%edx,8),%eax c3: 0f 6f 14 18 movq (%eax,%ebx,1),%mm2 c7: 8b 44 d6 08 mov 0x8(%esi,%edx,8),%eax cb: 0f 6f c2 movq %mm2,%mm0 ce: 0f db c6 pand %mm6,%mm0 d1: 0f 6f c8 movq %mm0,%mm1 d4: 0f 75 cd pcmpeqw %mm5,%mm1 d7: 0f 6f c2 movq %mm2,%mm0 da: 0f 69 d1 punpckhwd %mm1,%mm2 dd: 0f 61 c1 punpcklwd %mm1,%mm0 e0: 0f fe da paddd %mm2,%mm3 e3: 0f 6f 14 18 movq (%eax,%ebx,1),%mm2 e7: 0f fe e0 paddd %mm0,%mm4 ea: 8b 44 d6 10 mov 0x10(%esi,%edx,8),%eax ee: 0f 6f c2 movq %mm2,%mm0 f1: 0f db c6 pand %mm6,%mm0 f4: 0f 6f c8 movq %mm0,%mm1 f7: 0f 75 cd pcmpeqw %mm5,%mm1 fa: 0f 6f c2 movq %mm2,%mm0 fd: 0f 69 d1 punpckhwd %mm1,%mm2 100: 0f fe da paddd %mm2,%mm3 103: 0f 6f 14 18 movq (%eax,%ebx,1),%mm2 107: 0f 61 c1 punpcklwd %mm1,%mm0 10a: 8b 44 d6 18 mov 0x18(%esi,%edx,8),%eax 10e: 0f fe e0 paddd %mm0,%mm4 111: 83 c2 04 add $0x4,%edx 114: 39 fa cmp %edi,%edx 116: 0f 6f c2 movq %mm2,%mm0 119: 0f db c6 pand %mm6,%mm0 11c: 0f 6f c8 movq %mm0,%mm1 11f: 0f 75 cd pcmpeqw %mm5,%mm1 122: 0f 6f c2 movq %mm2,%mm0 125: 0f 69 d1 punpckhwd %mm1,%mm2 128: 0f fe da paddd %mm2,%mm3 12b: 0f 6f 14 18 movq (%eax,%ebx,1),%mm2 12f: 0f 61 c1 punpcklwd %mm1,%mm0 132: 0f fe e0 paddd %mm0,%mm4 135: 0f 6f c2 movq %mm2,%mm0 138: 0f db c6 pand %mm6,%mm0 13b: 0f 6f c8 movq %mm0,%mm1 13e: 0f 75 cd pcmpeqw %mm5,%mm1 141: 0f 6f c2 movq %mm2,%mm0 144: 0f 61 c1 punpcklwd %mm1,%mm0 147: 0f 69 d1 punpckhwd %mm1,%mm2 14a: 0f fe e0 paddd %mm0,%mm4 14d: 0f fe da paddd %mm2,%mm3 150: 0f 8c 6a ff ff ff jl c0 <MixAudio16_MMX_MOD0+0xc0> 156: 8b 45 08 mov 0x8(%ebp),%eax 159: 0f 6f c4 movq %mm4,%mm0 15c: 0f 6b c3 packssdw %mm3,%mm0 15f: 0f 7f 04 18 movq %mm0,(%eax,%ebx,1) 163: 83 c3 08 add $0x8,%ebx 166: 3b 5d f0 cmp 0xfffffff0(%ebp),%ebx 169: 0f 8c 11 ff ff ff jl 80 <MixAudio16_MMX_MOD0+0x80> } __builtin_ia32_emms(); 16f: 0f 77 emms return; 171: 83 c4 0c add $0xc,%esp 174: 5b pop %ebx 175: 5e pop %esi 176: 5f pop %edi 177: 5d pop %ebp 178: c3 ret 179: 8b 55 08 mov 0x8(%ebp),%edx 17c: 66 89 0c 5a mov %cx,(%edx,%ebx,2) 180: 43 inc %ebx 181: 3b 5d ec cmp 0xffffffec(%ebp),%ebx 184: e9 dd fe ff ff jmp 66 <MixAudio16_MMX_MOD0+0x66>
mixaudio16.o: file format elf32-i386 Disassembly of section .text: 00000000 <MixAudio16_MMX_MOD0>: __attribute__((aligned(16))) static const short sm[4] = {0x8000,0x8000,0x8000,0x8000}; __attribute__((aligned(16))) static const _v4hi *m = (_v4hi*)sm; void MixAudio16_MMX_MOD0(ALshort *dst, alMixEntry *entries, int streams) { 0: 55 push %ebp 1: 89 e5 mov %esp,%ebp 3: 57 push %edi 4: 56 push %esi 5: 53 push %ebx 6: 83 ec 0c sub $0xc,%esp int len = entries[0].bytes; int mod_len = len % (4 * sizeof(ALshort)); int offset; int st; _v4hi indata; _v4hi signmask; _v2si loout; _v2si hiout; _v2si temp; MIX_MOD; 9: 31 db xor %ebx,%ebx b: 8b 75 0c mov 0xc(%ebp),%esi e: 8b 7d 10 mov 0x10(%ebp),%edi 11: 8b 46 04 mov 0x4(%esi),%eax 14: 89 45 f0 mov %eax,0xfffffff0(%ebp) 17: 83 e0 07 and $0x7,%eax 1a: 39 c3 cmp %eax,%ebx 1c: 89 45 ec mov %eax,0xffffffec(%ebp) 1f: 7d 47 jge 68 <MixAudio16_MMX_MOD0+0x68> 21: eb 0d jmp 30 <MixAudio16_MMX_MOD0+0x30> 23: 90 nop 24: 90 nop 25: 90 nop 26: 90 nop 27: 90 nop 28: 90 nop 29: 90 nop 2a: 90 nop 2b: 90 nop 2c: 90 nop 2d: 90 nop 2e: 90 nop 2f: 90 nop 30: 31 c9 xor %ecx,%ecx 32: 31 d2 xor %edx,%edx 34: eb 0a jmp 40 <MixAudio16_MMX_MOD0+0x40> 36: 8b 04 d6 mov (%esi,%edx,8),%eax 39: 42 inc %edx 3a: 0f bf 04 58 movswl (%eax,%ebx,2),%eax 3e: 01 c1 add %eax,%ecx 40: 39 fa cmp %edi,%edx 42: 7c f2 jl 36 <MixAudio16_MMX_MOD0+0x36> 44: 0f bf c1 movswl %cx,%eax 47: 39 c8 cmp %ecx,%eax 49: 0f 84 36 01 00 00 je 185 <MixAudio16_MMX_MOD0+0x185> 4f: 31 c0 xor %eax,%eax 51: 8b 55 08 mov 0x8(%ebp),%edx 54: 85 c9 test %ecx,%ecx 56: 0f 9e c0 setle %al 59: 05 ff 7f 00 00 add $0x7fff,%eax 5e: 66 89 04 5a mov %ax,(%edx,%ebx,2) 62: 43 inc %ebx 63: 3b 5d ec cmp 0xffffffec(%ebp),%ebx 66: 7c c8 jl 30 <MixAudio16_MMX_MOD0+0x30> for (offset=0; offset<len; offset+=4*sizeof(ALshort)) { 68: 31 db xor %ebx,%ebx 6a: 3b 5d f0 cmp 0xfffffff0(%ebp),%ebx 6d: 0f 8d 08 01 00 00 jge 17b <MixAudio16_MMX_MOD0+0x17b> 73: 8b 06 mov (%esi),%eax 75: 8b 0d 00 00 00 00 mov 0x0,%ecx 7b: 89 45 e8 mov %eax,0xffffffe8(%ebp) 7e: 89 f6 mov %esi,%esi MIX_ST0; 80: 8b 55 e8 mov 0xffffffe8(%ebp),%edx 83: 0f 6f 04 1a movq (%edx,%ebx,1),%mm0 st = 1; 87: ba 01 00 00 00 mov $0x1,%edx LOOP_MIX; 8c: 39 fa cmp %edi,%edx 8e: 0f 6f d0 movq %mm0,%mm2 91: 0f db 01 pand (%ecx),%mm0 94: 0f 6f c8 movq %mm0,%mm1 97: 0f 75 09 pcmpeqw (%ecx),%mm1 9a: 0f 6f c2 movq %mm2,%mm0 9d: 0f 61 c1 punpcklwd %mm1,%mm0 a0: 0f 6f e8 movq %mm0,%mm5 a3: 0f 6f c2 movq %mm2,%mm0 a6: 0f 69 c1 punpckhwd %mm1,%mm0 a9: 0f 6f e0 movq %mm0,%mm4 ac: 0f 8d b0 00 00 00 jge 162 <MixAudio16_MMX_MOD0+0x162> b2: 0f 6f 19 movq (%ecx),%mm3 b5: 8d 74 26 00 lea 0x0(%esi),%esi b9: 8d bc 27 00 00 00 00 lea 0x0(%edi),%edi c0: 8b 04 d6 mov (%esi,%edx,8),%eax c3: 0f 6f cb movq %mm3,%mm1 c6: 0f 6f 04 18 movq (%eax,%ebx,1),%mm0 ca: 8b 44 d6 08 mov 0x8(%esi,%edx,8),%eax ce: 0f 6f d0 movq %mm0,%mm2 d1: 0f db c3 pand %mm3,%mm0 d4: 0f 75 c8 pcmpeqw %mm0,%mm1 d7: 0f 6f c2 movq %mm2,%mm0 da: 0f 61 c1 punpcklwd %mm1,%mm0 dd: 0f fe e8 paddd %mm0,%mm5 e0: 0f 6f c2 movq %mm2,%mm0 e3: 0f 69 c1 punpckhwd %mm1,%mm0 e6: 0f fe e0 paddd %mm0,%mm4 e9: 0f 6f 04 18 movq (%eax,%ebx,1),%mm0 ed: 0f 6f cb movq %mm3,%mm1 f0: 8b 44 d6 10 mov 0x10(%esi,%edx,8),%eax f4: 0f 6f d0 movq %mm0,%mm2 f7: 0f db c3 pand %mm3,%mm0 fa: 0f 75 c8 pcmpeqw %mm0,%mm1 fd: 0f 6f c2 movq %mm2,%mm0 100: 0f 61 c1 punpcklwd %mm1,%mm0 103: 0f fe e8 paddd %mm0,%mm5 106: 0f 6f c2 movq %mm2,%mm0 109: 0f 69 c1 punpckhwd %mm1,%mm0 10c: 0f fe e0 paddd %mm0,%mm4 10f: 0f 6f 04 18 movq (%eax,%ebx,1),%mm0 113: 0f 6f cb movq %mm3,%mm1 116: 8b 44 d6 18 mov 0x18(%esi,%edx,8),%eax 11a: 83 c2 04 add $0x4,%edx 11d: 39 fa cmp %edi,%edx 11f: 0f 6f d0 movq %mm0,%mm2 122: 0f db c3 pand %mm3,%mm0 125: 0f 75 c8 pcmpeqw %mm0,%mm1 128: 0f 6f c2 movq %mm2,%mm0 12b: 0f 61 c1 punpcklwd %mm1,%mm0 12e: 0f fe e8 paddd %mm0,%mm5 131: 0f 6f c2 movq %mm2,%mm0 134: 0f 69 c1 punpckhwd %mm1,%mm0 137: 0f fe e0 paddd %mm0,%mm4 13a: 0f 6f 04 18 movq (%eax,%ebx,1),%mm0 13e: 0f 6f cb movq %mm3,%mm1 141: 0f 6f d0 movq %mm0,%mm2 144: 0f db c3 pand %mm3,%mm0 147: 0f 75 c8 pcmpeqw %mm0,%mm1 14a: 0f 6f c2 movq %mm2,%mm0 14d: 0f 61 c1 punpcklwd %mm1,%mm0 150: 0f fe e8 paddd %mm0,%mm5 153: 0f 6f c2 movq %mm2,%mm0 156: 0f 69 c1 punpckhwd %mm1,%mm0 159: 0f fe e0 paddd %mm0,%mm4 15c: 0f 8c 5e ff ff ff jl c0 <MixAudio16_MMX_MOD0+0xc0> 162: 8b 45 08 mov 0x8(%ebp),%eax 165: 0f 6f c5 movq %mm5,%mm0 168: 0f 6b c4 packssdw %mm4,%mm0 16b: 0f 7f 04 18 movq %mm0,(%eax,%ebx,1) 16f: 83 c3 08 add $0x8,%ebx 172: 3b 5d f0 cmp 0xfffffff0(%ebp),%ebx 175: 0f 8c 05 ff ff ff jl 80 <MixAudio16_MMX_MOD0+0x80> } __builtin_ia32_emms(); 17b: 0f 77 emms return; 17d: 83 c4 0c add $0xc,%esp 180: 5b pop %ebx 181: 5e pop %esi 182: 5f pop %edi 183: 5d pop %ebp 184: c3 ret 185: 8b 55 08 mov 0x8(%ebp),%edx 188: 66 89 0c 5a mov %cx,(%edx,%ebx,2) 18c: 43 inc %ebx 18d: 3b 5d ec cmp 0xffffffec(%ebp),%ebx 190: e9 d1 fe ff ff jmp 66 <MixAudio16_MMX_MOD0+0x66>
signature.asc
Description: OpenPGP digital signature