Hi all, gcc-4.2.2 seems to generating wrong/misaligned code for movapd. I have used the same test case mentione here (for almost the similar bug) http://gcc.gnu.org/bugzilla/attachment.cgi?id=6012
The relavent information about the version and the files are as follows: The version of gcc: gcc -v Using built-in specs. Target: i386-redhat-linux Configured with: ../../src/gcc-4.2.2/configure --prefix=/depot/gcc-4.2.2-static --disable-shared --enable-threads=posix --disable-checking --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-languages=c,c++,objc,fortran --with-cpu=generic --host=i386-redhat-linux Thread model: posi $ gcc -g -O2 -funsigned-bitfields -fsigned-char -ffloat-store -Wformat -msse2 -mfpmath=sse -c sse.c $ objdump -Sd sse.o > sse_asm.txt $ grep movapd sse_asm.txt a3: 66 0f 28 c2 movapd %xmm2,%xmm0 cb: 66 0f 28 c1 movapd %xmm1,%xmm0 f6: 66 0f 28 c4 movapd %xmm4,%xmm0 165: 66 0f 29 9d 38 fe ff movapd %xmm3,0xfffffe38(%ebp) #<============ 185: 66 0f 28 9d 38 fe ff movapd 0xfffffe38(%ebp),%xmm3 Is this a known issue? If so, is there are there any suggested workarounds (other than upgrading to later versions :-) ? Regards, Gowri Kumar
double sin(double x); double cos(double x); typedef struct { double x, y, z; } DVECTOR; typedef struct { DVECTOR m[4]; } DMATRIX; void create_geo_to_topo(DMATRIX *xfp, double lat, double lon) { double cos_lat; { double sin_lon, cos_lon, sin_lat; sin_lon = sin(lon); cos_lon = cos(lon); sin_lat = sin(lat); cos_lat = cos(lat); xfp->m[0].x = -sin_lon; xfp->m[1].x = cos_lon; xfp->m[2].x = 0.0; xfp->m[0].y = -cos_lon * sin_lat; xfp->m[1].y = -sin_lon * sin_lat; xfp->m[2].y = cos_lat; xfp->m[0].z = cos_lon * cos_lat; xfp->m[1].z = sin_lon * cos_lat; xfp->m[2].z = sin_lat; { DVECTOR geo; DVECTOR lla; lla.x = lat; lla.y = lon; lla.z = 0.0; geo_lla_xyz( &lla, &geo); xfp->m[3].x = -geo.x * xfp->m[0].x - geo.y * xfp->m[1].x - geo.z * xfp->m[2].x; xfp->m[3].y = -geo.x * xfp->m[0].y - geo.y * xfp->m[1].y - geo.z * xfp->m[2].y; xfp->m[3].z = -geo.x * xfp->m[0].z - geo.y * xfp->m[1].z - geo.z * xfp->m[2].z; } } }
sse.i
Description: Binary data
sse.s
Description: Binary data
sse.o: file format elf32-i386 Disassembly of section .text: 00000000 <create_geo_to_topo>: void create_geo_to_topo(DMATRIX *xfp, double lat, double lon) { 0: 55 push %ebp 1: 89 e5 mov %esp,%ebp 3: 53 push %ebx 4: 81 ec d4 01 00 00 sub $0x1d4,%esp a: 8b 45 0c mov 0xc(%ebp),%eax d: 8b 5d 08 mov 0x8(%ebp),%ebx 10: 89 85 50 fe ff ff mov %eax,0xfffffe50(%ebp) 16: 8b 45 10 mov 0x10(%ebp),%eax 19: 89 85 54 fe ff ff mov %eax,0xfffffe54(%ebp) 1f: 8b 45 14 mov 0x14(%ebp),%eax 22: 89 85 48 fe ff ff mov %eax,0xfffffe48(%ebp) 28: 8b 45 18 mov 0x18(%ebp),%eax 2b: 89 85 4c fe ff ff mov %eax,0xfffffe4c(%ebp) double cos_lat; { double sin_lon, cos_lon, sin_lat; sin_lon = sin(lon); 31: f2 0f 10 85 48 fe ff movsd 0xfffffe48(%ebp),%xmm0 38: ff 39: f2 0f 11 04 24 movsd %xmm0,(%esp) 3e: e8 fc ff ff ff call 3f <create_geo_to_topo+0x3f> cos_lon = cos(lon); 43: f2 0f 10 85 48 fe ff movsd 0xfffffe48(%ebp),%xmm0 4a: ff 4b: f2 0f 11 04 24 movsd %xmm0,(%esp) 50: dd 5d e8 fstpl 0xffffffe8(%ebp) 53: e8 fc ff ff ff call 54 <create_geo_to_topo+0x54> sin_lat = sin(lat); 58: f2 0f 10 85 50 fe ff movsd 0xfffffe50(%ebp),%xmm0 5f: ff 60: f2 0f 11 04 24 movsd %xmm0,(%esp) 65: dd 5d e0 fstpl 0xffffffe0(%ebp) 68: e8 fc ff ff ff call 69 <create_geo_to_topo+0x69> cos_lat = cos(lat); 6d: f2 0f 10 85 50 fe ff movsd 0xfffffe50(%ebp),%xmm0 74: ff 75: f2 0f 11 04 24 movsd %xmm0,(%esp) 7a: dd 5d d8 fstpl 0xffffffd8(%ebp) 7d: e8 fc ff ff ff call 7e <create_geo_to_topo+0x7e> xfp->m[0].x = -sin_lon; xfp->m[1].x = cos_lon; xfp->m[2].x = 0.0; xfp->m[0].y = -cos_lon * sin_lat; xfp->m[1].y = -sin_lon * sin_lat; xfp->m[2].y = cos_lat; xfp->m[0].z = cos_lon * cos_lat; xfp->m[1].z = sin_lon * cos_lat; xfp->m[2].z = sin_lat; { DVECTOR geo; DVECTOR lla; lla.x = lat; lla.y = lon; lla.z = 0.0; geo_lla_xyz( &lla, &geo); 82: 8d 85 70 fe ff ff lea 0xfffffe70(%ebp),%eax 88: f2 0f 10 55 e8 movsd 0xffffffe8(%ebp),%xmm2 8d: f2 0f 10 4d e0 movsd 0xffffffe0(%ebp),%xmm1 92: f2 0f 10 65 d8 movsd 0xffffffd8(%ebp),%xmm4 97: f2 0f 10 1d 00 00 00 movsd 0x0,%xmm3 9e: 00 9f: 66 0f 57 ed xorpd %xmm5,%xmm5 a3: 66 0f 28 c2 movapd %xmm2,%xmm0 a7: 66 0f 57 c3 xorpd %xmm3,%xmm0 ab: f2 0f 11 45 d0 movsd %xmm0,0xffffffd0(%ebp) b0: f2 0f 10 45 d0 movsd 0xffffffd0(%ebp),%xmm0 b5: dd 5d f0 fstpl 0xfffffff0(%ebp) b8: f2 0f 11 03 movsd %xmm0,(%ebx) bc: f2 0f 11 4b 18 movsd %xmm1,0x18(%ebx) c1: f2 0f 11 6b 30 movsd %xmm5,0x30(%ebx) c6: f2 0f 59 55 f0 mulsd 0xfffffff0(%ebp),%xmm2 cb: 66 0f 28 c1 movapd %xmm1,%xmm0 cf: f2 0f 59 4d f0 mulsd 0xfffffff0(%ebp),%xmm1 d4: 66 0f 57 c3 xorpd %xmm3,%xmm0 d8: f2 0f 11 45 c8 movsd %xmm0,0xffffffc8(%ebp) dd: f2 0f 10 45 c8 movsd 0xffffffc8(%ebp),%xmm0 e2: f2 0f 59 45 d8 mulsd 0xffffffd8(%ebp),%xmm0 e7: f2 0f 11 45 c0 movsd %xmm0,0xffffffc0(%ebp) ec: f2 0f 10 45 c0 movsd 0xffffffc0(%ebp),%xmm0 f1: f2 0f 11 43 08 movsd %xmm0,0x8(%ebx) f6: 66 0f 28 c4 movapd %xmm4,%xmm0 fa: f2 0f 59 45 d0 mulsd 0xffffffd0(%ebp),%xmm0 ff: f2 0f 11 45 b8 movsd %xmm0,0xffffffb8(%ebp) 104: f2 0f 10 45 b8 movsd 0xffffffb8(%ebp),%xmm0 109: f2 0f 11 43 20 movsd %xmm0,0x20(%ebx) 10e: f2 0f 10 45 f0 movsd 0xfffffff0(%ebp),%xmm0 113: f2 0f 11 43 38 movsd %xmm0,0x38(%ebx) 118: f2 0f 11 4d b0 movsd %xmm1,0xffffffb0(%ebp) 11d: f2 0f 10 45 b0 movsd 0xffffffb0(%ebp),%xmm0 122: f2 0f 11 43 10 movsd %xmm0,0x10(%ebx) 127: f2 0f 11 55 a8 movsd %xmm2,0xffffffa8(%ebp) 12c: f2 0f 10 45 a8 movsd 0xffffffa8(%ebp),%xmm0 131: f2 0f 11 63 40 movsd %xmm4,0x40(%ebx) 136: f2 0f 11 43 28 movsd %xmm0,0x28(%ebx) 13b: f2 0f 10 85 50 fe ff movsd 0xfffffe50(%ebp),%xmm0 142: ff 143: f2 0f 11 85 58 fe ff movsd %xmm0,0xfffffe58(%ebp) 14a: ff 14b: f2 0f 10 85 48 fe ff movsd 0xfffffe48(%ebp),%xmm0 152: ff 153: f2 0f 11 85 60 fe ff movsd %xmm0,0xfffffe60(%ebp) 15a: ff 15b: 89 44 24 04 mov %eax,0x4(%esp) 15f: 8d 85 58 fe ff ff lea 0xfffffe58(%ebp),%eax 165: 66 0f 29 9d 38 fe ff movapd %xmm3,0xfffffe38(%ebp) 16c: ff 16d: f2 0f 11 ad 68 fe ff movsd %xmm5,0xfffffe68(%ebp) 174: ff 175: 89 04 24 mov %eax,(%esp) 178: e8 fc ff ff ff call 179 <create_geo_to_topo+0x179> xfp->m[3].x = -geo.x * xfp->m[0].x - 17d: f2 0f 10 a5 70 fe ff movsd 0xfffffe70(%ebp),%xmm4 184: ff 185: 66 0f 28 9d 38 fe ff movapd 0xfffffe38(%ebp),%xmm3 18c: ff 18d: f2 0f 10 95 78 fe ff movsd 0xfffffe78(%ebp),%xmm2 194: ff 195: f2 0f 10 8d 80 fe ff movsd 0xfffffe80(%ebp),%xmm1 19c: ff 19d: f2 0f 11 65 a0 movsd %xmm4,0xffffffa0(%ebp) 1a2: f2 0f 10 45 a0 movsd 0xffffffa0(%ebp),%xmm0 1a7: 66 0f 57 c3 xorpd %xmm3,%xmm0 1ab: f2 0f 11 45 98 movsd %xmm0,0xffffff98(%ebp) 1b0: f2 0f 10 03 movsd (%ebx),%xmm0 1b4: f2 0f 11 55 80 movsd %xmm2,0xffffff80(%ebp) 1b9: f2 0f 11 45 90 movsd %xmm0,0xffffff90(%ebp) 1be: f2 0f 10 45 98 movsd 0xffffff98(%ebp),%xmm0 1c3: f2 0f 59 45 90 mulsd 0xffffff90(%ebp),%xmm0 1c8: f2 0f 11 45 88 movsd %xmm0,0xffffff88(%ebp) 1cd: f2 0f 10 43 18 movsd 0x18(%ebx),%xmm0 1d2: f2 0f 11 8d 60 ff ff movsd %xmm1,0xffffff60(%ebp) 1d9: ff 1da: f2 0f 11 85 78 ff ff movsd %xmm0,0xffffff78(%ebp) 1e1: ff 1e2: f2 0f 10 45 80 movsd 0xffffff80(%ebp),%xmm0 1e7: f2 0f 59 85 78 ff ff mulsd 0xffffff78(%ebp),%xmm0 1ee: ff 1ef: f2 0f 11 85 70 ff ff movsd %xmm0,0xffffff70(%ebp) 1f6: ff 1f7: f2 0f 10 45 88 movsd 0xffffff88(%ebp),%xmm0 1fc: f2 0f 5c 85 70 ff ff subsd 0xffffff70(%ebp),%xmm0 203: ff 204: f2 0f 11 85 68 ff ff movsd %xmm0,0xffffff68(%ebp) 20b: ff 20c: f2 0f 10 43 30 movsd 0x30(%ebx),%xmm0 211: f2 0f 11 85 58 ff ff movsd %xmm0,0xffffff58(%ebp) 218: ff 219: f2 0f 10 85 60 ff ff movsd 0xffffff60(%ebp),%xmm0 220: ff 221: f2 0f 59 85 58 ff ff mulsd 0xffffff58(%ebp),%xmm0 228: ff 229: f2 0f 11 85 50 ff ff movsd %xmm0,0xffffff50(%ebp) 230: ff 231: f2 0f 10 85 68 ff ff movsd 0xffffff68(%ebp),%xmm0 238: ff 239: f2 0f 5c 85 50 ff ff subsd 0xffffff50(%ebp),%xmm0 240: ff 241: f2 0f 11 85 48 ff ff movsd %xmm0,0xffffff48(%ebp) 248: ff 249: f2 0f 10 85 48 ff ff movsd 0xffffff48(%ebp),%xmm0 250: ff 251: f2 0f 11 43 48 movsd %xmm0,0x48(%ebx) geo.y * xfp->m[1].x - geo.z * xfp->m[2].x; xfp->m[3].y = -geo.x * xfp->m[0].y - 256: f2 0f 11 a5 90 fe ff movsd %xmm4,0xfffffe90(%ebp) 25d: ff 25e: f2 0f 10 85 90 fe ff movsd 0xfffffe90(%ebp),%xmm0 265: ff 266: 66 0f 57 c3 xorpd %xmm3,%xmm0 26a: f2 0f 11 85 a0 fe ff movsd %xmm0,0xfffffea0(%ebp) 271: ff 272: f2 0f 10 43 08 movsd 0x8(%ebx),%xmm0 277: f2 0f 11 95 b0 fe ff movsd %xmm2,0xfffffeb0(%ebp) 27e: ff 27f: f2 0f 11 85 40 ff ff movsd %xmm0,0xffffff40(%ebp) 286: ff 287: f2 0f 10 85 a0 fe ff movsd 0xfffffea0(%ebp),%xmm0 28e: ff 28f: f2 0f 59 85 40 ff ff mulsd 0xffffff40(%ebp),%xmm0 296: ff 297: f2 0f 11 85 38 ff ff movsd %xmm0,0xffffff38(%ebp) 29e: ff 29f: f2 0f 10 43 20 movsd 0x20(%ebx),%xmm0 2a4: f2 0f 11 8d c0 fe ff movsd %xmm1,0xfffffec0(%ebp) 2ab: ff 2ac: f2 0f 11 85 30 ff ff movsd %xmm0,0xffffff30(%ebp) 2b3: ff 2b4: f2 0f 10 85 b0 fe ff movsd 0xfffffeb0(%ebp),%xmm0 2bb: ff 2bc: f2 0f 59 85 30 ff ff mulsd 0xffffff30(%ebp),%xmm0 2c3: ff 2c4: f2 0f 11 85 28 ff ff movsd %xmm0,0xffffff28(%ebp) 2cb: ff 2cc: f2 0f 10 85 38 ff ff movsd 0xffffff38(%ebp),%xmm0 2d3: ff 2d4: f2 0f 5c 85 28 ff ff subsd 0xffffff28(%ebp),%xmm0 2db: ff 2dc: f2 0f 11 85 20 ff ff movsd %xmm0,0xffffff20(%ebp) 2e3: ff 2e4: f2 0f 10 43 38 movsd 0x38(%ebx),%xmm0 2e9: f2 0f 11 85 18 ff ff movsd %xmm0,0xffffff18(%ebp) 2f0: ff 2f1: f2 0f 10 85 c0 fe ff movsd 0xfffffec0(%ebp),%xmm0 2f8: ff 2f9: f2 0f 59 85 18 ff ff mulsd 0xffffff18(%ebp),%xmm0 300: ff 301: f2 0f 11 85 10 ff ff movsd %xmm0,0xffffff10(%ebp) 308: ff 309: f2 0f 10 85 20 ff ff movsd 0xffffff20(%ebp),%xmm0 310: ff 311: f2 0f 5c 85 10 ff ff subsd 0xffffff10(%ebp),%xmm0 318: ff 319: f2 0f 11 85 08 ff ff movsd %xmm0,0xffffff08(%ebp) 320: ff 321: f2 0f 10 85 08 ff ff movsd 0xffffff08(%ebp),%xmm0 328: ff 329: f2 0f 11 43 50 movsd %xmm0,0x50(%ebx) geo.y * xfp->m[1].y - geo.z * xfp->m[2].y; xfp->m[3].z = -geo.x * xfp->m[0].z - 32e: f2 0f 11 a5 88 fe ff movsd %xmm4,0xfffffe88(%ebp) 335: ff 336: f2 0f 10 85 88 fe ff movsd 0xfffffe88(%ebp),%xmm0 33d: ff 33e: 66 0f 57 c3 xorpd %xmm3,%xmm0 342: f2 0f 11 85 98 fe ff movsd %xmm0,0xfffffe98(%ebp) 349: ff 34a: f2 0f 10 43 10 movsd 0x10(%ebx),%xmm0 34f: f2 0f 11 85 00 ff ff movsd %xmm0,0xffffff00(%ebp) 356: ff 357: f2 0f 10 85 98 fe ff movsd 0xfffffe98(%ebp),%xmm0 35e: ff 35f: f2 0f 59 85 00 ff ff mulsd 0xffffff00(%ebp),%xmm0 366: ff 367: f2 0f 11 85 f8 fe ff movsd %xmm0,0xfffffef8(%ebp) 36e: ff 36f: f2 0f 11 95 a8 fe ff movsd %xmm2,0xfffffea8(%ebp) 376: ff 377: f2 0f 10 43 28 movsd 0x28(%ebx),%xmm0 37c: f2 0f 11 8d b8 fe ff movsd %xmm1,0xfffffeb8(%ebp) 383: ff 384: f2 0f 11 85 f0 fe ff movsd %xmm0,0xfffffef0(%ebp) 38b: ff 38c: f2 0f 10 85 a8 fe ff movsd 0xfffffea8(%ebp),%xmm0 393: ff 394: f2 0f 59 85 f0 fe ff mulsd 0xfffffef0(%ebp),%xmm0 39b: ff 39c: f2 0f 11 85 e8 fe ff movsd %xmm0,0xfffffee8(%ebp) 3a3: ff 3a4: f2 0f 10 85 f8 fe ff movsd 0xfffffef8(%ebp),%xmm0 3ab: ff 3ac: f2 0f 5c 85 e8 fe ff subsd 0xfffffee8(%ebp),%xmm0 3b3: ff 3b4: f2 0f 11 85 e0 fe ff movsd %xmm0,0xfffffee0(%ebp) 3bb: ff 3bc: f2 0f 10 43 40 movsd 0x40(%ebx),%xmm0 3c1: f2 0f 11 85 d8 fe ff movsd %xmm0,0xfffffed8(%ebp) 3c8: ff 3c9: f2 0f 10 85 b8 fe ff movsd 0xfffffeb8(%ebp),%xmm0 3d0: ff 3d1: f2 0f 59 85 d8 fe ff mulsd 0xfffffed8(%ebp),%xmm0 3d8: ff 3d9: f2 0f 11 85 d0 fe ff movsd %xmm0,0xfffffed0(%ebp) 3e0: ff 3e1: f2 0f 10 85 e0 fe ff movsd 0xfffffee0(%ebp),%xmm0 3e8: ff 3e9: f2 0f 5c 85 d0 fe ff subsd 0xfffffed0(%ebp),%xmm0 3f0: ff 3f1: f2 0f 11 85 c8 fe ff movsd %xmm0,0xfffffec8(%ebp) 3f8: ff 3f9: f2 0f 10 85 c8 fe ff movsd 0xfffffec8(%ebp),%xmm0 400: ff 401: f2 0f 11 43 58 movsd %xmm0,0x58(%ebx) geo.y * xfp->m[1].z - geo.z * xfp->m[2].z; } } } 406: 81 c4 d4 01 00 00 add $0x1d4,%esp 40c: 5b pop %ebx 40d: 5d pop %ebp 40e: c3 ret