Why GCC copies vectorized buffers to and from stack ? Am I doing something wrong ?
=== Compiler: === gcc -v Using built-in specs. Target: x86_64-redhat-linux Configured with: ../configure --prefix=/usr --mandir=/usr/share/man --infodir=/usr/share/info --enable-shared --enable-threads=posix --enable-checking=release --with-system-zlib --enable-__cxa_atexit --disable-libunwind-exceptions --enable-libgcj-multifile --enable-languages=c,c++,objc,obj-c++,java,fortran,ada --enable-java-awt=gtk --disable-dssi --enable-plugin --with-java-home=/usr/lib/jvm/java-1.4.2-gcj-1.4.2.0/jre --with-cpu=generic --host=x86_64-redhat-linux Thread model: posix gcc version 4.1.2 20080704 (Red Hat 4.1.2-46) === Source a.c: === typedef int BLOCK512 __attribute__((__vector_size__(512))); void f (BLOCK512 *d, const BLOCK512 *s0, const BLOCK512 *s1) { *d = *s0 ^ *s1; } === Command: === gcc -O3 a.c -c -o a.o === Result (note 3 calls to memcpy): === Disassembly of section .text: 0000000000000000 <f>: 0: 41 54 push %r12 2: 49 89 fc mov %rdi,%r12 5: 53 push %rbx 6: 48 89 d3 mov %rdx,%rbx 9: ba 00 02 00 00 mov $0x200,%edx e: 48 81 ec 08 06 00 00 sub $0x608,%rsp 15: 48 8d bc 24 00 02 00 lea 0x200(%rsp),%rdi 1c: 00 1d: e8 00 00 00 00 callq 22 <f+0x22> 1e: R_X86_64_PC32 memcpy+0xfffffffffffffffc 22: 48 8d bc 24 00 04 00 lea 0x400(%rsp),%rdi 29: 00 2a: 48 89 de mov %rbx,%rsi 2d: ba 00 02 00 00 mov $0x200,%edx 32: e8 00 00 00 00 callq 37 <f+0x37> 33: R_X86_64_PC32 memcpy+0xfffffffffffffffc 37: 66 0f 6f 84 24 00 04 movdqa 0x400(%rsp),%xmm0 3e: 00 00 40: 48 89 e6 mov %rsp,%rsi 43: 4c 89 e7 mov %r12,%rdi 46: ba 00 02 00 00 mov $0x200,%edx 4b: 66 0f ef 84 24 00 02 pxor 0x200(%rsp),%xmm0 52: 00 00 54: 66 0f 7f 04 24 movdqa %xmm0,(%rsp) 59: 66 0f 6f 84 24 10 04 movdqa 0x410(%rsp),%xmm0 60: 00 00 62: 66 0f ef 84 24 10 02 pxor 0x210(%rsp),%xmm0 69: 00 00 6b: 66 0f 7f 44 24 10 movdqa %xmm0,0x10(%rsp) 71: 66 0f 6f 84 24 20 04 movdqa 0x420(%rsp),%xmm0 78: 00 00 7a: 66 0f ef 84 24 20 02 pxor 0x220(%rsp),%xmm0 81: 00 00 83: 66 0f 7f 44 24 20 movdqa %xmm0,0x20(%rsp) 89: 66 0f 6f 84 24 30 04 movdqa 0x430(%rsp),%xmm0 90: 00 00 92: 66 0f ef 84 24 30 02 pxor 0x230(%rsp),%xmm0 99: 00 00 9b: 66 0f 7f 44 24 30 movdqa %xmm0,0x30(%rsp) a1: 66 0f 6f 84 24 40 04 movdqa 0x440(%rsp),%xmm0 a8: 00 00 aa: 66 0f ef 84 24 40 02 pxor 0x240(%rsp),%xmm0 b1: 00 00 b3: 66 0f 7f 44 24 40 movdqa %xmm0,0x40(%rsp) b9: 66 0f 6f 84 24 50 04 movdqa 0x450(%rsp),%xmm0 c0: 00 00 c2: 66 0f ef 84 24 50 02 pxor 0x250(%rsp),%xmm0 c9: 00 00 cb: 66 0f 7f 44 24 50 movdqa %xmm0,0x50(%rsp) d1: 66 0f 6f 84 24 60 04 movdqa 0x460(%rsp),%xmm0 d8: 00 00 da: 66 0f ef 84 24 60 02 pxor 0x260(%rsp),%xmm0 e1: 00 00 e3: 66 0f 7f 44 24 60 movdqa %xmm0,0x60(%rsp) e9: 66 0f 6f 84 24 70 04 movdqa 0x470(%rsp),%xmm0 f0: 00 00 f2: 66 0f ef 84 24 70 02 pxor 0x270(%rsp),%xmm0 f9: 00 00 fb: 66 0f 7f 44 24 70 movdqa %xmm0,0x70(%rsp) 101: 66 0f 6f 84 24 80 04 movdqa 0x480(%rsp),%xmm0 108: 00 00 10a: 66 0f ef 84 24 80 02 pxor 0x280(%rsp),%xmm0 111: 00 00 113: 66 0f 7f 84 24 80 00 movdqa %xmm0,0x80(%rsp) 11a: 00 00 11c: 66 0f 6f 84 24 90 04 movdqa 0x490(%rsp),%xmm0 123: 00 00 125: 66 0f ef 84 24 90 02 pxor 0x290(%rsp),%xmm0 12c: 00 00 12e: 66 0f 7f 84 24 90 00 movdqa %xmm0,0x90(%rsp) 135: 00 00 137: 66 0f 6f 84 24 a0 04 movdqa 0x4a0(%rsp),%xmm0 13e: 00 00 140: 66 0f ef 84 24 a0 02 pxor 0x2a0(%rsp),%xmm0 147: 00 00 149: 66 0f 7f 84 24 a0 00 movdqa %xmm0,0xa0(%rsp) 150: 00 00 152: 66 0f 6f 84 24 b0 04 movdqa 0x4b0(%rsp),%xmm0 159: 00 00 15b: 66 0f ef 84 24 b0 02 pxor 0x2b0(%rsp),%xmm0 162: 00 00 164: 66 0f 7f 84 24 b0 00 movdqa %xmm0,0xb0(%rsp) 16b: 00 00 16d: 66 0f 6f 84 24 c0 04 movdqa 0x4c0(%rsp),%xmm0 174: 00 00 176: 66 0f ef 84 24 c0 02 pxor 0x2c0(%rsp),%xmm0 17d: 00 00 17f: 66 0f 7f 84 24 c0 00 movdqa %xmm0,0xc0(%rsp) 186: 00 00 188: 66 0f 6f 84 24 d0 04 movdqa 0x4d0(%rsp),%xmm0 18f: 00 00 191: 66 0f ef 84 24 d0 02 pxor 0x2d0(%rsp),%xmm0 198: 00 00 19a: 66 0f 7f 84 24 d0 00 movdqa %xmm0,0xd0(%rsp) 1a1: 00 00 1a3: 66 0f 6f 84 24 e0 04 movdqa 0x4e0(%rsp),%xmm0 1aa: 00 00 1ac: 66 0f ef 84 24 e0 02 pxor 0x2e0(%rsp),%xmm0 1b3: 00 00 1b5: 66 0f 7f 84 24 e0 00 movdqa %xmm0,0xe0(%rsp) 1bc: 00 00 1be: 66 0f 6f 84 24 f0 04 movdqa 0x4f0(%rsp),%xmm0 1c5: 00 00 1c7: 66 0f ef 84 24 f0 02 pxor 0x2f0(%rsp),%xmm0 1ce: 00 00 1d0: 66 0f 7f 84 24 f0 00 movdqa %xmm0,0xf0(%rsp) 1d7: 00 00 1d9: 66 0f 6f 84 24 00 05 movdqa 0x500(%rsp),%xmm0 1e0: 00 00 1e2: 66 0f ef 84 24 00 03 pxor 0x300(%rsp),%xmm0 1e9: 00 00 1eb: 66 0f 7f 84 24 00 01 movdqa %xmm0,0x100(%rsp) 1f2: 00 00 1f4: 66 0f 6f 84 24 10 05 movdqa 0x510(%rsp),%xmm0 1fb: 00 00 1fd: 66 0f ef 84 24 10 03 pxor 0x310(%rsp),%xmm0 204: 00 00 206: 66 0f 7f 84 24 10 01 movdqa %xmm0,0x110(%rsp) 20d: 00 00 20f: 66 0f 6f 84 24 20 05 movdqa 0x520(%rsp),%xmm0 216: 00 00 218: 66 0f ef 84 24 20 03 pxor 0x320(%rsp),%xmm0 21f: 00 00 221: 66 0f 7f 84 24 20 01 movdqa %xmm0,0x120(%rsp) 228: 00 00 22a: 66 0f 6f 84 24 30 05 movdqa 0x530(%rsp),%xmm0 231: 00 00 233: 66 0f ef 84 24 30 03 pxor 0x330(%rsp),%xmm0 23a: 00 00 23c: 66 0f 7f 84 24 30 01 movdqa %xmm0,0x130(%rsp) 243: 00 00 245: 66 0f 6f 84 24 40 05 movdqa 0x540(%rsp),%xmm0 24c: 00 00 24e: 66 0f ef 84 24 40 03 pxor 0x340(%rsp),%xmm0 255: 00 00 257: 66 0f 7f 84 24 40 01 movdqa %xmm0,0x140(%rsp) 25e: 00 00 260: 66 0f 6f 84 24 50 05 movdqa 0x550(%rsp),%xmm0 267: 00 00 269: 66 0f ef 84 24 50 03 pxor 0x350(%rsp),%xmm0 270: 00 00 272: 66 0f 7f 84 24 50 01 movdqa %xmm0,0x150(%rsp) 279: 00 00 27b: 66 0f 6f 84 24 60 05 movdqa 0x560(%rsp),%xmm0 282: 00 00 284: 66 0f ef 84 24 60 03 pxor 0x360(%rsp),%xmm0 28b: 00 00 28d: 66 0f 7f 84 24 60 01 movdqa %xmm0,0x160(%rsp) 294: 00 00 296: 66 0f 6f 84 24 70 05 movdqa 0x570(%rsp),%xmm0 29d: 00 00 29f: 66 0f ef 84 24 70 03 pxor 0x370(%rsp),%xmm0 2a6: 00 00 2a8: 66 0f 7f 84 24 70 01 movdqa %xmm0,0x170(%rsp) 2af: 00 00 2b1: 66 0f 6f 84 24 80 05 movdqa 0x580(%rsp),%xmm0 2b8: 00 00 2ba: 66 0f ef 84 24 80 03 pxor 0x380(%rsp),%xmm0 2c1: 00 00 2c3: 66 0f 7f 84 24 80 01 movdqa %xmm0,0x180(%rsp) 2ca: 00 00 2cc: 66 0f 6f 84 24 90 05 movdqa 0x590(%rsp),%xmm0 2d3: 00 00 2d5: 66 0f ef 84 24 90 03 pxor 0x390(%rsp),%xmm0 2dc: 00 00 2de: 66 0f 7f 84 24 90 01 movdqa %xmm0,0x190(%rsp) 2e5: 00 00 2e7: 66 0f 6f 84 24 a0 05 movdqa 0x5a0(%rsp),%xmm0 2ee: 00 00 2f0: 66 0f ef 84 24 a0 03 pxor 0x3a0(%rsp),%xmm0 2f7: 00 00 2f9: 66 0f 7f 84 24 a0 01 movdqa %xmm0,0x1a0(%rsp) 300: 00 00 302: 66 0f 6f 84 24 b0 05 movdqa 0x5b0(%rsp),%xmm0 309: 00 00 30b: 66 0f ef 84 24 b0 03 pxor 0x3b0(%rsp),%xmm0 312: 00 00 314: 66 0f 7f 84 24 b0 01 movdqa %xmm0,0x1b0(%rsp) 31b: 00 00 31d: 66 0f 6f 84 24 c0 05 movdqa 0x5c0(%rsp),%xmm0 324: 00 00 326: 66 0f ef 84 24 c0 03 pxor 0x3c0(%rsp),%xmm0 32d: 00 00 32f: 66 0f 7f 84 24 c0 01 movdqa %xmm0,0x1c0(%rsp) 336: 00 00 338: 66 0f 6f 84 24 d0 05 movdqa 0x5d0(%rsp),%xmm0 33f: 00 00 341: 66 0f ef 84 24 d0 03 pxor 0x3d0(%rsp),%xmm0 348: 00 00 34a: 66 0f 7f 84 24 d0 01 movdqa %xmm0,0x1d0(%rsp) 351: 00 00 353: 66 0f 6f 84 24 e0 05 movdqa 0x5e0(%rsp),%xmm0 35a: 00 00 35c: 66 0f ef 84 24 e0 03 pxor 0x3e0(%rsp),%xmm0 363: 00 00 365: 66 0f 7f 84 24 e0 01 movdqa %xmm0,0x1e0(%rsp) 36c: 00 00 36e: 66 0f 6f 84 24 f0 05 movdqa 0x5f0(%rsp),%xmm0 375: 00 00 377: 66 0f ef 84 24 f0 03 pxor 0x3f0(%rsp),%xmm0 37e: 00 00 380: 66 0f 7f 84 24 f0 01 movdqa %xmm0,0x1f0(%rsp) 387: 00 00 389: e8 00 00 00 00 callq 38e <f+0x38e> 38a: R_X86_64_PC32 memcpy+0xfffffffffffffffc 38e: 48 81 c4 08 06 00 00 add $0x608,%rsp 395: 5b pop %rbx 396: 41 5c pop %r12 398: c3 -- Summary: long vector operation causes gcc to copy arguments Product: gcc Version: 4.1.2 Status: UNCONFIRMED Severity: normal Priority: P3 Component: c AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: Shvaiger_Felix at emc dot com GCC build triplet: x86_64-redhat-linux GCC host triplet: x86_64-redhat-linux GCC target triplet: x86_64-redhat-linux http://gcc.gnu.org/bugzilla/show_bug.cgi?id=42367