The vectorizer produces horrible code with this testcase: $ cat dotproduct.c #include "inttypes.h"
int64_t dotproduct(int32_t *v1, int32_t *v2, int order) { int64_t accum = 0; while (order--) accum += (int64_t) *v1++ * *v2++; return accum; } int64_t dotproduct_order4(int32_t *v1, int32_t *v2, int order) { return dotproduct(v1, v2, 4); } $ gcc-4.4rc1 -o dotproduct.o -c dotproduct.c -O3 $ gcc-4.4rc1 -o dotproduct-no-vectorize.o -c dotproduct.c -O3 -fno-tree-vectorize $ objdump -d dotproduct.o dotproduct.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <dotproduct>: 0: 31 c0 xor %eax,%eax 2: 85 d2 test %edx,%edx 4: 0f 84 4e 01 00 00 je 158 <dotproduct+0x158> a: 41 89 d0 mov %edx,%r8d d: 44 8d 52 ff lea -0x1(%rdx),%r10d 11: 41 c1 e8 02 shr $0x2,%r8d 15: 83 fa 03 cmp $0x3,%edx 18: 46 8d 0c 85 00 00 00 lea 0x0(,%r8,4),%r9d 1f: 00 20: 76 05 jbe 27 <dotproduct+0x27> 22: 45 85 c9 test %r9d,%r9d 25: 75 09 jne 30 <dotproduct+0x30> 27: 31 c0 xor %eax,%eax 29: e9 fc 00 00 00 jmpq 12a <dotproduct+0x12a> 2e: 66 90 xchg %ax,%ax 30: 66 0f ef c0 pxor %xmm0,%xmm0 34: 31 c0 xor %eax,%eax 36: 66 45 0f ef c9 pxor %xmm9,%xmm9 3b: 31 c9 xor %ecx,%ecx 3d: 0f 1f 00 nopl (%rax) 40: f3 0f 6f 14 07 movdqu (%rdi,%rax,1),%xmm2 45: 83 c1 01 add $0x1,%ecx 48: 66 41 0f 6f d9 movdqa %xmm9,%xmm3 4d: f3 0f 6f 24 06 movdqu (%rsi,%rax,1),%xmm4 52: 66 45 0f 6f c1 movdqa %xmm9,%xmm8 57: 66 0f 6f ea movdqa %xmm2,%xmm5 5b: 48 83 c0 10 add $0x10,%rax 5f: 66 0f 66 dc pcmpgtd %xmm4,%xmm3 63: 66 0f 6f fc movdqa %xmm4,%xmm7 67: 66 44 0f 66 c2 pcmpgtd %xmm2,%xmm8 6c: 41 39 c8 cmp %ecx,%r8d 6f: 66 0f 62 fb punpckldq %xmm3,%xmm7 73: 66 41 0f 62 e8 punpckldq %xmm8,%xmm5 78: 66 0f 6a e3 punpckhdq %xmm3,%xmm4 7c: 66 41 0f 6a d0 punpckhdq %xmm8,%xmm2 81: 66 0f 6f cf movdqa %xmm7,%xmm1 85: 66 0f 6f f5 movdqa %xmm5,%xmm6 89: 66 44 0f 6f d7 movdqa %xmm7,%xmm10 8e: 66 0f f4 cd pmuludq %xmm5,%xmm1 92: 66 0f 6f da movdqa %xmm2,%xmm3 96: 66 0f 73 d6 20 psrlq $0x20,%xmm6 9b: 66 0f f4 f7 pmuludq %xmm7,%xmm6 9f: 66 41 0f 73 d2 20 psrlq $0x20,%xmm10 a5: 66 0f 73 f6 20 psllq $0x20,%xmm6 aa: 66 41 0f f4 ea pmuludq %xmm10,%xmm5 af: 66 0f d4 ce paddq %xmm6,%xmm1 b3: 66 0f 73 f5 20 psllq $0x20,%xmm5 b8: 66 0f d4 cd paddq %xmm5,%xmm1 bc: 66 0f 6f ec movdqa %xmm4,%xmm5 c0: 66 0f d4 c8 paddq %xmm0,%xmm1 c4: 66 0f 73 d3 20 psrlq $0x20,%xmm3 c9: 66 0f 6f c4 movdqa %xmm4,%xmm0 cd: 66 0f f4 dc pmuludq %xmm4,%xmm3 d1: 66 0f 73 f3 20 psllq $0x20,%xmm3 d6: 66 0f 73 d5 20 psrlq $0x20,%xmm5 db: 66 0f f4 c2 pmuludq %xmm2,%xmm0 df: 66 0f f4 d5 pmuludq %xmm5,%xmm2 e3: 66 0f d4 c3 paddq %xmm3,%xmm0 e7: 66 0f 73 f2 20 psllq $0x20,%xmm2 ec: 66 0f d4 c2 paddq %xmm2,%xmm0 f0: 66 0f d4 c1 paddq %xmm1,%xmm0 f4: 0f 87 46 ff ff ff ja 40 <dotproduct+0x40> fa: 42 8d 0c 8d 00 00 00 lea 0x0(,%r9,4),%ecx 101: 00 102: 66 0f 6f c8 movdqa %xmm0,%xmm1 106: 45 29 ca sub %r9d,%r10d 109: 89 c9 mov %ecx,%ecx 10b: 66 0f 73 d9 08 psrldq $0x8,%xmm1 110: 66 0f d4 c1 paddq %xmm1,%xmm0 114: 48 01 cf add %rcx,%rdi 117: 48 01 ce add %rcx,%rsi 11a: 44 39 ca cmp %r9d,%edx 11d: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp) 123: 48 8b 44 24 f8 mov -0x8(%rsp),%rax 128: 74 2e je 158 <dotproduct+0x158> 12a: 45 89 d2 mov %r10d,%r10d 12d: 31 d2 xor %edx,%edx 12f: 4e 8d 0c 95 04 00 00 lea 0x4(,%r10,4),%r9 136: 00 137: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) 13e: 00 00 140: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx 144: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8 148: 48 83 c2 04 add $0x4,%rdx 14c: 49 0f af c8 imul %r8,%rcx 150: 48 01 c8 add %rcx,%rax 153: 4c 39 ca cmp %r9,%rdx 156: 75 e8 jne 140 <dotproduct+0x140> 158: f3 c3 repz retq 15a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 0000000000000160 <dotproduct_order4>: 160: 66 0f ef c0 pxor %xmm0,%xmm0 164: f3 0f 6f 0f movdqu (%rdi),%xmm1 168: f3 0f 6f 1e movdqu (%rsi),%xmm3 16c: 66 0f 6f d0 movdqa %xmm0,%xmm2 170: 66 0f 6f f1 movdqa %xmm1,%xmm6 174: 66 0f 66 c1 pcmpgtd %xmm1,%xmm0 178: 66 0f 6f fb movdqa %xmm3,%xmm7 17c: 66 0f 66 d3 pcmpgtd %xmm3,%xmm2 180: 66 0f 62 f0 punpckldq %xmm0,%xmm6 184: 66 0f 62 fa punpckldq %xmm2,%xmm7 188: 66 0f 6a da punpckhdq %xmm2,%xmm3 18c: 66 0f 6a c8 punpckhdq %xmm0,%xmm1 190: 66 0f 6f ee movdqa %xmm6,%xmm5 194: 66 44 0f 6f c7 movdqa %xmm7,%xmm8 199: 66 0f 6f e7 movdqa %xmm7,%xmm4 19d: 66 0f 6f c3 movdqa %xmm3,%xmm0 1a1: 66 0f 73 d5 20 psrlq $0x20,%xmm5 1a6: 66 44 0f f4 c6 pmuludq %xmm6,%xmm8 1ab: 66 0f f4 ef pmuludq %xmm7,%xmm5 1af: 66 0f 6f d1 movdqa %xmm1,%xmm2 1b3: 66 0f 73 d4 20 psrlq $0x20,%xmm4 1b8: 66 0f 73 f5 20 psllq $0x20,%xmm5 1bd: 66 0f f4 e6 pmuludq %xmm6,%xmm4 1c1: 66 41 0f d4 e8 paddq %xmm8,%xmm5 1c6: 66 0f 73 f4 20 psllq $0x20,%xmm4 1cb: 66 0f d4 e5 paddq %xmm5,%xmm4 1cf: 66 0f 6f eb movdqa %xmm3,%xmm5 1d3: 66 0f f4 c1 pmuludq %xmm1,%xmm0 1d7: 66 0f 73 d2 20 psrlq $0x20,%xmm2 1dc: 66 0f f4 d3 pmuludq %xmm3,%xmm2 1e0: 66 0f 73 f2 20 psllq $0x20,%xmm2 1e5: 66 0f d4 c2 paddq %xmm2,%xmm0 1e9: 66 0f 73 d5 20 psrlq $0x20,%xmm5 1ee: 66 0f f4 cd pmuludq %xmm5,%xmm1 1f2: 66 0f 73 f1 20 psllq $0x20,%xmm1 1f7: 66 0f d4 c1 paddq %xmm1,%xmm0 1fb: 66 0f d4 c4 paddq %xmm4,%xmm0 1ff: 66 0f 6f c8 movdqa %xmm0,%xmm1 203: 66 0f 73 d9 08 psrldq $0x8,%xmm1 208: 66 0f d4 c1 paddq %xmm1,%xmm0 20c: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp) 212: 48 8b 44 24 f8 mov -0x8(%rsp),%rax 217: c3 retq $ objdump -d dotproduct-no-vectorize.o dotproduct-no-vectorize.o: file format elf64-x86-64 Disassembly of section .text: 0000000000000000 <dotproduct>: 0: 31 c0 xor %eax,%eax 2: 85 d2 test %edx,%edx 4: 74 2a je 30 <dotproduct+0x30> 6: 83 ea 01 sub $0x1,%edx 9: 4c 8d 0c 95 04 00 00 lea 0x4(,%rdx,4),%r9 10: 00 11: 31 d2 xor %edx,%edx 13: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1) 18: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx 1c: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8 20: 48 83 c2 04 add $0x4,%rdx 24: 49 0f af c8 imul %r8,%rcx 28: 48 01 c8 add %rcx,%rax 2b: 4c 39 ca cmp %r9,%rdx 2e: 75 e8 jne 18 <dotproduct+0x18> 30: f3 c3 repz retq 32: 66 66 66 66 66 2e 0f nopw %cs:0x0(%rax,%rax,1) 39: 1f 84 00 00 00 00 00 0000000000000040 <dotproduct_order4>: 40: 48 63 07 movslq (%rdi),%rax 43: 48 63 16 movslq (%rsi),%rdx 46: 48 63 4f 04 movslq 0x4(%rdi),%rcx 4a: 48 0f af d0 imul %rax,%rdx 4e: 48 63 46 04 movslq 0x4(%rsi),%rax 52: 48 0f af c1 imul %rcx,%rax 56: 48 63 4f 08 movslq 0x8(%rdi),%rcx 5a: 48 01 c2 add %rax,%rdx 5d: 48 63 46 08 movslq 0x8(%rsi),%rax 61: 48 0f af c1 imul %rcx,%rax 65: 48 63 4f 0c movslq 0xc(%rdi),%rcx 69: 48 01 c2 add %rax,%rdx 6c: 48 63 46 0c movslq 0xc(%rsi),%rax 70: 48 0f af c1 imul %rcx,%rax 74: 48 01 d0 add %rdx,%rax 77: c3 retq -- Summary: 120% slowdown with vectorizer Product: gcc Version: 4.4.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: ramiro86 at hotmail dot com GCC target triplet: x86_64-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821