6 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
0160 :
160: 66 0f ef c0 pxor %xmm0,%xmm0
164: f3 0f 6f 0f movdqu (%rdi),%xmm1
168: f3 0f 6f 1e movdqu (%rsi),%xmm3
16c: 66 0f 6f d0 movdqa %xmm0,%xmm2
170: 66 0f 6f f1 movdqa %xmm1,%xmm6
174: 66 0f 66 c1 pcmpgtd %xmm1,%xmm0
178: 66 0f 6f fb movdqa %xmm3,%xmm7
17c: 66 0f 66 d3 pcmpgtd %xmm3,%xmm2
180: 66 0f 62 f0 punpckldq %xmm0,%xmm6
184: 66 0f 62 fa punpckldq %xmm2,%xmm7
188: 66 0f 6a da punpckhdq %xmm2,%xmm3
18c: 66 0f 6a c8 punpckhdq %xmm0,%xmm1
190: 66 0f 6f ee movdqa %xmm6,%xmm5
194: 66 44 0f 6f c7 movdqa %xmm7,%xmm8
199: 66 0f 6f e7 movdqa %xmm7,%xmm4
19d: 66 0f 6f c3 movdqa %xmm3,%xmm0
1a1: 66 0f 73 d5 20 psrlq $0x20,%xmm5
1a6: 66 44 0f f4 c6 pmuludq %xmm6,%xmm8
1ab: 66 0f f4 ef pmuludq %xmm7,%xmm5
1af: 66 0f 6f d1 movdqa %xmm1,%xmm2
1b3: 66 0f 73 d4 20 psrlq $0x20,%xmm4
1b8: 66 0f 73 f5 20 psllq $0x20,%xmm5
1bd: 66 0f f4 e6 pmuludq %xmm6,%xmm4
1c1: 66 41 0f d4 e8 paddq %xmm8,%xmm5
1c6: 66 0f 73 f4 20 psllq $0x20,%xmm4
1cb: 66 0f d4 e5 paddq %xmm5,%xmm4
1cf: 66 0f 6f eb movdqa %xmm3,%xmm5
1d3: 66 0f f4 c1 pmuludq %xmm1,%xmm0
1d7: 66 0f 73 d2 20 psrlq $0x20,%xmm2
1dc: 66 0f f4 d3 pmuludq %xmm3,%xmm2
1e0: 66 0f 73 f2 20 psllq $0x20,%xmm2
1e5: 66 0f d4 c2 paddq %xmm2,%xmm0
1e9: 66 0f 73 d5 20 psrlq $0x20,%xmm5
1ee: 66 0f f4 cd pmuludq %xmm5,%xmm1
1f2: 66 0f 73 f1 20 psllq $0x20,%xmm1
1f7: 66 0f d4 c1 paddq %xmm1,%xmm0
1fb: 66 0f d4 c4 paddq %xmm4,%xmm0
1ff: 66 0f 6f c8 movdqa %xmm0,%xmm1
203: 66 0f 73 d9 08 psrldq $0x8,%xmm1
208: 66 0f d4 c1 paddq %xmm1,%xmm0
20c: 66 0f d6 44 24 f8 movq %xmm0,-0x8(%rsp)
212: 48 8b 44 24 f8 mov-0x8(%rsp),%rax
217: c3 retq
$ objdump -d dotproduct-no-vectorize.o
dotproduct-no-vectorize.o: file format elf64-x86-64
Disassembly of section .text:
:
0: 31 c0 xor%eax,%eax
2: 85 d2 test %edx,%edx
4: 74 2a je 30
6: 83 ea 01sub$0x1,%edx
9: 4c 8d 0c 95 04 00 00lea0x4(,%rdx,4),%r9
10: 00
11: 31 d2 xor%edx,%edx
13: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
18: 48 63 0c 16 movslq (%rsi,%rdx,1),%rcx
1c: 4c 63 04 17 movslq (%rdi,%rdx,1),%r8
20: 48 83 c2 04 add$0x4,%rdx
24: 49 0f af c8 imul %r8,%rcx
28: 48 01 c8add%rcx,%rax
2b: 4c 39 cacmp%r9,%rdx
2e: 75 e8 jne18
30: f3 c3 repz retq
32: 66 66 66 66 66 2e 0fnopw %cs:0x0(%rax,%rax,1)
39: 1f 84 00 00 00 00 00
0040 :
40: 48 63 07movslq (%rdi),%rax
43: 48 63 16movslq (%rsi),%rdx
46: 48 63 4f 04 movslq 0x4(%rdi),%rcx
4a: 48 0f af d0 imul %rax,%rdx
4e: 48 63 46 04 movslq 0x4(%rsi),%rax
52: 48 0f af c1 imul %rcx,%rax
56: 48 63 4f 08 movslq 0x8(%rdi),%rcx
5a: 48 01 c2add%rax,%rdx
5d: 48 63 46 08 movslq 0x8(%rsi),%rax
61: 48 0f af c1 imul %rcx,%rax
65: 48 63 4f 0c movslq 0xc(%rdi),%rcx
69: 48 01 c2add%rax,%rdx
6c: 48 63 46 0c movslq 0xc(%rsi),%rax
70: 48 0f af c1 imul %rcx,%rax
74: 48 01 d0add%rdx,%rax
77: c3 retq
--
Summary: 120% slowdown with vectorizer
Product: gcc
Version: 4.4.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: ramiro86 at hotmail dot com
GCC target triplet: x86_64-linux-gnu
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=39821