------- Comment #3 from dominiq at lps dot ens dot fr 2010-08-23 17:24 -------
> Can't reproduce on x86_64-linux.
My timings were on an Intel Core2Duo 2.53Ghz.
> Please try to pinpoint the codegen difference that causes the slowdown.
I don't know if this what you ask for, but comparing assembly (fast -, slow +)
I see in several places the following kind of patterns:
L36:
- leaq 1(%rsi), %r9
- movq %rsi, %r10
+ movq %rdi, %r10
+ leaq 1(%rdi), %r9
salq $4, %r10
+ movsd (%rsi,%r10), %xmm14
salq $4, %r9
- movapd (%rdi,%r10), %xmm5
- leaq 2(%rsi), %r10
- movapd (%rdi,%r9), %xmm4
- leaq 3(%rsi), %r9
+ movhpd 8(%rsi,%r10), %xmm14
+ leaq 2(%rdi), %r10
+ movapd %xmm14, %xmm13
salq $4, %r10
- andpd %xmm0, %xmm5
+ andpd %xmm0, %xmm13
+ movlpd %xmm13, (%rcx)
+ movhpd %xmm13, 8(%rcx)
+ movsd (%rsi,%r9), %xmm12
+ movhpd 8(%rsi,%r9), %xmm12
+ leaq 3(%rdi), %r9
+ movapd %xmm12, %xmm11
salq $4, %r9
- movapd (%rdi,%r10), %xmm3
- leaq 4(%rsi), %r10
- andpd %xmm0, %xmm4
- movapd (%rdi,%r9), %xmm2
- leaq 5(%rsi), %r9
+ andpd %xmm0, %xmm11
+ movlpd %xmm11, 16(%rcx)
+ movhpd %xmm11, 24(%rcx)
+ movsd (%rsi,%r10), %xmm10
+ movhpd 8(%rsi,%r10), %xmm10
+ leaq 4(%rdi), %r10
+ movapd %xmm10, %xmm9
salq $4, %r10
- andpd %xmm0, %xmm3
+ andpd %xmm0, %xmm9
+ movlpd %xmm9, 32(%rcx)
+ movhpd %xmm9, 40(%rcx)
+ movsd (%rsi,%r9), %xmm8
+ movhpd 8(%rsi,%r9), %xmm8
+ leaq 5(%rdi), %r9
+ movapd %xmm8, %xmm7
salq $4, %r9
- movapd (%rdi,%r10), %xmm1
- leaq 6(%rsi), %r10
- andpd %xmm0, %xmm2
- movapd (%rdi,%r9), %xmm15
- leaq 7(%rsi), %r9
+ andpd %xmm0, %xmm7
+ movlpd %xmm7, 48(%rcx)
+ movhpd %xmm7, 56(%rcx)
+ movsd (%rsi,%r10), %xmm6
+ movhpd 8(%rsi,%r10), %xmm6
+ leaq 6(%rdi), %r10
+ movapd %xmm6, %xmm5
salq $4, %r10
- andpd %xmm0, %xmm1
+ andpd %xmm0, %xmm5
+ movlpd %xmm5, 64(%rcx)
+ movhpd %xmm5, 72(%rcx)
+ movsd (%rsi,%r9), %xmm4
+ movhpd 8(%rsi,%r9), %xmm4
+ leaq 7(%rdi), %r9
+ addq $8, %rdi
+ movapd %xmm4, %xmm3
salq $4, %r9
- movapd (%rdi,%r10), %xmm14
- andpd %xmm0, %xmm15
- addq $8, %rsi
- movapd (%rdi,%r9), %xmm13
+ andpd %xmm0, %xmm3
+ movlpd %xmm3, 80(%rcx)
+ movhpd %xmm3, 88(%rcx)
+ movsd (%rsi,%r10), %xmm2
+ movhpd 8(%rsi,%r10), %xmm2
+ movapd %xmm2, %xmm1
+ andpd %xmm0, %xmm1
+ movlpd %xmm1, 96(%rcx)
+ movhpd %xmm1, 104(%rcx)
+ movsd (%rsi,%r9), %xmm15
+ movhpd 8(%rsi,%r9), %xmm15
+ movapd %xmm15, %xmm14
andpd %xmm0, %xmm14
- andpd %xmm0, %xmm13
- movlpd %xmm5, (%rcx)
- movhpd %xmm5, 8(%rcx)
- movlpd %xmm4, 16(%rcx)
- movhpd %xmm4, 24(%rcx)
- movlpd %xmm3, 32(%rcx)
- movhpd %xmm3, 40(%rcx)
- movlpd %xmm2, 48(%rcx)
- movhpd %xmm2, 56(%rcx)
- movlpd %xmm1, 64(%rcx)
- movhpd %xmm1, 72(%rcx)
- movlpd %xmm15, 80(%rcx)
- movhpd %xmm15, 88(%rcx)
- movlpd %xmm14, 96(%rcx)
- movhpd %xmm14, 104(%rcx)
- movlpd %xmm13, 112(%rcx)
- movhpd %xmm13, 120(%rcx)
+ movlpd %xmm14, 112(%rcx)
+ movhpd %xmm14, 120(%rcx)
subq $-128, %rcx
- cmpq %r11, %rsi
+ cmpq %r11, %rdi
jb L36
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45379