------- Comment #3 from dominiq at lps dot ens dot fr 2010-08-23 17:24 ------- > Can't reproduce on x86_64-linux.
My timings were on an Intel Core2Duo 2.53Ghz. > Please try to pinpoint the codegen difference that causes the slowdown. I don't know if this what you ask for, but comparing assembly (fast -, slow +) I see in several places the following kind of patterns: L36: - leaq 1(%rsi), %r9 - movq %rsi, %r10 + movq %rdi, %r10 + leaq 1(%rdi), %r9 salq $4, %r10 + movsd (%rsi,%r10), %xmm14 salq $4, %r9 - movapd (%rdi,%r10), %xmm5 - leaq 2(%rsi), %r10 - movapd (%rdi,%r9), %xmm4 - leaq 3(%rsi), %r9 + movhpd 8(%rsi,%r10), %xmm14 + leaq 2(%rdi), %r10 + movapd %xmm14, %xmm13 salq $4, %r10 - andpd %xmm0, %xmm5 + andpd %xmm0, %xmm13 + movlpd %xmm13, (%rcx) + movhpd %xmm13, 8(%rcx) + movsd (%rsi,%r9), %xmm12 + movhpd 8(%rsi,%r9), %xmm12 + leaq 3(%rdi), %r9 + movapd %xmm12, %xmm11 salq $4, %r9 - movapd (%rdi,%r10), %xmm3 - leaq 4(%rsi), %r10 - andpd %xmm0, %xmm4 - movapd (%rdi,%r9), %xmm2 - leaq 5(%rsi), %r9 + andpd %xmm0, %xmm11 + movlpd %xmm11, 16(%rcx) + movhpd %xmm11, 24(%rcx) + movsd (%rsi,%r10), %xmm10 + movhpd 8(%rsi,%r10), %xmm10 + leaq 4(%rdi), %r10 + movapd %xmm10, %xmm9 salq $4, %r10 - andpd %xmm0, %xmm3 + andpd %xmm0, %xmm9 + movlpd %xmm9, 32(%rcx) + movhpd %xmm9, 40(%rcx) + movsd (%rsi,%r9), %xmm8 + movhpd 8(%rsi,%r9), %xmm8 + leaq 5(%rdi), %r9 + movapd %xmm8, %xmm7 salq $4, %r9 - movapd (%rdi,%r10), %xmm1 - leaq 6(%rsi), %r10 - andpd %xmm0, %xmm2 - movapd (%rdi,%r9), %xmm15 - leaq 7(%rsi), %r9 + andpd %xmm0, %xmm7 + movlpd %xmm7, 48(%rcx) + movhpd %xmm7, 56(%rcx) + movsd (%rsi,%r10), %xmm6 + movhpd 8(%rsi,%r10), %xmm6 + leaq 6(%rdi), %r10 + movapd %xmm6, %xmm5 salq $4, %r10 - andpd %xmm0, %xmm1 + andpd %xmm0, %xmm5 + movlpd %xmm5, 64(%rcx) + movhpd %xmm5, 72(%rcx) + movsd (%rsi,%r9), %xmm4 + movhpd 8(%rsi,%r9), %xmm4 + leaq 7(%rdi), %r9 + addq $8, %rdi + movapd %xmm4, %xmm3 salq $4, %r9 - movapd (%rdi,%r10), %xmm14 - andpd %xmm0, %xmm15 - addq $8, %rsi - movapd (%rdi,%r9), %xmm13 + andpd %xmm0, %xmm3 + movlpd %xmm3, 80(%rcx) + movhpd %xmm3, 88(%rcx) + movsd (%rsi,%r10), %xmm2 + movhpd 8(%rsi,%r10), %xmm2 + movapd %xmm2, %xmm1 + andpd %xmm0, %xmm1 + movlpd %xmm1, 96(%rcx) + movhpd %xmm1, 104(%rcx) + movsd (%rsi,%r9), %xmm15 + movhpd 8(%rsi,%r9), %xmm15 + movapd %xmm15, %xmm14 andpd %xmm0, %xmm14 - andpd %xmm0, %xmm13 - movlpd %xmm5, (%rcx) - movhpd %xmm5, 8(%rcx) - movlpd %xmm4, 16(%rcx) - movhpd %xmm4, 24(%rcx) - movlpd %xmm3, 32(%rcx) - movhpd %xmm3, 40(%rcx) - movlpd %xmm2, 48(%rcx) - movhpd %xmm2, 56(%rcx) - movlpd %xmm1, 64(%rcx) - movhpd %xmm1, 72(%rcx) - movlpd %xmm15, 80(%rcx) - movhpd %xmm15, 88(%rcx) - movlpd %xmm14, 96(%rcx) - movhpd %xmm14, 104(%rcx) - movlpd %xmm13, 112(%rcx) - movhpd %xmm13, 120(%rcx) + movlpd %xmm14, 112(%rcx) + movhpd %xmm14, 120(%rcx) subq $-128, %rcx - cmpq %r11, %rsi + cmpq %r11, %rdi jb L36 -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45379