------- Comment #3 from dominiq at lps dot ens dot fr  2010-08-23 17:24 -------
> Can't reproduce on x86_64-linux.

My timings were on an Intel Core2Duo 2.53Ghz.

> Please try to pinpoint the codegen difference that causes the slowdown.

I don't know if this what you ask for, but comparing assembly (fast -, slow +)
I see in several places the following kind of patterns:

 L36:
-       leaq    1(%rsi), %r9
-       movq    %rsi, %r10
+       movq    %rdi, %r10
+       leaq    1(%rdi), %r9
        salq    $4, %r10
+       movsd   (%rsi,%r10), %xmm14
        salq    $4, %r9
-       movapd  (%rdi,%r10), %xmm5
-       leaq    2(%rsi), %r10
-       movapd  (%rdi,%r9), %xmm4
-       leaq    3(%rsi), %r9
+       movhpd  8(%rsi,%r10), %xmm14
+       leaq    2(%rdi), %r10
+       movapd  %xmm14, %xmm13
        salq    $4, %r10
-       andpd   %xmm0, %xmm5
+       andpd   %xmm0, %xmm13
+       movlpd  %xmm13, (%rcx)
+       movhpd  %xmm13, 8(%rcx)
+       movsd   (%rsi,%r9), %xmm12
+       movhpd  8(%rsi,%r9), %xmm12
+       leaq    3(%rdi), %r9
+       movapd  %xmm12, %xmm11
        salq    $4, %r9
-       movapd  (%rdi,%r10), %xmm3
-       leaq    4(%rsi), %r10
-       andpd   %xmm0, %xmm4
-       movapd  (%rdi,%r9), %xmm2
-       leaq    5(%rsi), %r9
+       andpd   %xmm0, %xmm11
+       movlpd  %xmm11, 16(%rcx)
+       movhpd  %xmm11, 24(%rcx)
+       movsd   (%rsi,%r10), %xmm10
+       movhpd  8(%rsi,%r10), %xmm10
+       leaq    4(%rdi), %r10
+       movapd  %xmm10, %xmm9
        salq    $4, %r10
-       andpd   %xmm0, %xmm3
+       andpd   %xmm0, %xmm9
+       movlpd  %xmm9, 32(%rcx)
+       movhpd  %xmm9, 40(%rcx)
+       movsd   (%rsi,%r9), %xmm8
+       movhpd  8(%rsi,%r9), %xmm8
+       leaq    5(%rdi), %r9
+       movapd  %xmm8, %xmm7
        salq    $4, %r9
-       movapd  (%rdi,%r10), %xmm1
-       leaq    6(%rsi), %r10
-       andpd   %xmm0, %xmm2
-       movapd  (%rdi,%r9), %xmm15
-       leaq    7(%rsi), %r9
+       andpd   %xmm0, %xmm7
+       movlpd  %xmm7, 48(%rcx)
+       movhpd  %xmm7, 56(%rcx)
+       movsd   (%rsi,%r10), %xmm6
+       movhpd  8(%rsi,%r10), %xmm6
+       leaq    6(%rdi), %r10
+       movapd  %xmm6, %xmm5
        salq    $4, %r10
-       andpd   %xmm0, %xmm1
+       andpd   %xmm0, %xmm5
+       movlpd  %xmm5, 64(%rcx)
+       movhpd  %xmm5, 72(%rcx)
+       movsd   (%rsi,%r9), %xmm4
+       movhpd  8(%rsi,%r9), %xmm4
+       leaq    7(%rdi), %r9
+       addq    $8, %rdi
+       movapd  %xmm4, %xmm3
        salq    $4, %r9
-       movapd  (%rdi,%r10), %xmm14
-       andpd   %xmm0, %xmm15
-       addq    $8, %rsi
-       movapd  (%rdi,%r9), %xmm13
+       andpd   %xmm0, %xmm3
+       movlpd  %xmm3, 80(%rcx)
+       movhpd  %xmm3, 88(%rcx)
+       movsd   (%rsi,%r10), %xmm2
+       movhpd  8(%rsi,%r10), %xmm2
+       movapd  %xmm2, %xmm1
+       andpd   %xmm0, %xmm1
+       movlpd  %xmm1, 96(%rcx)
+       movhpd  %xmm1, 104(%rcx)
+       movsd   (%rsi,%r9), %xmm15
+       movhpd  8(%rsi,%r9), %xmm15
+       movapd  %xmm15, %xmm14
        andpd   %xmm0, %xmm14
-       andpd   %xmm0, %xmm13
-       movlpd  %xmm5, (%rcx)
-       movhpd  %xmm5, 8(%rcx)
-       movlpd  %xmm4, 16(%rcx)
-       movhpd  %xmm4, 24(%rcx)
-       movlpd  %xmm3, 32(%rcx)
-       movhpd  %xmm3, 40(%rcx)
-       movlpd  %xmm2, 48(%rcx)
-       movhpd  %xmm2, 56(%rcx)
-       movlpd  %xmm1, 64(%rcx)
-       movhpd  %xmm1, 72(%rcx)
-       movlpd  %xmm15, 80(%rcx)
-       movhpd  %xmm15, 88(%rcx)
-       movlpd  %xmm14, 96(%rcx)
-       movhpd  %xmm14, 104(%rcx)
-       movlpd  %xmm13, 112(%rcx)
-       movhpd  %xmm13, 120(%rcx)
+       movlpd  %xmm14, 112(%rcx)
+       movhpd  %xmm14, 120(%rcx)
        subq    $-128, %rcx
-       cmpq    %r11, %rsi
+       cmpq    %r11, %rdi
        jb      L36


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=45379

Reply via email to