------- Comment #5 from ubizjak at gmail dot com  2009-02-09 08:28 -------
On alpha, we generate (-O3 -fno-tree-vectorize -funroll-loops):

$L2:
        lda $19,4($1)
        addq $17,$1,$21
        lda $2,8($1)
        cpys $f31,$f31,$f31
        addq $17,$19,$20
        ldl $22,0($21)
        lda $0,12($1)
        cpys $f31,$f31,$f31
        addq $17,$2,$5
        ldl $23,0($20)
        addq $17,$0,$4
        cpys $f31,$f31,$f31
        lda $27,16($1)
        ldl $25,0($5)
        addq $16,$1,$8
        ldl $24,0($4)
        lda $20,20($1)
        cpys $f31,$f31,$f31
        addq $16,$19,$18
        stl $22,0($8)
        addq $17,$27,$7
        cpys $f31,$f31,$f31
        lda $19,24($1)
        stl $23,0($18)
        addq $16,$2,$6
        ldl $22,0($7)
        addq $17,$20,$3
        cpys $f31,$f31,$f31
        lda $18,28($1)
        stl $25,0($6)
        addq $16,$0,$21
        ldl $8,0($3)
        addq $17,$19,$28
        cpys $f31,$f31,$f31
        addq $17,$18,$23
        stl $24,0($21)
        addq $16,$27,$5
        ldl $7,0($28)
        addq $16,$20,$4
        ldl $6,0($23)
        stl $22,0($5)
        addq $16,$19,$3
        stl $8,0($4)
        addq $16,$18,$0
        stl $7,0($3)
        lda $1,32($1)
        stl $6,0($0)
        lda $2,-4096($1)
        bne $2,$L2

However, on x86_64 we compile the loop in similar way:

.L2:
        leaq    1(%rax), %rdx
        leaq    2(%rax), %r10
        movl    (%rsi,%rax,4), %r8d
        movl    (%rsi,%rdx,4), %ecx
        movl    %r8d, (%rdi,%rax,4)
        movl    (%rsi,%r10,4), %r11d
        movl    %ecx, (%rdi,%rdx,4)
        leaq    3(%rax), %r8
        movl    %r11d, (%rdi,%r10,4)
        leaq    4(%rax), %rdx
        movl    (%rsi,%r8,4), %r9d
        movl    (%rsi,%rdx,4), %ecx
        movl    %r9d, (%rdi,%r8,4)
        leaq    5(%rax), %r10
        movl    %ecx, (%rdi,%rdx,4)
        leaq    6(%rax), %r8
        leaq    7(%rax), %rdx
        movl    (%rsi,%r10,4), %r11d
        movl    (%rsi,%r8,4), %r9d
        movl    %r11d, (%rdi,%r10,4)
        movl    (%rsi,%rdx,4), %ecx
        movl    %r9d, (%rdi,%r8,4)
        addq    $8, %rax
        movl    %ecx, (%rdi,%rdx,4)
        cmpq    $1024, %rax
        jne     .L2

It looks that RTL loop optimizer prefers calculated addresses instead of
offseted addressing.


-- 

ubizjak at gmail dot com changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
                 CC|uros at gcc dot gnu dot org |ubizjak at gmail dot com
   Last reconfirmed|0000-00-00 00:00:00         |2009-02-09 08:28:40
               date|                            |


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22031

Reply via email to