------- Comment #2 from amylaar at gcc dot gnu dot org  2005-11-11 21:40 -------
Created an attachment (id=10222)
 --> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=10222&action=view)
test case

This testcase, compiled with -O2 -funroll-loops, shows numerous reg+index
addressing being used in the inner loop. which requires additional addition
instructions.  (And since INDEX_REG_CLASS is CLASS_LIKELY_SPILLED_P, it also
requires extra reg-reg copies).

When there ia a reg+index address giv, with one of the register being loop
invariant, unrolling should perform strength reduction on the sum, so that
reg+offset addressing can be used.

E.g.:

L2:
        mov     r6,r3
        mov.l   @r3+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov     r7,r0
        mov.l   r9,@(r0,r5)
        mov     r7,r2
        add     #4,r2
        mov.l   @(4,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov     r2,r0
        mov.l   r1,@(r0,r5)
        add     #4,r2
        mov.l   @(4,r3),r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov     r2,r0
        mov.l   r9,@(r0,r5)
        add     #12,r7
        mov.l   @(12,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov     r7,r0
        mov.l   r3,@(r0,r5)
        add     #4,r7
        mov.l   @(16,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov     r7,r0
        mov.l   r1,@(r0,r5)
        add     #4,r7
        mov.l   @(20,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r2
        mov     r7,r0
        mov.l   r2,@(r0,r5)
        add     #4,r7
        mov.l   @(24,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov     r7,r0
        mov.l   r9,@(r0,r5)
        add     #4,r7
        mov.l   @(28,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov     r7,r0
        mov.l   r3,@(r0,r5)
        add     #4,r7
        mov.l   @(32,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov     r7,r0
        mov.l   r1,@(r0,r5)
        add     #4,r7
        mov.l   @(36,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r2
        mov     r7,r0
        mov.l   r2,@(r0,r5)
        add     #4,r7
        mov.l   @(40,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov     r7,r0
        mov.l   r9,@(r0,r5)
        add     #4,r7
        mov.l   @(44,r6),r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov     r7,r0
        mov.l   r3,@(r0,r5)
        add     #48,r6
        add     #4,r7

can be changed into:

        add r5,r7
L2:
        mov     @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov.l   r9,@r7
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov.l   r1,@(4,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov.l   r9,@(8,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov.l   r3,@(12,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov.l   r1,@(16,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r2
        mov.l   r2,@(20,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov.l   r9,@(24,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov.l   r3,@(28,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r1
        mov.l   r1,@(32,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r2
        mov.l   r2,@(36,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r9
        mov.l   r9,@(40,r7)
        mov.l   @r6+,r0
        shll2   r0
        mov.l   @(r0,r8),r3
        mov.l   r3,@(44,r7)
        add     #48,r7
, and beyond the reduced instruction count, the reduction of r0 usage also
makes this code simpler to schedule.


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24815

Reply via email to