------- Comment #2 from amylaar at gcc dot gnu dot org 2005-11-11 21:40 -------
Created an attachment (id=10222)
--> (http://gcc.gnu.org/bugzilla/attachment.cgi?id=10222&action=view)
test case
This testcase, compiled with -O2 -funroll-loops, shows numerous reg+index
addressing being used in the inner loop. which requires additional addition
instructions. (And since INDEX_REG_CLASS is CLASS_LIKELY_SPILLED_P, it also
requires extra reg-reg copies).
When there ia a reg+index address giv, with one of the register being loop
invariant, unrolling should perform strength reduction on the sum, so that
reg+offset addressing can be used.
E.g.:
L2:
mov r6,r3
mov.l @r3+,r0
shll2 r0
mov.l @(r0,r8),r9
mov r7,r0
mov.l r9,@(r0,r5)
mov r7,r2
add #4,r2
mov.l @(4,r6),r0
shll2 r0
mov.l @(r0,r8),r1
mov r2,r0
mov.l r1,@(r0,r5)
add #4,r2
mov.l @(4,r3),r0
shll2 r0
mov.l @(r0,r8),r9
mov r2,r0
mov.l r9,@(r0,r5)
add #12,r7
mov.l @(12,r6),r0
shll2 r0
mov.l @(r0,r8),r3
mov r7,r0
mov.l r3,@(r0,r5)
add #4,r7
mov.l @(16,r6),r0
shll2 r0
mov.l @(r0,r8),r1
mov r7,r0
mov.l r1,@(r0,r5)
add #4,r7
mov.l @(20,r6),r0
shll2 r0
mov.l @(r0,r8),r2
mov r7,r0
mov.l r2,@(r0,r5)
add #4,r7
mov.l @(24,r6),r0
shll2 r0
mov.l @(r0,r8),r9
mov r7,r0
mov.l r9,@(r0,r5)
add #4,r7
mov.l @(28,r6),r0
shll2 r0
mov.l @(r0,r8),r3
mov r7,r0
mov.l r3,@(r0,r5)
add #4,r7
mov.l @(32,r6),r0
shll2 r0
mov.l @(r0,r8),r1
mov r7,r0
mov.l r1,@(r0,r5)
add #4,r7
mov.l @(36,r6),r0
shll2 r0
mov.l @(r0,r8),r2
mov r7,r0
mov.l r2,@(r0,r5)
add #4,r7
mov.l @(40,r6),r0
shll2 r0
mov.l @(r0,r8),r9
mov r7,r0
mov.l r9,@(r0,r5)
add #4,r7
mov.l @(44,r6),r0
shll2 r0
mov.l @(r0,r8),r3
mov r7,r0
mov.l r3,@(r0,r5)
add #48,r6
add #4,r7
can be changed into:
add r5,r7
L2:
mov @r6+,r0
shll2 r0
mov.l @(r0,r8),r9
mov.l r9,@r7
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r1
mov.l r1,@(4,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r9
mov.l r9,@(8,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r3
mov.l r3,@(12,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r1
mov.l r1,@(16,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r2
mov.l r2,@(20,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r9
mov.l r9,@(24,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r3
mov.l r3,@(28,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r1
mov.l r1,@(32,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r2
mov.l r2,@(36,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r9
mov.l r9,@(40,r7)
mov.l @r6+,r0
shll2 r0
mov.l @(r0,r8),r3
mov.l r3,@(44,r7)
add #48,r7
, and beyond the reduced instruction count, the reduction of r0 usage also
makes this code simpler to schedule.
--
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24815