For this code:

void f(int *__restrict t, const int *__restrict s) {
    unsigned long i;
    for (i = 0; i < 1024; i++)
        t[i] = s[i];
}

gcc-3.3 produces with -O3 -funroll-loops:

0000000000000000 <f>:
   0:   17 04 ff 47     clr     t9
   4:   ff 03 1f 23     lda     t10,1023
   8:   1f 04 ff 47     nop     
   c:   00 00 fe 2f     unop    
  10:   00 00 51 a0     ldl     t1,0(a1)
  14:   04 00 91 a0     ldl     t3,4(a1)
  18:   08 00 b1 a0     ldl     t4,8(a1)
  1c:   0c 00 d1 a0     ldl     t5,12(a1)
  20:   10 00 f1 a0     ldl     t6,16(a1)
  24:   14 00 11 a1     ldl     t7,20(a1)
  28:   18 00 71 a0     ldl     t2,24(a1)
  2c:   08 00 f7 22     lda     t9,8(t9)
  30:   1c 00 51 a2     ldl     a2,28(a1)
  34:   a0 07 f8 42     cmpule  t9,t10,v0
  38:   00 00 50 b0     stl     t1,0(a0)
  3c:   04 00 90 b0     stl     t3,4(a0)
  40:   20 00 31 22     lda     a1,32(a1)
  44:   08 00 b0 b0     stl     t4,8(a0)
  48:   0c 00 d0 b0     stl     t5,12(a0)
  4c:   10 00 f0 b0     stl     t6,16(a0)
  50:   14 00 10 b1     stl     t7,20(a0)
  54:   18 00 70 b0     stl     t2,24(a0)
  58:   1c 00 50 b2     stl     a2,28(a0)
  5c:   20 00 10 22     lda     a0,32(a0)
  60:   eb ff 1f f4     bne     v0,10 <f+0x10>
  64:   01 80 fa 6b     ret

however mainline produces:

0000000000000000 <f>:
   0:   06 04 ff 47     clr     t5
   4:   00 00 fe 2f     unop    
   8:   1f 04 ff 47     nop     
   c:   00 00 fe 2f     unop    
  10:   05 04 26 42     addq    a1,t5,t4
  14:   04 04 06 42     addq    a0,t5,t3
  18:   04 00 c6 20     lda     t5,4(t5)
  1c:   00 00 65 a0     ldl     t2,0(t4)
  20:   05 04 26 42     addq    a1,t5,t4
  24:   01 04 e6 47     mov     t5,t0
  28:   1c f0 46 20     lda     t1,-4068(t5)
  2c:   00 00 64 b0     stl     t2,0(t3)
  30:   04 04 06 42     addq    a0,t5,t3
  34:   04 00 c6 20     lda     t5,4(t5)
  38:   00 00 65 a0     ldl     t2,0(t4)
  3c:   05 04 26 42     addq    a1,t5,t4
  40:   00 00 64 b0     stl     t2,0(t3)
  44:   04 04 06 42     addq    a0,t5,t3
  48:   08 00 c1 20     lda     t5,8(t0)
  4c:   00 00 65 a0     ldl     t2,0(t4)
  50:   05 04 26 42     addq    a1,t5,t4
  54:   00 00 64 b0     stl     t2,0(t3)
  58:   04 04 06 42     addq    a0,t5,t3
  5c:   0c 00 c1 20     lda     t5,12(t0)
  60:   00 00 65 a0     ldl     t2,0(t4)
  64:   05 04 26 42     addq    a1,t5,t4
  68:   00 00 64 b0     stl     t2,0(t3)
  6c:   04 04 06 42     addq    a0,t5,t3
  70:   10 00 c1 20     lda     t5,16(t0)
  74:   00 00 65 a0     ldl     t2,0(t4)
  78:   05 04 26 42     addq    a1,t5,t4
  7c:   00 00 64 b0     stl     t2,0(t3)
  80:   04 04 06 42     addq    a0,t5,t3
  84:   14 00 c1 20     lda     t5,20(t0)
  88:   00 00 65 a0     ldl     t2,0(t4)
  8c:   05 04 26 42     addq    a1,t5,t4
  90:   00 00 64 b0     stl     t2,0(t3)
  94:   04 04 06 42     addq    a0,t5,t3
  98:   18 00 c1 20     lda     t5,24(t0)
  9c:   00 00 65 a0     ldl     t2,0(t4)
  a0:   05 04 26 42     addq    a1,t5,t4
  a4:   00 00 64 b0     stl     t2,0(t3)
  a8:   04 04 06 42     addq    a0,t5,t3
  ac:   1c 00 c1 20     lda     t5,28(t0)
  b0:   00 00 65 a0     ldl     t2,0(t4)
  b4:   00 00 64 b0     stl     t2,0(t3)
  b8:   d5 ff 5f f4     bne     t1,10 <f+0x10>
  bc:   01 80 fa 6b     ret

that is, it doesn't reorder loads and stores anymore, and apparently introduces
lots of silly IVs.

-- 
           Summary: Poor code from unrolled simple loop
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P2
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: falk at debian dot org
                CC: gcc-bugs at gcc dot gnu dot org
 GCC build triplet: alphaev68-unknown-linux-gnu
  GCC host triplet: alphaev68-unknown-linux-gnu
GCC target triplet: alphaev68-unknown-linux-gnu


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=22031

Reply via email to