https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65447
Bug ID: 65447 Summary: AArch64: iv-opt causes bad addressing Product: gcc Version: 5.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: tree-optimization Assignee: unassigned at gcc dot gnu.org Reporter: amker at gcc dot gnu.org Hi, For below case extracted from spec2006 (and even worse in real case), loops containing a significant number of memory accesses generate very inefficient code. This is due to iv-opt hitting a limit and choosing the wrong induction variable, resulting in addressing modes with huge offsets and all loads/stores expanded into 2 or 3 instructions. The source code is like: void foo (double *p) { int i; for (i = -20000; i < 200000; i+= 40) { p[i+0] = 1.0; p[i+1] = 1.0; p[i+2] = 1.0; p[i+3] = 1.0; p[i+4] = 1.0; p[i+5] = 1.0; p[i+6] = 1.0; p[i+7] = 1.0; p[i+8] = 1.0; p[i+9] = 1.0; p[i+10] = 1.0; p[i+11] = 1.0; p[i+12] = 1.0; p[i+13] = 1.0; p[i+14] = 1.0; p[i+15] = 1.0; p[i+16] = 1.0; p[i+17] = 1.0; p[i+18] = 1.0; p[i+19] = 1.0; p[i+20] = 1.0; p[i+21] = 1.0; p[i+22] = 1.0; p[i+23] = 1.0; p[i+24] = 1.0; p[i+25] = 1.0; p[i+26] = 1.0; p[i+27] = 1.0; p[i+28] = 1.0; p[i+29] = 1.0; p[i+30] = 1.0; p[i+31] = 1.0; p[i+32] = 1.0; p[i+33] = 1.0; p[i+34] = 1.0; p[i+35] = 1.0; p[i+36] = 1.0; p[i+37] = 1.0; p[i+38] = 1.0; p[i+39] = 1.0; } } And comparison of generated assembly and the optimal one: *** test.S 2015-03-17 17:04:41.677033862 +0800 --- ../../../trunk-orig/target/bin/test.S 2015-03-17 17:03:45.377033869 +0800 *************** *** 7,40 **** .type foo, %function foo: fmov d0, 1.0e+0 ! sub x1, x0, #159744 ! add x2, x0, 1597440 ! sub x0, x1, #256 ! add x1, x2, 2560 .p2align 2 .L2: ! stp d0, d0, [x0] ! stp d0, d0, [x0, 16] ! stp d0, d0, [x0, 32] ! stp d0, d0, [x0, 48] ! stp d0, d0, [x0, 64] ! stp d0, d0, [x0, 80] ! stp d0, d0, [x0, 96] ! stp d0, d0, [x0, 112] ! stp d0, d0, [x0, 128] ! stp d0, d0, [x0, 144] ! stp d0, d0, [x0, 160] ! stp d0, d0, [x0, 176] ! stp d0, d0, [x0, 192] ! stp d0, d0, [x0, 208] ! stp d0, d0, [x0, 224] ! stp d0, d0, [x0, 240] ! add x0, x0, 320 ! cmp x1, x0 ! stp d0, d0, [x0, -64] ! stp d0, d0, [x0, -48] ! stp d0, d0, [x0, -32] ! stp d0, d0, [x0, -16] bne .L2 ret .size foo, .-foo --- 7,53 ---- .type foo, %function foo: fmov d0, 1.0e+0 ! mov x8, 56064 ! movk x8, 0x1a, lsl 16 ! mov x3, 0 .p2align 2 .L2: ! add x2, x0, x3 ! add x3, x3, 320 ! sub x1, x2, #159744 ! sub x2, x2, #155648 ! sub x4, x2, #4088 ! sub x7, x2, #4080 ! stp d0, d0, [x1, -256] ! sub x6, x2, #4072 ! sub x5, x2, #4064 ! stp d0, d0, [x1, -240] ! cmp x3, x8 ! stp d0, d0, [x1, -224] ! stp d0, d0, [x1, -208] ! stp d0, d0, [x1, -192] ! stp d0, d0, [x1, -176] ! stp d0, d0, [x1, -160] ! stp d0, d0, [x1, -144] ! stp d0, d0, [x1, -128] ! stp d0, d0, [x1, -112] ! stp d0, d0, [x1, -96] ! stp d0, d0, [x1, -80] ! stp d0, d0, [x1, -64] ! stp d0, d0, [x1, -48] ! stp d0, d0, [x1, -32] ! stp d0, d0, [x1, -16] ! str d0, [x1] ! sub x1, x2, #4048 ! str d0, [x4] ! sub x4, x2, #4056 ! sub x2, x2, #4040 ! str d0, [x7] ! str d0, [x6] ! str d0, [x5] ! str d0, [x4] ! str d0, [x1] ! str d0, [x2] bne .L2 ret .size foo, .-foo Actually in this case most IVs differ to each other by a constant offset of base address, they point to same memory object and have same step. These address type IVs should be categorize into a single group as if it's ONE IV use. As a result, the number of IV uses can be decreased thus we can run expensive IV algorithm to make better choice. I can see this only hits architectures like arm/aarch64, because it has more addressing modes than simple register direct one, also it doesn't support arbitrary constant offset in memory reference. But, anyway, this should be handled as target independent issue. I am working on this.