------- Comment #2 from rguenth at gcc dot gnu dot org 2008-02-27 15:19 ------- You probably hit some inliner size limits which are not connected to -ftemplate-depth-20. (That option should never change code generation).
g++ 4.3.0 seems to work like a charm btw. With g++ 4.2 you can use __attribute__((flatten)) to force recursively inling all callees of a function. In your case there is no separate expression kernel evaluator, but the following will do it: int __attribute__((flatten)) evaluate () { int i, s = 0; for(i = 0; i < 1000000000; i++) s += Int(i).add(i).add(i).add(i).add(i).add(i).add(i).add(i).add(i) .add(i).add(i).add(i).add(i).add(i).add(i).add(i).add(i) #ifdef DEPTH17 .add(i) #endif #ifdef DEPTH18 .add(i).add(i) #endif .value(); return s; } int main() { cout << evaluate() << endl; } With 4.3 it is still way faster because it vectorizes the reduction by default which 4.2 can't do. Actually for this reason it may be that with depth18 the code gets slower because of register pressure and bad code generation and not missed inlining. Indeed. With DEPTH17 you get _Z8evaluatev: .LFB1471: xorl %edx, %edx xorl %eax, %eax .p2align 4,,7 .L2: leal (%rax,%rdx,8), %eax leal (%rax,%rdx,8), %eax addl %edx, %eax addl $1, %edx cmpl $1000000000, %edx jne .L2 rep ; ret while with DEPTH18 the following is generated: _Z8evaluatev: .LFB1471: subq $48, %rsp .LCFI0: xorl %edx, %edx xorl %ecx, %ecx .p2align 4,,7 .L2: movl %edx, -36(%rsp) movl %edx, -40(%rsp) movq -40(%rsp), %rax movl %edx, -28(%rsp) movl %edx, -32(%rsp) movl %edx, -20(%rsp) movl %edx, -24(%rsp) movl %edx, -12(%rsp) movq %rax, -120(%rsp) movq -32(%rsp), %rax movl %edx, -16(%rsp) movl %edx, -4(%rsp) movl %edx, -8(%rsp) movl %edx, 4(%rsp) movq %rax, -112(%rsp) movq -24(%rsp), %rax movl %edx, (%rsp) movl %edx, 12(%rsp) movl %edx, 8(%rsp) movl %edx, 20(%rsp) movq %rax, -104(%rsp) movq -16(%rsp), %rax movl %edx, 16(%rsp) movl %edx, 28(%rsp) movl %edx, 24(%rsp) movq %rax, -96(%rsp) movq -8(%rsp), %rax movq %rax, -88(%rsp) movq (%rsp), %rax movq %rax, -80(%rsp) movq 8(%rsp), %rax movq %rax, -72(%rsp) movq 16(%rsp), %rax movq %rax, -64(%rsp) movq 24(%rsp), %rax movq %rax, -56(%rsp) movl -52(%rsp), %eax addl -56(%rsp), %eax addl -60(%rsp), %eax addl -64(%rsp), %eax addl -68(%rsp), %eax addl -72(%rsp), %eax addl -76(%rsp), %eax addl -80(%rsp), %eax addl -84(%rsp), %eax addl -88(%rsp), %eax addl -92(%rsp), %eax addl -96(%rsp), %eax addl -100(%rsp), %eax addl -104(%rsp), %eax addl -108(%rsp), %eax addl -112(%rsp), %eax addl -116(%rsp), %eax addl -120(%rsp), %eax addl %edx, %eax addl $1, %edx addl %eax, %ecx cmpl $1000000000, %edx jne .L2 movl %ecx, %eax addq $48, %rsp ret scalarization seems to give up here (not for 4.3 again). Fixed in 4.3.0. -- rguenth at gcc dot gnu dot org changed: What |Removed |Added ---------------------------------------------------------------------------- Status|UNCONFIRMED |RESOLVED Keywords| |missed-optimization Resolution| |FIXED Target Milestone|--- |4.3.0 http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35393