------- Comment #27 from ubizjak at gmail dot com 2008-01-22 12:20 ------- As already noted by Richi in Comment #9, the difference is in usage of %rax.
gcc-4.2 generates: ... addq $7, %rax leaq (%rax,%rbp,2), %r10 leaq (%rax,%rdx,2), %rdx leaq (%rax,%rdi,2), %rdi movq (%rcx), %rsi movq (%r13), %rcx leaq (%rax,%r9,2), %r9 leaq (%rax,%r8,2), %r8 leaq (%rax,%r14,2), %r11 addq $8, %rbp movsd (%rdx), %xmm3 leaq (%rax,%rsi,2), %rsi leaq (%rax,%rcx,2), %rcx ... movsd %xmm7, (%rcx) subsd %xmm1, %xmm10 addsd %xmm1, %xmm0 movsd %xmm8, (%rsi) movsd %xmm0, (%rdi) movapd %xmm12, %xmm0 subsd %xmm3, %xmm12 addsd %xmm3, %xmm0 movsd %xmm0, (%r8) movsd %xmm10, (%r9) movsd %xmm12, (%rdx) jg .L26 where gcc-4.3 limps along with: ... leaq 7(%rax), %r9 movq %rbx, -64(%rsp) movq -56(%rsp), %rcx addq %r10, %r10 movsd 7(%rax,%rdx), %xmm3 movsd (%r9,%rbx,2), %xmm8 movq (%r11), %rbx movsd 7(%rax,%r10), %xmm5 addq %r8, %r8 addq %rdi, %rdi movsd 7(%rax,%r8), %xmm12 movsd 15(%rbx), %xmm2 leaq (%r9,%rbp,2), %r9 movsd 7(%rbx), %xmm1 ... movsd %xmm0, 7(%rax,%r9,2) movapd %xmm10, %xmm0 movsd %xmm7, 7(%rax,%rcx) subsd %xmm1, %xmm10 addsd %xmm1, %xmm0 movsd %xmm8, 7(%rax,%rsi) movsd %xmm0, 7(%rax,%rdi) movapd %xmm12, %xmm0 subsd %xmm3, %xmm12 addsd %xmm3, %xmm0 movsd %xmm0, 7(%rax,%r8) movsd %xmm10, 7(%rax,%r10) movsd %xmm12, 7(%rax,%rdx) jg .L17 The difference is in offseted addresses. Looking at the tree dumps, it is obvious that the problem is in fre pass. At the end of the loop (line 685+ in _.034.fre) gcc-4.2 transforms every seqence of: D.2013_432 = ___fp_256 + 40B; D.2014_433 = *D.2013_432; D.2068_434 = (long int *) D.2014_433; D.2069_435 = D.2068_434 + 7B; D.2070_436 = (long int) D.2069_435; D.2094_437 = ___r3_35 << 1; D.2095_438 = D.2070_436 + D.2094_437; D.2096_439 = (double *) D.2095_438; *D.2096_439 = ___F64V53_431; D.2013_440 = ___fp_256 + 40B; D.2014_441 = *D.2013_440; D.2068_442 = (long int *) D.2014_441; D.2069_443 = D.2068_442 + 7B; D.2070_444 = (long int) D.2069_443; D.2091_445 = ___r4_257 << 1; D.2092_446 = D.2070_444 + D.2091_445; D.2093_447 = (double *) D.2092_446; *D.2093_447 = ___F64V52_430; D.2013_448 = ___fp_256 + 40B; D.2014_449 = *D.2013_448; D.2068_450 = (long int *) D.2014_449; D.2069_451 = D.2068_450 + 7B; D.2070_452 = (long int) D.2069_451; ... into: D.2013_432 = D.2013_286; D.2014_433 = D.2014_287; D.2068_434 = D.2068_288; D.2069_435 = D.2069_289; D.2070_436 = D.2070_290; D.2094_437 = D.2094_366; D.2095_438 = D.2095_367; D.2096_439 = D.2096_368; *D.2096_439 = ___F64V53_431; D.2013_440 = D.2013_286; D.2014_441 = D.2014_287; D.2068_442 = D.2068_288; D.2069_443 = D.2069_289; D.2070_444 = D.2070_290; D.2091_445 = D.2091_357; D.2092_446 = D.2092_358; D.2093_447 = D.2093_359; *D.2093_447 = ___F64V52_430; D.2013_448 = D.2013_286; D.2014_449 = D.2014_287; D.2068_450 = D.2068_288; D.2069_451 = D.2069_289; D.2070_452 = D.2070_290; D.1994_453 = D.1994_258; D.2040_454 = D.2040_347; D.2041_455 = D.2041_348; D.2089_456 = D.2089_349; D.2090_457 = D.2090_350; ... and this is optimized in further passes into: *D.2096 = ___F64V32 + ___F64V45; *D.2093 = ___F64V31 + ___F64V42; *D.2090 = ___F64V32 - ___F64V45; *D.2088 = ___F64V31 - ___F64V42; *D.2084 = ___F64V28 + ___F64V39; *D.2081 = ___F64V27 + ___F64V36; *D.2077 = ___F64V28 - ___F64V39; *D.2074 = ___F64V27 - ___F64V36; However, for some reason gcc-4.3 transforms only _some_ instructions (line 708+ in _.085t.fre dump), creating: D.1683_428 = D.1683_282; D.1684_429 = D.1684_283; D.1738_430 = D.1738_284; D.1739_431 = D.1739_285; D.1740_432 = D.1740_286; D.1764_433 = D.1764_362; D.1765_434 = D.1765_363; D.1766_435 = D.1766_364; *D.1766_435 = ___F64V53_427; D.1683_436 = D.1683_282; D.1684_437 = *D.1683_436; D.1738_438 = (long unsigned int) D.1684_437; D.1739_439 = D.1738_438 + 7; D.1740_440 = (long int) D.1739_439; D.1761_441 = D.1761_353; D.1762_442 = D.1740_440 + D.1761_441; D.1763_443 = (double *) D.1762_442; *D.1763_443 = ___F64V52_426; D.1683_444 = D.1683_282; D.1684_445 = *D.1683_444; D.1738_446 = (long unsigned int) D.1684_445; D.1739_447 = D.1738_446 + 7; D.1740_448 = (long int) D.1739_447; ... which leaves us with: *D.1766 = ___F64V32 + ___F64V45; *(double *) (D.1761 + (long int) ((long unsigned int) *pretmp.33 + 7)) = ___F64V31 + ___F64V42; *(double *) ((long int) ((long unsigned int) *pretmp.33 + 7) + (*temp.65 << 1)) = ___F64V32 - ___F64V45; *(double *) ((long int) ((long unsigned int) *pretmp.33 + 7) + (*D.1685 << 1)) = ___F64V31 - ___F64V42; *(double *) ((long int) ((long unsigned int) *pretmp.33 + 7) + (*temp.61 << 1)) = ___F64V28 + ___F64V39; *(double *) ((long int) ((long unsigned int) *pretmp.33 + 7) + (*pretmp.152 << 1)) = ___F64V27 + ___F64V36; *(double *) ((long int) ((long unsigned int) *pretmp.33 + 7) + (*pretmp.147 << 1)) = ___F64V28 - ___F64V39; *(double *) ((long int) ((long unsigned int) *pretmp.33 + 7) + (*___fp.47 << 1)) = ___F64V27 - ___F64V36; and creates unoptimal asm as above. -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=33928