I did some analysis of why gfortran does badly at the gas_dyn benchmark of the Polyhedron benchmark suite. See my analysis at
http://gcc.gnu.org/ml/fortran/2007-04/msg00494.html In short, GCC should use reciprocal and reciprocal square root instructions (available in single precision for SSE and Altivec) when possible. These instructions are very fast, a few cycles vs. dozens or hundreds of cycles for normal division and square root instructions. However, as these instructions are accurate only to 12 bits, they should be enabled only with -ffast-math (or some separate option that gets included with -ffast-math). The following C program demonstrates the issue, for all the functions it should be possible to use reciprocal and/or reciprocal square root instructions instead of normal div and sqrt: #include <math.h> float recip1 (float a) { return 1.0f/a; } float recip2 (float a, float b) { return a/b; } float rsqrt1 (float a) { return 1.0f/sqrtf(a); } float rsqrt2 (float a, float b) { /* Mathematically equivalent to 1/sqrt(b*(1/a)) */ return sqrtf(a/b); } asm output (compiled with -std=c99 -O3 -c -Wall -pedantic -march=k8 -mfpmath=sse -ffast-math -S): .file "recip.c" .text .p2align 4,,15 .globl recip1 .type recip1, @function recip1: pushl %ebp movl %esp, %ebp subl $4, %esp movss .LC0, %xmm0 divss 8(%ebp), %xmm0 movss %xmm0, -4(%ebp) flds -4(%ebp) leave ret .size recip1, .-recip1 .p2align 4,,15 .globl recip2 .type recip2, @function recip2: pushl %ebp movl %esp, %ebp movss 8(%ebp), %xmm0 divss 12(%ebp), %xmm0 movss %xmm0, 8(%ebp) flds 8(%ebp) leave ret .size recip2, .-recip2 .p2align 4,,15 .globl rsqrt2 .type rsqrt2, @function rsqrt2: pushl %ebp movl %esp, %ebp subl $4, %esp movss 8(%ebp), %xmm0 divss 12(%ebp), %xmm0 sqrtss %xmm0, %xmm0 movss %xmm0, -4(%ebp) flds -4(%ebp) leave ret .size rsqrt2, .-rsqrt2 .p2align 4,,15 .globl rsqrt1 .type rsqrt1, @function rsqrt1: pushl %ebp movl %esp, %ebp subl $4, %esp movss .LC0, %xmm0 sqrtss 8(%ebp), %xmm1 divss %xmm1, %xmm0 movss %xmm0, -4(%ebp) flds -4(%ebp) leave ret .size rsqrt1, .-rsqrt1 .section .rodata.cst4,"aM",@progbits,4 .align 4 .LC0: .long 1065353216 .ident "GCC: (GNU) 4.3.0 20070426 (experimental)" .section .note.GNU-stack,"",@progbits As can be seen, it uses divss and sqrtss instead of rcpss and rsqrtss. Of course, there are vectorized versions of these functions too, rcpps and rsqrtps, that should be used when appropriate (vectorization is important e.g. for gas_dyn). -- Summary: Use reciprocal and reciprocal square root with -ffast- math Product: gcc Version: 4.3.0 Status: UNCONFIRMED Keywords: missed-optimization Severity: normal Priority: P3 Component: middle-end AssignedTo: unassigned at gcc dot gnu dot org ReportedBy: jb at gcc dot gnu dot org GCC target triplet: i686-pc-linux-gnu http://gcc.gnu.org/bugzilla/show_bug.cgi?id=31723