On 3/13/06, Andrew Pinski <[EMAIL PROTECTED]> wrote: > Actually the best way of improving the inline heuristics is to get > a real testcase (and not some benchmark) where the inline heuristics > is messed up. Ah, you mean a brand new testcase because PR-21195 wasn't good enough?
$ /usr/local/gcc-4.1.0/bin/g++ -v Using built-in specs. Target: i686-pc-cygwin Configured with: ../configure --prefix=/usr/local/gcc-4.1.0 --enable-languages=c,c++ --enable-threads=posix --with-system-zlib --disable-checking --disable-nls --disable-shared --disable-win32-registry --verbose --enable-bootstrap --with-gcc --with-gnu-ld --with-gnu-as --with-cpu=k8 Thread model: posix gcc version 4.1.0 /usr/local/gcc-4.1.0/bin/g++ -g -O3 -march=k8 -msse2 -o pr-inline.o pr-inline.cc #include <xmmintrin.h> static __m128 mm_max_ps(const __m128 a, const __m128 b) { return _mm_max_ps(a,b); } static __m128 mm_min_ps(const __m128 a, const __m128 b) { return _mm_min_ps(a,b); } static __m128 mm_mul_ps(const __m128 a, const __m128 b) { return _mm_mul_ps(a,b); } static __m128 mm_div_ps(const __m128 a, const __m128 b) { return _mm_div_ps(a,b); } static __m128 mm_or_ps(const __m128 a, const __m128 b) { return _mm_or_ps(a,b); } static int mm_movemask_ps(const __m128 a) { return _mm_movemask_ps(a); } static __attribute__ ((always_inline)) bool bloatit(const __m128 a, const __m128 b) { const __m128 v0 = mm_max_ps(a,b), v1 = mm_min_ps(a,b), v2 = mm_mul_ps(a,b), v3 = mm_div_ps(a,b), g0 = mm_or_ps(_mm_or_ps(_mm_or_ps(v0,v1), v2), v3), v4 = mm_min_ps(mm_or_ps(a,b),mm_div_ps(b,a)), v5 = mm_max_ps(mm_min_ps(a,mm_div_ps(b,a)), mm_or_ps(b, mm_div_ps(b,g0))), g1 = mm_or_ps(g0,mm_or_ps(v4,v5)); return mm_movemask_ps(g1); } bool finalblow(const __m128 a, const __m128 b, const __m128 c, const __m128 d, const __m128 e, const __m128 f) { return bloatit(a,b) & bloatit(c,d) & bloatit(e,f) & bloatit(a,c) & bloatit(b,d) & bloatit(c,e) & bloatit(d,f) & bloatit(b,a) & bloatit(d,c) & bloatit(f,e) & bloatit(c,a) & bloatit(d,b) & bloatit(e,c) & bloatit(f,d); } int main() { return 0; } 00401080 <mm_mul_ps(float __vector, float __vector)>: 401080: push %ebp 401081: mulps %xmm1,%xmm0 401084: mov %esp,%ebp 401086: sub $0x8,%esp 401089: leave 40108a: ret 40108b: nop 40108c: lea 0x0(%esi),%esi 00401090 <mm_or_ps(float __vector, float __vector)>: 401090: push %ebp 401091: orps %xmm1,%xmm0 401094: mov %esp,%ebp 401096: sub $0x8,%esp 401099: leave 40109a: ret 40109b: nop 40109c: lea 0x0(%esi),%esi 004010a0 <mm_div_ps(float __vector, float __vector)>: 4010a0: divps %xmm1,%xmm0 4010a3: push %ebp 4010a4: mov %esp,%ebp 4010a6: sub $0x8,%esp 4010a9: leave 4010aa: ret 4010ab: nop ... 004010e0 <finalblow(float __vector, float __vector, float __vector, float __vector, float __vector, float __vector)>: ... 401101: call 4010c0 <mm_max_ps(float __vector, float __vector)> 401106: movaps %xmm0,0xfffff958(%ebp) 40110d: movaps 0xfffff8f8(%ebp),%xmm1 401114: movaps 0xfffff908(%ebp),%xmm0 40111b: call 4010b0 <mm_min_ps(float __vector, float __vector)> 401120: movaps 0xfffff8f8(%ebp),%xmm1 401127: movaps %xmm0,0xfffff948(%ebp) 40112e: movaps 0xfffff908(%ebp),%xmm0 401135: call 401080 <mm_mul_ps(float __vector, float __vector)> 40113a: movaps 0xfffff8f8(%ebp),%xmm1 401141: movaps %xmm0,0xfffff938(%ebp) 401148: movaps 0xfffff908(%ebp),%xmm0 40114f: call 4010a0 <mm_div_ps(float __vector, float __vector)> 401154: movaps 0xfffff958(%ebp),%xmm1 40115b: orps 0xfffff948(%ebp),%xmm1 401162: movaps %xmm1,0xfffff958(%ebp) 401169: movaps %xmm0,%xmm1 40116c: movaps 0xfffff958(%ebp),%xmm0 401173: orps 0xfffff938(%ebp),%xmm0 40117a: call 401090 <mm_or_ps(float __vector, float __vector)> 40117f: movaps 0xfffff908(%ebp),%xmm1 401186: movaps %xmm0,0xfffff928(%ebp) 40118d: movaps 0xfffff8f8(%ebp),%xmm0 401194: call 4010a0 <mm_div_ps(float __vector, float __vector)> 401199: movaps 0xfffff8f8(%ebp),%xmm1