Basically, consider the following case (shortened, full testcase will be
attached):
static __inline __m128 __attribute__((__always_inline__))
_mm_max_ps (__m128 __A, __m128 __B)
{ return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); }
static __m128 mm_max_ps(const __m128 a, const __m128 b)
{ return _mm_max_ps(a,b); }
... more wrappers ...
static bool __attribute__((always_inline)) bloatit(const __m128 a, const __m128
b)
{
const __m128
v0 = mm_max_ps(a,b),
v1 = mm_min_ps(a,b),
v2 = mm_mul_ps(a,b),
v3 = mm_div_ps(a,b),
g0 = mm_or_ps(_mm_or_ps(_mm_or_ps(v0,v1), v2), v3),
v4 = mm_min_ps(mm_or_ps(a,b),mm_div_ps(b,a)),
v5 = mm_max_ps(mm_min_ps(a,mm_div_ps(b,a)), mm_or_ps(b,
mm_div_ps(b,g0))),
g1 = mm_or_ps(g0,mm_or_ps(v4,v5));
return mm_movemask_ps(g1);
}
bool finalblow(const __m128 a, const __m128 b, const __m128 c, const __m128 d,
const __m128 e, const __m128 f)
{
return
bloatit(a,b) & bloatit(c,d) & bloatit(e,f) & bloatit(a,c) &
bloatit(b,d) & bloatit(c,e) & bloatit(d,f) &
bloatit(b,a) & bloatit(d,c) & bloatit(f,e) & bloatit(c,a) &
bloatit(d,b) & bloatit(e,c) & bloatit(f,d);
}
what happens is that as a first pass, all always_inline functions are inlined,
so bloatit will be inlined into finalblow causing the size of finalblow after
inlining to be greater than the max-function-growth limit. After that we
now decide to look at the mm_* routines used in bloatit and decide if we
can inline them into finalblow - which we do _not_ do because finalblow is
already bigger than it may get due to the function-growth limit. Even if
we correctly figure out that inlining the mm_* functions will _decrease_
the size of finalblow.
Bad.
We also incorrectly count the number of calls to mm_* in finalblow, which we
count to be zero (0).
--
Summary: Inlining always_inline functions causes further inlining
that reduces function size to fail
Product: gcc
Version: 4.1.0
Status: UNCONFIRMED
Keywords: missed-optimization
Severity: normal
Priority: P3
Component: tree-optimization
AssignedTo: unassigned at gcc dot gnu dot org
ReportedBy: rguenth at gcc dot gnu dot org
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26667