Basically, consider the following case (shortened, full testcase will be
attached):

static __inline __m128 __attribute__((__always_inline__))
_mm_max_ps (__m128 __A, __m128 __B)
{ return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); }

static __m128 mm_max_ps(const __m128 a, const __m128 b)
{ return _mm_max_ps(a,b); }

... more wrappers ...

static bool __attribute__((always_inline)) bloatit(const __m128 a, const __m128
b)
{
    const __m128
        v0 = mm_max_ps(a,b),
        v1 = mm_min_ps(a,b),
        v2 = mm_mul_ps(a,b),
        v3 = mm_div_ps(a,b),
        g0 = mm_or_ps(_mm_or_ps(_mm_or_ps(v0,v1), v2), v3),
        v4 = mm_min_ps(mm_or_ps(a,b),mm_div_ps(b,a)),
        v5 = mm_max_ps(mm_min_ps(a,mm_div_ps(b,a)), mm_or_ps(b,
mm_div_ps(b,g0))),
        g1 = mm_or_ps(g0,mm_or_ps(v4,v5));
    return mm_movemask_ps(g1);
}

bool finalblow(const __m128 a, const __m128 b, const __m128 c, const __m128 d,
               const __m128 e, const __m128 f)
{
    return
        bloatit(a,b) & bloatit(c,d) & bloatit(e,f) & bloatit(a,c) &
        bloatit(b,d) & bloatit(c,e) & bloatit(d,f) &
        bloatit(b,a) & bloatit(d,c) & bloatit(f,e) & bloatit(c,a) &
        bloatit(d,b) & bloatit(e,c) & bloatit(f,d);
}

what happens is that as a first pass, all always_inline functions are inlined,
so bloatit will be inlined into finalblow causing the size of finalblow after
inlining to be greater than the max-function-growth limit.  After that we
now decide to look at the mm_* routines used in bloatit and decide if we
can inline them into finalblow - which we do _not_ do because finalblow is
already bigger than it may get due to the function-growth limit.  Even if
we correctly figure out that inlining the mm_* functions will _decrease_
the size of finalblow.

Bad.

We also incorrectly count the number of calls to mm_* in finalblow, which we
count to be zero (0).


-- 
           Summary: Inlining always_inline functions causes further inlining
                    that reduces function size to fail
           Product: gcc
           Version: 4.1.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
        AssignedTo: unassigned at gcc dot gnu dot org
        ReportedBy: rguenth at gcc dot gnu dot org


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26667

Reply via email to