The combine function was store8888'ing the result, and all consumers were immediately load8888'ing it, causing lots of unnecessary pack and unpack instructions.
It's a very straight forward conversion, except for mmx_combine_over_u and mmx_combine_saturate_u. mmx_combine_over_u was testing the integer result to skip pixels, so we use the is_equal/is_zero functions to test the __m64 data directly without loading it into an integer register. For mmx_combine_saturate_u there's not a lot we can do, since it uses DIV_UN8. Signed-off-by: Matt Turner <[email protected]> --- pixman/pixman-mmx.c | 83 ++++++++++++++++++++++----------------------------- 1 files changed, 36 insertions(+), 47 deletions(-) diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index 63edf18..a9dffd8 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -534,23 +534,20 @@ pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) /* --------------- MMX code patch for fbcompose.c --------------------- */ -static force_inline uint32_t +static force_inline __m64 combine (const uint32_t *src, const uint32_t *mask) { - uint32_t ssrc = *src; + __m64 vsrc = load8888 (src); if (mask) { __m64 m = load8888 (mask); - __m64 s = load8888 (&ssrc); m = expand_alpha (m); - s = pix_multiply (s, m); - - store8888 (&ssrc, s); + vsrc = pix_multiply (vsrc, m); } - return ssrc; + return vsrc; } static void @@ -565,19 +562,17 @@ mmx_combine_over_u (pixman_implementation_t *imp, while (dest < end) { - uint32_t ssrc = combine (src, mask); - uint32_t a = ssrc >> 24; + __m64 vsrc = combine (src, mask); - if (a == 0xff) + /* if (vsrc & full_alpha) == full_alpha */ + if (is_equal (_mm_and_si64 (vsrc, MC (full_alpha)), MC (full_alpha))) { - *dest = ssrc; + store8888 (dest, vsrc); } - else if (ssrc) + else if (!is_zero (vsrc)) { - __m64 s, sa; - s = load8888 (&ssrc); - sa = expand_alpha (s); - store8888 (dest, over (s, sa, load8888 (dest))); + __m64 sa = expand_alpha (vsrc); + store8888 (dest, over (vsrc, sa, load8888 (dest))); } ++dest; @@ -601,11 +596,11 @@ mmx_combine_over_reverse_u (pixman_implementation_t *imp, while (dest < end) { __m64 d, da; - uint32_t s = combine (src, mask); + __m64 s = combine (src, mask); d = load8888 (dest); da = expand_alpha (d); - store8888 (dest, over (d, da, load8888 (&s))); + store8888 (dest, over (d, da, s)); ++dest; ++src; @@ -627,10 +622,9 @@ mmx_combine_in_u (pixman_implementation_t *imp, while (dest < end) { - __m64 x, a; - uint32_t ssrc = combine (src, mask); + __m64 a; + __m64 x = combine (src, mask); - x = load8888 (&ssrc); a = load8888 (dest); a = expand_alpha (a); x = pix_multiply (x, a); @@ -657,11 +651,10 @@ mmx_combine_in_reverse_u (pixman_implementation_t *imp, while (dest < end) { - __m64 x, a; - uint32_t ssrc = combine (src, mask); + __m64 a = combine (src, mask); + __m64 x; x = load8888 (dest); - a = load8888 (&ssrc); a = expand_alpha (a); x = pix_multiply (x, a); store8888 (dest, x); @@ -686,10 +679,9 @@ mmx_combine_out_u (pixman_implementation_t *imp, while (dest < end) { - __m64 x, a; - uint32_t ssrc = combine (src, mask); + __m64 a; + __m64 x = combine (src, mask); - x = load8888 (&ssrc); a = load8888 (dest); a = expand_alpha (a); a = negate (a); @@ -716,11 +708,10 @@ mmx_combine_out_reverse_u (pixman_implementation_t *imp, while (dest < end) { - __m64 x, a; - uint32_t ssrc = combine (src, mask); + __m64 a = combine (src, mask); + __m64 x; x = load8888 (dest); - a = load8888 (&ssrc); a = expand_alpha (a); a = negate (a); x = pix_multiply (x, a); @@ -747,10 +738,9 @@ mmx_combine_atop_u (pixman_implementation_t *imp, while (dest < end) { - __m64 s, da, d, sia; - uint32_t ssrc = combine (src, mask); + __m64 da, d, sia; + __m64 s = combine (src, mask); - s = load8888 (&ssrc); d = load8888 (dest); sia = expand_alpha (s); sia = negate (sia); @@ -780,10 +770,9 @@ mmx_combine_atop_reverse_u (pixman_implementation_t *imp, while (dest < end) { - __m64 s, dia, d, sa; - uint32_t ssrc = combine (src, mask); + __m64 dia, d, sa; + __m64 s = combine (src, mask); - s = load8888 (&ssrc); d = load8888 (dest); sa = expand_alpha (s); dia = expand_alpha (d); @@ -811,10 +800,9 @@ mmx_combine_xor_u (pixman_implementation_t *imp, while (dest < end) { - __m64 s, dia, d, sia; - uint32_t ssrc = combine (src, mask); + __m64 dia, d, sia; + __m64 s = combine (src, mask); - s = load8888 (&ssrc); d = load8888 (dest); sia = expand_alpha (s); dia = expand_alpha (d); @@ -843,10 +831,9 @@ mmx_combine_add_u (pixman_implementation_t *imp, while (dest < end) { - __m64 s, d; - uint32_t ssrc = combine (src, mask); + __m64 d; + __m64 s = combine (src, mask); - s = load8888 (&ssrc); d = load8888 (dest); s = pix_add (s, d); store8888 (dest, s); @@ -871,12 +858,14 @@ mmx_combine_saturate_u (pixman_implementation_t *imp, while (dest < end) { - uint32_t s = combine (src, mask); + uint32_t s, sa, da; uint32_t d = *dest; - __m64 ms = load8888 (&s); - __m64 md = load8888 (&d); - uint32_t sa = s >> 24; - uint32_t da = ~d >> 24; + __m64 ms = combine (src, mask); + __m64 md = load8888 (dest); + + store8888(&s, ms); + da = ~d >> 24; + sa = s >> 24; if (sa > da) { -- 1.7.3.4 _______________________________________________ Pixman mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/pixman
