The old code was calculating horizontal weights for right pixels
in the following way (for simplicity assume 8-bit interpolation
precision):
Start with "x = vx" and do increment "x += ux" after each pixel.
In this case right pixel weight for interpolation can be calculated
as "((x >> 8) ^ 0xFF) + 1", which is the same as "256 - (x >> 8)".
The new code instead:
Starts with "x = -(vx + 1)", performs increment "x += -ux" after
each pixel and calculates right weights as just "(x >> 8) + 1",
eliminating the need for XOR operation in the inner loop.
So we have one instruction less on the critical path. Benchmarks
with "lowlevel-blt-bench -b src_8888_8888" using GCC 4.7.2 on
x86-64 system and default optimizations:
Intel Core i7 860 (2.8GHz):
before: src_8888_8888 = L1: 359.00 L2: 354.78 M:348.82
after: src_8888_8888 = L1: 402.24 L2: 391.12 M:386.51
Intel Core2 T7300 (2GHz):
before: src_8888_8888 = L1: 121.95 L2: 118.38 M:118.52
after: src_8888_8888 = L1: 128.82 L2: 125.12 M:124.88
Intel Atom N450 (1.67GHz):
before: src_8888_8888 = L1: 64.25 L2: 62.37 M: 61.80
after: src_8888_8888 = L1: 64.23 L2: 62.37 M: 61.82
Inspired by the "sse2_bilinear_interpolation" function (single
pixel interpolation) from:
http://lists.freedesktop.org/archives/pixman/2013-January/002575.html
---
pixman/pixman-sse2.c | 34 +++++++++++++++++++++-------------
1 file changed, 21 insertions(+), 13 deletions(-)
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index fc873cc..2d74401 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5554,19 +5554,27 @@ FAST_NEAREST_MAINLOOP_COMMON
(sse2_8888_n_8888_normal_OVER,
scaled_nearest_scanline_sse2_8888_n_8888_OVER,
uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
-#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
-
-#define BILINEAR_DECLARE_VARIABLES
\
+#if BILINEAR_INTERPOLATION_BITS < 8
+# define BILINEAR_DECLARE_VARIABLES
\
+ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
\
+ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
\
+ const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);
\
+ const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,
\
+ unit_x, -unit_x, unit_x, -unit_x);
\
+ const __m128i xmm_zero = _mm_setzero_si128 ();
\
+ __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),
\
+ vx, -(vx + 1), vx, -(vx + 1))
+#else
+# define BILINEAR_DECLARE_VARIABLES
\
const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
\
const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
\
- const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK,
BMSK);\
- const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
\
- const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0,
BMSK);\
- const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);
\
+ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
\
const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,
\
- unit_x, unit_x, unit_x, unit_x);
\
+ -unit_x, -unit_x, -unit_x, -unit_x);
\
const __m128i xmm_zero = _mm_setzero_si128 ();
\
- __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+ __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,
\
+ -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
+#endif
#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)
\
do {
\
@@ -5585,8 +5593,8 @@ do {
\
if (BILINEAR_INTERPOLATION_BITS < 8)
\
{
\
/* calculate horizontal weights */
\
- xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,
\
- _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));
\
+ xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,
\
+ 16 - BILINEAR_INTERPOLATION_BITS));
\
xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);
\
/* horizontal interpolation */
\
a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (
\
@@ -5595,8 +5603,8 @@ do {
\
else
\
{
\
/* calculate horizontal weights */
\
- xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,
\
- _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));
\
+ xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,
\
+ 16 - BILINEAR_INTERPOLATION_BITS));
\
xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);
\
/* horizontal interpolation */
\
xmm_lo = _mm_mullo_epi16 (a, xmm_wh);
\
--
1.7.12.4
_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman