[Pixman] [PATCH] sse2: faster bilinear interpolation (get rid of XOR instruction)

Siarhei Siamashka Sun, 27 Jan 2013 23:22:21 -0800

The old code was calculating horizontal weights for right pixels
in the following way (for simplicity assume 8-bit interpolation
precision):


  Start with "x = vx" and do increment "x += ux" after each pixel.
  In this case right pixel weight for interpolation can be calculated
  as "((x >> 8) ^ 0xFF) + 1", which is the same as "256 - (x >> 8)".

The new code instead:

  Starts with "x = -(vx + 1)", performs increment "x += -ux" after
  each pixel and calculates right weights as just "(x >> 8) + 1",
  eliminating the need for XOR operation in the inner loop.

So we have one instruction less on the critical path. Benchmarks
with "lowlevel-blt-bench -b src_8888_8888" using GCC 4.7.2 on
x86-64 system and default optimizations:

Intel Core i7 860 (2.8GHz):
    before: src_8888_8888 =  L1: 359.00  L2: 354.78  M:348.82
    after:  src_8888_8888 =  L1: 402.24  L2: 391.12  M:386.51

Intel Core2 T7300 (2GHz):
    before: src_8888_8888 =  L1: 121.95  L2: 118.38  M:118.52
    after:  src_8888_8888 =  L1: 128.82  L2: 125.12  M:124.88

Intel Atom N450 (1.67GHz):
    before: src_8888_8888 =  L1:  64.25  L2:  62.37  M: 61.80
    after:  src_8888_8888 =  L1:  64.23  L2:  62.37  M: 61.82

Inspired by the "sse2_bilinear_interpolation" function (single
pixel interpolation) from:
    http://lists.freedesktop.org/archives/pixman/2013-January/002575.html
---
 pixman/pixman-sse2.c | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index fc873cc..2d74401 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5554,19 +5554,27 @@ FAST_NEAREST_MAINLOOP_COMMON 
(sse2_8888_n_8888_normal_OVER,
                              scaled_nearest_scanline_sse2_8888_n_8888_OVER,
                              uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
 
-#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
-
-#define BILINEAR_DECLARE_VARIABLES                                             
\
+#if BILINEAR_INTERPOLATION_BITS < 8
+# define BILINEAR_DECLARE_VARIABLES                                            
\
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);     
\
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);     
\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);           
\
+    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,    
\
+                                         unit_x, -unit_x, unit_x, -unit_x);    
\
+    const __m128i xmm_zero = _mm_setzero_si128 ();                             
\
+    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),               
\
+                                  vx, -(vx + 1), vx, -(vx + 1))
+#else
+# define BILINEAR_DECLARE_VARIABLES                                            
\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);     
\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);     
\
-    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, 
BMSK);\
-    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);          
\
-    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, 
BMSK);\
-    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);          
\
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);           
\
     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,      
\
-                                         unit_x, unit_x, unit_x, unit_x);      
\
+                                         -unit_x, -unit_x, -unit_x, -unit_x);  
\
     const __m128i xmm_zero = _mm_setzero_si128 ();                             
\
-    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
+    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,                             
\
+                                  -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
+#endif
 
 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)                                    
\
 do {                                                                           
\
@@ -5585,8 +5593,8 @@ do {                                                      
                        \
     if (BILINEAR_INTERPOLATION_BITS < 8)                                       
\
     {                                                                          
\
        /* calculate horizontal weights */                                      
\
-       xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,            
\
-                  _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));  
\
+       xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,                
\
+                                       16 - BILINEAR_INTERPOLATION_BITS));     
\
        xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  
\
        /* horizontal interpolation */                                          
\
        a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (             
\
@@ -5595,8 +5603,8 @@ do {                                                      
                        \
     else                                                                       
\
     {                                                                          
\
        /* calculate horizontal weights */                                      
\
-       xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,            
\
-               _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));     
\
+       xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,                
\
+                                       16 - BILINEAR_INTERPOLATION_BITS));     
\
        xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  
\
        /* horizontal interpolation */                                          
\
        xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                   
\
-- 
1.7.12.4

_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman

[Pixman] [PATCH] sse2: faster bilinear interpolation (get rid of XOR instruction)

Reply via email to