[Pixman] [PATCH 2/2] sse2: faster bilinear scaling with 7-bit precision (use _mm_madd_epi16)

Siarhei Siamashka Mon, 25 Jun 2012 16:46:03 -0700

Reducing interpolation precision allows the use of PMADDWD instruction.
It is much faster:


8-bit: image             firefox-fishtank   57.584   58.349   0.74%    3/3
7-bit: image             firefox-fishtank   51.139   51.229   0.30%    3/3

8-bit: src_8888_8888 =  L1: 228.71  L2: 226.52  M:224.82 ( 14.95%)  HT:183.22  
VT:154.02  R:171.72  RT:109.36
7-bit: src_8888_8888 =  L1: 320.45  L2: 317.43  M:314.38 ( 20.77%)  HT:215.13  
VT:177.35  R:204.46  RT:121.93
---
 pixman/pixman-private.h |    2 +-
 pixman/pixman-sse2.c    |   35 +++++++++++++++++++++++++----------
 2 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 020e026..18aa523 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -1,5 +1,5 @@
 /* bilinear interpolation precision (must be <= 8) */
-#define BILINEAR_INTERPOLATION_BITS 8
+#define BILINEAR_INTERPOLATION_BITS 7
 #define BILINEAR_INTERPOLATION_RANGE (1 << BILINEAR_INTERPOLATION_BITS)
 
 #ifndef __ASSEMBLER__
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 301bce5..f665b37 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5369,8 +5369,10 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
 #define BILINEAR_DECLARE_VARIABLES                                             
\
     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);     
\
     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);     
\
-    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, 
BMSK);\
-    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);           
\
+    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, 
BMSK);\
+    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);          
\
+    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, 
BMSK);\
+    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);          
\
     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,      
\
                                          unit_x, unit_x, unit_x, unit_x);      
\
     const __m128i xmm_zero = _mm_setzero_si128 ();                             
\
@@ -5390,15 +5392,28 @@ do {                                                    
                        \
                                        xmm_wt),                                
\
                       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),     
\
                                        xmm_wb));                               
\
-    /* calculate horizontal weights */                                         
\
-    xmm_wh = _mm_add_epi16 (xmm_addc, _mm_xor_si128 (xmm_xorc,                 
\
+    if (BILINEAR_INTERPOLATION_BITS < 8)                                       
\
+    {                                                                          
\
+       /* calculate horizontal weights */                                      
\
+       xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,            
\
                   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));  
\
-    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                     
\
-    /* horizontal interpolation */                                             
\
-    xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                      
\
-    xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                      
\
-    a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                    
\
-                      _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                    
\
+       xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  
\
+       /* horizontal interpolation */                                          
\
+       a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (             
\
+               a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);                      
\
+    }                                                                          
\
+    else                                                                       
\
+    {                                                                          
\
+       /* calculate horizontal weights */                                      
\
+       xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,            
\
+               _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));     
\
+       xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);                                  
\
+       /* horizontal interpolation */                                          
\
+       xmm_lo = _mm_mullo_epi16 (a, xmm_wh);                                   
\
+       xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);                                   
\
+       a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),                 
\
+                          _mm_unpackhi_epi16 (xmm_lo, xmm_hi));                
\
+    }                                                                          
\
     /* shift and pack the result */                                            
\
     a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);                   
\
     a = _mm_packs_epi32 (a, a);                                                
        \
-- 
1.7.3.4

_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman

[Pixman] [PATCH 2/2] sse2: faster bilinear scaling with 7-bit precision (use _mm_madd_epi16)

Reply via email to