[Pixman] [PATCH] mmx: compile on MIPS for Loongson-3A MMI optimizations

xianjudiao Mon, 24 Sep 2018 18:58:25 -0700

From: Xianju Diao <[email protected]>

make check:
        when I enable the USE_OPENMP, the test of 'glyph-test' and 'cover-test' 
will failed on Loongson-3A3000.
        Neither of the two test examples passed without optimizing the 
code.Maybe be multi-core synchronization
        of cpu bug,I will continue to debug this problem.


benchmark:
        Running cairo-perf-trace benchmark on Loongson-3A.
                                              image             image16
        gvim                              5.425 -> 5.069     5.531 -> 5.236
        popler-reseau                     2.149 -> 2.13      2.152 -> 2.139
        swfdec-giant-steps-full          18.672 -> 8.215    33.167 -> 18.28
        swfdec-giant-steps                7.014 -> 2.455    12.48  -> 5.982
        xfce4-terminal-al                13.695 -> 5.241    15.703 -> 5.859
        gonme-system-monitor             12.783 -> 7.058    12.780 -> 7.104
        grads-heat-map                    0.482 -> 0.486     0.516 -> 0.514
        firefox-talos-svg               141.138 -> 134.621 152.495 -> 159.069
        firefox-talos-gfx                23.119 -> 14.437   24.870 -> 15.161
        firefox-world-map                32.018 -> 27.139   33.817 -> 28.085
        firefox-periodic-table           12.305 -> 12.443   12.876 -> 12.913
        evolution                         7.071 -> 3.564     8.550 -> 3.784
        firefox-planet-gnome             77.926 -> 67.526   81.554 -> 65.840
        ocitysmap                         4.934 -> 1.702     4.937 -> 1.701
---
 configure.ac               |    7 +-
 pixman/loongson-mmintrin.h |   12 +
 pixman/pixman-combine32.h  |    6 +
 pixman/pixman-mmx.c        | 1085 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 1105 insertions(+), 5 deletions(-)

diff --git a/configure.ac b/configure.ac
index e833e45..3e3dde5 100644
--- a/configure.ac
+++ b/configure.ac
@@ -154,9 +154,9 @@ AC_CHECK_DECL([__amd64], [AMD64_ABI="yes"], 
[AMD64_ABI="no"])
 # has set CFLAGS.
 if test $SUNCC = yes &&                        \
    test "x$test_CFLAGS" = "x" &&       \
-   test "$CFLAGS" = "-g"
+   test "$CFLAGS" = "-g -mabi=n64"
 then
-  CFLAGS="-O -g"
+  CFLAGS="-O -g -mabi=n64"
 fi
 
 # 
@@ -183,6 +183,7 @@ AC_SUBST(LT_VERSION_INFO)
 # Check for dependencies
 
 PIXMAN_CHECK_CFLAG([-Wall])
+PIXMAN_CHECK_CFLAG([-mabi=n64])
 PIXMAN_CHECK_CFLAG([-Wdeclaration-after-statement])
 PIXMAN_CHECK_CFLAG([-Wno-unused-local-typedefs])
 PIXMAN_CHECK_CFLAG([-fno-strict-aliasing])
@@ -273,7 +274,7 @@ dnl 
===========================================================================
 dnl Check for Loongson Multimedia Instructions
 
 if test "x$LS_CFLAGS" = "x" ; then
-    LS_CFLAGS="-march=loongson2f"
+    LS_CFLAGS="-march=loongson3a"
 fi
 
 have_loongson_mmi=no
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 086c6e0..7c0177e 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -110,6 +110,18 @@ _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
        return ret;
 }
 
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
+{
+       __m64 ret;
+       asm("pcmpgth %0, %1, %2\n\t"
+          : "=f" (ret)
+          : "f" (__m1), "f" (__m2)
+       );
+       return ret;
+}
+
+
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_empty (void)
 {
diff --git a/pixman/pixman-combine32.h b/pixman/pixman-combine32.h
index cdd56a6..27f62d9 100644
--- a/pixman/pixman-combine32.h
+++ b/pixman/pixman-combine32.h
@@ -14,6 +14,12 @@
 #define RB_ONE_HALF 0x800080
 #define RB_MASK_PLUS_ONE 0x10000100
 
+#define RGB_MASK 0xffffff
+#define RGB_DMASK 0xffffffffffffULL
+#define R_DMASK 0x0000ffff00000000ULL
+#define G_DMASK 0x00000000ffff0000ULL
+#define B_DMASK 0x000000000000ffffULL
+
 #define ALPHA_8(x) ((x) >> A_SHIFT)
 #define RED_8(x) (((x) >> R_SHIFT) & MASK)
 #define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index dec3974..e17a65e 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -59,6 +59,71 @@ _mm_empty (void)
 }
 #endif
 
+#define COMBINE_A_OUT 1
+#define COMBINE_A_IN  2
+#define COMBINE_B_OUT 4
+#define COMBINE_B_IN  8
+
+#define COMBINE_CLEAR   0
+#define COMBINE_A       (COMBINE_A_OUT | COMBINE_A_IN)
+#define COMBINE_B       (COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_OVER  (COMBINE_A_OUT | COMBINE_B_OUT | COMBINE_B_IN)
+#define COMBINE_A_ATOP  (COMBINE_B_OUT | COMBINE_A_IN)
+#define COMBINE_B_ATOP  (COMBINE_A_OUT | COMBINE_B_IN)
+#define COMBINE_XOR     (COMBINE_A_OUT | COMBINE_B_OUT)
+
+/* no SIMD instructions for div, so leave it alone 
+ * portion covered by a but not b
+ * min (1, (1-b) / a) 
+ */
+static uint8_t
+combine_disjoint_out_part (uint8_t a, uint8_t b)
+{
+
+    b = ~b;                
+    if (b >= a)            
+       return MASK;       
+    return DIV_UN8 (b, a);    
+}
+
+/* portion covered by both a and b 
+ * max (1-(1-b)/a, 0)
+ */
+static uint8_t
+combine_disjoint_in_part (uint8_t a, uint8_t b)
+{
+
+    b = ~b;                 
+    if (b >= a)            
+       return 0;          
+    return ~DIV_UN8(b, a);   
+}
+
+/* portion covered by a but not b 
+ * max (1-b/a ,0)
+ * */
+static uint8_t
+combine_conjoint_out_part (uint8_t a, uint8_t b)
+{
+
+    if (b >= a)             
+       return 0x00;       
+    return ~DIV_UN8(b, a);   
+}
+
+/* portion covered by both a and b 
+ * min (1, b/a)
+ */
+static uint8_t
+combine_conjoint_in_part (uint8_t a, uint8_t b)
+{
+
+    if (b >= a)            
+       return MASK;       
+    return DIV_UN8 (b, a);    
+}
+
 #ifdef USE_X86_MMX
 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
 #  include <xmmintrin.h>
@@ -79,6 +144,8 @@ _mm_movemask_pi8 (__m64 __A)
     return ret;
 }
 
+#define __OPTIMIZE__
+#ifdef  __OPTIMIZE__
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
 {
@@ -89,6 +156,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
     return __A;
 }
 
+#else
 # define _mm_shuffle_pi16(A, N)                                                
\
     ({                                                                 \
        __m64 ret;                                                      \
@@ -103,6 +171,7 @@ _mm_mulhi_pu16 (__m64 __A, __m64 __B)
 # endif
 #endif
 
+#endif
 #ifndef _MSC_VER
 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
@@ -710,6 +779,34 @@ combine (const uint32_t *src, const uint32_t *mask)
     return vsrc;
 }
 
+static force_inline void
+mmx_combine_mask_ca(const uint32_t *src, const uint32_t *mask, __m64 *s64, 
__m64 *m64)
+{
+    __m64 res, tmp;
+    
+    if(!(*mask))
+       {
+           *s64 = 0;
+           *m64 = 0;
+           return;
+       }
+    
+    *s64 = load8888(src);
+    
+    if (*mask == ~0)
+       {
+           *m64 = expand_alpha(*s64);
+           return;
+       }
+    
+    *m64 = load8888(mask);
+    
+    res = pix_multiply(*s64, *m64);
+    tmp = expand_alpha(*s64);
+    *s64 = res;
+    *m64 = pix_multiply(*m64, tmp);
+}
+
 static force_inline __m64
 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
 {
@@ -729,6 +826,39 @@ core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
 }
 
 static void
+mmx_combine_disjoint_over_u (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                         uint32_t *                dest,
+                         const uint32_t *          src,
+                         const uint32_t *          mask,
+                         int                      width)
+{
+    uint32_t *end = dest + width;
+    uint32_t s32;
+    uint64_t sa64;
+    __m64 s64, d64;
+
+    while (dest < end)
+       {
+           s64 = combine (src, mask);
+           
+           if (s64)
+               {
+                   store8888(&s32, s64);
+                   sa64 = combine_disjoint_out_part (*dest >> A_SHIFT, s32 >> 
A_SHIFT);
+                   d64 = pix_add (pix_multiply (load8888 
(dest),expand_alpha_rev ((*(__m64*)&sa64))), s64);
+                   store8888 (dest, d64);
+               }
+           
+           ++dest;
+           ++src;
+           if (mask)
+               ++mask;
+           
+       }
+}
+
+static void
 mmx_combine_over_u (pixman_implementation_t *imp,
                     pixman_op_t              op,
                     uint32_t *               dest,
@@ -1062,8 +1192,295 @@ mmx_combine_saturate_u (pixman_implementation_t *imp,
     }
     _mm_empty ();
 }
+/* In functions such as ‘combine_conjoint_gerneral_u’, there are multiple 
branchs,determined by the parameter 'combine'.
+ * and this value will not change during functions operations,so it is not 
necessary to judge each value in the origin
+ * code. Can be judged at function entrance,and set the corresponding function 
pointer,can be called directly later.
+ */
+#define DEF_FUNC_ZERO_MASK(type, zm, suffix, res)                              
         \
+    static type inline combine_joint_ ##zm## _ ##suffix( type sa, type da, 
type io_flag) \
+    {                                                                          
         \
+       return res;                                                             
         \
+    }
+                                                                       
+/* 'conjoint' is same code structure as 'disjoint',the funtion name is 
different,set this macro to generate the corresponding
+ * function.The order of parameter is different,which is determined by 
'io_flag',with '0' for 'in_part' and '1' for 'out_part'.
+ */
+#define DEF_FUNC_COMBINE_JOINT_U(cd, io)                                       
                         \
+    static uint8_t inline combine_ ##cd## joint_ ##io## _part_u(uint8_t sa, 
uint8_t da, uint8_t io_flag) \
+    {                                                                          
                         \
+       uint8_t parm[2];                                                        
                         \
+       parm[0] = sa * (io_flag ^ 0x1) + da * (io_flag ^ 0x0);                  
                         \
+       parm[1] = sa * (io_flag ^ 0x0) + da * (io_flag ^ 0x1);                  
                         \
+       return combine_ ##cd## joint_ ##io## _part (parm[0], parm[1]);          
                         \
+    }
+/* Sets the macro for the array of function pointers, storing the correct 
handler at the function entrance */
+#define DEF_COMB_FUNC_ARR(cd,SUFFIX,suffix)                            \
+    COMBINE_JOINT_FUNC_##SUFFIX combine_ ##cd## joint_ ##suffix[4] ={  \
+       combine_joint_zero_ ##suffix,                                   \
+       combine_ ##cd## joint_out_part_ ##suffix,                       \
+       combine_ ##cd## joint_in_part_ ##suffix,                        \
+       combine_joint_mask_ ##suffix                                    \
+    };
+
+typedef  uint8_t (*COMBINE_JOINT_FUNC_U)(uint8_t a, uint8_t b, uint8_t 
io_flag);
+
+DEF_FUNC_ZERO_MASK(uint8_t,zero,u, 0x0)
+DEF_FUNC_ZERO_MASK(uint8_t,mask,u, ~0x0)
+
+DEF_FUNC_COMBINE_JOINT_U(dis, in);
+DEF_FUNC_COMBINE_JOINT_U(dis, out);
+DEF_COMB_FUNC_ARR(dis,U,u)
+
+DEF_FUNC_COMBINE_JOINT_U(con, in);
+DEF_FUNC_COMBINE_JOINT_U(con, out);
+DEF_COMB_FUNC_ARR(con, U, u)
+/* Set an underlying function,'conjoint' and 'disjoint' related functions can 
be called. */
+static void
+mmx_combine_joint_general_u (uint32_t * dest,
+                        const uint32_t *src,
+                        const uint32_t *mask,
+                        int            width,
+                        uint8_t        comb,
+                        COMBINE_JOINT_FUNC_U *cjf)
+{
+    COMBINE_JOINT_FUNC_U combine_joint_u[2];
+    combine_joint_u[0] = cjf[comb & COMBINE_A]; /* in_part */
+    combine_joint_u[1] = cjf[(comb & COMBINE_B)>>2]; /* out_par */
+    
+    uint32_t *end = dest + width;
+    while (dest < end)
+       {
+           __m64 s64 = combine (src, mask);
+           __m64 d64,sa64,da64;
+           uint8_t sa, da;
+           uint32_t tmp;
+           uint64_t Fa, Fb;
+           
+           /* Because these function contain division instructions,
+             *  multimedia instruction are not used to optimize them.
+             */    
+           store8888(&tmp, s64);
+           sa = tmp >> A_SHIFT;
+           da = *dest >> A_SHIFT;
+               
+           Fa = combine_joint_u[0](sa, da, 0);
+           Fb = combine_joint_u[1](sa, da, 1);
+               
+           d64 = load8888(dest);
+           sa64 = expand_alpha_rev (*(__m64*)&Fa);
+           da64 = expand_alpha_rev (*(__m64*)&Fb);
+               
+           d64 = pix_add_mul (s64, sa64, d64, da64);
+               
+           store8888 (dest, d64);
+               
+           ++dest;
+           ++src;
+           if (mask)
+               ++mask;
+       }
+}
+
+
+static void
+mmx_combine_disjoint_general_u (uint32_t * dest,
+                               const uint32_t *src,
+                               const uint32_t *mask,
+                               int            width,
+                               uint8_t        comb)
+{
+    mmx_combine_joint_general_u (dest, src, mask, width, comb, 
combine_disjoint_u);
+}
+
+static void
+mmx_combine_disjoint_in_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *                dest,
+                          const uint32_t *          src,
+                          const uint32_t *          mask,
+                          int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_disjoint_in_reverse_u (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  uint32_t *                dest,
+                                  const uint32_t *          src,
+                                  const uint32_t *          mask,
+                                  int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_disjoint_out_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *                dest,
+                           const uint32_t *          src,
+                           const uint32_t *          mask,
+                           int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_disjoint_out_reverse_u (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint32_t *                dest,
+                                   const uint32_t *          src,
+                                   const uint32_t *          mask,
+                                   int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
 
 static void
+mmx_combine_disjoint_atop_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *                dest,
+                            const uint32_t *          src,
+                            const uint32_t *          mask,
+                            int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_disjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    uint32_t *                dest,
+                                    const uint32_t *          src,
+                                    const uint32_t *          mask,
+                                    int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_disjoint_xor_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *                dest,
+                           const uint32_t *          src,
+                           const uint32_t *          mask,
+                           int                      width)
+{
+    mmx_combine_disjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+/* Conjoint */
+static void
+mmx_combine_conjoint_general_u(uint32_t * dest,
+                              const uint32_t *src,
+                              const uint32_t *mask,
+                              int            width,
+                              uint8_t        comb)
+{
+    mmx_combine_joint_general_u (dest, src, mask, width, comb, 
combine_conjoint_u);
+}
+
+static void
+mmx_combine_conjoint_over_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *                dest,
+                            const uint32_t *          src,
+                            const uint32_t *          mask,
+                            int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_conjoint_over_reverse_u (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    uint32_t *                dest,
+                                    const uint32_t *          src,
+                                    const uint32_t *          mask,
+                                    int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+mmx_combine_conjoint_in_u (pixman_implementation_t *imp,
+                          pixman_op_t              op,
+                          uint32_t *                dest,
+                          const uint32_t *          src,
+                          const uint32_t *          mask,
+                          int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_conjoint_in_reverse_u (pixman_implementation_t *imp,
+                                  pixman_op_t              op,
+                                  uint32_t *                dest,
+                                  const uint32_t *          src,
+                                  const uint32_t *          mask,
+                                  int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_conjoint_out_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *                dest,
+                           const uint32_t *          src,
+                           const uint32_t *          mask,
+                           int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_conjoint_out_reverse_u (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint32_t *                dest,
+                                   const uint32_t *          src,
+                                   const uint32_t *          mask,
+                                   int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_conjoint_atop_u (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *                dest,
+                            const uint32_t *          src,
+                            const uint32_t *          mask,
+                            int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_conjoint_atop_reverse_u (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    uint32_t *                dest,
+                                    const uint32_t *          src,
+                                    const uint32_t *          mask,
+                                    int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_conjoint_xor_u (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *                dest,
+                           const uint32_t *          src,
+                           const uint32_t *          mask,
+                           int                      width)
+{
+    mmx_combine_conjoint_general_u (dest, src, mask, width, COMBINE_XOR);
+}
+
+/* Component alpha combiners */
+static void
 mmx_combine_src_ca (pixman_implementation_t *imp,
                     pixman_op_t              op,
                     uint32_t *               dest,
@@ -1089,6 +1506,410 @@ mmx_combine_src_ca (pixman_implementation_t *imp,
 }
 
 static void
+mmx_combine_saturate_ca (pixman_implementation_t *imp,
+                        pixman_op_t              op,
+                        uint32_t *                dest,
+                        const uint32_t *          src,
+                        const uint32_t *          mask,
+                        int                      width)
+{
+    uint32_t *end = dest + width;
+    while (dest < end)
+       {
+           uint16_t sa, sr, sg, sb;
+           uint32_t sa32, m32;
+           __m64 m64, s64, d64, sa64, da64, cmpf, res;
+           
+           mmx_combine_mask_ca (src, mask, &s64, &m64);
+           
+           d64 = load8888 (dest);
+           da64 = expand_alpha (negate(d64));
+           cmpf = _mm_cmpgt_pi16 (m64, da64);
+           if (cmpf)
+               {
+                   store8888 (&m32, m64);
+                   sa = (m32 >> (A_SHIFT));
+                   sr = (m32 >> (R_SHIFT)) & MASK;
+                   sg = (m32 >> (G_SHIFT)) & MASK;
+                   sb =  m32               & MASK;
+                   sa32 = (~(*dest) >> A_SHIFT) & MASK;
+                   
+                   sa = (sa) ? sa : 0x1;
+                   sr = (sr) ? sr : 0x1;
+                   sg = (sg) ? sg : 0x1;
+                   sb = (sb) ? sb : 0x1;
+                   
+                   sa32 = ((sa32 << G_SHIFT) / sb & MASK) |
+                       ((((sa32 << G_SHIFT) / sg) & MASK) << G_SHIFT) |
+                       ((((sa32 << G_SHIFT) / sr) & MASK) << R_SHIFT) |
+                       ((((sa32 << G_SHIFT) / sa) & MASK) << A_SHIFT);
+                   sa64 = load8888 (&sa32);
+                   da64 = MC (4x00ff);
+                   res = pix_multiply (s64, sa64);
+                   s64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 
(s64, negate (cmpf)));
+                   res = pix_multiply (d64, da64);
+                   d64 = _mm_or_si64 (_mm_and_si64 (res, cmpf), _mm_and_si64 
(d64, negate (cmpf)));
+               }
+           res = _mm_adds_pu8 (s64, d64);
+           store8888 (dest, res);
+           
+           ++dest;
+           ++src;
+           if (mask)
+               ++mask;
+       }
+}
+
+#define DEF_FUNC_COMBINE_JOINT_CA(cd, io)                              \
+    static uint32_t inline combine_ ##cd## joint_ ##io## _part_ca(uint32_t sa, 
uint32_t da, uint32_t io_flag) \
+    {                                                                  \
+       uint8_t da8 = da >> A_SHIFT;                                    \
+       uint32_t m, n, o, p, res;                                       \
+       uint8_t i, parm[2][4], shift=0;                                 \
+       for (i=0; i<4; i++)                                             \
+           {                                                           \
+               parm[0][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x1) + da8 * 
(io_flag ^ 0x0); \
+               parm[1][i] = (uint8_t)(sa>>shift) * (io_flag ^ 0x0) + da8 * 
(io_flag ^ 0x1); \
+               shift += G_SHIFT;                                       \
+           }                                                           \
+       m = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][0], 
parm[1][0]); \
+       n = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][1], 
parm[1][1]) << G_SHIFT; \
+       o = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][2], 
parm[1][2]) << R_SHIFT; \
+       p = (uint32_t)combine_ ##cd## joint_ ##io## _part (parm[0][3], 
parm[1][3]) << A_SHIFT; \
+       res = m | n | o | p;                                            \
+       return res;                                                     \
+    }
+
+typedef  uint32_t (*COMBINE_JOINT_FUNC_CA)(uint32_t sa, uint32_t da, uint32_t 
io_flag);
+
+DEF_FUNC_ZERO_MASK(uint32_t, zero, ca, 0x0)
+DEF_FUNC_ZERO_MASK(uint32_t, mask, ca, ~0x0)
+
+DEF_FUNC_COMBINE_JOINT_CA(dis, in);
+DEF_FUNC_COMBINE_JOINT_CA(dis, out);
+DEF_COMB_FUNC_ARR(dis, CA, ca)
+
+DEF_FUNC_COMBINE_JOINT_CA(con, in);
+DEF_FUNC_COMBINE_JOINT_CA(con, out);
+DEF_COMB_FUNC_ARR(con, CA, ca)
+
+static void
+mmx_combine_joint_general_ca (uint32_t * dest,
+                             const uint32_t *src,
+                             const uint32_t *mask,
+                             int            width,
+                             uint8_t        comb,
+                             COMBINE_JOINT_FUNC_CA *cjf)
+{
+    COMBINE_JOINT_FUNC_CA combine_joint_ca[2];
+    combine_joint_ca[0] = cjf[comb & COMBINE_A];
+    combine_joint_ca[1] = cjf[(comb & COMBINE_B)>>2];
+    
+    uint32_t *end = dest + width;
+    while (dest < end)
+       {
+           __m64 m64, s64, sa64, da64, d64;
+           uint32_t m32, Fa, Fb;
+           
+           mmx_combine_mask_ca (src, mask, &s64, &m64);
+           store8888(&m32, m64);
+           
+           Fa = combine_joint_ca[0](m32, *dest, 0);
+           Fb = combine_joint_ca[1](m32, *dest, 1);
+           
+           sa64 = load8888 (&Fa);
+           da64 = load8888 (&Fb);
+           
+           d64 = load8888 (dest);
+           d64 = pix_add_mul(s64, sa64, d64, da64);
+           
+           store8888 (dest, d64);
+           
+           ++dest;
+           ++src;
+           if (mask)
+               ++mask;
+       }
+    
+}
+
+static void
+mmx_combine_disjoint_general_ca (uint32_t * dest,
+                                const uint32_t *src,
+                                const uint32_t *mask,
+                                int            width,
+                                uint8_t        comb)
+{
+    mmx_combine_joint_general_ca (dest, src, mask, width, comb, 
combine_disjoint_ca);
+}
+
+static void
+mmx_combine_disjoint_over_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *                dest,
+                             const uint32_t *          src,
+                             const uint32_t *          mask,
+                             int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_disjoint_in_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *                dest,
+                           const uint32_t *          src,
+                           const uint32_t *          mask,
+                           int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_disjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint32_t *                dest,
+                                   const uint32_t *          src,
+                                   const uint32_t *          mask,
+                                   int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_disjoint_out_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *                dest,
+                            const uint32_t *          src,
+                            const uint32_t *          mask,
+                            int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_disjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    uint32_t *                dest,
+                                    const uint32_t *          src,
+                                    const uint32_t *          mask,
+                                    int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_disjoint_atop_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *                dest,
+                             const uint32_t *          src,
+                             const uint32_t *          mask,
+                             int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_disjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                     pixman_op_t              op,
+                                     uint32_t *                dest,
+                                     const uint32_t *          src,
+                                     const uint32_t *          mask,
+                                     int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_disjoint_xor_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *                dest,
+                            const uint32_t *          src,
+                            const uint32_t *          mask,
+                            int                      width)
+{
+    mmx_combine_disjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
+mmx_combine_conjoint_general_ca(uint32_t * dest,
+                               const uint32_t *src,
+                               const uint32_t *mask,
+                               int            width,
+                               uint8_t        comb)
+{
+    mmx_combine_joint_general_ca(dest,src,mask,width,comb,combine_conjoint_ca);
+}
+
+/*
+ * Multiply
+ * B(Dca, ad, Sca, as) = Dca.Sca
+ */
+ 
+static void
+mmx_combine_multiply_u (pixman_implementation_t *imp,
+                    pixman_op_t              op,
+                    uint32_t *                dest,
+                    const uint32_t *          src,
+                    const uint32_t *          mask,
+                    int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 dia, d, sia;
+       __m64 s = combine (src, mask); 
+       __m64 ss = s;
+       d = load8888 (dest);   
+       sia = negate (expand_alpha (s));     
+       dia = negate (expand_alpha (d));
+       ss = pix_add_mul (ss, dia, d, sia);
+       d = pix_multiply (d, s);
+       d = pix_add (d, ss);    
+       store8888 (dest, d);
+
+       ++dest;
+       ++src;
+       if (mask)
+               mask++;
+    }
+    _mm_empty ();
+}
+ 
+static void
+mmx_combine_multiply_ca (pixman_implementation_t *imp,
+                     pixman_op_t              op,
+                     uint32_t *                dest,
+                     const uint32_t *          src,
+                     const uint32_t *          mask,
+                     int                      width)
+{
+    const uint32_t *end = dest + width;
+
+    while (dest < end)
+    {
+       __m64 a = load8888 (mask);
+       __m64 s = load8888 (src);
+       __m64 d = load8888 (dest);
+       __m64 r = d;    
+       __m64 da = negate (expand_alpha (d));
+       __m64 sa = expand_alpha (s);
+       s = pix_multiply (s, a);
+       a = pix_multiply (a, sa);
+       a = negate (a);
+       r = pix_add_mul (r, a, s, da);
+       d = pix_multiply (d, s);
+       r = pix_add (r, d);
+       store8888 (dest, r);
+
+       ++src;
+       ++dest;
+       ++mask;
+    }
+    _mm_empty ();
+}
+
+static void
+mmx_combine_conjoint_over_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *                dest,
+                             const uint32_t *          src,
+                             const uint32_t *          mask,
+                             int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OVER);
+}
+
+static void
+mmx_combine_conjoint_over_reverse_ca (pixman_implementation_t *imp,
+                                     pixman_op_t              op,
+                                     uint32_t *                dest,
+                                     const uint32_t *          src,
+                                     const uint32_t *          mask,
+                                     int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OVER);
+}
+
+static void
+mmx_combine_conjoint_in_ca (pixman_implementation_t *imp,
+                           pixman_op_t              op,
+                           uint32_t *                dest,
+                           const uint32_t *          src,
+                           const uint32_t *          mask,
+                           int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_IN);
+}
+
+static void
+mmx_combine_conjoint_in_reverse_ca (pixman_implementation_t *imp,
+                                   pixman_op_t              op,
+                                   uint32_t *                dest,
+                                   const uint32_t *          src,
+                                   const uint32_t *          mask,
+                                   int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_IN);
+}
+
+static void
+mmx_combine_conjoint_out_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *                dest,
+                            const uint32_t *          src,
+                            const uint32_t *          mask,
+                            int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_OUT);
+}
+
+static void
+mmx_combine_conjoint_out_reverse_ca (pixman_implementation_t *imp,
+                                    pixman_op_t              op,
+                                    uint32_t *                dest,
+                                    const uint32_t *          src,
+                                    const uint32_t *          mask,
+                                    int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_OUT);
+}
+
+static void
+mmx_combine_conjoint_atop_ca (pixman_implementation_t *imp,
+                             pixman_op_t              op,
+                             uint32_t *                dest,
+                             const uint32_t *          src,
+                             const uint32_t *          mask,
+                             int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_A_ATOP);
+}
+
+static void
+mmx_combine_conjoint_atop_reverse_ca (pixman_implementation_t *imp,
+                                     pixman_op_t              op,
+                                     uint32_t *                dest,
+                                     const uint32_t *          src,
+                                     const uint32_t *          mask,
+                                     int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_B_ATOP);
+}
+
+static void
+mmx_combine_conjoint_xor_ca (pixman_implementation_t *imp,
+                            pixman_op_t              op,
+                            uint32_t *                dest,
+                            const uint32_t *          src,
+                            const uint32_t *          mask,
+                            int                      width)
+{
+    mmx_combine_conjoint_general_ca (dest, src, mask, width, COMBINE_XOR);
+}
+
+static void
 mmx_combine_over_ca (pixman_implementation_t *imp,
                      pixman_op_t              op,
                      uint32_t *               dest,
@@ -2089,23 +2910,34 @@ mmx_fill (pixman_implementation_t *imp,
        stride = stride * (int) sizeof (uint32_t) / 1;
        byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
        byte_width = width;
-       stride *= 1;
+/*non necessary???*/
+/*     stride *= 1; */
         filler = (filler & 0xff) * 0x01010101;
     }
     else if (bpp == 16)
     {
        stride = stride * (int) sizeof (uint32_t) / 2;
        byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
+#if 0
        byte_width = 2 * width;
        stride *= 2;
+#else
+       byte_width = width << 1;
+       stride <<= 1;
+#endif
         filler = (filler & 0xffff) * 0x00010001;
     }
     else
     {
        stride = stride * (int) sizeof (uint32_t) / 4;
        byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
+#if 0
        byte_width = 4 * width;
        stride *= 4;
+#else
+       byte_width = width << 2;
+       stride <<= 2;
+#endif
     }
 
     fill = ((uint64_t)filler << 32) | filler;
@@ -3274,9 +4106,15 @@ mmx_blt (pixman_implementation_t *imp,
        dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
        src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + 
(src_x));
        dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) 
+ (dest_x));
+#if 0
        byte_width = 2 * width;
        src_stride *= 2;
        dst_stride *= 2;
+#else
+       byte_width = width << 1;
+       src_stride <<= 1;
+       dst_stride <<= 1;
+#endif
     }
     else if (src_bpp == 32)
     {
@@ -3284,9 +4122,15 @@ mmx_blt (pixman_implementation_t *imp,
        dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
        src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + 
(src_x));
        dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) 
+ (dest_x));
+#if 0
        byte_width = 4 * width;
        src_stride *= 4;
        dst_stride *= 4;
+#else
+       byte_width = width << 2;
+       src_stride <<= 2;
+       dst_stride <<= 2;
+#endif
     }
     else
     {
@@ -4003,6 +4847,186 @@ static const pixman_iter_info_t mmx_iters[] =
     { PIXMAN_null },
 };
 
+#define MMX_PDF_SEPARABLE_BLEND_MODE(name)                                     
  \
+static void                                                                    
          \
+mmx_combine_ ## name ## _u (pixman_implementation_t *imp,                      
   \
+                                       pixman_op_t              op,            
  \
+                                       uint32_t *                dest,         
  \
+                                       const uint32_t *          src,          
  \
+                                       const uint32_t *          mask,         
  \
+                                       int                      width)         
  \
+{                                                                              
  \
+       int i;                                                                  
  \
+       for (i = 0; i < width; ++i) {                                           
  \
+               __m64 s = load8888(src + i);                                    
  \
+               __m64 d = load8888(dest + i);                                   
  \
+               __m64 da = expand_alpha(d);                                     
  \
+                                                                               
  \
+               if(mask)                                                        
  \
+               {                                                               
  \
+                       __m64 m = load8888(mask + i);                           
  \
+                       __m64 ma = expand_alpha(m);                             
  \
+                       s = pix_multiply(s,ma);                                 
  \
+               }                                                               
  \
+               __m64 sa = expand_alpha(s);                                     
  \
+                                                                               
  \
+               __m64 isa = negate(sa);                                         
  \
+               __m64 ida = negate(da);                                         
  \
+                                                                               
  \
+               uint32_t result,sada,res;                                       
  \
+               __m64 temp;                                                     
  \
+               store8888(&result,pix_add_mul(d,isa,s,ida));                    
  \
+               store8888(&sada,pix_multiply(sa,da));                           
  \
+               store8888(&res,mmx_blend_ ## name(d,da,s,sa));                  
  \
+                                                                               
  \
+               sada &= A_MASK;                                                 
  \
+               res  &= RGB_MASK;                                               
  \
+               temp = pix_add( pix_add(load8888(&result), load8888(&sada)),    
  \
+                               load8888(&res));                                
  \
+               store8888(dest+i, temp);                                        
  \
+       }                                                                       
  \
+}                                                                              
   \
+static void                                                                    
  \
+mmx_combine_ ## name ## _ca (pixman_implementation_t *imp,                     
  \
+                            pixman_op_t              op,                       
  \
+                 uint32_t *                dest,                               
  \
+                            const uint32_t *          src,                     
  \
+                            const uint32_t *          mask,                    
  \
+                            int                     width)                     
  \
+    {                                                                          
  \
+       int i;                                                                  
  \
+       for (i = 0; i < width; ++i) {                                           
  \
+               __m64 m = load8888(mask + i);                                   
  \
+               __m64 s = load8888(src + i);                                    
  \
+               __m64 d = load8888(dest + i);                                   
  \
+               __m64 sa = expand_alpha(s);                                     
  \
+               __m64 da = expand_alpha(d);                                     
  \
+               __m64 ida = negate(da);                                         
  \
+                                                                               
  \
+               s = pix_multiply(s,m);                                          
  \
+               m = pix_multiply(m,sa);                                         
  \
+               __m64 im = negate(m);                                           
  \
+               __m64 ima = expand_alpha(m);                                    
  \
+                                                                               
   \
+               uint32_t result,mada,res;                                       
  \
+               __m64 temp;                                                     
  \
+               store8888(&result,pix_add_mul(d,im,s,ida));                     
  \
+               store8888(&mada,pix_multiply(ima,da));                          
  \
+               store8888(&res,mmx_blend_ ## name(d,da,s,m));                   
  \
+                                                                               
   \
+               mada &= A_MASK;                                                 
  \
+               res  &= RGB_MASK;                                               
  \
+               temp = pix_add( pix_add(load8888(&result), load8888(&mada)),    
  \
+                               load8888(&res));                                
  \
+               store8888(dest+i, temp);                                        
  \
+       }                                                                       
  \
+}                                                                              
   \
+
+static inline __m64
+_emulate_pminuh(__m64 s, __m64 d)
+{
+       uint64_t tmp_s = to_uint64(s);
+       uint64_t tmp_d = to_uint64(d);
+
+       __m64 res = to_m64(MIN((tmp_s & R_DMASK), (tmp_d & R_DMASK)) 
+               | MIN((tmp_s & G_DMASK), (tmp_d & G_DMASK)) 
+               | MIN((tmp_s & B_DMASK), (tmp_d & B_DMASK)));   
+
+       return res; 
+}
+
+static inline __m64
+_emulate_pmaxuh(__m64 s, __m64 d)
+{
+       uint64_t tmp_s = to_uint64(s);
+       uint64_t tmp_d = to_uint64(d);
+
+       __m64 res = to_m64(MAX((tmp_s & R_DMASK), (tmp_d & R_DMASK)) 
+               | MAX((tmp_s & G_DMASK), (tmp_d & G_DMASK)) 
+               | MAX((tmp_s & B_DMASK), (tmp_d & B_DMASK)));   
+
+       return res; 
+}
+
+#define R_GREATER(a, b) ((a > b) ? 0x0000ffff00000000ULL : 0)
+#define G_GREATER(a, b) ((a > b) ? 0x00000000ffff0000ULL : 0)
+#define B_GREATER(a, b) ((a > b) ? 0x000000000000ffffULL : 0)
+
+static inline __m64
+_emulate_pcmpgtuh(__m64 s, __m64 d)
+{
+       uint64_t tmp_s = to_uint64(s);
+       uint64_t tmp_d = to_uint64(d);
+
+       __m64 res = to_m64(R_GREATER((tmp_s & R_DMASK), (tmp_d & R_DMASK)) 
+               | G_GREATER((tmp_s & G_DMASK), (tmp_d & G_DMASK)) 
+               | B_GREATER((tmp_s & B_DMASK), (tmp_d & B_DMASK)));     
+
+       return res; 
+}
+
+static inline __m64
+_emulate_paddcmpgtuh(__m64 s, __m64 d1, __m64 d2)
+{
+       uint64_t tmp_s = to_uint64(s);
+       uint64_t tmp_d1 = to_uint64(d1);
+       uint64_t tmp_d2 = to_uint64(d2);
+
+       __m64 res = to_m64(R_GREATER((tmp_s & R_DMASK), (tmp_d1 & R_DMASK) + 
(tmp_d2 & R_DMASK)) 
+               | G_GREATER((tmp_s & G_DMASK), (tmp_d1 & G_DMASK) + (tmp_d2 & 
G_DMASK)) 
+               | B_GREATER((tmp_s & B_DMASK), (tmp_d1 & B_DMASK) + (tmp_d2 & 
B_DMASK)));       
+
+       return res; 
+}
+
+
+/*
+ * Darken
+ * B(Dca, Da, Sca, Sa) = min (Sca.Da, Dca.Sa)
+ */
+static inline __m64
+mmx_blend_darken (__m64 dca, __m64 da, __m64 sca, __m64 sa)
+{
+       __m64 res;
+
+       __m64 s = _mm_mullo_pi16(sca,da);
+       __m64 d = _mm_mullo_pi16(dca,sa);
+
+
+       res = _emulate_pminuh(s, d);
+       res = _mm_adds_pu16(res,MC(4x0080));
+       res = _mm_mulhi_pu16(res,MC(4x0101));
+       
+       return res;
+}
+
+MMX_PDF_SEPARABLE_BLEND_MODE (darken)
+
+/*
+ * Lighten
+ * B(Dca, Da, Sca, Sa) = max (Sca.Da, Dca.Sa)
+ */
+static inline __m64
+mmx_blend_lighten (__m64 dca, __m64 da, __m64 sca, __m64 sa)
+{
+       __m64 res;
+
+       __m64 s = _mm_mullo_pi16(sca,da);
+       __m64 d = _mm_mullo_pi16(dca,sa);
+
+       res = _emulate_pmaxuh(s, d);
+       res = _mm_adds_pu16(res,MC(4x0080));
+       res = _mm_mulhi_pu16(res,MC(4x0101));
+       
+       return res;
+}
+
+MMX_PDF_SEPARABLE_BLEND_MODE (lighten)
+
+
+#undef MMX_PDF_SEPARABLE_BLEND_MODE
+
+
 static const pixman_fast_path_t mmx_fast_paths[] =
 {
     PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   
mmx_composite_over_n_8_0565       ),
@@ -4114,8 +5138,37 @@ _pixman_implementation_create_mmx 
(pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, 
mmx_fast_paths);
 
+    /* Unified alpha */
     imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
+    /* Disjoint, unified */
+    imp->combine_32[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OVER_REVERSE] = mmx_combine_saturate_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_IN_REVERSE] = 
mmx_combine_disjoint_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_OUT_REVERSE] = 
mmx_combine_disjoint_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = 
mmx_combine_disjoint_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_u;
+
+    /* Conjoint, unified */
+    imp->combine_32[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OVER_REVERSE] = 
mmx_combine_conjoint_over_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_IN_REVERSE] = 
mmx_combine_conjoint_in_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_OUT_REVERSE] = 
mmx_combine_conjoint_out_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = 
mmx_combine_conjoint_atop_reverse_u;
+    imp->combine_32[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_u;
+
+    /* Multiply, Unified */
+    imp->combine_32[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_u;
+    imp->combine_32[PIXMAN_OP_DARKEN] = mmx_combine_darken_u;
+    imp->combine_32[PIXMAN_OP_LIGHTEN] = mmx_combine_lighten_u;
+    
+    /* Component alpha combiners */
     imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
     imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
     imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
@@ -4137,7 +5190,35 @@ _pixman_implementation_create_mmx 
(pixman_implementation_t *fallback)
     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
     imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
     imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
-
+    imp->combine_32_ca[PIXMAN_OP_SATURATE] = mmx_combine_saturate_ca;
+
+    /* Disjoint CA */
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER] = mmx_combine_disjoint_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OVER_REVERSE] = 
mmx_combine_saturate_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN] = mmx_combine_disjoint_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_IN_REVERSE] = 
mmx_combine_disjoint_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT] = mmx_combine_disjoint_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_OUT_REVERSE] = 
mmx_combine_disjoint_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP] = mmx_combine_disjoint_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_ATOP_REVERSE] = 
mmx_combine_disjoint_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_DISJOINT_XOR] = mmx_combine_disjoint_xor_ca;
+
+    /* Conjoint CA */
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER] = mmx_combine_conjoint_over_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OVER_REVERSE] = 
mmx_combine_conjoint_over_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN] = mmx_combine_conjoint_in_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_IN_REVERSE] = 
mmx_combine_conjoint_in_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT] = mmx_combine_conjoint_out_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_OUT_REVERSE] = 
mmx_combine_conjoint_out_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP] = mmx_combine_conjoint_atop_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_ATOP_REVERSE] = 
mmx_combine_conjoint_atop_reverse_ca;
+    imp->combine_32_ca[PIXMAN_OP_CONJOINT_XOR] = mmx_combine_conjoint_xor_ca;
+
+    /* Multiply CA */
+    imp->combine_32_ca[PIXMAN_OP_MULTIPLY] = mmx_combine_multiply_ca;
+    imp->combine_32_ca[PIXMAN_OP_DARKEN] = mmx_combine_darken_ca;
+    imp->combine_32_ca[PIXMAN_OP_LIGHTEN] = mmx_combine_lighten_ca;
+    
     imp->blt = mmx_blt;
     imp->fill = mmx_fill;
 
-- 
2.1.0

_______________________________________________
Pixman mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/pixman

[Pixman] [PATCH] mmx: compile on MIPS for Loongson-3A MMI optimizations

Reply via email to