On an SNB i5-2500 using cairo-image:

firefox-canvas        17.8 -> 10.3:  1.72x speedup
firefox-tron          46.3 -> 28.4:  1.63x speedup
swfdec-youtube         1.7 ->  1.4:  1.22x speedup
firefox-fishbowl      64.6 -> 53.7:  1.20x speedup
firefox-paintball     40.8 -> 36.8:  1.11x speedup
firefox-canvas-alpha  27.3 -> 25.4:  1.07x speedup
---
 pixman/pixman-sse2.c |  719 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 719 insertions(+)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index fc873cc..9558e9c 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -6346,6 +6346,709 @@ static const fetcher_info_t fetchers[] =
     { PIXMAN_null }
 };
 
+typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
+
+static force_inline uint32_t
+linear_interpolation (const uint32_t a, const uint32_t b, int w)
+{
+    uint32_t l, r, t;
+
+    w <<= (8 - BILINEAR_INTERPOLATION_BITS);
+
+    /* red and blue */
+    l = a & 0x00ff00ff;
+    r = b & 0x00ff00ff;
+    t = w*r + (256-w)*l;
+
+    /* alpha and green */
+    l = (a & 0xff00ff00) >> 8;
+    r = (b & 0xff00ff00) >> 8;
+    return ((t & 0xff00ff00) >> 8) | ((w*r + (256-w)*l) & 0xff00ff00);
+}
+
+static force_inline uint32_t
+sse2_bilinear_interpolation (const uint32_t *src_top,
+                            const uint32_t *src_bottom,
+                            int dx, int dy)
+{
+#if 0
+    int wb = dy, wt = BILINEAR_INTERPOLATION_RANGE - dy;
+    pixman_fixed_t   vx = dx << (16 - BILINEAR_INTERPOLATION_BITS);
+    pixman_fixed_t   unit_x = 0;
+    BILINEAR_DECLARE_VARIABLES;
+    uint32_t pix1;
+    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
+    return pix1;
+#else
+    int wb = dy, wt = BILINEAR_INTERPOLATION_RANGE - dy;
+
+    __m128i xmm_wh, xmm_lo, xmm_hi, a;
+
+    /* fetch 2x2 pixel block into sse2 registers */
+    __m128i tltr = _mm_loadl_epi64 ((__m128i *)src_top);
+    __m128i blbr = _mm_loadl_epi64 ((__m128i *)src_bottom);
+
+    /* vertical interpolation */
+    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, 
_mm_setzero_si128 ()),
+                                       _mm_set_epi16 (wt, wt, wt, wt, wt, wt, 
wt, wt)),
+                      _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, 
_mm_setzero_si128 ()),
+                                       _mm_set_epi16 (wb, wb, wb, wb, wb, wb, 
wb, wb)));
+    if (BILINEAR_INTERPOLATION_BITS < 8)
+    {
+       const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, 
BMSK);
+       const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);
+       const __m128i xmm_x = _mm_set_epi16 (dx, dx, dx, dx, dx, dx, dx, dx);
+
+       /* calculate horizontal weights */
+       xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, xmm_x));
+       /* horizontal interpolation */
+       a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (
+               a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);
+    }
+    else
+    {
+       const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, 
BMSK);
+       const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
+       const __m128i xmm_x = _mm_set_epi16 (dx, dx, dx, dx, dx, dx, dx, dx);
+
+       /* calculate horizontal weights */
+       xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, xmm_x));
+       /* horizontal interpolation */
+       xmm_lo = _mm_mullo_epi16 (a, xmm_wh);
+       xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);
+       a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),
+                          _mm_unpackhi_epi16 (xmm_lo, xmm_hi));
+    }
+    /* shift and pack the result */
+    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);
+    a = _mm_packs_epi32 (a, a);
+    a = _mm_packus_epi16 (a, a);
+    return _mm_cvtsi128_si32 (a);
+#endif
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static force_inline uint32_t *
+sse2_fetch_bilinear (pixman_iter_t *iter,
+                    const uint32_t *mask,
+                    convert_pixel_t    convert_pixel,
+                    pixman_format_code_t       format,
+                    pixman_repeat_t repeat)
+{
+    pixman_image_t * ima = iter->image;
+    int              offset = iter->x;
+    int              line = iter->y++;
+    int              width = iter->width;
+    uint32_t *       buffer = iter->buffer;
+    uint32_t * const end = buffer + width;
+
+    bits_image_t *bits = &ima->bits;
+    pixman_fixed_t x_top, x_bottom, x;
+    pixman_fixed_t ux_top, ux_bottom, ux;
+    pixman_vector_t v;
+    const uint8_t *top_row;
+    const uint8_t *bottom_row;
+    uint32_t one = 1;
+    int y, y1, y2;
+    int disty;
+    int mask_inc;
+    int w;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (bits->common.transform, &v))
+       return iter->buffer;
+
+    ux = ux_top = ux_bottom = bits->common.transform->matrix[0][0];
+    x = x_top = x_bottom = v.vector[0] - pixman_fixed_1/2;
+
+    y = v.vector[1] - pixman_fixed_1/2;
+    disty = pixman_fixed_to_bilinear_weight (y);
+
+    /* Load the pointers to the first and second lines from the source
+     * image that bilinear code must read.
+     *
+     * The main trick in this code is about the check if any line are
+     * outside of the image;
+     *
+     * When I realize that a line (any one) is outside, I change
+     * the pointer to a dummy area with zeros. Once I change this, I
+     * must be sure the pointer will not change, so I set the
+     * variables to each pointer increments inside the loop.
+     */
+    y1 = pixman_fixed_to_int (y);
+    y2 = y1 + 1;
+
+    /* Instead of checking whether the operation uses the mast in
+     * each loop iteration, verify this only once and prepare the
+     * variables to make the code smaller inside the loop.
+     */
+    if (!mask)
+    {
+       mask_inc = 0;
+       mask = &one;
+    }
+    else
+    {
+       /* If have a mask, prepare the variables to check it */
+       mask_inc = 1;
+    }
+
+    if (repeat == PIXMAN_REPEAT_NONE)
+    {
+       uint32_t top_mask, bottom_mask;
+
+       if (y1 < 0 || y1 >= bits->height)
+       {
+           top_row = zero;
+           x_top = 0;
+           ux_top = 0;
+       }
+       else
+       {
+           top_row = (uint8_t *)(bits->bits + y1 * bits->rowstride);
+           x_top = x;
+           ux_top = ux;
+       }
+
+       if (y2 < 0 || y2 >= bits->height)
+       {
+           bottom_row = zero;
+           x_bottom = 0;
+           ux_bottom = 0;
+       }
+       else
+       {
+           bottom_row = (uint8_t *)(bits->bits + y2 * bits->rowstride);
+           x_bottom = x;
+           ux_bottom = ux;
+       }
+
+       /* If both are zero, then the whole thing is zero */
+       if (top_row == zero && bottom_row == zero)
+       {
+           return memset (buffer, 0, width * sizeof (uint32_t));
+       }
+       else if (PIXMAN_FORMAT_A(format) == 0)
+       {
+           if (top_row == zero)
+           {
+               top_mask = 0;
+               bottom_mask = 0xff000000;
+           }
+           else if (bottom_row == zero)
+           {
+               top_mask = 0xff000000;
+               bottom_mask = 0;
+           }
+           else
+           {
+               top_mask = 0xff000000;
+               bottom_mask = 0xff000000;
+           }
+       }
+       else
+       {
+           top_mask = 0;
+           bottom_mask = 0;
+       }
+
+       /* Zero fill to the left of the image */
+       while (buffer < end && x < pixman_fixed_minus_1)
+       {
+           *buffer++ = 0;
+           x += ux;
+           x_top += ux_top;
+           x_bottom += ux_bottom;
+           mask += mask_inc;
+       }
+
+       /* Left edge
+       */
+       while (buffer < end && x < 0)
+       {
+           uint32_t top[2] = {0, convert_pixel (top_row, 0) | top_mask};
+           uint32_t bot[2] = {0, convert_pixel (bottom_row, 0) | bottom_mask};
+           int32_t distx = pixman_fixed_to_bilinear_weight (x);
+
+           *buffer++ = sse2_bilinear_interpolation (top, bot, distx, disty);
+
+           x += ux;
+           x_top += ux_top;
+           x_bottom += ux_bottom;
+           mask += mask_inc;
+       }
+
+       /* Main part */
+       w = pixman_int_to_fixed (bits->width - 1);
+       if (format == PIXMAN_a8r8g8b8 && ux_top == ux && ux_bottom == ux && x < 
w)
+       {
+           int width;
+
+           width = end - buffer;
+           if (width * ux > w - x)
+               width = (w - x + ux - 1) / ux;
+
+           scaled_bilinear_scanline_sse2_8888_8888_SRC (buffer, NULL,
+                                                        (uint32_t *)top_row,
+                                                        (uint32_t *)bottom_row,
+                                                        width,
+                                                        
BILINEAR_INTERPOLATION_RANGE - disty, disty,
+                                                        x, ux,
+                                                        0, 0);
+
+           buffer += width;
+           x_bottom = x_top = x += ux * width;
+           mask += mask_inc * width;
+       }
+       else
+       {
+           while (buffer < end && x < w)
+           {
+               if (*mask)
+               {
+                   int32_t distx = pixman_fixed_to_bilinear_weight (x);
+                   uint32_t top[2] = {
+                       convert_pixel (top_row, pixman_fixed_to_int (x_top)) | 
top_mask,
+                       convert_pixel (top_row, pixman_fixed_to_int (x_top) + 
1) | top_mask,
+                   };
+                   uint32_t bot[2] = {
+                       convert_pixel (bottom_row, pixman_fixed_to_int 
(x_bottom)) | bottom_mask,
+                       convert_pixel (bottom_row, pixman_fixed_to_int 
(x_bottom) + 1) | bottom_mask,
+                   };
+
+                   *buffer = sse2_bilinear_interpolation (top, bot, distx, 
disty);
+               }
+
+               buffer++;
+               x += ux;
+               x_top += ux_top;
+               x_bottom += ux_bottom;
+               mask += mask_inc;
+           }
+       }
+
+       /* Right Edge */
+       w = pixman_int_to_fixed (bits->width);
+       while (buffer < end && x < w)
+       {
+           if (*mask)
+           {
+               uint32_t top[2] = { convert_pixel (top_row, pixman_fixed_to_int 
(x_top)) | top_mask, 0};
+               uint32_t bot[2] = { convert_pixel (bottom_row, 
pixman_fixed_to_int (x_bottom)) | bottom_mask, 0};
+               int32_t distx = pixman_fixed_to_bilinear_weight (x);
+
+               *buffer = sse2_bilinear_interpolation (top, bot, distx, disty);
+           }
+
+           buffer++;
+           x += ux;
+           x_top += ux_top;
+           x_bottom += ux_bottom;
+           mask += mask_inc;
+       }
+
+       /* Zero fill to the left of the image */
+       while (buffer < end)
+           *buffer++ = 0;
+    }
+    else
+    {
+       uint32_t alpha = PIXMAN_FORMAT_A (format) ? 0 : 0xff000000;
+
+       if (y1 <= 0)
+       {
+           top_row = (uint8_t *)(bits->bits);
+       }
+       else if (y1 >= bits->height)
+       {
+           top_row = (uint8_t *)(bits->bits + (bits->height-1) * 
bits->rowstride);
+       }
+       else
+       {
+           top_row = (uint8_t *)(bits->bits + y1 * bits->rowstride);
+       }
+
+       if (y2 <= 0)
+       {
+           bottom_row = (uint8_t *)(bits->bits);
+       }
+       else if (y2 >= bits->height)
+       {
+           bottom_row = (uint8_t *)(bits->bits + (bits->height-1) * 
bits->rowstride);
+       }
+       else
+       {
+           bottom_row = (uint8_t *)(bits->bits + y2 * bits->rowstride);
+       }
+
+       /* Left edge */
+       if (x <= 0)
+       {
+           uint32_t top = convert_pixel (top_row, 0) | alpha;
+           uint32_t bot = convert_pixel (bottom_row, 0) | alpha;
+           uint32_t p = linear_interpolation (top, bot, disty);
+           while (buffer < end && x <= 0)
+           {
+               *buffer++ = p;
+               x += ux;
+               mask += mask_inc;
+           }
+       }
+
+       /* Main part */
+       w = pixman_int_to_fixed (bits->width - 1);
+       if (format == PIXMAN_a8r8g8b8 && x < w)
+       {
+           int width;
+
+           width = end - buffer;
+           if (width * ux > w - x)
+               width = (w - x + ux - 1) / ux;
+
+           scaled_bilinear_scanline_sse2_8888_8888_SRC (buffer, NULL,
+                                                        (uint32_t *)top_row,
+                                                        (uint32_t *)bottom_row,
+                                                        width,
+                                                        
BILINEAR_INTERPOLATION_RANGE - disty, disty,
+                                                        x, ux,
+                                                        0, 0);
+
+           buffer += width;
+           x += ux * width;
+           mask += mask_inc * width;
+       }
+       else
+       {
+           while (buffer < end && x < w)
+           {
+               if (*mask)
+               {
+                   int32_t distx = pixman_fixed_to_bilinear_weight (x);
+                   uint32_t top[2] = {
+                       convert_pixel (top_row, pixman_fixed_to_int (x)) | 
alpha,
+                       convert_pixel (top_row, pixman_fixed_to_int (x) + 1) | 
alpha,
+                   };
+                   uint32_t bot[2] = {
+                       convert_pixel (bottom_row, pixman_fixed_to_int (x)) | 
alpha,
+                       convert_pixel (bottom_row, pixman_fixed_to_int (x) + 1) 
| alpha,
+                   };
+
+                   *buffer = sse2_bilinear_interpolation (top, bot, distx, 
disty);
+               }
+
+               buffer++;
+               x += ux;
+               mask += mask_inc;
+           }
+       }
+
+       /* Right Edge */
+       if (buffer < end)
+       {
+           uint32_t top = convert_pixel (top_row, bits->width-1) | alpha;
+           uint32_t bot = convert_pixel (bottom_row, bits->width-1) | alpha;
+           uint32_t p = linear_interpolation (top, bot, disty);
+           while (buffer < end)
+           {
+               *buffer++ = p;
+           }
+       }
+    }
+
+    return iter->buffer;
+}
+
+static force_inline uint32_t *
+sse2_fetch_bilinear_affine (pixman_iter_t *iter,
+                           const uint32_t * mask,
+
+                           convert_pixel_t     convert_pixel,
+                           pixman_format_code_t        format,
+                           pixman_repeat_t     repeat_mode)
+{
+    pixman_image_t *image = iter->image;
+    int offset = iter->x;
+    int line = iter->y++;
+    int width = iter->width;
+    uint32_t *buffer = iter->buffer;
+    pixman_fixed_t x, y;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    bits_image_t *bits = &image->bits;
+    int i;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+       return iter->buffer;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    x = v.vector[0];
+    y = v.vector[1];
+
+    for (i = 0; i < width; ++i)
+    {
+       int x1, y1, x2, y2;
+       uint32_t top[2], bot[2];
+       int32_t distx, disty;
+       int width = image->bits.width;
+       int height = image->bits.height;
+       const uint8_t *top_row;
+       const uint8_t *bot_row;
+
+       if (mask && !mask[i])
+           goto next;
+
+       x1 = x - pixman_fixed_1 / 2;
+       y1 = y - pixman_fixed_1 / 2;
+
+       distx = pixman_fixed_to_bilinear_weight (x1);
+       disty = pixman_fixed_to_bilinear_weight (y1);
+
+       y1 = pixman_fixed_to_int (y1);
+       y2 = y1 + 1;
+       x1 = pixman_fixed_to_int (x1);
+       x2 = x1 + 1;
+
+       if (repeat_mode != PIXMAN_REPEAT_NONE)
+       {
+           uint32_t alpha = PIXMAN_FORMAT_A (format) ? 0 : 0xff000000;
+
+           repeat (repeat_mode, &x1, width);
+           repeat (repeat_mode, &y1, height);
+           repeat (repeat_mode, &x2, width);
+           repeat (repeat_mode, &y2, height);
+
+           top_row = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+           bot_row = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+
+           top[0] = convert_pixel (top_row, x1) | alpha;
+           top[1] = convert_pixel (top_row, x2) | alpha;
+           bot[0] = convert_pixel (bot_row, x1) | alpha;
+           bot[1] = convert_pixel (bot_row, x2) | alpha;
+       }
+       else
+       {
+           uint32_t top_alpha, bot_alpha;
+           int bpp;
+
+           /* Note: PIXMAN_FORMAT_BPP() returns an unsigned value,
+            * which means if you use it in expressions, those
+            * expressions become unsigned themselves. Since
+            * the variables below can be negative in some cases,
+            * that will lead to crashes on 64 bit architectures.
+            *
+            * So this line makes sure bpp is signed
+            */
+           bpp = PIXMAN_FORMAT_BPP (format);
+
+           if (x1 >= width || x2 < 0 || y1 >= height || y2 < 0)
+           {
+               buffer[i] = 0;
+               goto next;
+           }
+
+           if (y2 == 0)
+           {
+               top_row = zero;
+               top_alpha = 0;
+           }
+           else
+           {
+               top_row = (uint8_t *)bits->bits + bits->rowstride * 4 * y1;
+               top_row += bpp / 8 * x1;
+               top_alpha = PIXMAN_FORMAT_A (format) ? 0 : 0xff000000;
+           }
+
+           if (y1 == height - 1)
+           {
+               bot_row = zero;
+               bot_alpha = 0;
+           }
+           else
+           {
+               bot_row = (uint8_t *)bits->bits + bits->rowstride * 4 * y2;
+               bot_row += bpp / 8 * x1;
+               bot_alpha = PIXMAN_FORMAT_A (format) ? 0 : 0xff000000;
+           }
+
+           if (x2 == 0)
+           {
+               top[0] = 0;
+               bot[0] = 0;
+           }
+           else
+           {
+               top[0] = convert_pixel (top_row, 0) | top_alpha;
+               bot[0] = convert_pixel (bot_row, 0) | bot_alpha;
+           }
+
+           if (x1 == width - 1)
+           {
+               top[1] = 0;
+               bot[1] = 0;
+           }
+           else
+           {
+               top[1] = convert_pixel (top_row, 1) | top_alpha;
+               bot[1] = convert_pixel (bot_row, 1) | bot_alpha;
+           }
+       }
+
+       buffer[i] = sse2_bilinear_interpolation (top, bot, distx, disty);
+
+    next:
+       x += ux;
+       y += uy;
+    }
+
+    return iter->buffer;
+}
+
+static force_inline uint32_t
+convert_a8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_x8r8g8b8 (const uint8_t *row, int x)
+{
+    return *(((uint32_t *)row) + x);
+}
+
+static force_inline uint32_t
+convert_a8 (const uint8_t *row, int x)
+{
+    return *(row + x) << 24;
+}
+
+static force_inline uint32_t
+convert_r5g6b5 (const uint8_t *row, int x)
+{
+    return convert_0565_to_0888 (*((uint16_t *)row + x));
+}
+
+#define MAKE_SIMPLE_BILINEAR_FETCHER(format, repeat)                   \
+    static uint32_t *                                                  \
+    sse2_fetch_bilinear_ ## format ## _ ## repeat (pixman_iter_t   *iter,\
+                                                  const uint32_t * mask)\
+    {                                                                  \
+       return sse2_fetch_bilinear (iter, mask,                         \
+                                   convert_ ## format,                 \
+                                   PIXMAN_ ## format,                  \
+                                   PIXMAN_REPEAT_ ## repeat);                  
        \
+    }
+
+MAKE_SIMPLE_BILINEAR_FETCHER(a8r8g8b8, NONE)
+MAKE_SIMPLE_BILINEAR_FETCHER(a8r8g8b8, PAD)
+MAKE_SIMPLE_BILINEAR_FETCHER(x8r8g8b8, NONE)
+MAKE_SIMPLE_BILINEAR_FETCHER(x8r8g8b8, PAD)
+MAKE_SIMPLE_BILINEAR_FETCHER(a8, NONE)
+MAKE_SIMPLE_BILINEAR_FETCHER(a8, PAD)
+MAKE_SIMPLE_BILINEAR_FETCHER(r5g6b5, NONE)
+MAKE_SIMPLE_BILINEAR_FETCHER(r5g6b5, PAD)
+
+#define MAKE_BILINEAR_FETCHER(name, format, repeat)                    \
+    static uint32_t *                                                  \
+    sse2_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,                
\
+                                        const uint32_t * mask)         \
+    {                                                                  \
+       return sse2_fetch_bilinear_affine (iter, mask,                  \
+                                          convert_ ## format,          \
+                                          PIXMAN_ ## format,           \
+                                          PIXMAN_REPEAT_ ## repeat);   \
+    }                                                                  \
+
+MAKE_BILINEAR_FETCHER(pad_a8r8g8b8,    a8r8g8b8, PAD)
+MAKE_BILINEAR_FETCHER(none_a8r8g8b8,   a8r8g8b8, NONE)
+MAKE_BILINEAR_FETCHER(reflect_a8r8g8b8,        a8r8g8b8, REFLECT)
+MAKE_BILINEAR_FETCHER(normal_a8r8g8b8, a8r8g8b8, NORMAL)
+MAKE_BILINEAR_FETCHER(pad_x8r8g8b8,    x8r8g8b8, PAD)
+MAKE_BILINEAR_FETCHER(none_x8r8g8b8,   x8r8g8b8, NONE)
+MAKE_BILINEAR_FETCHER(reflect_x8r8g8b8,        x8r8g8b8, REFLECT)
+MAKE_BILINEAR_FETCHER(normal_x8r8g8b8, x8r8g8b8, NORMAL)
+MAKE_BILINEAR_FETCHER(pad_a8,          a8,       PAD)
+MAKE_BILINEAR_FETCHER(none_a8,         a8,       NONE)
+MAKE_BILINEAR_FETCHER(reflect_a8,      a8,       REFLECT)
+MAKE_BILINEAR_FETCHER(normal_a8,       a8,       NORMAL)
+MAKE_BILINEAR_FETCHER(pad_r5g6b5,      r5g6b5,   PAD)
+MAKE_BILINEAR_FETCHER(none_r5g6b5,     r5g6b5,   NONE)
+MAKE_BILINEAR_FETCHER(reflect_r5g6b5,  r5g6b5,   REFLECT)
+MAKE_BILINEAR_FETCHER(normal_r5g6b5,   r5g6b5,   NORMAL)
+
+typedef struct
+{
+    pixman_format_code_t       format;
+    uint32_t                   flags;
+    pixman_iter_get_scanline_t get_scanline;
+} bilinear_fetcher_info_t;
+
+static const bilinear_fetcher_info_t bilinear_fetcher_info[] =
+{
+
+#define GENERAL_BILINEAR_FLAGS                                         \
+    (FAST_PATH_STANDARD_FLAGS          |                               \
+     FAST_PATH_HAS_TRANSFORM           |                               \
+     FAST_PATH_AFFINE_TRANSFORM                |                               
\
+     FAST_PATH_BILINEAR_FILTER)
+
+#define FAST_BILINEAR_FLAGS                                            \
+    (GENERAL_BILINEAR_FLAGS            |                               \
+     FAST_PATH_X_UNIT_POSITIVE         |                               \
+     FAST_PATH_Y_UNIT_ZERO)
+
+#define BILINEAR_SIMPLE_FAST_PATH(format, repeat)                      \
+    { PIXMAN_ ## format,                                               \
+      FAST_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,           \
+      sse2_fetch_bilinear_ ## format ## _ ## repeat,                   \
+    }
+
+    BILINEAR_SIMPLE_FAST_PATH (a8r8g8b8, NONE),
+    BILINEAR_SIMPLE_FAST_PATH (a8r8g8b8, PAD),
+    BILINEAR_SIMPLE_FAST_PATH (x8r8g8b8, NONE),
+    BILINEAR_SIMPLE_FAST_PATH (x8r8g8b8, PAD),
+    BILINEAR_SIMPLE_FAST_PATH (a8, NONE),
+    BILINEAR_SIMPLE_FAST_PATH (a8, PAD),
+    BILINEAR_SIMPLE_FAST_PATH (r5g6b5, NONE),
+    BILINEAR_SIMPLE_FAST_PATH (r5g6b5, PAD),
+
+#define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)                        
\
+    { PIXMAN_ ## format,                                               \
+      GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,                
\
+      sse2_fetch_bilinear_affine_ ## name,                             \
+    }
+
+    BILINEAR_AFFINE_FAST_PATH (pad_a8r8g8b8, a8r8g8b8, PAD),
+    BILINEAR_AFFINE_FAST_PATH (none_a8r8g8b8, a8r8g8b8, NONE),
+    BILINEAR_AFFINE_FAST_PATH (reflect_a8r8g8b8, a8r8g8b8, REFLECT),
+    BILINEAR_AFFINE_FAST_PATH (normal_a8r8g8b8, a8r8g8b8, NORMAL),
+    BILINEAR_AFFINE_FAST_PATH (pad_x8r8g8b8, x8r8g8b8, PAD),
+    BILINEAR_AFFINE_FAST_PATH (none_x8r8g8b8, x8r8g8b8, NONE),
+    BILINEAR_AFFINE_FAST_PATH (reflect_x8r8g8b8, x8r8g8b8, REFLECT),
+    BILINEAR_AFFINE_FAST_PATH (normal_x8r8g8b8, x8r8g8b8, NORMAL),
+    BILINEAR_AFFINE_FAST_PATH (pad_a8, a8, PAD),
+    BILINEAR_AFFINE_FAST_PATH (none_a8, a8, NONE),
+    BILINEAR_AFFINE_FAST_PATH (reflect_a8, a8, REFLECT),
+    BILINEAR_AFFINE_FAST_PATH (normal_a8, a8, NORMAL),
+    BILINEAR_AFFINE_FAST_PATH (pad_r5g6b5, r5g6b5, PAD),
+    BILINEAR_AFFINE_FAST_PATH (none_r5g6b5, r5g6b5, NONE),
+    BILINEAR_AFFINE_FAST_PATH (reflect_r5g6b5, r5g6b5, REFLECT),
+    BILINEAR_AFFINE_FAST_PATH (normal_r5g6b5, r5g6b5, NORMAL),
+
+    { PIXMAN_null },
+};
+
 static pixman_bool_t
 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
 {
@@ -6376,6 +7079,22 @@ sse2_src_iter_init (pixman_implementation_t *imp, 
pixman_iter_t *iter)
        }
     }
 
+    if ((iter->iter_flags & ITER_NARROW) &&
+       (iter->image_flags & GENERAL_BILINEAR_FLAGS) == GENERAL_BILINEAR_FLAGS)
+    {
+       const bilinear_fetcher_info_t *f;
+
+       for (f = bilinear_fetcher_info; f->format != PIXMAN_null; ++f)
+       {
+           if ((f->flags & iter->image_flags) == f->flags &&
+               f->format == image->common.extended_format_code)
+           {
+               iter->get_scanline = f->get_scanline;
+               return TRUE;
+           }
+       }
+    }
+
     return FALSE;
 }
 
-- 
1.7.10.4

_______________________________________________
Pixman mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/pixman

Reply via email to