/*
 * (C) Copyright IBM Corporation 2004
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
/**
 * \file read_rgba_span_MMX.S
 *
 * \author Ian Romanick <idr@us.ibm.com>
 */

	.file	"read_rgba_span_MMX.S"
	.section	.rodata
	.align 16
	.type	mask, @object
	.size	mask, 32
mask:
	.long	0xff00ff00
	.long	0xff00ff00
	.long	0x00ff0000
	.long	0x00ff0000

.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
_generic_read_RGBA_span_BGRA8888_REV_MMX:
	pushl	%esi
	pushl	%ebx

	emms
	movq	mask, %mm1
	movq	mask+8, %mm2

	movl	12(%esp), %ebx	/* source pointer */
	movl	20(%esp), %edx	/* number of pixels to copy */
	movl	16(%esp), %ecx	/* destination pointer */

	movl	%ebx, %eax
	movl	%edx, %esi

	negl	%eax
	andl	$7, %eax
	je	.L28a

	sarl	$2, %eax
	cmpl	%edx, %eax
	jge	.L14
	movl	%eax, %esi
.L14:
	subl	%esi, %edx
	testl	%esi, %esi
	jmp	.L40
.L43:
	movb	2(%ebx), %al
	movb	%al, (%ecx)
	movb	1(%ebx), %al
	movb	%al, 1(%ecx)
	movb	(%ebx), %al
	movb	%al, 2(%ecx)
	movb	3(%ebx), %al
	addl	$4, %ebx
	movb	%al, 3(%ecx)
	addl	$4, %ecx
	subl	$1, %esi
.L40:
	jne	.L43
.L28a:

	/* Would it be faster to unroll this loop once and process 4 pixels
	 * per pass, instead of just two?
	 */

	movl	%edx, %eax
	shrl	%eax
	jmp	.L41
.L44:
	movq	(%ebx), %mm0
	addl	$8, %ebx

	movq	%mm0, %mm3
	movq	%mm0, %mm4
	
	pand	%mm2, %mm3
	psrlq	$16, %mm3

	psllq	$16, %mm4
	pand	%mm2, %mm4

	pand	%mm1, %mm0
	por	%mm4, %mm3
	por	%mm3, %mm0

	movq	%mm0, (%ecx)
	addl	$8, %ecx
	subl	$1, %eax
.L41:
	jne	.L44

	emms

	/* At this point there are either 1 or 0 pixels remaining to be
	 * converted.  Convert the last pixel, if needed.
	 */

	andl	$1, %edx
	je	.L42

	movb	2(%ebx), %al
	movb	%al, (%ecx)
	movb	1(%ebx), %al
	movb	%al, 1(%ecx)
	movb	(%ebx), %al
	movb	%al, 2(%ecx)
	movb	3(%ebx), %al
	movb	%al, 3(%ecx)

.L42:
	popl	%ebx
	popl	%esi
	ret
	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX
