/*
 * (C) Copyright IBM Corporation 2004
 * All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * on the rights to use, copy, modify, merge, publish, distribute, sub
 * license, and/or sell copies of the Software, and to permit persons to whom
 * the Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice (including the next
 * paragraph) shall be included in all copies or substantial portions of the
 * Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
/**
 * \file read_rgba_span_x86.S
 *
 * \author Ian Romanick <idr@us.ibm.com>
 */

	.file	"read_rgba_span_MMX.S"
	.section	.rodata
	.align 16
	.type	mask, @object
	.size	mask, 32
mask:
	.long	0xff00ff00
	.long	0xff00ff00
	.long	0xff00ff00
	.long	0xff00ff00
	.long	0x00ff0000
	.long	0x00ff0000
	.long	0x00ff0000
	.long	0x00ff0000

/**
 * MMX optimized version of the BGRA8888_REV to RGBA copy routine.
 * 
 * \warning
 * This function assumes that the caller will issue the EMMS instruction
 * at the correct places.
 */

.globl _generic_read_RGBA_span_BGRA8888_REV_MMX
	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function
_generic_read_RGBA_span_BGRA8888_REV_MMX:
	pushl	%esi
	pushl	%ebx

#ifdef USE_INNER_EMMS
	emms
#endif
	movq	mask, %mm1
	movq	mask+16, %mm2

	movl	12(%esp), %ebx	/* source pointer */
	movl	20(%esp), %edx	/* number of pixels to copy */
	movl	16(%esp), %ecx	/* destination pointer */

	movl	%ebx, %eax
	movl	%edx, %esi

	negl	%eax
	andl	$7, %eax
	je	.L17

	sarl	$2, %eax
	cmpl	%edx, %eax
	jge	.L14
	movl	%eax, %esi
.L14:
	subl	%esi, %edx
	testl	%esi, %esi
	jmp	.L15
.L16:
	movb	2(%ebx), %al
	movb	%al, (%ecx)
	movb	1(%ebx), %al
	movb	%al, 1(%ecx)
	movb	(%ebx), %al
	movb	%al, 2(%ecx)
	movb	3(%ebx), %al
	addl	$4, %ebx
	movb	%al, 3(%ecx)
	addl	$4, %ecx
	subl	$1, %esi
.L15:
	jne	.L16
.L17:

	/* Would it be faster to unroll this loop once and process 4 pixels
	 * per pass, instead of just two?
	 */

	movl	%edx, %eax
	shrl	%eax
	jmp	.L18
.L19:
	movq	(%ebx), %mm0
	addl	$8, %ebx

	movq	%mm0, %mm3
	movq	%mm0, %mm4
	
	pand	%mm2, %mm3
	psrlq	$16, %mm3

	psllq	$16, %mm4
	pand	%mm2, %mm4

	pand	%mm1, %mm0
	por	%mm4, %mm3
	por	%mm3, %mm0

	movq	%mm0, (%ecx)
	addl	$8, %ecx
	subl	$1, %eax
.L18:
	jne	.L19

#ifdef USE_INNER_EMMS
	emms
#endif

	/* At this point there are either 1 or 0 pixels remaining to be
	 * converted.  Convert the last pixel, if needed.
	 */

	andl	$1, %edx
	je	.L20

	movb	2(%ebx), %al
	movb	%al, (%ecx)
	movb	1(%ebx), %al
	movb	%al, 1(%ecx)
	movb	(%ebx), %al
	movb	%al, 2(%ecx)
	movb	3(%ebx), %al
	movb	%al, 3(%ecx)

.L20:
	popl	%ebx
	popl	%esi
	ret
	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX


/**
 * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE
 * instructions are only actually used to read data from the framebuffer.
 * In practice, the speed-up is pretty small.
 *
 * \todo
 * Do some more testing and determine if there's any reason to have this
 * function in addition to the MMX version.
 *
 * \warning
 * This function assumes that the caller will issue the EMMS instruction
 * at the correct places.
 */

.globl _generic_read_RGBA_span_BGRA8888_REV_SSE
	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE:
	pushl	%esi
	pushl	%ebx
	pushl	%ebp

#ifdef USE_INNER_EMMS
	emms
#endif
	movq	mask, %mm1
	movq	mask+16, %mm2

	movl	16(%esp), %ebx	/* source pointer */
	movl	24(%esp), %edx	/* number of pixels to copy */
	movl	20(%esp), %ecx	/* destination pointer */

	movl	%esp, %ebp
	subl	$16, %esp
	andl	$0xfffffff0, %esp

	movl	%ebx, %eax
	movl	%edx, %esi

	negl	%eax
	andl	$15, %eax
	je	.L30

	sarl	$2, %eax
	cmpl	%edx, %eax
	cmovle	%eax, %esi

	subl	%esi, %edx
	testl	%esi, %esi
	jmp	.L31
.L32:
	movzbl	2(%ebx), %eax
	movb	%al, (%ecx)
	movzbl	1(%ebx), %eax
	movb	%al, 1(%ecx)
	movzbl	(%ebx), %eax
	movb	%al, 2(%ecx)
	movzbl	3(%ebx), %eax
	addl	$4, %ebx
	movb	%al, 3(%ecx)
	addl	$4, %ecx
	subl	$1, %esi
.L31:
	jne	.L32

.L30:
	movl	%edx, %eax
	shrl	$2, %eax
	jmp	.L33
.L34:
	movaps	(%ebx), %xmm0
	addl	$16, %ebx

	/* This would be so much better if we could just move directly from
	 * an SSE register to an MMX register.  Unfortunately, that
	 * functionality wasn't introduced until SSE2.
	 */
	movaps	%xmm0, (%esp)
	movq	(%esp), %mm0
	movq	8(%esp), %mm5

	movq	%mm0, %mm3
	movq	%mm0, %mm4
	movq	%mm5, %mm6
	movq	%mm5, %mm7

	pand	%mm2, %mm3
	pand	%mm2, %mm6

	psllq	$16, %mm4
	psllq	$16, %mm7

	psrlq	$16, %mm3
	psrlq	$16, %mm6

	pand	%mm2, %mm4
	pand	%mm2, %mm7

	pand	%mm1, %mm0
	pand	%mm1, %mm5

	por	%mm4, %mm3
	por	%mm7, %mm6

	por	%mm3, %mm0
	por	%mm6, %mm5

	movq	%mm0, (%ecx)
	movq	%mm5, 8(%ecx)
	addl	$16, %ecx

	subl	$1, %eax
.L33:
	jne	.L34

#ifdef USE_INNER_EMMS
	emms
#endif
	movl	%ebp, %esp

	/* At this point there are either [0, 3] pixels remaining to be
	 * converted.
	 */

	andl	$3, %edx
	jmp	.L35

.L36:
	movzbl	2(%ebx), %eax
	movb	%al, (%ecx)
	movzbl	1(%ebx), %eax
	movb	%al, 1(%ecx)
	movzbl	(%ebx), %eax
	movb	%al, 2(%ecx)
	movzbl	3(%ebx), %eax
	addl	$4, %ebx
	movb	%al, 3(%ecx)
	addl	$4, %ecx
	subl	$1, %edx
.L35:
	jne	.L36

	popl	%ebp
	popl	%ebx
	popl	%esi
	ret
	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE


	.text
.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2
	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function
_generic_read_RGBA_span_BGRA8888_REV_SSE2:
	pushl	%esi
	pushl	%ebx

	movdqa	mask, %xmm1
	movdqa	mask+16, %xmm2

	movl	12(%esp), %ebx	/* source pointer */
	movl	20(%esp), %edx	/* number of pixels to copy */
	movl	16(%esp), %ecx	/* destination pointer */
	movl	%ebx, %eax
	movl	%edx, %esi

	/* If the source pointer isn't a multiple of 16 we have to process
	 * a few pixels the "slow" way to get the address aligned for
	 * the SSE fetch intsructions.
	 */

	negl	%eax
	andl	$15, %eax
	je	.L40

	sarl	$2, %eax
	cmpl	%edx, %eax
	cmovbe	%eax, %esi
	subl	%esi, %edx

.L41:
	movzbl	2(%ebx), %eax
	movb	%al, (%ecx)
	movzbl	1(%ebx), %eax
	movb	%al, 1(%ecx)
	movzbl	(%ebx), %eax
	movb	%al, 2(%ecx)
	movzbl	3(%ebx), %eax
	addl	$4, %ebx
	movb	%al, 3(%ecx)
	addl	$4, %ecx
	subl	$1, %esi
	jne	.L41

.L40:

	/* Would it be worth having a specialized version of this loop for
	 * the case where the destination is 16-byte aligned?  That version
	 * would be identical except that it could use movedqa instead of
	 * movdqu.
	 */

	movl	%edx, %eax
	shrl	$2, %eax
	jmp	.L42
.L43:
	movdqa	(%ebx), %xmm0
	addl	$16, %ebx

	movdqa	%xmm0, %xmm3
	movdqa	%xmm0, %xmm4
	andps	%xmm1, %xmm0

	andps	%xmm2, %xmm3
	pslldq	$2, %xmm4
	psrldq	$2, %xmm3
	andps	%xmm2, %xmm4

	orps	%xmm4, %xmm3
	orps	%xmm3, %xmm0

	movdqu	%xmm0, (%ecx)
	addl	$16, %ecx
	subl	$1, %eax
.L42:
	jne	.L43
	andl	$3, %edx
	jmp	.L44

.L45:	movzbl	2(%ebx), %eax
	movb	%al, (%ecx)
	movzbl	1(%ebx), %eax
	movb	%al, 1(%ecx)
	movzbl	(%ebx), %eax
	movb	%al, 2(%ecx)
	movzbl	3(%ebx), %eax
	addl	$4, %ebx
	movb	%al, 3(%ecx)

	addl	$4, %ecx
	subl	$1, %edx
.L44:	jne	.L45

	popl	%ebx
	popl	%esi
	ret
	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2
