In these last few days I have been working on the Mesa software blending
and the existing MMX bug. I've made some progress.
I made a small test program which calls the relevant functions directly as
Alex suggested. In the process I added comments to the assembly code
(which had none). The error is due to the fact that the inner loop blends
two pixels at the same time, so if the mask of the first element is zero
then both are skipped. I also spotted some errors in the runin section,
e.g., it ANDs with 4 and compares the result with 8 which is impossible...
I still have to study the x86 architecture optimization a little further
to know how to optimally fix both these situations.
I also made two optimizations in blend_transparency(s_blend.c) which have
no effect in the result precision but that achieved a global speedup of
30% in the function. These optimizations are in the C code and benefit all
architectures.
The first was to avoid the repetition of the input variable in the DIV255.
At least my version of gcc (2.96) wasn't factoring the common code out
yelding to a 17% speedup.
The second was to factor the equation of blending reducing in half the
number of multiplications. This optimization can be applied in other
places on this file as well.
A third optimization that I'll try is the "double blend" trick (make two
8-bit multiplications at the same time in a 32-bit register) as documented
by Michael Herf (http://www.stereopsis.com/doubleblend.html - a quite
interesting site referred to me by Brian).
I would like to keep improving Mesa software rendering performance. I know
that due to its versatility and power Mesa will never rival with a
dedicated and non-conformant software 3d engine such as unreal one,
nevertheless I think that it's possible to make it usefull for simple
realtime rendering. Regards,
Jos� Fonseca
Index: swrast/s_blend.c
===================================================================
RCS file: /cvsroot/mesa3d/Mesa/src/swrast/s_blend.c,v
retrieving revision 1.14
diff -u -r1.14 s_blend.c
--- swrast/s_blend.c 27 Mar 2002 15:49:27 -0000 1.14
+++ swrast/s_blend.c 1 Apr 2002 00:34:20 -0000
@@ -132,12 +132,24 @@
#if CHAN_BITS == 8
/* This satisfies Glean and should be reasonably fast */
/* Contributed by Nathan Hand */
+#if 0
#define DIV255(X) (((X) << 8) + (X) + 256) >> 16
+#else
+ const GLint temp;
+#define DIV255(X) (temp = (X), ((temp << 8) + temp + 256) >> 16)
+#endif
+#if 0
const GLint s = CHAN_MAX - t;
const GLint r = DIV255(rgba[i][RCOMP] * t + dest[i][RCOMP] * s);
const GLint g = DIV255(rgba[i][GCOMP] * t + dest[i][GCOMP] * s);
const GLint b = DIV255(rgba[i][BCOMP] * t + dest[i][BCOMP] * s);
const GLint a = DIV255(rgba[i][ACOMP] * t + dest[i][ACOMP] * s);
+#else
+ const GLint r = DIV255((rgba[i][RCOMP] - dest[i][RCOMP]) * t) +
+dest[i][RCOMP];
+ const GLint g = DIV255((rgba[i][GCOMP] - dest[i][GCOMP]) * t) +
+dest[i][GCOMP];
+ const GLint b = DIV255((rgba[i][BCOMP] - dest[i][BCOMP]) * t) +
+dest[i][BCOMP];
+ const GLint a = DIV255((rgba[i][ACOMP] - dest[i][ACOMP]) * t) +
+dest[i][ACOMP];
+#endif
#undef DIV255
#elif CHAN_BITS == 16
const GLfloat tt = (GLfloat) t / CHAN_MAXF;
Index: X86/mmx_blend.S
===================================================================
RCS file: /cvsroot/mesa3d/Mesa/src/X86/mmx_blend.S,v
retrieving revision 1.5
diff -u -r1.5 mmx_blend.S
--- X86/mmx_blend.S 28 Mar 2001 20:44:44 -0000 1.5
+++ X86/mmx_blend.S 1 Apr 2002 00:35:13 -0000
@@ -7,25 +7,35 @@
ALIGNTEXT16
GLOBL GLNAME(_mesa_mmx_blend_transparency)
+/*
+ * void blend_transparency( GLcontext *ctx,
+ * GLuint n,
+ * const GLubyte mask[],
+ * GLchan rgba[][4],
+ * CONST GLchan dest[][4] )
+ *
+ * Common transparency blending mode.
+ */
GLNAME( _mesa_mmx_blend_transparency ):
PUSH_L ( EBP )
MOV_L ( ESP, EBP )
SUB_L ( CONST(52), ESP )
PUSH_L ( EBX )
+
MOV_L ( CONST(16711680), REGOFF(-8, EBP) )
MOV_L ( CONST(16711680), REGOFF(-4, EBP) )
MOV_L ( CONST(0), REGOFF(-16, EBP) )
MOV_L ( CONST(-1), REGOFF(-12, EBP) )
MOV_L ( CONST(-1), REGOFF(-24, EBP) )
MOV_L ( CONST(0), REGOFF(-20, EBP) )
- MOV_L ( REGOFF(24, EBP), EAX )
+ MOV_L ( REGOFF(24, EBP), EAX ) /* rgba */
ADD_L ( CONST(4), EAX )
MOV_L ( EAX, EDX )
- AND_L ( REGOFF(20, EBP), EDX )
+ AND_L ( REGOFF(20, EBP), EDX ) /* mask */
MOV_L ( EDX, EAX )
AND_L ( CONST(4), EAX )
CMP_L ( CONST(8), EAX )
- JNE ( LLBL(GMBT_2) )
+ JNE ( LLBL(GMBT_no_align) )
MOV_L ( REGOFF(20, EBP), EAX )
ADD_L ( CONST(3), EAX )
XOR_L ( EDX, EDX )
@@ -116,28 +126,28 @@
ADD_L ( CONST(4), REGOFF(20, EBP) )
ADD_L ( CONST(4), REGOFF(24, EBP) )
DEC_L ( REGOFF(12, EBP) )
-LLBL(GMBT_2):
+LLBL(GMBT_skip_runin):
- CMP_L ( CONST(0), REGOFF(12, EBP) )
- JE ( LLBL(GMBT_3) )
- MOV_L ( CONST(0), REGOFF(-28, EBP) )
+ CMP_L ( CONST(0), REGOFF(12, EBP) ) /* n == 0 */
+ JE ( LLBL(GMBT_zero_length) )
+ MOV_L ( CONST(0), REGOFF(-28, EBP) )
ALIGNTEXT4
-LLBL(GMBT_4):
+LLBL(GMBT_main_loop):
MOV_L ( REGOFF(12, EBP), EDX )
MOV_L ( EDX, EAX )
- SHR_L ( CONST(1), EAX )
+ SHR_L ( CONST(1), EAX ) /* eax = n/2 */
CMP_L ( EAX, REGOFF(-28, EBP) )
- JB ( LLBL(GMBT_7) )
- JMP ( LLBL(GMBT_5) )
+ JB ( LLBL(GMBT_no_jump) )
+ JMP ( LLBL(GMBT_end_loop) )
ALIGNTEXT16
-LLBL(GMBT_7):
+LLBL(GMBT_nojump):
MOV_L ( REGOFF(-28, EBP), EAX )
LEA_L ( REGDIS(0,EAX,2), EDX )
- MOV_L ( REGOFF(16, EBP), EAX )
+ MOV_L ( REGOFF(16, EBP), EAX ) /* mask */
CMP_B ( CONST(0), REGBI(EAX,EDX) )
- JE ( LLBL(GMBT_6) )
+ JE ( LLBL(GMBT_masked) )
MOV_L ( REGOFF(-28, EBP), EAX )
MOV_L ( EAX, EDX )
LEA_L ( REGDIS(0,EDX,8), ECX )
@@ -195,25 +205,22 @@
POR ( MM1, MM4 )
MOVQ ( MM4, REGIND(EAX) )
-
-LLBL(GMBT_8):
-
-LLBL(GMBT_6):
+LLBL(GMBT_masked):
INC_L ( REGOFF(-28, EBP) )
- JMP ( LLBL(GMBT_4) )
+ JMP ( LLBL(GMBT_main_loop) )
ALIGNTEXT16
-LLBL(GMBT_5):
+LLBL(GMBT_end_loop):
EMMS
-LLBL(GMBT_3):
+LLBL(GMBT_runout):
MOV_L ( REGOFF(12, EBP), EAX )
AND_L ( CONST(1), EAX )
TEST_L ( EAX, EAX )
- JE ( LLBL(GMBT_9) )
+ JE ( LLBL(GMBT_skip_runout) )
MOV_L ( REGOFF(12, EBP), EAX )
LEA_L ( REGDIS(0,EAX,4), EDX )
MOV_L ( EDX, EAX )
@@ -342,9 +349,7 @@
LEA_L ( REGOFF(-1, EAX), EDX )
MOV_B ( REGOFF(-32, EBP), AL )
MOV_B ( AL, REGIND(EDX) )
-LLBL(GMBT_9):
-
-LLBL(GMBT_1):
+LLBL(GMBT_skip_runout):
MOV_L ( REGOFF(-56, EBP), EBX )
MOV_L ( EBP, ESP )