I've looked at that crossbar patch for r200 again and improved it a bit.
It will now
- disable texture sampling of units if the result is not used
- reorder tex env instructions to be always in-order on the gpu (according to earlier tests, this can make a performance difference, http://marc.theaimsgroup.com/?l=dri-devel&m=112308244205670&w=2, though I've yet to find an app which doesn't enable the units in-order, the only thing in real world I've found which doesn't was a marbleblastdemo, and it only doesn't because it fails the texture completeness test, not because it actually doesn't enable the unit...) - tries to optimize away env instructions. This is not a general optimizer, which would be very hard to do anyway and more or less impossible due to the requirement of OpenGL to clamp the results after each stage, but it will try to ditch the tex env if it is GL_REPLACE (for both rgb and alpha) by replacing the args in the next tex env. Seems to work, for instance ut2003 sometimes uses tex envs with 4 units enabled, and the optimizer reduces this to 3 sampled textures, and 2 env instructions. Impressive, isn't it? Unfortunately this makes absolutely no difference in performance... (ut2003 is horribly limited by vertex throughput with the current state of the driver, and anything which causes more cpu cycles to be used will probably make it slower, no matter how many gpu cycles this might save, plus I believe these tex envs which can be optimized are only used for small parts of the screen (powerups maybe).) It MIGHT make more of a performance difference with radeon 8500/9100, as those can sample more textures per pass (at least under some circumstances afaik), but have the same amout of arithmetic resources (afaik). Does this look somewhat reasonable? The code is a bit ugly (especially the GL_REPLACE env optimize stuff), I don't like that the env args have to be parsed two times, and it does cause some more cpu cycles spent (roughly 2.5 times as much as previously in the driver's tex env functions according to some quick profiling, it was still only 0.2 percent or so however). But there doesn't seem to be a good way to clean it up (without making it quite a bit slower at least).

Roland


Index: r200_context.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_context.c,v
retrieving revision 1.47
diff -u -r1.47 r200_context.c
--- r200_context.c      11 Aug 2005 19:47:06 -0000      1.47
+++ r200_context.c      25 Aug 2005 14:45:57 -0000
@@ -140,6 +140,7 @@
     { "GL_ARB_texture_env_add",            NULL },
     { "GL_ARB_texture_env_combine",        NULL },
     { "GL_ARB_texture_env_dot3",           NULL },
+    { "GL_ARB_texture_env_crossbar",       NULL },
     { "GL_ARB_texture_mirrored_repeat",    NULL },
     { "GL_ARB_vertex_buffer_object",       
GL_ARB_vertex_buffer_object_functions },
     { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
Index: r200_context.h
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_context.h,v
retrieving revision 1.30
diff -u -r1.30 r200_context.h
--- r200_context.h      26 Jul 2005 02:44:02 -0000      1.30
+++ r200_context.h      25 Aug 2005 14:45:57 -0000
@@ -172,8 +172,8 @@
 
 struct r200_texture_env_state {
    r200TexObjPtr texobj;
-   GLenum format;
-   GLenum envMode;
+   GLuint outputreg;
+   GLuint unitneeded;
 };
 
 #define R200_MAX_TEXTURE_UNITS 6
@@ -544,6 +544,7 @@
    struct r200_stencilbuffer_state stencil;
    struct r200_stipple_state stipple;
    struct r200_texture_state texture;
+   GLuint envneeded;
 };
 
 /* Need refcounting on dma buffers:
Index: r200_reg.h
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_reg.h,v
retrieving revision 1.12
diff -u -r1.12 r200_reg.h
--- r200_reg.h  15 Mar 2005 22:23:29 -0000      1.12
+++ r200_reg.h  25 Aug 2005 14:45:58 -0000
@@ -172,6 +172,8 @@
 #define     R200_TEX_BLEND_4_ENABLE                   0x00010000
 #define     R200_TEX_BLEND_5_ENABLE                   0x00020000
 #define     R200_TEX_BLEND_6_ENABLE                   0x00040000
+#define     R200_TEX_BLEND_ENABLE_MASK                0x0007f800
+#define     R200_TEX_BLEND_0_ENABLE_SHIFT             (12)
 #define     R200_MULTI_PASS_ENABLE                    0x00080000
 #define     R200_SPECULAR_ENABLE                      0x00200000
 #define     R200_FOG_ENABLE                           0x00400000
@@ -1146,6 +1148,7 @@
 #define     R200_TXC_CLAMP_WRAP                    (0 << 12)
 #define     R200_TXC_CLAMP_0_1                     (1 << 12)
 #define     R200_TXC_CLAMP_8_8                     (2 << 12)
+#define     R200_TXC_OUTPUT_REG_SHIFT              16
 #define     R200_TXC_OUTPUT_REG_MASK               (7 << 16)
 #define     R200_TXC_OUTPUT_REG_NONE               (0 << 16)
 #define     R200_TXC_OUTPUT_REG_R0                 (1 << 16)
@@ -1288,6 +1291,7 @@
 #define     R200_TXA_CLAMP_WRAP                    (0 << 12)
 #define     R200_TXA_CLAMP_0_1                     (1 << 12)
 #define     R200_TXA_CLAMP_8_8                     (2 << 12)
+#define     R200_TXA_OUTPUT_REG_SHIFT              16
 #define     R200_TXA_OUTPUT_REG_MASK               (7 << 16)
 #define     R200_TXA_OUTPUT_REG_NONE               (0 << 16)
 #define     R200_TXA_OUTPUT_REG_R0                 (1 << 16)
Index: r200_state_init.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_state_init.c,v
retrieving revision 1.21
diff -u -r1.21 r200_state_init.c
--- r200_state_init.c   15 Mar 2005 22:23:29 -0000      1.21
+++ r200_state_init.c   25 Aug 2005 14:46:00 -0000
@@ -118,7 +118,9 @@
 #define CHECK( NM, FLAG )                              \
 static GLboolean check_##NM( GLcontext *ctx, int idx ) \
 {                                                      \
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);           \
    (void) idx;                                         \
+   (void) rmesa;                                       \
    return FLAG;                                                \
 }
 
@@ -135,12 +137,17 @@
 CHECK( always, GL_TRUE )
 CHECK( never, GL_FALSE )
 CHECK( tex_any, ctx->Texture._EnabledUnits )
-CHECK( tex_pair, (ctx->Texture.Unit[idx]._ReallyEnabled | 
ctx->Texture.Unit[idx & ~1]._ReallyEnabled))
-CHECK( tex, ctx->Texture.Unit[idx]._ReallyEnabled )
-CHECK( tex_cube, ctx->Texture.Unit[idx]._ReallyEnabled & TEXTURE_CUBE_BIT)
+/*CHECK( tex_pair, (ctx->Texture.Unit[idx]._ReallyEnabled | 
ctx->Texture.Unit[idx & ~1]._ReallyEnabled))*/
+CHECK( tex_pair, (rmesa->state.texture.unit[idx].unitneeded | 
rmesa->state.texture.unit[idx & ~1].unitneeded))
+/*CHECK( tex, ctx->Texture.Unit[idx]._ReallyEnabled )*/
+CHECK( tex, rmesa->state.texture.unit[idx].unitneeded )
+CHECK( texenv, rmesa->state.envneeded & (1 << idx) )
+/*CHECK( tex_cube, ctx->Texture.Unit[idx]._ReallyEnabled & TEXTURE_CUBE_BIT)*/
+CHECK( tex_cube, rmesa->state.texture.unit[idx].unitneeded & TEXTURE_CUBE_BIT )
 CHECK( fog, ctx->Fog.Enabled )
 TCL_CHECK( tcl, GL_TRUE )
-TCL_CHECK( tcl_tex, ctx->Texture.Unit[idx]._ReallyEnabled )
+/*TCL_CHECK( tcl_tex, ctx->Texture.Unit[idx]._ReallyEnabled )*/
+TCL_CHECK( tcl_tex, rmesa->state.texture.unit[idx].unitneeded )
 TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
 TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[idx].Enabled )
 TCL_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) )
@@ -300,11 +307,11 @@
    ALLOC_STATE( lit[6], tcl_light, LIT_STATE_SIZE, "LIT/light-6", 6 );
    ALLOC_STATE( lit[7], tcl_light, LIT_STATE_SIZE, "LIT/light-7", 7 );
    ALLOC_STATE( pix[0], always, PIX_STATE_SIZE, "PIX/pixstage-0", 0 );
-   ALLOC_STATE( pix[1], tex, PIX_STATE_SIZE, "PIX/pixstage-1", 1 );
-   ALLOC_STATE( pix[2], tex, PIX_STATE_SIZE, "PIX/pixstage-2", 2 );
-   ALLOC_STATE( pix[3], tex, PIX_STATE_SIZE, "PIX/pixstage-3", 3 );
-   ALLOC_STATE( pix[4], tex, PIX_STATE_SIZE, "PIX/pixstage-4", 4 );
-   ALLOC_STATE( pix[5], tex, PIX_STATE_SIZE, "PIX/pixstage-5", 5 );
+   ALLOC_STATE( pix[1], texenv, PIX_STATE_SIZE, "PIX/pixstage-1", 1 );
+   ALLOC_STATE( pix[2], texenv, PIX_STATE_SIZE, "PIX/pixstage-2", 2 );
+   ALLOC_STATE( pix[3], texenv, PIX_STATE_SIZE, "PIX/pixstage-3", 3 );
+   ALLOC_STATE( pix[4], texenv, PIX_STATE_SIZE, "PIX/pixstage-4", 4 );
+   ALLOC_STATE( pix[5], texenv, PIX_STATE_SIZE, "PIX/pixstage-5", 5 );
    if (rmesa->r200Screen->drmSupportsTriPerf) {
       ALLOC_STATE( prf, always, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
    }
Index: r200_texstate.c
===================================================================
RCS file: /cvs/mesa/Mesa/src/mesa/drivers/dri/r200/r200_texstate.c,v
retrieving revision 1.20
diff -u -r1.20 r200_texstate.c
--- r200_texstate.c     31 May 2005 23:32:47 -0000      1.20
+++ r200_texstate.c     25 Aug 2005 14:46:01 -0000
@@ -351,7 +351,26 @@
    /* FYI: r200UploadTexImages( rmesa, t ) used to be called here */
 }
 
-
+static void print_texenv( GLcontext *ctx, GLuint k ) {
+   int j;
+   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[k];
+   const GLuint numColorArgs = texUnit->_CurrentCombine->_NumArgsRGB;
+   const GLuint numAlphaArgs = texUnit->_CurrentCombine->_NumArgsA;
+   fprintf(stderr, "%d %sabled ", k, texUnit->_ReallyEnabled ? "en" : "dis");
+   fprintf(stderr, "comb RGB %x ", texUnit->_CurrentCombine->ModeRGB);
+   fprintf(stderr, "comb A %x ", texUnit->_CurrentCombine->ModeA);
+   fprintf(stderr, "RGB args/ops ");
+   for (j = 0; j < numColorArgs; j++) {
+      fprintf(stderr, "%x ", texUnit->_CurrentCombine->SourceRGB[j]);
+      fprintf(stderr, "%d ", texUnit->_CurrentCombine->OperandRGB[j] - 
GL_SRC_COLOR);
+   }
+   fprintf(stderr, "A args/ops ");
+   for (j = 0; j < numAlphaArgs; j++) {
+      fprintf(stderr, "%x ", texUnit->_CurrentCombine->SourceA[j]);
+      fprintf(stderr, "%d ", texUnit->_CurrentCombine->OperandA[j] - 
GL_SRC_ALPHA);
+   }
+   fprintf(stderr, "\n");
+}
 
 /* ================================================================
  * Texture combine functions
@@ -407,6 +426,14 @@
    R200_TXC_ARG_A_TFACTOR_ALPHA | R200_TXC_COMP_ARG_A
 };
 
+static GLuint r200_tfactor1_color[] =
+{
+   R200_TXC_ARG_A_TFACTOR1_COLOR,
+   R200_TXC_ARG_A_TFACTOR1_COLOR | R200_TXC_COMP_ARG_A,
+   R200_TXC_ARG_A_TFACTOR1_ALPHA,
+   R200_TXC_ARG_A_TFACTOR1_ALPHA | R200_TXC_COMP_ARG_A
+};
+
 static GLuint r200_primary_color[] =
 {
    R200_TXC_ARG_A_DIFFUSE_COLOR,
@@ -455,6 +482,12 @@
    R200_TXA_ARG_A_TFACTOR_ALPHA | R200_TXA_COMP_ARG_A
 };
 
+static GLuint r200_tfactor1_alpha[] =
+{
+   R200_TXA_ARG_A_TFACTOR1_ALPHA,
+   R200_TXA_ARG_A_TFACTOR1_ALPHA | R200_TXA_COMP_ARG_A
+};
+
 static GLuint r200_primary_alpha[] =
 {
    R200_TXA_ARG_A_DIFFUSE_ALPHA,
@@ -500,15 +533,17 @@
  * Texture unit state management
  */
 
-static GLboolean r200UpdateTextureEnv( GLcontext *ctx, int unit )
+static GLboolean r200UpdateTextureEnv( GLcontext *ctx, int unit, int slot, 
GLuint replaceargs )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
    GLuint color_combine, alpha_combine;
-   GLuint color_scale = rmesa->hw.pix[unit].cmd[PIX_PP_TXCBLEND2] &
-      ~(R200_TXC_SCALE_MASK);
-   GLuint alpha_scale = rmesa->hw.pix[unit].cmd[PIX_PP_TXABLEND2] &
-      ~(R200_TXA_DOT_ALPHA | R200_TXA_SCALE_MASK);
+   GLuint color_scale = rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND2] &
+      ~(R200_TXC_SCALE_MASK | R200_TXC_OUTPUT_REG_MASK | 
R200_TXC_TFACTOR_SEL_MASK |
+       R200_TXC_TFACTOR1_SEL_MASK);
+   GLuint alpha_scale = rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND2] &
+      ~(R200_TXA_DOT_ALPHA | R200_TXA_SCALE_MASK | R200_TXA_OUTPUT_REG_MASK |
+       R200_TXA_TFACTOR_SEL_MASK | R200_TXA_TFACTOR1_SEL_MASK);
 
    /* texUnit->_Current can be NULL if and only if the texture unit is
     * not actually enabled.
@@ -526,12 +561,16 @@
     * reduces the amount of special-casing we have to do, alpha-only
     * textures being a notable exception.
     */
-   /* Don't cache these results.
-    */
-   rmesa->state.texture.unit[unit].format = 0;
-   rmesa->state.texture.unit[unit].envMode = 0;
+
+   color_scale |= ((rmesa->state.texture.unit[unit].outputreg + 1) << 
R200_TXC_OUTPUT_REG_SHIFT) |
+                       (unit << R200_TXC_TFACTOR_SEL_SHIFT) |
+                       (replaceargs << R200_TXC_TFACTOR1_SEL_SHIFT);
+   alpha_scale |= ((rmesa->state.texture.unit[unit].outputreg + 1) << 
R200_TXA_OUTPUT_REG_SHIFT) |
+                       (unit << R200_TXA_TFACTOR_SEL_SHIFT) |
+                       (replaceargs << R200_TXA_TFACTOR1_SEL_SHIFT);
 
    if ( !texUnit->_ReallyEnabled ) {
+      assert( unit == 0);
       if ( unit == 0 ) {
         color_combine = R200_TXC_ARG_A_ZERO | R200_TXC_ARG_B_ZERO
             | R200_TXC_ARG_C_DIFFUSE_COLOR | R200_TXC_OP_MADD;
@@ -554,14 +593,20 @@
       GLuint Ashift = texUnit->_CurrentCombine->ScaleShiftA;
 
 
+      const GLint replaceoprgb =
+        ctx->Texture.Unit[replaceargs]._CurrentCombine->OperandRGB[0] - 
GL_SRC_COLOR;
+      const GLint replaceopa =
+        ctx->Texture.Unit[replaceargs]._CurrentCombine->OperandA[0] - 
GL_SRC_ALPHA;
+
       /* Step 1:
        * Extract the color and alpha combine function arguments.
        */
       for ( i = 0 ; i < numColorArgs ; i++ ) {
-        const GLint op = texUnit->_CurrentCombine->OperandRGB[i] - 
GL_SRC_COLOR;
+        GLint op = texUnit->_CurrentCombine->OperandRGB[i] - GL_SRC_COLOR;
+        const GLint srcRGBi = texUnit->_CurrentCombine->SourceRGB[i];
         assert(op >= 0);
         assert(op <= 3);
-        switch ( texUnit->_CurrentCombine->SourceRGB[i] ) {
+        switch ( srcRGBi ) {
         case GL_TEXTURE:
            color_arg[i] = r200_register_color[op][unit];
            break;
@@ -572,10 +617,54 @@
            color_arg[i] = r200_primary_color[op];
            break;
         case GL_PREVIOUS:
-           if (unit == 0)
-               color_arg[i] = r200_primary_color[op];
-           else
-               color_arg[i] = r200_register_color[op][0];
+           if (replaceargs != unit) {
+              const GLint srcRGBreplace = 
ctx->Texture.Unit[replaceargs]._CurrentCombine->SourceRGB[0];
+              if (op >= 2) {
+                 op = op ^ replaceopa;
+              }
+              else {
+                 op = op ^ replaceoprgb;
+              }
+              switch (srcRGBreplace) {
+              case GL_TEXTURE:
+                 color_arg[i] = r200_register_color[op][replaceargs];
+                 break;
+              case GL_CONSTANT:
+                 color_arg[i] = r200_tfactor1_color[op];
+                 break;
+              case GL_PRIMARY_COLOR:
+                 color_arg[i] = r200_primary_color[op];
+                 break;
+              case GL_PREVIOUS:
+                 if (slot == 0)
+                    color_arg[i] = r200_primary_color[op];
+                 else
+                    color_arg[i] = 
r200_register_color[op][rmesa->state.texture.unit[replaceargs - 1].outputreg];
+                 break;
+              case GL_ZERO:
+                 color_arg[i] = r200_zero_color[op];
+                 break;
+              case GL_ONE:
+                 color_arg[i] = r200_zero_color[op+1];
+                 break;
+              case GL_TEXTURE0:
+              case GL_TEXTURE1:
+              case GL_TEXTURE2:
+              case GL_TEXTURE3:
+              case GL_TEXTURE4:
+              case GL_TEXTURE5:
+                 color_arg[i] = r200_register_color[op][srcRGBreplace - 
GL_TEXTURE0];
+                 break;
+              default:
+              return GL_FALSE;
+              }
+           }
+           else {
+              if (slot == 0)
+                 color_arg[i] = r200_primary_color[op];
+              else
+                 color_arg[i] = 
r200_register_color[op][rmesa->state.texture.unit[unit - 1].outputreg];
+            }
            break;
         case GL_ZERO:
            color_arg[i] = r200_zero_color[op];
@@ -583,16 +672,25 @@
         case GL_ONE:
            color_arg[i] = r200_zero_color[op+1];
            break;
+        case GL_TEXTURE0:
+        case GL_TEXTURE1:
+        case GL_TEXTURE2:
+        case GL_TEXTURE3:
+        case GL_TEXTURE4:
+        case GL_TEXTURE5:
+           color_arg[i] = r200_register_color[op][srcRGBi - GL_TEXTURE0];
+           break;
         default:
            return GL_FALSE;
         }
       }
 
       for ( i = 0 ; i < numAlphaArgs ; i++ ) {
-        const GLint op = texUnit->_CurrentCombine->OperandA[i] - GL_SRC_ALPHA;
+        GLint op = texUnit->_CurrentCombine->OperandA[i] - GL_SRC_ALPHA;
+        const GLint srcAi = texUnit->_CurrentCombine->SourceA[i];
         assert(op >= 0);
         assert(op <= 1);
-        switch ( texUnit->_CurrentCombine->SourceA[i] ) {
+        switch ( srcAi ) {
         case GL_TEXTURE:
            alpha_arg[i] = r200_register_alpha[op][unit];
            break;
@@ -603,10 +701,49 @@
            alpha_arg[i] = r200_primary_alpha[op];
            break;
         case GL_PREVIOUS:
-           if (unit == 0)
-               alpha_arg[i] = r200_primary_alpha[op];
-           else
-               alpha_arg[i] = r200_register_alpha[op][0];
+           if (replaceargs != unit) {
+              const GLint srcAreplace = 
ctx->Texture.Unit[replaceargs]._CurrentCombine->SourceA[0];
+              op = op ^ replaceopa;
+              switch (srcAreplace) {
+              case GL_TEXTURE:
+                 alpha_arg[i] = r200_register_alpha[op][replaceargs];
+                 break;
+              case GL_CONSTANT:
+                 alpha_arg[i] = r200_tfactor1_alpha[op];
+                 break;
+              case GL_PRIMARY_COLOR:
+                 alpha_arg[i] = r200_primary_alpha[op];
+                 break;
+              case GL_PREVIOUS:
+                 if (slot == 0)
+                    alpha_arg[i] = r200_primary_alpha[op];
+                 else
+                    alpha_arg[i] = 
r200_register_alpha[op][rmesa->state.texture.unit[replaceargs - 1].outputreg];
+                 break;
+              case GL_ZERO:
+                 alpha_arg[i] = r200_zero_alpha[op];
+                 break;
+              case GL_ONE:
+                 alpha_arg[i] = r200_zero_alpha[op+1];
+                 break;
+              case GL_TEXTURE0:
+              case GL_TEXTURE1:
+              case GL_TEXTURE2:
+              case GL_TEXTURE3:
+              case GL_TEXTURE4:
+              case GL_TEXTURE5:
+                 alpha_arg[i] = r200_register_alpha[op][srcAreplace - 
GL_TEXTURE0];
+                 break;
+              default:
+              return GL_FALSE;
+              }
+           }
+           else {
+              if (slot == 0)
+                 alpha_arg[i] = r200_primary_alpha[op];
+              else
+                 alpha_arg[i] = 
r200_register_alpha[op][rmesa->state.texture.unit[unit - 1].outputreg];
+            }
            break;
         case GL_ZERO:
            alpha_arg[i] = r200_zero_alpha[op];
@@ -614,6 +751,14 @@
         case GL_ONE:
            alpha_arg[i] = r200_zero_alpha[op+1];
            break;
+        case GL_TEXTURE0:
+        case GL_TEXTURE1:
+        case GL_TEXTURE2:
+        case GL_TEXTURE3:
+        case GL_TEXTURE4:
+        case GL_TEXTURE5:
+           alpha_arg[i] = r200_register_alpha[op][srcAi - GL_TEXTURE0];
+           break;
         default:
            return GL_FALSE;
         }
@@ -801,20 +946,220 @@
        */
    }
 
-   if ( rmesa->hw.pix[unit].cmd[PIX_PP_TXCBLEND] != color_combine ||
-       rmesa->hw.pix[unit].cmd[PIX_PP_TXABLEND] != alpha_combine ||
-       rmesa->hw.pix[unit].cmd[PIX_PP_TXCBLEND2] != color_scale ||
-       rmesa->hw.pix[unit].cmd[PIX_PP_TXABLEND2] != alpha_scale) {
-      R200_STATECHANGE( rmesa, pix[unit] );
-      rmesa->hw.pix[unit].cmd[PIX_PP_TXCBLEND] = color_combine;
-      rmesa->hw.pix[unit].cmd[PIX_PP_TXABLEND] = alpha_combine;
-      rmesa->hw.pix[unit].cmd[PIX_PP_TXCBLEND2] = color_scale;
-      rmesa->hw.pix[unit].cmd[PIX_PP_TXABLEND2] = alpha_scale;
+   if ( rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND] != color_combine ||
+       rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND] != alpha_combine ||
+       rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND2] != color_scale ||
+       rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND2] != alpha_scale) {
+      R200_STATECHANGE( rmesa, pix[slot] );
+      rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND] = color_combine;
+      rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND] = alpha_combine;
+      rmesa->hw.pix[slot].cmd[PIX_PP_TXCBLEND2] = color_scale;
+      rmesa->hw.pix[slot].cmd[PIX_PP_TXABLEND2] = alpha_scale;
    }
 
    return GL_TRUE;
 }
 
+#define REF_COLOR 1
+#define REF_ALPHA 2
+
+static GLboolean r200UpdateAllTexEnv( GLcontext *ctx )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   GLint i, j, currslot;
+   GLint maxunitused = -1;
+   GLboolean texregfree[6] = {GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE, 
GL_TRUE};
+   GLubyte stageref[7] = {0, 0, 0, 0, 0, 0, 0};
+   GLint nextunit[R200_MAX_TEXTURE_UNITS] = {0, 0, 0, 0, 0, 0};
+   GLint currentnext = -1;
+   GLboolean ok;
+
+   /* find highest used unit */
+   for ( j = 0; j < R200_MAX_TEXTURE_UNITS; j++) {
+      if (ctx->Texture.Unit[j]._ReallyEnabled) {
+        maxunitused = j;
+      }
+   }
+   stageref[maxunitused + 1] = REF_COLOR | REF_ALPHA;
+
+   for ( j = maxunitused; j >= 0; j-- ) {
+      const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[j];
+
+      rmesa->state.texture.unit[j].outputreg = -1;
+
+      if (stageref[j + 1]) {
+
+        /* use the lowest available reg. That gets us automatically reg0 for 
the last stage.
+           need this even for disabled units, as it may get referenced due to 
the replace
+           optimization */
+        for ( i = 0 ; i < R200_MAX_TEXTURE_UNITS; i++ ) {
+           if (texregfree[i]) {
+              rmesa->state.texture.unit[j].outputreg = i;
+              break;
+           }
+        }
+        if (rmesa->state.texture.unit[j].outputreg == -1) {
+           /* no more free regs we can use. Need a fallback :-( */
+           return GL_FALSE;
+         }
+
+         nextunit[j] = currentnext;
+
+         if (!texUnit->_ReallyEnabled) {
+        /* the not enabled stages are referenced "indirectly",
+            must not cut off the lower stages */
+           stageref[j] = REF_COLOR | REF_ALPHA;
+           continue;
+         }
+        currentnext = j;
+ 
+        const GLuint numColorArgs = texUnit->_CurrentCombine->_NumArgsRGB;
+        const GLuint numAlphaArgs = texUnit->_CurrentCombine->_NumArgsA;
+        const GLboolean isdot3rgba = (texUnit->_CurrentCombine->ModeRGB == 
GL_DOT3_RGBA) ||
+                                     (texUnit->_CurrentCombine->ModeRGB == 
GL_DOT3_RGBA_EXT);
+
+
+        /* check if we need the color part, special case for dot3_rgba
+           as if only the alpha part is referenced later on it still is using 
the color part */
+        if ((stageref[j + 1] & REF_COLOR) || isdot3rgba) {
+           for ( i = 0 ; i < numColorArgs ; i++ ) {
+              const GLuint srcRGBi = texUnit->_CurrentCombine->SourceRGB[i];
+              const GLuint op = texUnit->_CurrentCombine->OperandRGB[i];
+              switch ( srcRGBi ) {
+              case GL_PREVIOUS:
+                 /* op 0/1 are referencing color, op 2/3 alpha */
+                 stageref[j] |= (op >> 1) + 1;
+                 break;
+              case GL_TEXTURE:
+                 texregfree[j] = GL_FALSE;
+                 break;
+              case GL_TEXTURE0:
+              case GL_TEXTURE1:
+              case GL_TEXTURE2:
+              case GL_TEXTURE3:
+              case GL_TEXTURE4:
+              case GL_TEXTURE5:
+                 texregfree[srcRGBi - GL_TEXTURE0] = GL_FALSE;
+                 break;
+              default: /* don't care about other sources here */
+                 break;
+              }
+           }
+        }
+
+        /* alpha args are ignored for dot3_rgba */
+        if ((stageref[j + 1] & REF_ALPHA) && !isdot3rgba) {
+
+           for ( i = 0 ; i < numAlphaArgs ; i++ ) {
+              const GLuint srcAi = texUnit->_CurrentCombine->SourceA[i];
+              switch ( srcAi ) {
+              case GL_PREVIOUS:
+                 stageref[j] |= REF_ALPHA;
+                 break;
+              case GL_TEXTURE:
+                 texregfree[j] = GL_FALSE;
+                 break;
+              case GL_TEXTURE0:
+              case GL_TEXTURE1:
+              case GL_TEXTURE2:
+              case GL_TEXTURE3:
+              case GL_TEXTURE4:
+              case GL_TEXTURE5:
+                 texregfree[srcAi - GL_TEXTURE0] = GL_FALSE;
+                 break;
+              default: /* don't care about other sources here */
+                 break;
+              }
+           }
+        }
+      }
+   }
+
+   /* don't enable texture sampling for units if the result is not used */
+   for (i = 0; i < R200_MAX_TEXTURE_UNITS; i++) {
+      if (ctx->Texture.Unit[i]._ReallyEnabled && !texregfree[i])
+        rmesa->state.texture.unit[i].unitneeded = 
ctx->Texture.Unit[i]._ReallyEnabled;
+      else rmesa->state.texture.unit[i].unitneeded = 0;
+   }
+#if 0
+   {
+      for (i = 0; i <= maxunitused; i++) {
+        if ((ctx->Texture.Unit[i]._ReallyEnabled) && texregfree[i]) {
+           int k;
+           fprintf(stderr, "unused tex %d\n", i);
+           for (k = 0; k <= maxunitused; k++) {
+              print_texenv( ctx, k );
+           }
+        }
+        else if (texregfree[i]) {
+           fprintf(stderr, "texture units not enabled in-order, disabled unit 
%d\n", i);
+        }
+      }
+   }
+#endif
+
+   ok = GL_TRUE;
+   currslot = 0;
+   rmesa->state.envneeded = 1;
+
+   i = 0;
+   while ((i <= maxunitused) && (i >= 0)) {
+      /* only output instruction if the results are referenced */
+      if (ctx->Texture.Unit[i]._ReallyEnabled && stageref[i+1]) {
+         GLuint replaceunit = i;
+        /* try to optimize GL_REPLACE away (only one level deep though) */
+        if (   (ctx->Texture.Unit[i]._CurrentCombine->ModeRGB == GL_REPLACE) &&
+               (ctx->Texture.Unit[i]._CurrentCombine->ModeA == GL_REPLACE) &&
+               (ctx->Texture.Unit[i]._CurrentCombine->ScaleShiftRGB == 0) &&
+               (ctx->Texture.Unit[i]._CurrentCombine->ScaleShiftA == 0) &&
+               (nextunit[i] > 0) ) {
+           /* yippie! can optimize it away! */
+#if 0
+           fprintf(stderr, "removing env %d replacing with env %d\n", i, 
nextunit[i]);
+           for (j = 0; j <= maxunitused; j++)
+              print_texenv( ctx, j );
+#endif
+           replaceunit = i;
+           i = nextunit[i];
+        }
+
+        /* need env instruction slot */
+        rmesa->state.envneeded |= 1 << currslot;
+        ok = r200UpdateTextureEnv( ctx, i, currslot, replaceunit );
+        if (!ok) return GL_FALSE;
+#if 0
+        fprintf(stderr, "enabling slot %d unit %d output reg %d\n", currslot, 
i,
+               rmesa->state.texture.unit[i].outputreg);
+#endif
+        currslot++;
+      }
+      i = i + 1;
+   }
+
+   if (currslot == 0) {
+      /* need one stage at least? */
+      /* need this if texturing is disabled, for the very simple case when 
texturing
+         is enabled only for unit 0, combine mode is GL_REPLACE, Scale is 1, 
and
+         operands are GL_SRC_COLOR and GL_SRC_ALPHA respectively could omit it 
I guess
+         but it probably wouldn't make a performance difference anyway */
+#if 0
+      fprintf(stderr, "add texenv stage 0\n");
+#endif
+      rmesa->state.texture.unit[0].outputreg = 0;
+      ok = r200UpdateTextureEnv( ctx, 0, 0, 0 );
+   }
+
+   R200_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~R200_TEX_BLEND_ENABLE_MASK;
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= rmesa->state.envneeded << 
R200_TEX_BLEND_0_ENABLE_SHIFT;
+
+   return ok;
+}
+
+#undef REF_COLOR
+#undef REF_ALPHA
+
+
 #define TEXOBJ_TXFILTER_MASK (R200_MAX_MIP_LEVEL_MASK |                \
                              R200_MIN_FILTER_MASK |            \
                              R200_MAG_FILTER_MASK |            \
@@ -1016,6 +1361,8 @@
    case GL_SPHERE_MAP:
       rmesa->TexGenNeedNormals[unit] = GL_TRUE;
       tgi |= R200_TEXGEN_INPUT_SPHERE<<inputshift;
+      /* GL_SPHERE_MAP doesn't appear to work.
+      return GL_FALSE;*/
       break;
 
    case 0:
@@ -1063,9 +1410,7 @@
       }
 
       R200_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~((R200_TEX_0_ENABLE |
-                                          R200_TEX_BLEND_0_ENABLE) << unit);
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_BLEND_0_ENABLE; 
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~(R200_TEX_0_ENABLE << unit);
         
       R200_STATECHANGE( rmesa, vtx );
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
@@ -1077,10 +1422,6 @@
       /* Actually want to keep all units less than max active texture
        * enabled, right?  Fix this for >2 texunits.
        */
-      /* FIXME: What should happen here if r200UpdateTextureEnv fails? */
-      if (unit == 0) 
-        r200UpdateTextureEnv( ctx, unit ); 
-
 
       {
         GLuint tmp = rmesa->TexGenEnabled;
@@ -1258,7 +1599,6 @@
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
    struct gl_texture_object *tObj = texUnit->_Current;
    r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-   GLenum format;
 
    /* Fallback if there's a texture border */
    if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 )
@@ -1288,8 +1628,7 @@
     */
    if ( 1|| !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit))) {
       R200_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= (R200_TEX_0_ENABLE | 
-                                        R200_TEX_BLEND_0_ENABLE) << unit;
+      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << unit;
 
       R200_STATECHANGE( rmesa, vtx );
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
@@ -1309,16 +1648,6 @@
       rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
    }
 
-   format = tObj->Image[0][tObj->BaseLevel]->Format;
-   if ( rmesa->state.texture.unit[unit].format != format ||
-       rmesa->state.texture.unit[unit].envMode != texUnit->EnvMode ) {
-      rmesa->state.texture.unit[unit].format = format;
-      rmesa->state.texture.unit[unit].envMode = texUnit->EnvMode;
-      if ( ! r200UpdateTextureEnv( ctx, unit ) ) {
-        return GL_FALSE;
-      }
-   }
-
    FALLBACK( rmesa, R200_FALLBACK_BORDER_MODE, t->border_fallback );
    return !t->border_fallback;
 }
@@ -1327,27 +1656,28 @@
 
 static GLboolean r200UpdateTextureUnit( GLcontext *ctx, int unit )
 {
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+/*   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];*/
 
-   if ( texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT) ) {
+   if ( rmesa->state.texture.unit[unit].unitneeded & (TEXTURE_RECT_BIT) ) {
       return (enable_tex_rect( ctx, unit ) &&
              update_tex_common( ctx, unit ));
    }
-   else if ( texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
+   else if (  rmesa->state.texture.unit[unit].unitneeded & (TEXTURE_1D_BIT | 
TEXTURE_2D_BIT) ) {
       return (enable_tex_2d( ctx, unit ) &&
              update_tex_common( ctx, unit ));
    }
 #if ENABLE_HW_3D_TEXTURE
-   else if ( texUnit->_ReallyEnabled & (TEXTURE_3D_BIT) ) {
+   else if ( rmesa->state.texture.unit[unit].unitneeded & (TEXTURE_3D_BIT) ) {
       return (enable_tex_3d( ctx, unit ) &&
              update_tex_common( ctx, unit ));
    }
 #endif
-   else if ( texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT) ) {
+   else if ( rmesa->state.texture.unit[unit].unitneeded & (TEXTURE_CUBE_BIT) ) 
{
       return (enable_tex_cube( ctx, unit ) &&
              update_tex_common( ctx, unit ));
    }
-   else if ( texUnit->_ReallyEnabled ) {
+   else if ( rmesa->state.texture.unit[unit].unitneeded ) {
       return GL_FALSE;
    }
    else {
@@ -1363,12 +1693,16 @@
    GLboolean ok;
    GLuint dbg;
 
-   ok = (r200UpdateTextureUnit( ctx, 0 ) &&
+   ok = r200UpdateAllTexEnv( ctx );
+
+   if (ok) {
+      ok = (r200UpdateTextureUnit( ctx, 0 ) &&
         r200UpdateTextureUnit( ctx, 1 ) &&
         r200UpdateTextureUnit( ctx, 2 ) &&
         r200UpdateTextureUnit( ctx, 3 ) &&
         r200UpdateTextureUnit( ctx, 4 ) &&
         r200UpdateTextureUnit( ctx, 5 ));
+   }
 
    FALLBACK( rmesa, R200_FALLBACK_TEXTURE, !ok );
 

Reply via email to