Module: Mesa
Branch: master
Commit: 6eade342eb223313242c1c2a7615b6bd75036087
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=6eade342eb223313242c1c2a7615b6bd75036087

Author: Marek Olšák <[email protected]>
Date:   Tue Sep  5 13:40:59 2017 +0200

radeonsi: optimize TCS epilog when invocation 0 writes tess factors

This removes the barrier and LDS stores and loads for tess factors
when it's possible. The removal of the barrier seems more important
to me though.

In one shader, it removes 17 * 4 bytes from the shader binary.

Reviewed-by: Nicolai Hähnle <[email protected]>

---

 src/gallium/auxiliary/tgsi/tgsi_scan.c            |   2 -
 src/gallium/drivers/radeonsi/si_shader.c          | 111 ++++++++++++++++------
 src/gallium/drivers/radeonsi/si_shader.h          |   2 +
 src/gallium/drivers/radeonsi/si_shader_internal.h |   1 +
 src/gallium/drivers/radeonsi/si_state_shaders.c   |   3 +
 5 files changed, 89 insertions(+), 30 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c 
b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index b8932891e4..212d1bb95a 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -1139,7 +1139,6 @@ tgsi_scan_tess_ctrl(const struct tgsi_token *tokens,
          if (main_block_tf_writemask || cond_block_tf_writemask) {
             /* Accumulate the result: */
             out->tessfactors_are_def_in_all_invocs &=
-               main_block_tf_writemask &&
                !(cond_block_tf_writemask & ~main_block_tf_writemask);
 
             /* Analyze the next code segment from scratch. */
@@ -1155,7 +1154,6 @@ tgsi_scan_tess_ctrl(const struct tgsi_token *tokens,
    /* Accumulate the result for the last code segment separated by a barrier. 
*/
    if (main_block_tf_writemask || cond_block_tf_writemask) {
       out->tessfactors_are_def_in_all_invocs &=
-         main_block_tf_writemask &&
          !(cond_block_tf_writemask & ~main_block_tf_writemask);
    }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index e7888e6012..43619dd329 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1150,7 +1150,7 @@ static void store_output_tcs(struct lp_build_tgsi_context 
*bld_base,
        LLVMValueRef buffer, base, buf_addr;
        LLVMValueRef values[4];
        bool skip_lds_store;
-       bool is_tess_factor = false;
+       bool is_tess_factor = false, is_tess_inner = false;
 
        /* Only handle per-patch and per-vertex outputs here.
         * Vectors will be lowered to scalars and this function will be called 
again.
@@ -1177,8 +1177,11 @@ static void store_output_tcs(struct 
lp_build_tgsi_context *bld_base,
                        /* Always write tess factors into LDS for the TCS 
epilog. */
                        if (name == TGSI_SEMANTIC_TESSINNER ||
                            name == TGSI_SEMANTIC_TESSOUTER) {
-                               skip_lds_store = false;
+                               /* The epilog doesn't read LDS if invocation 0 
defines tess factors. */
+                               skip_lds_store = 
!sh_info->reads_tessfactor_outputs &&
+                                                
ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs;
                                is_tess_factor = true;
+                               is_tess_inner = name == TGSI_SEMANTIC_TESSINNER;
                        }
                }
        }
@@ -1207,6 +1210,18 @@ static void store_output_tcs(struct 
lp_build_tgsi_context *bld_base,
                                                    buf_addr, base,
                                                    4 * chan_index, 1, 0, true, 
false);
                }
+
+               /* Write tess factors into VGPRs for the epilog. */
+               if (is_tess_factor &&
+                   
ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
+                       if (!is_tess_inner) {
+                               LLVMBuildStore(gallivm->builder, value, /* 
outer */
+                                              
ctx->invoc0_tess_factors[chan_index]);
+                       } else if (chan_index < 2) {
+                               LLVMBuildStore(gallivm->builder, value, /* 
inner */
+                                              ctx->invoc0_tess_factors[4 + 
chan_index]);
+                       }
+               }
        }
 
        if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) {
@@ -2671,7 +2686,9 @@ static void si_copy_tcs_inputs(struct 
lp_build_tgsi_context *bld_base)
 static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base,
                                  LLVMValueRef rel_patch_id,
                                  LLVMValueRef invocation_id,
-                                 LLVMValueRef 
tcs_out_current_patch_data_offset)
+                                 LLVMValueRef 
tcs_out_current_patch_data_offset,
+                                 LLVMValueRef invoc0_tf_outer[4],
+                                 LLVMValueRef invoc0_tf_inner[2])
 {
        struct si_shader_context *ctx = si_shader_context(bld_base);
        struct gallivm_state *gallivm = &ctx->gallivm;
@@ -2682,7 +2699,9 @@ static void si_write_tess_factors(struct 
lp_build_tgsi_context *bld_base,
        unsigned stride, outer_comps, inner_comps, i, offset;
        struct lp_build_if_state if_ctx, inner_if_ctx;
 
-       si_llvm_emit_barrier(NULL, bld_base, NULL);
+       /* Add a barrier before loading tess factors from LDS. */
+       if (!shader->key.part.tcs.epilog.invoc0_tess_factors_are_def)
+               si_llvm_emit_barrier(NULL, bld_base, NULL);
 
        /* Do this only for invocation 0, because the tess levels are per-patch,
         * not per-vertex.
@@ -2716,32 +2735,32 @@ static void si_write_tess_factors(struct 
lp_build_tgsi_context *bld_base,
                return;
        }
 
-       /* Load tess_inner and tess_outer from LDS.
-        * Any invocation can write them, so we can't get them from a temporary.
-        */
-       tess_inner_index = 
si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
-       tess_outer_index = 
si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
-
-       lds_base = tcs_out_current_patch_data_offset;
-       lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
-                                LLVMConstInt(ctx->i32,
-                                             tess_inner_index * 4, 0), "");
-       lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
-                                LLVMConstInt(ctx->i32,
-                                             tess_outer_index * 4, 0), "");
-
        for (i = 0; i < 4; i++) {
                inner[i] = LLVMGetUndef(ctx->i32);
                outer[i] = LLVMGetUndef(ctx->i32);
        }
 
-       if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
-               /* For isolines, the hardware expects tess factors in the
-                * reverse order from what GLSL / TGSI specify.
-                */
-               outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, 
lds_outer);
-               outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, 
lds_outer);
+       if (shader->key.part.tcs.epilog.invoc0_tess_factors_are_def) {
+               /* Tess factors are in VGPRs. */
+               for (i = 0; i < outer_comps; i++)
+                       outer[i] = out[i] = invoc0_tf_outer[i];
+               for (i = 0; i < inner_comps; i++)
+                       inner[i] = out[outer_comps+i] = invoc0_tf_inner[i];
        } else {
+               /* Load tess_inner and tess_outer from LDS.
+                * Any invocation can write them, so we can't get them from a 
temporary.
+                */
+               tess_inner_index = 
si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSINNER, 0);
+               tess_outer_index = 
si_shader_io_get_unique_index_patch(TGSI_SEMANTIC_TESSOUTER, 0);
+
+               lds_base = tcs_out_current_patch_data_offset;
+               lds_inner = LLVMBuildAdd(gallivm->builder, lds_base,
+                                        LLVMConstInt(ctx->i32,
+                                                     tess_inner_index * 4, 0), 
"");
+               lds_outer = LLVMBuildAdd(gallivm->builder, lds_base,
+                                        LLVMConstInt(ctx->i32,
+                                                     tess_outer_index * 4, 0), 
"");
+
                for (i = 0; i < outer_comps; i++) {
                        outer[i] = out[i] =
                                lds_load(bld_base, TGSI_TYPE_SIGNED, i, 
lds_outer);
@@ -2752,6 +2771,15 @@ static void si_write_tess_factors(struct 
lp_build_tgsi_context *bld_base,
                }
        }
 
+       if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) {
+               /* For isolines, the hardware expects tess factors in the
+                * reverse order from what GLSL / TGSI specify.
+                */
+               LLVMValueRef tmp = out[0];
+               out[0] = out[1];
+               out[1] = tmp;
+       }
+
        /* Convert the outputs to vectors for stores. */
        vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4));
        vec1 = NULL;
@@ -2946,7 +2974,18 @@ static void si_llvm_emit_tcs_epilogue(struct 
lp_build_tgsi_context *bld_base)
 
        ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, "");
        ret = LLVMBuildInsertValue(builder, ret, invocation_id, vgpr++, "");
-       ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, "");
+
+       if (ctx->shader->selector->tcs_info.tessfactors_are_def_in_all_invocs) {
+               vgpr++; /* skip the tess factor LDS offset */
+               for (unsigned i = 0; i < 6; i++) {
+                       LLVMValueRef value =
+                               LLVMBuildLoad(builder, 
ctx->invoc0_tess_factors[i], "");
+                       value = bitcast(bld_base, TGSI_TYPE_FLOAT, value);
+                       ret = LLVMBuildInsertValue(builder, ret, value, vgpr++, 
"");
+               }
+       } else {
+               ret = LLVMBuildInsertValue(builder, ret, tf_lds_offset, vgpr++, 
"");
+       }
        ctx->return_value = ret;
 }
 
@@ -4330,7 +4369,7 @@ static void create_function(struct si_shader_context *ctx)
                 */
                for (i = 0; i < GFX6_TCS_NUM_USER_SGPR + 2; i++)
                        returns[num_returns++] = ctx->i32; /* SGPRs */
-               for (i = 0; i < 5; i++)
+               for (i = 0; i < 11; i++)
                        returns[num_returns++] = ctx->f32; /* VGPRs */
                break;
 
@@ -4387,7 +4426,7 @@ static void create_function(struct si_shader_context *ctx)
                         */
                        for (i = 0; i <= 8 + GFX9_SGPR_TCS_FACTOR_ADDR_BASE64K; 
i++)
                                returns[num_returns++] = ctx->i32; /* SGPRs */
-                       for (i = 0; i < 5; i++)
+                       for (i = 0; i < 11; i++)
                                returns[num_returns++] = ctx->f32; /* VGPRs */
                }
                break;
@@ -5692,6 +5731,14 @@ static bool si_compile_tgsi_main(struct 
si_shader_context *ctx,
                }
        }
 
+       if (ctx->type == PIPE_SHADER_TESS_CTRL &&
+           sel->tcs_info.tessfactors_are_def_in_all_invocs) {
+               for (unsigned i = 0; i < 6; i++) {
+                       ctx->invoc0_tess_factors[i] =
+                               lp_build_alloca_undef(&ctx->gallivm, ctx->i32, 
"");
+               }
+       }
+
        if (ctx->type == PIPE_SHADER_GEOMETRY) {
                int i;
                for (i = 0; i < 4; i++) {
@@ -6926,16 +6973,24 @@ static void si_build_tcs_epilog_function(struct 
si_shader_context *ctx,
        add_arg(&fninfo, ARG_VGPR, ctx->i32); /* invocation ID within the patch 
*/
        add_arg(&fninfo, ARG_VGPR, ctx->i32); /* LDS offset where tess factors 
should be loaded from */
 
+       for (unsigned i = 0; i < 6; i++)
+               add_arg(&fninfo, ARG_VGPR, ctx->i32); /* tess factors */
+
        /* Create the function. */
        si_create_function(ctx, "tcs_epilog", NULL, 0, &fninfo,
                           ctx->screen->b.chip_class >= CIK ? 128 : 64);
        declare_lds_as_pointer(ctx);
        func = ctx->main_fn;
 
+       LLVMValueRef invoc0_tess_factors[6];
+       for (unsigned i = 0; i < 6; i++)
+               invoc0_tess_factors[i] = LLVMGetParam(func, tess_factors_idx + 
3 + i);
+
        si_write_tess_factors(bld_base,
                              LLVMGetParam(func, tess_factors_idx),
                              LLVMGetParam(func, tess_factors_idx + 1),
-                             LLVMGetParam(func, tess_factors_idx + 2));
+                             LLVMGetParam(func, tess_factors_idx + 2),
+                             invoc0_tess_factors, invoc0_tess_factors + 4);
 
        LLVMBuildRetVoid(gallivm->builder);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index ee6b0c167f..4592ac551c 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -327,6 +327,7 @@ struct si_shader_selector {
        struct nir_shader       *nir;
        struct pipe_stream_output_info  so;
        struct tgsi_shader_info         info;
+       struct tgsi_tessctrl_info       tcs_info;
 
        /* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
        unsigned        type;
@@ -404,6 +405,7 @@ struct si_vs_prolog_bits {
 /* Common TCS bits between the shader key and the epilog key. */
 struct si_tcs_epilog_bits {
        unsigned        prim_mode:3;
+       unsigned        invoc0_tess_factors_are_def:1;
        unsigned        tes_reads_tess_factors:1;
 };
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h 
b/src/gallium/drivers/radeonsi/si_shader_internal.h
index ad29ab7e84..023f9a6a09 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -209,6 +209,7 @@ struct si_shader_context {
        LLVMValueRef gsvs_ring[4];
 
        LLVMValueRef lds;
+       LLVMValueRef invoc0_tess_factors[6]; /* outer[4], inner[2] */
        LLVMValueRef gs_next_vertex[4];
        LLVMValueRef postponed_kill;
        LLVMValueRef return_value;
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 9f76551cfb..6398111e5a 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1301,6 +1301,8 @@ static inline void si_shader_selector_key(struct 
pipe_context *ctx,
 
                key->part.tcs.epilog.prim_mode =
                        
sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
+               key->part.tcs.epilog.invoc0_tess_factors_are_def =
+                       sel->tcs_info.tessfactors_are_def_in_all_invocs;
                key->part.tcs.epilog.tes_reads_tess_factors =
                        sctx->tes_shader.cso->info.reads_tess_factors;
 
@@ -2004,6 +2006,7 @@ static void *si_create_shader_selector(struct 
pipe_context *ctx,
                }
 
                tgsi_scan_shader(state->tokens, &sel->info);
+               tgsi_scan_tess_ctrl(state->tokens, &sel->info, &sel->tcs_info);
        } else {
                assert(state->type == PIPE_SHADER_IR_NIR);
 

_______________________________________________
mesa-commit mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-commit

Reply via email to