gfx9: add support for monolithic merged LS-HS

Marek Olšák Mon, 24 Apr 2017 01:49:13 -0700

From: Marek Olšák <[email protected]>

---
 src/gallium/drivers/radeonsi/si_shader.c | 143 +++++++++++++++++++++++++++----
 src/gallium/drivers/radeonsi/si_shader.h |   2 +
 2 files changed, 128 insertions(+), 17 deletions(-)


diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 823ffff..9c5dd5e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -7592,34 +7592,37 @@ static void si_build_gs_prolog_function(struct 
si_shader_context *ctx,
        LLVMBuildRet(builder, ret);
 }
 
 /**
  * Given a list of shader part functions, build a wrapper function that
  * runs them in sequence to form a monolithic shader.
  */
 static void si_build_wrapper_function(struct si_shader_context *ctx,
                                      LLVMValueRef *parts,
                                      unsigned num_parts,
-                                     unsigned main_part)
+                                     unsigned main_part,
+                                     unsigned next_shader_first_part)
 {
        struct gallivm_state *gallivm = &ctx->gallivm;
        LLVMBuilderRef builder = ctx->gallivm.builder;
        /* PS epilog has one arg per color component */
        LLVMTypeRef param_types[48];
-       LLVMValueRef out[48];
+       LLVMValueRef initial[48], out[48];
        LLVMTypeRef function_type;
        unsigned num_params;
-       unsigned num_out;
+       unsigned num_out, initial_num_out;
        MAYBE_UNUSED unsigned num_out_sgpr; /* used in debug checks */
+       MAYBE_UNUSED unsigned initial_num_out_sgpr; /* used in debug checks */
        unsigned num_sgprs, num_vgprs;
        unsigned last_sgpr_param;
        unsigned gprs;
+       struct lp_build_if_state if_state;
 
        for (unsigned i = 0; i < num_parts; ++i) {
                lp_add_function_attr(parts[i], -1, LP_FUNC_ATTR_ALWAYSINLINE);
                LLVMSetLinkage(parts[i], LLVMPrivateLinkage);
        }
 
        /* The parameters of the wrapper function correspond to those of the
         * first part in terms of SGPRs and VGPRs, but we use the types of the
         * main part to get the right types. This is relevant for the
         * dereferenceable attribute on descriptor table pointers.
@@ -7657,20 +7660,27 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
 
                assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
                assert(gprs + size <= num_sgprs + num_vgprs &&
                       (gprs >= num_sgprs || gprs + size <= num_sgprs));
 
                gprs += size;
        }
 
        si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, 
last_sgpr_param);
 
+       if (is_merged_shader(ctx->shader)) {
+               LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0);
+               lp_build_intrinsic(ctx->gallivm.builder,
+                                  "llvm.amdgcn.init.exec", ctx->voidt,
+                                  &full_mask, 1, LP_FUNC_ATTR_CONVERGENT);
+       }
+
        /* Record the arguments of the function as if they were an output of
         * a previous part.
         */
        num_out = 0;
        num_out_sgpr = 0;
 
        for (unsigned i = 0; i < num_params; ++i) {
                LLVMValueRef param = LLVMGetParam(ctx->main_fn, i);
                LLVMTypeRef param_type = LLVMTypeOf(param);
                LLVMTypeRef out_type = i <= last_sgpr_param ? ctx->i32 : 
ctx->f32;
@@ -7693,30 +7703,52 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
 
                        for (unsigned j = 0; j < size; ++j)
                                out[num_out++] = LLVMBuildExtractElement(
                                        builder, param, LLVMConstInt(ctx->i32, 
j, 0), "");
                }
 
                if (i <= last_sgpr_param)
                        num_out_sgpr = num_out;
        }
 
+       memcpy(initial, out, sizeof(out));
+       initial_num_out = num_out;
+       initial_num_out_sgpr = num_out_sgpr;
+
        /* Now chain the parts. */
        for (unsigned part = 0; part < num_parts; ++part) {
                LLVMValueRef in[48];
                LLVMValueRef ret;
                LLVMTypeRef ret_type;
                unsigned out_idx = 0;
 
                num_params = LLVMCountParams(parts[part]);
                assert(num_params <= ARRAY_SIZE(param_types));
 
+               /* Merged shaders are executed conditionally depending
+                * on the number of enabled threads passed in the input SGPRs. 
*/
+               if (is_merged_shader(ctx->shader) &&
+                   (part == 0 || part == next_shader_first_part)) {
+                       LLVMValueRef ena, count = initial[3];
+
+                       /* The thread count for the 2nd shader is at bit-offset 
8. */
+                       if (part == next_shader_first_part) {
+                               count = LLVMBuildLShr(builder, count,
+                                                     LLVMConstInt(ctx->i32, 8, 
0), "");
+                       }
+                       count = LLVMBuildAnd(builder, count,
+                                            LLVMConstInt(ctx->i32, 0x7f, 0), 
"");
+                       ena = LLVMBuildICmp(builder, LLVMIntULT,
+                                           ac_get_thread_id(&ctx->ac), count, 
"");
+                       lp_build_if(&if_state, &ctx->gallivm, ena);
+               }
+
                /* Derive arguments for the next part from outputs of the
                 * previous one.
                 */
                for (unsigned param_idx = 0; param_idx < num_params; 
++param_idx) {
                        LLVMValueRef param;
                        LLVMTypeRef param_type;
                        bool is_sgpr;
                        unsigned param_size;
                        LLVMValueRef arg = NULL;
 
@@ -7750,23 +7782,47 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
                                } else {
                                        arg = LLVMBuildBitCast(builder, arg, 
param_type, "");
                                }
                        }
 
                        in[param_idx] = arg;
                        out_idx += param_size;
                }
 
                ret = LLVMBuildCall(builder, parts[part], in, num_params, "");
-               ret_type = LLVMTypeOf(ret);
+
+               if (is_merged_shader(ctx->shader) &&
+                   (part + 1 == next_shader_first_part ||
+                    part + 1 == num_parts)) {
+                       lp_build_endif(&if_state);
+
+                       if (part + 1 == next_shader_first_part) {
+                               /* A barrier is required between 2 merged 
shaders. */
+                               si_llvm_emit_barrier(NULL, &ctx->bld_base, 
NULL);
+
+                               /* The second half of the merged shader should 
use
+                                * the inputs from the toplevel (wrapper) 
function,
+                                * not the return value from the last call.
+                                *
+                                * That's because the last call was executed 
condi-
+                                * tionally, so we can't consume it in the main
+                                * block.
+                                */
+                               memcpy(out, initial, sizeof(initial));
+                               num_out = initial_num_out;
+                               num_out_sgpr = initial_num_out_sgpr;
+                       }
+                       continue;
+               }
 
                /* Extract the returned GPRs. */
+               ret_type = LLVMTypeOf(ret);
                num_out = 0;
                num_out_sgpr = 0;
 
                if (LLVMGetTypeKind(ret_type) != LLVMVoidTypeKind) {
                        assert(LLVMGetTypeKind(ret_type) == LLVMStructTypeKind);
 
                        unsigned ret_size = 
LLVMCountStructElementTypes(ret_type);
 
                        for (unsigned i = 0; i < ret_size; ++i) {
                                LLVMValueRef val =
@@ -7840,78 +7896,130 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
                }
 
                if (need_epilog) {
                        union si_shader_part_key epilog_key;
                        si_get_vs_epilog_key(shader, 
&shader->key.part.vs.epilog, &epilog_key);
                        si_build_vs_epilog_function(&ctx, &epilog_key);
                        parts[need_prolog ? 2 : 1] = ctx.main_fn;
                }
 
                si_build_wrapper_function(&ctx, parts, 1 + need_prolog + 
need_epilog,
-                                         need_prolog ? 1 : 0);
+                                         need_prolog ? 1 : 0, 0);
        } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
-               LLVMValueRef parts[2];
-               union si_shader_part_key epilog_key;
+               if (sscreen->b.chip_class >= GFX9) {
+                       struct si_shader_selector *ls = shader->key.part.tcs.ls;
+                       LLVMValueRef parts[4];
+
+                       /* TCS main part */
+                       parts[2] = ctx.main_fn;
+
+                       /* TCS epilog */
+                       union si_shader_part_key tcs_epilog_key;
+                       memset(&tcs_epilog_key, 0, sizeof(tcs_epilog_key));
+                       tcs_epilog_key.tcs_epilog.states = 
shader->key.part.tcs.epilog;
+                       si_build_tcs_epilog_function(&ctx, &tcs_epilog_key);
+                       parts[3] = ctx.main_fn;
+
+                       /* VS prolog */
+                       if (ls->vs_needs_prolog) {
+                               union si_shader_part_key vs_prolog_key;
+                               si_get_vs_prolog_key(&ls->info,
+                                                    
shader->info.num_input_sgprs,
+                                                    
&shader->key.part.tcs.ls_prolog,
+                                                    shader, &vs_prolog_key);
+                               vs_prolog_key.vs_prolog.is_monolithic = true;
+                               si_build_vs_prolog_function(&ctx, 
&vs_prolog_key);
+                               parts[0] = ctx.main_fn;
+                       }
 
-               parts[0] = ctx.main_fn;
+                       /* VS as LS main part */
+                       struct si_shader shader_ls = {};
+                       shader_ls.selector = ls;
+                       shader_ls.key.as_ls = 1;
+                       shader_ls.key.mono = shader->key.mono;
+                       shader_ls.key.opt = shader->key.opt;
+                       si_llvm_context_set_tgsi(&ctx, &shader_ls);
 
-               memset(&epilog_key, 0, sizeof(epilog_key));
-               epilog_key.tcs_epilog.states = shader->key.part.tcs.epilog;
-               si_build_tcs_epilog_function(&ctx, &epilog_key);
-               parts[1] = ctx.main_fn;
+                       if (!si_compile_tgsi_main(&ctx, true)) {
+                               si_llvm_dispose(&ctx);
+                               return -1;
+                       }
+                       shader->info.uses_instanceid |= 
ls->info.uses_instanceid;
+                       parts[1] = ctx.main_fn;
+
+                       /* Reset the shader context. */
+                       ctx.shader = shader;
+                       ctx.type = PIPE_SHADER_TESS_CTRL;
 
-               si_build_wrapper_function(&ctx, parts, 2, 0);
+                       si_build_wrapper_function(&ctx,
+                                                 parts + !ls->vs_needs_prolog,
+                                                 4 - !ls->vs_needs_prolog, 0,
+                                                 ls->vs_needs_prolog ? 2 : 1);
+               } else {
+                       LLVMValueRef parts[2];
+                       union si_shader_part_key epilog_key;
+
+                       parts[0] = ctx.main_fn;
+
+                       memset(&epilog_key, 0, sizeof(epilog_key));
+                       epilog_key.tcs_epilog.states = 
shader->key.part.tcs.epilog;
+                       si_build_tcs_epilog_function(&ctx, &epilog_key);
+                       parts[1] = ctx.main_fn;
+
+                       si_build_wrapper_function(&ctx, parts, 2, 0, 0);
+               }
        } else if (is_monolithic && ctx.type == PIPE_SHADER_TESS_EVAL &&
                   !shader->key.as_es) {
                LLVMValueRef parts[2];
                union si_shader_part_key epilog_key;
 
                parts[0] = ctx.main_fn;
 
                si_get_vs_epilog_key(shader, &shader->key.part.tes.epilog, 
&epilog_key);
                si_build_vs_epilog_function(&ctx, &epilog_key);
                parts[1] = ctx.main_fn;
 
-               si_build_wrapper_function(&ctx, parts, 2, 0);
+               si_build_wrapper_function(&ctx, parts, 2, 0, 0);
        } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
                LLVMValueRef parts[2];
                union si_shader_part_key prolog_key;
 
                parts[1] = ctx.main_fn;
 
                memset(&prolog_key, 0, sizeof(prolog_key));
                prolog_key.gs_prolog.states = shader->key.part.gs.prolog;
                si_build_gs_prolog_function(&ctx, &prolog_key);
                parts[0] = ctx.main_fn;
 
-               si_build_wrapper_function(&ctx, parts, 2, 1);
+               si_build_wrapper_function(&ctx, parts, 2, 1, 0);
        } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
                LLVMValueRef parts[3];
                union si_shader_part_key prolog_key;
                union si_shader_part_key epilog_key;
                bool need_prolog;
 
                si_get_ps_prolog_key(shader, &prolog_key, false);
                need_prolog = si_need_ps_prolog(&prolog_key);
 
                parts[need_prolog ? 1 : 0] = ctx.main_fn;
 
                if (need_prolog) {
                        si_build_ps_prolog_function(&ctx, &prolog_key);
                        parts[0] = ctx.main_fn;
                }
 
                si_get_ps_epilog_key(shader, &epilog_key);
                si_build_ps_epilog_function(&ctx, &epilog_key);
                parts[need_prolog ? 2 : 1] = ctx.main_fn;
 
-               si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2, 
need_prolog ? 1 : 0);
+               si_build_wrapper_function(&ctx, parts, need_prolog ? 3 : 2,
+                                         need_prolog ? 1 : 0, 0);
        }
 
        /* Dump LLVM IR before any optimization passes */
        if (sscreen->b.debug_flags & DBG_PREOPT_IR &&
            r600_can_dump_shader(&sscreen->b, ctx.type))
                LLVMDumpModule(ctx.gallivm.module);
 
        si_llvm_finalize_module(&ctx,
                                    r600_extra_shader_checks(&sscreen->b, 
ctx.type));
 
@@ -8157,21 +8265,22 @@ static void si_build_vs_prolog_function(struct 
si_shader_context *ctx,
 
        /* Vertex load indices. */
        for (i = 0; i <= key->vs_prolog.last_input; i++)
                returns[num_returns++] = ctx->f32;
 
        /* Create the function. */
        si_create_function(ctx, "vs_prolog", returns, num_returns, params,
                           num_params, last_sgpr);
        func = ctx->main_fn;
 
-       if (key->vs_prolog.num_merged_next_stage_vgprs)
+       if (key->vs_prolog.num_merged_next_stage_vgprs &&
+           !key->vs_prolog.is_monolithic)
                si_init_exec_from_input(ctx, 3, 0);
 
        /* Copy inputs to outputs. This should be no-op, as the registers match,
         * but it will prevent the compiler from overwriting them 
unintentionally.
         */
        ret = ctx->return_value;
        for (i = 0; i < key->vs_prolog.num_input_sgprs; i++) {
                LLVMValueRef p = LLVMGetParam(func, i);
                ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
        }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index afbe547..e24b8b8 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -348,20 +348,22 @@ struct si_ps_epilog_bits {
        unsigned        clamp_color:1;
 };
 
 union si_shader_part_key {
        struct {
                struct si_vs_prolog_bits states;
                unsigned        num_input_sgprs:6;
                /* For merged stages such as LS-HS, HS input VGPRs are first. */
                unsigned        num_merged_next_stage_vgprs:3;
                unsigned        last_input:4;
+               /* Prologs for monolithic shaders shouldn't set EXEC. */
+               unsigned        is_monolithic:1;
        } vs_prolog;
        struct {
                struct si_vs_epilog_bits states;
                unsigned        prim_id_param_offset:5;
        } vs_epilog;
        struct {
                struct si_tcs_epilog_bits states;
        } tcs_epilog;
        struct {
                struct si_gs_prolog_bits states;
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 29/61] radeonsi/gfx9: add support for monolithic merged LS-HS

Reply via email to