On 24.04.2017 10:45, Marek Olšák wrote:
From: Marek Olšák <marek.ol...@amd.com>

LLVM 5.0 removes s_barrier instructions if the max-work-group-size
attribute is not set. What a surprise.

One minor comment on patch 56, apart from that patches 54-61:

Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com>


---
 src/gallium/drivers/radeonsi/si_shader.c | 45 +++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index 3b00bea..086b279 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5674,21 +5674,21 @@ static const struct lp_build_tgsi_action tex_action = {

 static const struct lp_build_tgsi_action interp_action = {
        .fetch_args = interp_fetch_args,
        .emit = build_interp_intrinsic,
 };

 static void si_create_function(struct si_shader_context *ctx,
                               const char *name,
                               LLVMTypeRef *returns, unsigned num_returns,
                               LLVMTypeRef *params, unsigned num_params,
-                              int last_sgpr)
+                              int last_sgpr, unsigned max_workgroup_size)
 {
        int i;

        si_llvm_create_func(ctx, name, returns, num_returns,
                            params, num_params);
        si_llvm_shader_type(ctx->main_fn, ctx->type);
        ctx->return_value = LLVMGetUndef(ctx->return_type);

        for (i = 0; i <= last_sgpr; ++i) {
                LLVMValueRef P = LLVMGetParam(ctx->main_fn, i);
@@ -5701,20 +5701,24 @@ static void si_create_function(struct si_shader_context 
*ctx,
                 * SGPR spilling significantly.
                 */
                if (LLVMGetTypeKind(LLVMTypeOf(P)) == LLVMPointerTypeKind) {
                        lp_add_function_attr(ctx->main_fn, i + 1, 
LP_FUNC_ATTR_BYVAL);
                        lp_add_function_attr(ctx->main_fn, i + 1, 
LP_FUNC_ATTR_NOALIAS);
                        ac_add_attr_dereferenceable(P, UINT64_MAX);
                } else
                        lp_add_function_attr(ctx->main_fn, i + 1, 
LP_FUNC_ATTR_INREG);
        }

+       if (max_workgroup_size) {
+               si_llvm_add_attribute(ctx->main_fn, 
"amdgpu-max-work-group-size",
+                                     max_workgroup_size);
+       }
        LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
                                           "no-signed-zeros-fp-math",
                                           "true");

        if (ctx->screen->b.debug_flags & DBG_UNSAFE_MATH) {
                /* These were copied from some LLVM test. */
                LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
                                                   "less-precise-fpmad",
                                                   "true");
                LLVMAddTargetDependentFunctionAttr(ctx->main_fn,
@@ -5782,20 +5786,36 @@ static void declare_lds_as_pointer(struct 
si_shader_context *ctx)
        struct gallivm_state *gallivm = &ctx->gallivm;

        unsigned lds_size = ctx->screen->b.chip_class >= CIK ? 65536 : 32768;
        ctx->lds = LLVMBuildIntToPtr(gallivm->builder, ctx->i32_0,
                LLVMPointerType(LLVMArrayType(ctx->i32, lds_size / 4), 
LOCAL_ADDR_SPACE),
                "lds");
 }

 static unsigned si_get_max_workgroup_size(struct si_shader *shader)
 {
+       switch (shader->selector->type) {
+       case PIPE_SHADER_TESS_CTRL:
+               /* Return this so that LLVM doesn't remove s_barrier
+                * instructions on chips where we use s_barrier. */
+               return shader->selector->screen->b.chip_class >= CIK ? 128 : 64;
+
+       case PIPE_SHADER_GEOMETRY:
+               return shader->selector->screen->b.chip_class >= GFX9 ? 128 : 
64;
+
+       case PIPE_SHADER_COMPUTE:
+               break; /* see below */
+
+       default:
+               return 0;
+       }
+
        const unsigned *properties = shader->selector->info.properties;
        unsigned max_work_group_size =
                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
                       properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];

        if (!max_work_group_size) {
                /* This is a variable group size compute shader,
                 * compile it for the maximum possible group size.
                 */
@@ -6172,39 +6192,36 @@ static void create_function(struct si_shader_context 
*ctx)
                num_params = SI_PARAM_THREAD_ID + 1;
                break;
        default:
                assert(0 && "unimplemented shader");
                return;
        }

        assert(num_params <= ARRAY_SIZE(params));

        si_create_function(ctx, "main", returns, num_returns, params,
-                          num_params, last_sgpr);
+                          num_params, last_sgpr,
+                          si_get_max_workgroup_size(shader));

        /* Reserve register locations for VGPR inputs the PS prolog may need. */
        if (ctx->type == PIPE_SHADER_FRAGMENT &&
            ctx->separate_prolog) {
                si_llvm_add_attribute(ctx->main_fn,
                                      "InitialPSInputAddr",
                                      S_0286D0_PERSP_SAMPLE_ENA(1) |
                                      S_0286D0_PERSP_CENTER_ENA(1) |
                                      S_0286D0_PERSP_CENTROID_ENA(1) |
                                      S_0286D0_LINEAR_SAMPLE_ENA(1) |
                                      S_0286D0_LINEAR_CENTER_ENA(1) |
                                      S_0286D0_LINEAR_CENTROID_ENA(1) |
                                      S_0286D0_FRONT_FACE_ENA(1) |
                                      S_0286D0_POS_FIXED_PT_ENA(1));
-       } else if (ctx->type == PIPE_SHADER_COMPUTE) {
-               si_llvm_add_attribute(ctx->main_fn,
-                                     "amdgpu-max-work-group-size",
-                                     si_get_max_workgroup_size(shader));
        }

        shader->info.num_input_sgprs = 0;
        shader->info.num_input_vgprs = 0;

        for (i = 0; i <= last_sgpr; ++i)
                shader->info.num_input_sgprs += llvm_get_type_size(params[i]) / 
4;

        for (; i < num_params; ++i)
                shader->info.num_input_vgprs += llvm_get_type_size(params[i]) / 
4;
@@ -7701,21 +7718,21 @@ static void si_build_gs_prolog_function(struct 
si_shader_context *ctx,
                returns[i] = ctx->i32;
        }

        for (unsigned i = 0; i < num_vgprs; ++i) {
                params[num_sgprs + i] = ctx->i32;
                returns[num_sgprs + i] = ctx->f32;
        }

        /* Create the function. */
        si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
-                          params, num_sgprs + num_vgprs, num_sgprs - 1);
+                          params, num_sgprs + num_vgprs, num_sgprs - 1, 0);
        func = ctx->main_fn;

        /* Set the full EXEC mask for the prolog, because we are only fiddling
         * with registers here. The main shader part will set the correct EXEC
         * mask.
         */
        if (ctx->screen->b.chip_class >= GFX9 && !key->gs_prolog.is_monolithic)
                si_init_exec_full_mask(ctx);

        /* Copy inputs to outputs. This should be no-op, as the registers match,
@@ -7861,21 +7878,23 @@ static void si_build_wrapper_function(struct 
si_shader_context *ctx,
                size = llvm_get_type_size(param_types[num_params]) / 4;
                num_params++;

                assert(ac_is_sgpr_param(param) == (gprs < num_sgprs));
                assert(gprs + size <= num_sgprs + num_vgprs &&
                       (gprs >= num_sgprs || gprs + size <= num_sgprs));

                gprs += size;
        }

-       si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params, 
last_sgpr_param);
+       si_create_function(ctx, "wrapper", NULL, 0, param_types, num_params,
+                          last_sgpr_param,
+                          si_get_max_workgroup_size(ctx->shader));

        if (is_merged_shader(ctx->shader))
                si_init_exec_full_mask(ctx);

        /* Record the arguments of the function as if they were an output of
         * a previous part.
         */
        num_out = 0;
        num_out_sgpr = 0;

@@ -8499,21 +8518,21 @@ static void si_build_vs_prolog_function(struct 
si_shader_context *ctx,
                params[num_params++] = ctx->i32;
                returns[num_returns++] = ctx->f32;
        }

        /* Vertex load indices. */
        for (i = 0; i <= key->vs_prolog.last_input; i++)
                returns[num_returns++] = ctx->f32;

        /* Create the function. */
        si_create_function(ctx, "vs_prolog", returns, num_returns, params,
-                          num_params, last_sgpr);
+                          num_params, last_sgpr, 0);
        func = ctx->main_fn;

        if (key->vs_prolog.num_merged_next_stage_vgprs &&
            !key->vs_prolog.is_monolithic)
                si_init_exec_from_input(ctx, 3, 0);

        /* Copy inputs to outputs. This should be no-op, as the registers match,
         * but it will prevent the compiler from overwriting them 
unintentionally.
         */
        ret = ctx->return_value;
@@ -8643,21 +8662,22 @@ static void si_build_tcs_epilog_function(struct 
si_shader_context *ctx,
                params[ctx->param_tcs_offchip_offset = num_params++] = ctx->i32;
                params[ctx->param_tcs_factor_offset = num_params++] = ctx->i32;
        }
        last_sgpr = num_params - 1;

        params[num_params++] = ctx->i32; /* patch index within the wave 
(REL_PATCH_ID) */
        params[num_params++] = ctx->i32; /* invocation ID within the patch */
        params[num_params++] = ctx->i32; /* LDS offset where tess factors 
should be loaded from */

        /* Create the function. */
-       si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, 
last_sgpr);
+       si_create_function(ctx, "tcs_epilog", NULL, 0, params, num_params, 
last_sgpr,
+                          ctx->screen->b.chip_class >= CIK ? 128 : 64);
        declare_lds_as_pointer(ctx);
        func = ctx->main_fn;

        si_write_tess_factors(bld_base,
                              LLVMGetParam(func, last_sgpr + 1),
                              LLVMGetParam(func, last_sgpr + 2),
                              LLVMGetParam(func, last_sgpr + 3));

        LLVMBuildRetVoid(gallivm->builder);
 }
@@ -8764,21 +8784,21 @@ static void si_build_ps_prolog_function(struct 
si_shader_context *ctx,
                params[num_params++] = ctx->f32;

        /* Declare outputs (same as inputs + add colors if needed) */
        num_returns = num_params;
        num_color_channels = util_bitcount(key->ps_prolog.colors_read);
        for (i = 0; i < num_color_channels; i++)
                params[num_returns++] = ctx->f32;

        /* Create the function. */
        si_create_function(ctx, "ps_prolog", params, num_returns, params,
-                          num_params, last_sgpr);
+                          num_params, last_sgpr, 0);
        func = ctx->main_fn;

        /* Copy inputs to outputs. This should be no-op, as the registers match,
         * but it will prevent the compiler from overwriting them 
unintentionally.
         */
        ret = ctx->return_value;
        for (i = 0; i < num_params; i++) {
                LLVMValueRef p = LLVMGetParam(func, i);
                ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
        }
@@ -9006,21 +9026,22 @@ static void si_build_ps_epilog_function(struct 
si_shader_context *ctx,

        num_params = MAX2(num_params,
                          last_sgpr + 1 + PS_EPILOG_SAMPLEMASK_MIN_LOC + 1);

        assert(num_params <= ARRAY_SIZE(params));

        for (i = last_sgpr + 1; i < num_params; i++)
                params[i] = ctx->f32;

        /* Create the function. */
-       si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params, 
last_sgpr);
+       si_create_function(ctx, "ps_epilog", NULL, 0, params, num_params,
+                          last_sgpr, 0);
        /* Disable elimination of unused inputs. */
        si_llvm_add_attribute(ctx->main_fn,
                                  "InitialPSInputAddr", 0xffffff);

        /* Process colors. */
        unsigned vgpr = last_sgpr + 1;
        unsigned colors_written = key->ps_epilog.colors_written;
        int last_color_export = -1;

        /* Find the last color export. */



--
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to