On Fri, Apr 28, 2017 at 1:25 PM, Nicolai Hähnle <[email protected]> wrote: > On 24.04.2017 10:45, Marek Olšák wrote: >> >> From: Marek Olšák <[email protected]> >> >> --- >> src/gallium/drivers/radeonsi/si_shader.c | 87 >> +++++++++++++++++++++++++------- >> 1 file changed, 70 insertions(+), 17 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_shader.c >> b/src/gallium/drivers/radeonsi/si_shader.c >> index a4c2ac0..392f85d 100644 >> --- a/src/gallium/drivers/radeonsi/si_shader.c >> +++ b/src/gallium/drivers/radeonsi/si_shader.c >> @@ -7368,20 +7368,28 @@ static void si_count_scratch_private_memory(struct >> si_shader_context *ctx) >> LLVMTypeRef type = >> LLVMGetElementType(LLVMTypeOf(inst)); >> /* No idea why LLVM aligns allocas to 4 elements. >> */ >> unsigned alignment = LLVMGetAlignment(inst); >> unsigned dw_size = align(llvm_get_type_size(type) >> / 4, alignment); >> ctx->shader->config.private_mem_vgprs += dw_size; >> } >> bb = LLVMGetNextBasicBlock(bb); >> } >> } >> >> +static void si_init_exec_full_mask(struct si_shader_context *ctx) >> +{ >> + LLVMValueRef full_mask = LLVMConstInt(ctx->i64, ~0ull, 0); >> + lp_build_intrinsic(ctx->gallivm.builder, >> + "llvm.amdgcn.init.exec", ctx->voidt, >> + &full_mask, 1, LP_FUNC_ATTR_CONVERGENT); >> +} >> + >> static void si_init_exec_from_input(struct si_shader_context *ctx, >> unsigned param, unsigned bitoffset) >> { >> LLVMValueRef args[] = { >> LLVMGetParam(ctx->main_fn, param), >> LLVMConstInt(ctx->i32, bitoffset, 0), >> }; >> lp_build_intrinsic(ctx->gallivm.builder, >> "llvm.amdgcn.init.exec.from.input", >> ctx->voidt, args, 2, LP_FUNC_ATTR_CONVERGENT); >> @@ -7681,79 +7689,128 @@ static void si_get_ps_epilog_key(struct si_shader >> *shader, >> key->ps_epilog.states = shader->key.part.ps.epilog; >> } >> >> /** >> * Build the GS prolog function. Rotate the input vertices for triangle >> strips >> * with adjacency. >> */ >> static void si_build_gs_prolog_function(struct si_shader_context *ctx, >> union si_shader_part_key *key) >> { >> - const unsigned num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; >> - const unsigned num_vgprs = 8; >> + unsigned num_sgprs, num_vgprs; >> struct gallivm_state *gallivm = &ctx->gallivm; >> LLVMBuilderRef builder = gallivm->builder; >> - LLVMTypeRef params[32]; >> - LLVMTypeRef returns[32]; >> + LLVMTypeRef params[48]; /* 40 SGPRs (maximum) + some VGPRs */ >> + LLVMTypeRef returns[48]; >> LLVMValueRef func, ret; >> >> + if (ctx->screen->b.chip_class >= GFX9) { >> + num_sgprs = 8 + GFX9_GS_NUM_USER_SGPR; >> + num_vgprs = 5; /* ES inputs are not needed by GS */ >> + } else { >> + num_sgprs = GFX6_GS_NUM_USER_SGPR + 2; >> + num_vgprs = 8; >> + } >> + >> for (unsigned i = 0; i < num_sgprs; ++i) { >> params[i] = ctx->i32; >> returns[i] = ctx->i32; >> } >> >> for (unsigned i = 0; i < num_vgprs; ++i) { >> params[num_sgprs + i] = ctx->i32; >> returns[num_sgprs + i] = ctx->f32; >> } >> >> /* Create the function. */ >> si_create_function(ctx, "gs_prolog", returns, num_sgprs + >> num_vgprs, >> params, num_sgprs + num_vgprs, num_sgprs - 1); >> func = ctx->main_fn; >> >> + /* Set the full EXEC mask for the prolog, because we are only >> fiddling >> + * with registers here. The main shader part will set the correct >> EXEC >> + * mask. >> + */ >> + if (ctx->screen->b.chip_class >= GFX9) >> + si_init_exec_full_mask(ctx); >> + >> /* Copy inputs to outputs. This should be no-op, as the registers >> match, >> * but it will prevent the compiler from overwriting them >> unintentionally. >> */ >> ret = ctx->return_value; >> for (unsigned i = 0; i < num_sgprs; i++) { >> LLVMValueRef p = LLVMGetParam(func, i); >> ret = LLVMBuildInsertValue(builder, ret, p, i, ""); >> } >> for (unsigned i = 0; i < num_vgprs; i++) { >> LLVMValueRef p = LLVMGetParam(func, num_sgprs + i); >> p = LLVMBuildBitCast(builder, p, ctx->f32, ""); >> ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, >> ""); >> } >> >> if (key->gs_prolog.states.tri_strip_adj_fix) { >> /* Remap the input vertices for every other primitive. */ >> - const unsigned vtx_params[6] = { >> + const unsigned gfx6_vtx_params[6] = { >> num_sgprs, >> num_sgprs + 1, >> num_sgprs + 3, >> num_sgprs + 4, >> num_sgprs + 5, >> num_sgprs + 6 >> }; >> + const unsigned gfx9_vtx_params[3] = { >> + num_sgprs, >> + num_sgprs + 1, >> + num_sgprs + 4, >> + }; >> + LLVMValueRef vtx_in[6], vtx_out[6]; >> LLVMValueRef prim_id, rotate; >> >> + if (ctx->screen->b.chip_class >= GFX9) { >> + for (unsigned i = 0; i < 3; i++) { >> + vtx_in[i*2] = unpack_param(ctx, >> gfx9_vtx_params[i], 0, 16); >> + vtx_in[i*2+1] = unpack_param(ctx, >> gfx9_vtx_params[i], 16, 16); >> + } >> + } else { >> + for (unsigned i = 0; i < 6; i++) >> + vtx_in[i] = LLVMGetParam(func, >> gfx6_vtx_params[i]); >> + } >> + >> prim_id = LLVMGetParam(func, num_sgprs + 2); >> rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, ""); >> >> for (unsigned i = 0; i < 6; ++i) { >> - LLVMValueRef base, rotated, actual; >> - base = LLVMGetParam(func, vtx_params[i]); >> - rotated = LLVMGetParam(func, vtx_params[(i + 4) % >> 6]); >> - actual = LLVMBuildSelect(builder, rotate, rotated, >> base, ""); >> - actual = LLVMBuildBitCast(builder, actual, >> ctx->f32, ""); >> - ret = LLVMBuildInsertValue(builder, ret, actual, >> vtx_params[i], ""); >> + LLVMValueRef base, rotated; >> + base = vtx_in[i]; >> + rotated = vtx_in[(i + 4) % 6]; >> + vtx_out[i] = LLVMBuildSelect(builder, rotate, >> rotated, base, ""); >> + } >> + >> + if (ctx->screen->b.chip_class >= GFX9) { >> + for (unsigned i = 0; i < 3; i++) { >> + LLVMValueRef hi, out; >> + >> + hi = LLVMBuildShl(builder, vtx_out[i*2+1], >> + LLVMConstInt(ctx->i32, >> 16, 0), ""); >> + out = LLVMBuildOr(builder, vtx_out[i*2], >> hi, ""); >> + out = LLVMBuildBitCast(builder, out, >> ctx->f32, ""); >> + ret = LLVMBuildInsertValue(builder, ret, >> out, >> + >> gfx9_vtx_params[i], ""); >> + } >> + } else { >> + for (unsigned i = 0; i < 6; i++) { >> + LLVMValueRef out; >> + >> + out = LLVMBuildBitCast(builder, >> vtx_out[i], ctx->f32, ""); >> + ret = LLVMBuildInsertValue(builder, ret, >> out, >> + >> gfx6_vtx_params[i], ""); >> + } >> } > > > I believe this could be simplified quite a bit, since the vertex indices are > rotate by a multiple of 2. So there's no need to unpack the bits and pack > them again, instead just rotate the 3 input registers by 2 instead of > rotating 6 input registers by 4. > > I'm fine with it if you want to do that in a follow-up patch.
To be honest with you, I'm really not into optimizing for a GS workaround while hardly any app uses GS. Marek _______________________________________________ mesa-dev mailing list [email protected] https://lists.freedesktop.org/mailman/listinfo/mesa-dev
