On Wed, Apr 15, 2015 at 06:58:02PM +0100, Neil Roberts wrote: > Commit 5a06ee738 added a step to the generator to set up the message > header when generating the VS_OPCODE_PULL_CONSTANT_LOAD_GEN7 > instruction. That pseudo opcode is implemented in terms of multiple > actual opcodes, one of which writes to one of the source registers in > order to set up the message header. This causes problems because the > scheduler isn't aware that the source register is written to and it > can end up reorganising the instructions incorrectly such that the > write to the source register overwrites a needed value from a previous > instruction. This problem was presenting itself as a rendering error > in the weapon in Enemy Territory: Quake Wars. > > Since commit 588859e1 there is an additional problem that the double > register allocated to include the message header would end up being > split into two. This wasn't happening previously because the code to > split registers was explicitly avoided for instructions that are > sending from the GRF. > > This patch fixes both problems by splitting the code to set up the > message header into a new pseudo opcode so that it will be done > outside of the generator. This new opcode has the header register as a > destination so the scheduler can recognise that the register is > written to. This has the additional benefit that the scheduler can > optimise the message header slightly better by moving the mov > instructions further away from the send instructions. > > On Skylake it appears to fix the following three Piglit tests without > causing any regressions: > > gs-float-array-variable-index > gs-mat3x4-row-major > gs-mat4x3-row-major > > I think we actually may need to do something similar for the fs > backend and possibly for message headers from regular texture sampling > but I'm not entirely sure. > > v2: Make sure the exec-size is retained as 8 for the mov instruction > to initialise the header from g0. This was accidentally lost > during a rebase on top of 07c571a39fa1. > Split the patch into two so that the helper function is a separate > change. > Fix emitting the MOV instruction on Gen7. > > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=89058
Very nice fix. Reviewed-by: Ben Widawsky <[email protected]> > --- > src/mesa/drivers/dri/i965/brw_defines.h | 1 + > src/mesa/drivers/dri/i965/brw_shader.cpp | 4 ++ > src/mesa/drivers/dri/i965/brw_vec4.h | 2 + > .../dri/i965/brw_vec4_dead_code_eliminate.cpp | 1 + > src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 52 > +++++++++++----------- > src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 38 ++++++++++++---- > 6 files changed, 63 insertions(+), 35 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_defines.h > b/src/mesa/drivers/dri/i965/brw_defines.h > index da6ed5b..a97a944 100644 > --- a/src/mesa/drivers/dri/i965/brw_defines.h > +++ b/src/mesa/drivers/dri/i965/brw_defines.h > @@ -948,6 +948,7 @@ enum opcode { > VS_OPCODE_URB_WRITE, > VS_OPCODE_PULL_CONSTANT_LOAD, > VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, > + VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, > VS_OPCODE_UNPACK_FLAGS_SIMD4X2, > > /** > diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp > b/src/mesa/drivers/dri/i965/brw_shader.cpp > index 335a800..0d6ac0c 100644 > --- a/src/mesa/drivers/dri/i965/brw_shader.cpp > +++ b/src/mesa/drivers/dri/i965/brw_shader.cpp > @@ -568,6 +568,10 @@ brw_instruction_name(enum opcode op) > return "pull_constant_load"; > case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: > return "pull_constant_load_gen7"; > + > + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: > + return "set_simd4x2_header_gen9"; > + > case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: > return "unpack_flags_simd4x2"; > > diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h > b/src/mesa/drivers/dri/i965/brw_vec4.h > index 0363924..a0ee2cc 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4.h > +++ b/src/mesa/drivers/dri/i965/brw_vec4.h > @@ -500,6 +500,8 @@ private: > struct brw_reg dst, > struct brw_reg surf_index, > struct brw_reg offset); > + void generate_set_simd4x2_header_gen9(vec4_instruction *inst, > + struct brw_reg dst); > void generate_unpack_flags(struct brw_reg dst); > > void generate_untyped_atomic(vec4_instruction *inst, > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > index 980e266..70d2af5 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_dead_code_eliminate.cpp > @@ -44,6 +44,7 @@ can_do_writemask(const struct brw_context *brw, > case SHADER_OPCODE_GEN4_SCRATCH_READ: > case VS_OPCODE_PULL_CONSTANT_LOAD: > case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: > + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: > return false; > default: > /* The MATH instruction on Gen6 only executes in align1 mode, which > does > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > index e4addf7..b22a555 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp > @@ -1039,38 +1039,18 @@ > vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, > { > assert(surf_index.type == BRW_REGISTER_TYPE_UD); > > - struct brw_reg src = offset; > - bool header_present = false; > - int mlen = 1; > - > - if (brw->gen >= 9) { > - /* Skylake requires a message header in order to use SIMD4x2 mode. */ > - src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD); > - mlen = 2; > - header_present = true; > - > - brw_push_insn_state(p); > - brw_set_default_mask_control(p, BRW_MASK_DISABLE); > - brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), > BRW_REGISTER_TYPE_UD)); > - brw_set_default_access_mode(p, BRW_ALIGN_1); > - > - brw_MOV(p, get_element_ud(src, 2), > - brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); > - brw_pop_insn_state(p); > - } > - > if (surf_index.file == BRW_IMMEDIATE_VALUE) { > > brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); > brw_set_dest(p, insn, dst); > - brw_set_src0(p, insn, src); > + brw_set_src0(p, insn, offset); > brw_set_sampler_message(p, insn, > surf_index.dw1.ud, > 0, /* LD message ignores sampler unit */ > GEN5_SAMPLER_MESSAGE_SAMPLE_LD, > 1, /* rlen */ > - mlen, > - header_present, > + inst->mlen, > + inst->header_present, > BRW_SAMPLER_SIMD_MODE_SIMD4X2, > 0); > > @@ -1095,14 +1075,14 @@ > vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, > > /* dst = send(offset, a0.0 | <descriptor>) */ > brw_inst *insn = brw_send_indirect_message( > - p, BRW_SFID_SAMPLER, dst, src, addr); > + p, BRW_SFID_SAMPLER, dst, offset, addr); > brw_set_sampler_message(p, insn, > 0 /* surface */, > 0 /* sampler */, > GEN5_SAMPLER_MESSAGE_SAMPLE_LD, > 1 /* rlen */, > - mlen /* mlen */, > - header_present /* header */, > + inst->mlen, > + inst->header_present, > BRW_SAMPLER_SIMD_MODE_SIMD4X2, > 0); > > @@ -1113,6 +1093,22 @@ > vec4_generator::generate_pull_constant_load_gen7(vec4_instruction *inst, > } > > void > +vec4_generator::generate_set_simd4x2_header_gen9(vec4_instruction *inst, > + struct brw_reg dst) > +{ > + brw_push_insn_state(p); > + brw_set_default_mask_control(p, BRW_MASK_DISABLE); > + > + brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); > + > + brw_set_default_access_mode(p, BRW_ALIGN_1); > + brw_MOV(p, get_element_ud(dst, 2), > + brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); > + > + brw_pop_insn_state(p); > +} > + > +void > vec4_generator::generate_untyped_atomic(vec4_instruction *inst, > struct brw_reg dst, > struct brw_reg atomic_op, > @@ -1435,6 +1431,10 @@ vec4_generator::generate_code(const cfg_t *cfg) > generate_pull_constant_load_gen7(inst, dst, src[0], src[1]); > break; > > + case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: > + generate_set_simd4x2_header_gen9(inst, dst); > + break; > + > case GS_OPCODE_URB_WRITE: > generate_gs_urb_write(inst); > break; > diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp > b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp > index f7d542b..3d16caa 100644 > --- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp > +++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp > @@ -1313,16 +1313,36 @@ vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, > > vec4_instruction *pull; > > - if (brw->gen >= 7) { > - dst_reg grf_offset = dst_reg(this, glsl_type::int_type); > + if (brw->gen >= 9) { > + /* Gen9+ needs a message header in order to use SIMD4x2 mode */ > + src_reg header(this, glsl_type::uvec4_type, 2); > > - /* We have to use a message header on Skylake to get SIMD4x2 mode. > - * Reserve space for the register. > - */ > - if (brw->gen >= 9) { > - grf_offset.reg_offset++; > - alloc.sizes[grf_offset.reg] = 2; > - } > + pull = new(mem_ctx) > + vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, > + dst_reg(header)); > + > + if (before_inst) > + emit_before(before_block, before_inst, pull); > + else > + emit(pull); > + > + dst_reg index_reg = retype(offset(dst_reg(header), 1), > + offset_reg.type); > + pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg); > + > + if (before_inst) > + emit_before(before_block, before_inst, pull); > + else > + emit(pull); > + > + pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, > + dst, > + surf_index, > + header); > + pull->mlen = 2; > + pull->header_present = true; > + } else if (brw->gen >= 7) { > + dst_reg grf_offset = dst_reg(this, glsl_type::int_type); > > grf_offset.type = offset_reg.type; > > -- > 1.9.3 > -- Ben Widawsky, Intel Open Source Technology Center _______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
