From: Chia-I Wu <[email protected]> Add fs_visitor::emit_dual_texture_gen7 that emulate SIMD16 sample_d with dual SIMD8 sample_d on gen7+. Fix fs_generator::generate_tex to send SIMD8 messages when force_uncompressed or force_sechalf is set.
No piglit quick.tests regression on Ivy Bridge and Haswell. With this change, I am seeing 6.76479% +/- 0.619064% (at 95.0% confidence) improvement on Xonotic with Ultra effects. Signed-off-by: Chia-I Wu <[email protected]> --- src/mesa/drivers/dri/i965/brw_fs.h | 3 + src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 137 ++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 2 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index c161e7d..82a0a7d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -335,6 +335,9 @@ public: fs_inst *emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, fs_reg shadow_comp, fs_reg lod, fs_reg lod2, fs_reg sample_index, int sampler); + void emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, + fs_reg shadow_comp, fs_reg lod, fs_reg lod2, + fs_reg sample_index, int sampler); fs_inst *emit_texture(ir_texture *ir, fs_reg dst, int base_mrf, int mlen, bool header_present, int regs_written, int sampler); diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 6435a17..b9f97b6 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -1334,6 +1334,133 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, return emit_texture(ir, dst, base_mrf, mlen, header_present, 4, sampler); } +/* Emulate a SIMD16 sampler message with dual SIMD8 sampler messages. For + * now, and for pratical reaons, only ir_txd is supported. + */ +void +fs_visitor::emit_dual_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, + fs_reg shadow_c, fs_reg lod, fs_reg lod2, + fs_reg sample_index, int sampler) +{ + /* no need to emit dual SIMD8 messages */ + if (dispatch_width != 16 || ir->op != ir_txd) { + emit_texture_gen7(ir, dst, coordinate, shadow_c, + lod, lod2, sample_index, sampler); + return; + } + + const int reg_width = 1; + int mlen = 0; + int base_mrf = 2; + bool header_present = false; + fs_reg temp = fs_reg(GRF, virtual_grf_alloc(4), + brw_type_for_base_type(ir->type)); + + emit(FS_OPCODE_OVERWRITE_DST, dst); + emit(FS_OPCODE_OVERWRITE_DST, temp); + + for (int msg = 0; msg < 2; msg++) { + if (msg == 0) + push_force_uncompressed(); + else + push_force_sechalf(); + + /* only txd is supported for now */ + assert(ir->op == ir_txd); + + if (ir->offset) { + /* The offsets set up by the ir_texture visitor are in the + * m1 header, so we can't go headerless. + */ + header_present = true; + mlen++; + base_mrf--; + } + + if (ir->shadow_comparitor) { + emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c)); + mlen += reg_width; + } + + /* Load dPdx and the coordinate together: + * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z + */ + fs_reg coord = coordinate, ddx = lod, ddy = lod2; + for (int i = 0; i < ir->coordinate->type->vector_elements; i++) { + emit(MOV(fs_reg(MRF, base_mrf + mlen), coord)); + coord.reg_offset++; + mlen += reg_width; + + /* For cube map array, the coordinate is (u,v,r,ai) but there are + * only derivatives for (u, v, r). + */ + if (i < ir->lod_info.grad.dPdx->type->vector_elements) { + emit(MOV(fs_reg(MRF, base_mrf + mlen), ddx)); + ddx.reg_offset++; + mlen += reg_width; + + emit(MOV(fs_reg(MRF, base_mrf + mlen), ddy)); + ddy.reg_offset++; + mlen += reg_width; + } + } + + if (mlen > 11) { + fail("Message length >11 disallowed by hardware\n"); + break; + } + + /* response length is 4, which are 2 vgrf */ + emit_texture(ir, temp, base_mrf, mlen, header_present, 2, sampler); + + if (msg == 0) { + /* move from temp to dst */ + for (int i = 0; i < 4; i++) { + fs_reg d = dst; + d.reg_offset += i; + + fs_reg s = temp; + s.reg_offset += i / 2; + s.sechalf = (i % 2); + + emit(MOV(d, s)); + } + + pop_force_uncompressed(); + + /* use non-overlapping MRF range if possible */ + if (base_mrf + mlen * 2 < BRW_MAX_MRF) + base_mrf += mlen; + + mlen = 0; + + temp.reg_offset += 2; + + coordinate.sechalf = true; + shadow_c.sechalf = true; + lod.sechalf = true; + lod2.sechalf = true; + sample_index.sechalf = true; + } + else { + /* move from temp to dst */ + for (int i = 0; i < 4; i++) { + fs_reg d = dst; + d.reg_offset += i; + d.sechalf = true; + + fs_reg s = temp; + s.reg_offset += i / 2; + s.sechalf = (i % 2); + + emit(MOV(d, s)); + } + + pop_force_sechalf(); + } + } +} + fs_reg fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate, bool is_rect, int sampler, int texunit) @@ -1503,8 +1630,14 @@ fs_visitor::visit(ir_texture *ir) fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1)); if (brw->gen >= 7) { - emit_texture_gen7(ir, dst, coordinate, shadow_comparitor, - lod, lod2, sample_index, sampler); + if (dispatch_width == 16 && ir->op == ir_txd) { + emit_dual_texture_gen7(ir, dst, coordinate, shadow_comparitor, + lod, lod2, sample_index, sampler); + } + else { + emit_texture_gen7(ir, dst, coordinate, shadow_comparitor, + lod, lod2, sample_index, sampler); + } } else if (brw->gen >= 5) { emit_texture_gen5(ir, dst, coordinate, shadow_comparitor, lod, lod2, sample_index, sampler); -- 1.8.3.1 _______________________________________________ mesa-dev mailing list [email protected] http://lists.freedesktop.org/mailman/listinfo/mesa-dev
