From: Iago Toral Quiroga <ito...@igalia.com> There are a few places where we need to shuffle the result of a 32-bit load into valid 64-bit data, so extract this logic into a separate helper that we can reuse.
Also, the shuffling needs to operate with WE_all set, which we were missing before, because we are changing the layout of the data across the various channels. Otherwise we will run into problems in non-uniform control-flow scenarios. --- src/mesa/drivers/dri/i965/brw_fs.cpp | 95 +++++++++++++++++++++----------- src/mesa/drivers/dri/i965/brw_fs.h | 5 ++ src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++-------------- 3 files changed, 73 insertions(+), 73 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index dff13ea..709e4b8 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, vec4_result.type = dst.type; - /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If we - * are reading doubles this means that we get this: - * - * r0: x0 x0 x0 x0 x0 x0 x0 x0 - * r1: x1 x1 x1 x1 x1 x1 x1 x1 - * r2: y0 y0 y0 y0 y0 y0 y0 y0 - * r3: y1 y1 y1 y1 y1 y1 y1 y1 - * - * Fix this up so we return valid double elements: - * - * r0: x0 x1 x0 x1 x0 x1 x0 x1 - * r1: x0 x1 x0 x1 x0 x1 x0 x1 - * r2: y0 y1 y0 y1 y0 y1 y0 y1 - * r3: y0 y1 y0 y1 y0 y1 y0 y1 - */ - if (type_sz(dst.type) == 8) { - int multiplier = bld.dispatch_width() / 8; - fs_reg fixed_res = - fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F); - /* We only have 2 doubles in a 32-bit vec4 */ - for (int i = 0; i < 2; i++) { - fs_reg vec4_float = - horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F), - multiplier * 16 * i); - - bld.MOV(stride(fixed_res, 2), vec4_float); - bld.MOV(stride(horiz_offset(fixed_res, 1), 2), - horiz_offset(vec4_float, 8 * multiplier)); - - bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i), - retype(fixed_res, BRW_REGISTER_TYPE_DF)); - } - } + if (type_sz(dst.type) == 8) + SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, vec4_result, 2); int type_slots = MAX2(type_sz(dst.type) / 4, 1); bld.MOV(dst, offset(vec4_result, bld, @@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, } /** + * This helper takes the result of a load operation that reads 32-bit elements + * in this format: + * + * x x x x x x x x + * y y y y y y y y + * z z z z z z z z + * w w w w w w w w + * + * and shuffles the data to get this: + * + * x y x y x y x y + * x y x y x y x y + * z w z w z w z w + * z w z w z w z w + * + * Which is exactly what we want if the load is reading 64-bit components + * like doubles, where x represents the low 32-bit of the x double component + * and y represents the high 32-bit of the x double component (likewise with + * z and w for double component y). The parameter @components represents + * the number of 64-bit components present in @src. This would typically be + * 2 at most, since we can only fit 2 double elements in the result of a + * vec4 load. + * + * Notice that @dst and @src can be the same register. + */ +void +fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder &bld, + const fs_reg dst, + const fs_reg src, + uint32_t components) +{ + int multiplier = bld.dispatch_width() / 8; + + /* A temporary that we will use to shuffle the 32-bit data of each + * component in the vector into valid 64-bit data + */ + fs_reg tmp = + fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F); + + /* We are going to manipulate the data in elements of 32-bit */ + fs_reg src_data = retype(src, BRW_REGISTER_TYPE_F); + + /* We are going to manipulate the dst in elements of 64-bit */ + fs_reg dst_data = retype(dst, BRW_REGISTER_TYPE_DF); + + /* Shuffle the data */ + for (unsigned i = 0; i < components; i++) { + fs_reg component_i = horiz_offset(src_data, multiplier * 16 * i); + + bld.MOV(stride(tmp, 2), component_i)->force_writemask_all = true; + bld.MOV(stride(horiz_offset(tmp, 1), 2), + horiz_offset(component_i, 8 * multiplier)) + ->force_writemask_all = true; + + bld.MOV(horiz_offset(dst_data, multiplier * 8 * i), + retype(tmp, BRW_REGISTER_TYPE_DF))->force_writemask_all = true; + } +} + +/** * A helper for MOV generation for fixing up broken hardware SEND dependency * handling. */ diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 1f18112..c95b30a 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -106,6 +106,11 @@ public: uint32_t const_offset); void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf); + void SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const brw::fs_builder &bld, + const fs_reg dst, + const fs_reg src, + uint32_t components); + bool run_fs(bool do_rep_send); bool run_vs(gl_clip_plane *clip_planes); bool run_tes(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp index 5a12d63..9cd751b 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp @@ -3082,44 +3082,19 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr for (int i = 0; i < instr->num_components; i++) bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); } else { - /* Reading a dvec, then the untyped read below gives us this: + /* Reading a dvec, so we need to: * - * x0 x0 x0 x0 x0 x0 x0 x0 - * x1 x1 x1 x1 x1 x1 x1 x1 - * y0 y0 y0 y0 y0 y0 y0 y0 - * y1 y1 y1 y1 y1 y1 y1 y1 - * - * But that is not valid 64-bit data, instead we want: - * - * x0 x1 x0 x1 x0 x1 x0 x1 - * x0 x1 x0 x1 x0 x1 x0 x1 - * y0 y1 y0 y1 y0 y1 y0 y1 - * y0 y1 y0 y1 y0 y1 y0 y1 - * - * Also, that would only load half of a dvec4. So, we have to: - * - * 1. Multiply num_components by 2, to account for the fact that we need - * to read 64-bit components. + * 1. Multiply num_components by 2, to account for the fact that we + * need to read 64-bit components. * 2. Shuffle the result of the load to form valid 64-bit elements * 3. Emit a second load (for components z/w) if needed. - * - * FIXME: extract the shuffling logic and share it with - * varying_pull_constant_load */ - int multiplier = bld.dispatch_width() / 8; - fs_reg read_offset = vgrf(glsl_type::uint_type); bld.MOV(read_offset, offset_reg); int num_components = instr->num_components; int iters = num_components <= 2 ? 1 : 2; - /* A temporary that we will use to shuffle the 32-bit data of each - * component in the vector into valid 64-bit data - */ - fs_reg tmp = - fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F); - /* Load the dvec, the first iteration loads components x/y, the second * iteration, if needed, loads components z/w */ @@ -3138,18 +3113,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr read_result.type = BRW_REGISTER_TYPE_F; /* Shuffle the 32-bit load result into valid 64-bit data */ - int multiplier = bld.dispatch_width() / 8; - for (int i = 0; i < iter_components; i++) { - fs_reg component_i = - horiz_offset(read_result, multiplier * 16 * i); - - bld.MOV(stride(tmp, 2), component_i); - bld.MOV(stride(horiz_offset(tmp, 1), 2), - horiz_offset(component_i, 8 * multiplier)); - - bld.MOV(horiz_offset(dest, multiplier * 8 * (i + it * 2)), - retype(tmp, BRW_REGISTER_TYPE_DF)); - } + SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, + offset(dest, bld, it * 2), + read_result, iter_components); bld.ADD(read_offset, read_offset, brw_imm_ud(16)); } -- 2.5.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev