Module: Mesa Branch: main Commit: dbbf566588cedc72062f3d3640a0cf1bebd40af9 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=dbbf566588cedc72062f3d3640a0cf1bebd40af9
Author: Qiang Yu <[email protected]> Date: Wed Nov 8 10:54:55 2023 +0800 aco,ac/llvm,radeonsi: lower f2f16 to f2f16_rtz in nir No need to handle f2f16 specially for OpenGL, and we can vectorize f2f16 when using ACO. Reviewed-by: Marek Olšák <[email protected]> Signed-off-by: Qiang Yu <[email protected]> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25990> --- src/amd/compiler/aco_interface.cpp | 3 +- src/amd/llvm/ac_nir_to_llvm.c | 54 +++++++++++++---------------------- src/gallium/drivers/radeonsi/si_get.c | 9 ++++++ 3 files changed, 31 insertions(+), 35 deletions(-) diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp index 76d4b13b308..7ef3fcc7714 100644 --- a/src/amd/compiler/aco_interface.cpp +++ b/src/amd/compiler/aco_interface.cpp @@ -444,7 +444,8 @@ aco_nir_op_supports_packed_math_16bit(const nir_alu_instr* alu) case nir_op_f2f16: { nir_shader* shader = nir_cf_node_get_function(&alu->instr.block->cf_node)->function->shader; unsigned execution_mode = shader->info.float_controls_execution_mode; - return nir_is_rounding_mode_rtz(execution_mode, 16); + return (shader->options->force_f2f16_rtz && !nir_is_rounding_mode_rtne(execution_mode, 16)) || + nir_is_rounding_mode_rtz(execution_mode, 16); } case nir_op_fadd: case nir_op_fsub: diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index d82dde258eb..273194ee7ca 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -928,45 +928,31 @@ static bool visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr) case nir_op_u2f64: result = LLVMBuildUIToFP(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); break; - case nir_op_f2f16_rtz: - case nir_op_f2f16: + case nir_op_f2f16_rtz: { src[0] = ac_to_float(&ctx->ac, src[0]); - /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use it, - * all f32->f16 conversions have to round towards zero, because both scalar - * and vec2 down-conversions have to round equally. - */ - if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL || instr->op == nir_op_f2f16_rtz) { - src[0] = ac_to_float(&ctx->ac, src[0]); - - if (LLVMTypeOf(src[0]) == ctx->ac.f64) - src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); - - /* Fast path conversion. This only works if NIR is vectorized - * to vec2 16. - */ - if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) { - LLVMValueRef args[] = { - ac_llvm_extract_elem(&ctx->ac, src[0], 0), - ac_llvm_extract_elem(&ctx->ac, src[0], 1), - }; - result = ac_build_cvt_pkrtz_f16(&ctx->ac, args); - break; - } + if (LLVMTypeOf(src[0]) == ctx->ac.f64) + src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, ""); - assert(ac_get_llvm_num_components(src[0]) == 1); - LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)}; - result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); - result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); - } else { - if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < ac_get_elem_bits(&ctx->ac, def_type)) - result = - LLVMBuildFPExt(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); - else - result = - LLVMBuildFPTrunc(ctx->ac.builder, src[0], ac_to_float_type(&ctx->ac, def_type), ""); + /* Fast path conversion. This only works if NIR is vectorized + * to vec2 16. + */ + if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) { + LLVMValueRef args[] = { + ac_llvm_extract_elem(&ctx->ac, src[0], 0), + ac_llvm_extract_elem(&ctx->ac, src[0], 1), + }; + result = ac_build_cvt_pkrtz_f16(&ctx->ac, args); + break; } + + assert(ac_get_llvm_num_components(src[0]) == 1); + LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)}; + result = ac_build_cvt_pkrtz_f16(&ctx->ac, param); + result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, ""); break; + } + case nir_op_f2f16: case nir_op_f2f16_rtne: case nir_op_f2f32: case nir_op_f2f64: diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c index 5c9d7ab1073..e15ab779966 100644 --- a/src/gallium/drivers/radeonsi/si_get.c +++ b/src/gallium/drivers/radeonsi/si_get.c @@ -1381,6 +1381,15 @@ void si_init_screen_get_functions(struct si_screen *sscreen) nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 | nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64 | nir_lower_iadd_sat64 | nir_lower_conv64, + + /* For OpenGL, rounding mode is undefined. We want fast packing with v_cvt_pkrtz_f16, + * but if we use it, all f32->f16 conversions have to round towards zero, + * because both scalar and vec2 down-conversions have to round equally. + * + * For OpenCL, rounding mode is explicit. This will only lower f2f16 to f2f16_rtz + * when execution mode is rtz instead of rtne. + */ + .force_f2f16_rtz = true, }; *sscreen->nir_options = nir_options; }
