Module: Mesa
Branch: main
Commit: dbbf566588cedc72062f3d3640a0cf1bebd40af9
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=dbbf566588cedc72062f3d3640a0cf1bebd40af9

Author: Qiang Yu <[email protected]>
Date:   Wed Nov  8 10:54:55 2023 +0800

aco,ac/llvm,radeonsi: lower f2f16 to f2f16_rtz in nir

No need to handle f2f16 specially for OpenGL, and we can vectorize
f2f16 when using ACO.

Reviewed-by: Marek Olšák <[email protected]>
Signed-off-by: Qiang Yu <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25990>

---

 src/amd/compiler/aco_interface.cpp    |  3 +-
 src/amd/llvm/ac_nir_to_llvm.c         | 54 +++++++++++++----------------------
 src/gallium/drivers/radeonsi/si_get.c |  9 ++++++
 3 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/src/amd/compiler/aco_interface.cpp 
b/src/amd/compiler/aco_interface.cpp
index 76d4b13b308..7ef3fcc7714 100644
--- a/src/amd/compiler/aco_interface.cpp
+++ b/src/amd/compiler/aco_interface.cpp
@@ -444,7 +444,8 @@ aco_nir_op_supports_packed_math_16bit(const nir_alu_instr* 
alu)
    case nir_op_f2f16: {
       nir_shader* shader = 
nir_cf_node_get_function(&alu->instr.block->cf_node)->function->shader;
       unsigned execution_mode = shader->info.float_controls_execution_mode;
-      return nir_is_rounding_mode_rtz(execution_mode, 16);
+      return (shader->options->force_f2f16_rtz && 
!nir_is_rounding_mode_rtne(execution_mode, 16)) ||
+             nir_is_rounding_mode_rtz(execution_mode, 16);
    }
    case nir_op_fadd:
    case nir_op_fsub:
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index d82dde258eb..273194ee7ca 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -928,45 +928,31 @@ static bool visit_alu(struct ac_nir_context *ctx, const 
nir_alu_instr *instr)
    case nir_op_u2f64:
       result = LLVMBuildUIToFP(ctx->ac.builder, src[0], 
ac_to_float_type(&ctx->ac, def_type), "");
       break;
-   case nir_op_f2f16_rtz:
-   case nir_op_f2f16:
+   case nir_op_f2f16_rtz: {
       src[0] = ac_to_float(&ctx->ac, src[0]);
 
-      /* For OpenGL, we want fast packing with v_cvt_pkrtz_f16, but if we use 
it,
-       * all f32->f16 conversions have to round towards zero, because both 
scalar
-       * and vec2 down-conversions have to round equally.
-       */
-      if (ctx->ac.float_mode == AC_FLOAT_MODE_DEFAULT_OPENGL || instr->op == 
nir_op_f2f16_rtz) {
-         src[0] = ac_to_float(&ctx->ac, src[0]);
-
-         if (LLVMTypeOf(src[0]) == ctx->ac.f64)
-            src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, 
"");
-
-         /* Fast path conversion. This only works if NIR is vectorized
-          * to vec2 16.
-          */
-         if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
-            LLVMValueRef args[] = {
-               ac_llvm_extract_elem(&ctx->ac, src[0], 0),
-               ac_llvm_extract_elem(&ctx->ac, src[0], 1),
-            };
-            result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
-            break;
-         }
+      if (LLVMTypeOf(src[0]) == ctx->ac.f64)
+         src[0] = LLVMBuildFPTrunc(ctx->ac.builder, src[0], ctx->ac.f32, "");
 
-         assert(ac_get_llvm_num_components(src[0]) == 1);
-         LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)};
-         result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
-         result = LLVMBuildExtractElement(ctx->ac.builder, result, 
ctx->ac.i32_0, "");
-      } else {
-         if (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src[0])) < 
ac_get_elem_bits(&ctx->ac, def_type))
-            result =
-               LLVMBuildFPExt(ctx->ac.builder, src[0], 
ac_to_float_type(&ctx->ac, def_type), "");
-         else
-            result =
-               LLVMBuildFPTrunc(ctx->ac.builder, src[0], 
ac_to_float_type(&ctx->ac, def_type), "");
+      /* Fast path conversion. This only works if NIR is vectorized
+       * to vec2 16.
+       */
+      if (LLVMTypeOf(src[0]) == ctx->ac.v2f32) {
+         LLVMValueRef args[] = {
+            ac_llvm_extract_elem(&ctx->ac, src[0], 0),
+            ac_llvm_extract_elem(&ctx->ac, src[0], 1),
+         };
+         result = ac_build_cvt_pkrtz_f16(&ctx->ac, args);
+         break;
       }
+
+      assert(ac_get_llvm_num_components(src[0]) == 1);
+      LLVMValueRef param[2] = {src[0], LLVMGetUndef(ctx->ac.f32)};
+      result = ac_build_cvt_pkrtz_f16(&ctx->ac, param);
+      result = LLVMBuildExtractElement(ctx->ac.builder, result, ctx->ac.i32_0, 
"");
       break;
+   }
+   case nir_op_f2f16:
    case nir_op_f2f16_rtne:
    case nir_op_f2f32:
    case nir_op_f2f64:
diff --git a/src/gallium/drivers/radeonsi/si_get.c 
b/src/gallium/drivers/radeonsi/si_get.c
index 5c9d7ab1073..e15ab779966 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -1381,6 +1381,15 @@ void si_init_screen_get_functions(struct si_screen 
*sscreen)
          nir_lower_imul64 | nir_lower_imul_high64 | nir_lower_imul_2x32_64 |
          nir_lower_divmod64 | nir_lower_minmax64 | nir_lower_iabs64 |
          nir_lower_iadd_sat64 | nir_lower_conv64,
+
+      /* For OpenGL, rounding mode is undefined. We want fast packing with 
v_cvt_pkrtz_f16,
+       * but if we use it, all f32->f16 conversions have to round towards zero,
+       * because both scalar and vec2 down-conversions have to round equally.
+       *
+       * For OpenCL, rounding mode is explicit. This will only lower f2f16 to 
f2f16_rtz
+       * when execution mode is rtz instead of rtne.
+       */
+      .force_f2f16_rtz = true,
    };
    *sscreen->nir_options = nir_options;
 }

Reply via email to