llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-ir Author: Srinivasa Ravi (Wolfram70) <details> <summary>Changes</summary> This change adds NVVM intrinsics and clang builtins for the cvt instruction variants of types `.e2m3x2`, `.e3m2x2`, and `.ue8m0x2` introduced in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`. Tests are added in `NVPTX/convert-sm100a.ll` and `clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0. PTX Spec Reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt --- Patch is 36.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134345.diff 6 Files Affected: - (modified) clang/include/clang/Basic/BuiltinsNVPTX.td (+31) - (modified) clang/test/CodeGen/builtins-nvptx.c (+187-4) - (modified) llvm/include/llvm/IR/IntrinsicsNVVM.td (+39) - (modified) llvm/lib/Target/NVPTX/NVPTXInstrInfo.td (+47) - (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+56) - (added) llvm/test/CodeGen/NVPTX/convert-sm100a.ll (+290) ``````````diff diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td b/clang/include/clang/Basic/BuiltinsNVPTX.td index 61e48b31c244b..d240b1a8d0d16 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.td +++ b/clang/include/clang/Basic/BuiltinsNVPTX.td @@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>; def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>; +def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_89, PTX81>; +def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>; +def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; +def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_100, PTX86>; def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>; def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM_89, PTX81>; @@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM_89, PTX81>; +def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; +def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + +def __nvvm_ue8m0x2_to_bf16x2 : NVPTXBuiltinSMAndPTX<"_Vector<2, __bf16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>; + // FNS let Attributes = [NoThrow] in { def __nvvm_fns : NVPTXBuiltinPTX<"unsigned int(unsigned int, unsigned int, int)", PTX60>; diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index ffa41c85c2734..f74ae332ecf23 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -25,14 +25,29 @@ // RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX72_SM86 -check-prefix=LP64 %s -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_89 -target-feature +ptx81 -DPTX=81\ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ // RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_90 -target-feature +ptx78 -DPTX=78 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX78_SM90 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100 -target-feature +ptx86 -DPTX=86 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100 %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx86 -DPTX=86 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM100a %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_101a -target-feature +ptx86 -DPTX=86 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM101a %s +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx86 -DPTX=86 \ +// RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s // ### The last run to check with the highest SM and PTX version available // ### to make sure target builtins are still accepted. -// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_100a -target-feature +ptx87 \ +// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_120a -target-feature +ptx87 -DPTX=87 \ // RUN: -fcuda-is-device -emit-llvm -o - -x cuda %s \ -// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX81_SM89 %s +// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX86_SM120a %s #define __device__ __attribute__((device)) #define __global__ __attribute__((global)) @@ -994,7 +1009,7 @@ __device__ void nvvm_cvt_sm80() { // CHECK-LABEL: nvvm_cvt_sm89 __device__ void nvvm_cvt_sm89() { -#if __CUDA_ARCH__ >= 890 +#if (PTX >= 81) && (__CUDA_ARCH__ >= 890) // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn(float 1.000000e+00, float 1.000000e+00) __nvvm_ff_to_e4m3x2_rn(1.0f, 1.0f); // CHECK_PTX81_SM89: call i16 @llvm.nvvm.ff.to.e4m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00) @@ -1021,6 +1036,174 @@ __device__ void nvvm_cvt_sm89() { __nvvm_e5m2x2_to_f16x2_rn(0x4c4c); // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 19532) __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c); + + // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rna_satfinite(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm90 +__device__ void nvvm_cvt_sm90() { +#if (PTX >= 78) && (__CUDA_ARCH__ >= 900) + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.000000e+00) + __nvvm_f2tf32_rn(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.000000e+00) + __nvvm_f2tf32_rn_relu(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.000000e+00) + __nvvm_f2tf32_rz(1.0f); + // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.000000e+00) + __nvvm_f2tf32_rz_relu(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm100 +__device__ void nvvm_cvt_sm100() { +#if (PTX >= 86) && (__CUDA_ARCH__ >= 1000) + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rn_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rn_relu_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rz_satfinite(1.0f); + // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 1.000000e+00) + __nvvm_f2tf32_rz_relu_satfinite(1.0f); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm100a +__device__ void nvvm_cvt_sm100a() { +#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM100_ALL + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + __nvvm_e2m3x2_to_f16x2_rn(0x4C4C); + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e2m3x2_to_f16x2_rn_relu(0x4848); + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + __nvvm_e3m2x2_to_f16x2_rn(0x4848); + // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM100a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + __nvvm_ue8m0x2_to_bf16x2(0x4C4C); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm101a +__device__ void nvvm_cvt_sm101a() { +#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM101_ALL + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f); + + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + __nvvm_e2m3x2_to_f16x2_rn(0x4C4C); + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e2m3x2_to_f16x2_rn_relu(0x4848); + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + __nvvm_e3m2x2_to_f16x2_rn(0x4848); + // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C); + + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM101a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + __nvvm_ue8m0x2_to_bf16x2(0x4C4C); +#endif + // CHECK: ret void +} + +// CHECK-LABEL: nvvm_cvt_sm120a +__device__ void nvvm_cvt_sm120a() { +#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM120_ALL + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn(1.0f, 1.0f); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m3x2.rn.relu(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e2m3x2_rn_relu(1.0f, 1.0f); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn(1.0f, 1.0f); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e3m2x2.rn.relu(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_e3m2x2_rn_relu(1.0f, 1.0f); + + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 19532) + __nvvm_e2m3x2_to_f16x2_rn(0x4C4C); + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 18504) + __nvvm_e2m3x2_to_f16x2_rn_relu(0x4848); + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 18504) + __nvvm_e3m2x2_to_f16x2_rn(0x4848); + // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532) + __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C); + + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz(1.0f, 1.0f); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rz_satfinite(1.0f, 1.0f); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp(1.0f, 1.0f); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rp.satfinite(float 1.000000e+00, float 1.000000e+00) + __nvvm_ff_to_ue8m0x2_rp_satfinite(1.0f, 1.0f); + + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rz.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rz_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp({(__bf16)0.1f, (__bf16)0.1f}); + // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.bf16x2.to.ue8m0x2.rp.satfinite(<2 x bfloat> splat (bfloat 0xR3DCD) + __nvvm_bf16x2_to_ue8m0x2_rp_satfinite({(__bf16)0.1f, (__bf16)0.1f}); + + // CHECK_PTX86_SM120a: call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 19532) + __nvvm_ue8m0x2_to_bf16x2(0x4C4C); #endif // CHECK: ret void } diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 80e10f33b770d..e764284d7fb7f 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1548,6 +1548,45 @@ let TargetPrefix = "nvvm" in { Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; def int_nvvm_e5m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">, Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + + def int_nvvm_ff_to_e2m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn">, + Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ff_to_e2m3x2_rn_relu : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn_relu">, + Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ff_to_e3m2x2_rn : ClangBuiltin<"__nvvm_ff_to_e3m2x2_rn">, + Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ff_to_e3m2x2_rn_relu : ClangBuiltin<"__nvvm_ff_to_e3m2x2_rn_relu">, + Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + + def int_nvvm_e2m3x2_to_f16x2_rn : ClangBuiltin<"__nvvm_e2m3x2_to_f16x2_rn">, + Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_e2m3x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e2m3x2_to_f16x2_rn_relu">, + Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_e3m2x2_to_f16x2_rn : ClangBuiltin<"__nvvm_e3m2x2_to_f16x2_rn">, + Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_e3m2x2_to_f16x2_rn_relu : ClangBuiltin<"__nvvm_e3m2x2_to_f16x2_rn_relu">, + Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>; + + def int_nvvm_ff_to_ue8m0x2_rz : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rz">, + Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ff_to_ue8m0x2_rz_satfinite : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rz_satfinite">, + Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ff_to_ue8m0x2_rp : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rp">, + Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_ff_to_ue8m0x2_rp_satfinite : ClangBuiltin<"__nvvm_ff_to_ue8m0x2_rp_satfinite">, + Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, IntrNoCallback]>; + + def int_nvvm_bf16x2_to_ue8m0x2_rz : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rz">, + Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rz_satfinite">, + Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_bf16x2_to_ue8m0x2_rp : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rp">, + Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>; + def int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite : ClangBuiltin<"__nvvm_bf16x2_to_ue8m0x2_rp_satfinite">, + Intrinsic<[llvm_i16_ty], [llvm_v2bf16_ty], [IntrNoMem, IntrNoCallback]>; + + def int_nvvm_ue8m0x2_to_bf16x2 : ClangBuiltin<"__nvvm_ue8m0x2_to_bf16x2">, + Intrinsic<[llvm_v2bf16_ty], ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/134345 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits