[clang] [llvm] [NVPTX] Add intrinsics for redux.sync f32 instructions (PR #126664)

2025-02-11 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/126664

>From 88e076bb9af3b1bc63d76feef1ba842d88fbd95f Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Mon, 10 Feb 2025 14:13:42 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for redux.sync f32 instructions

Adds NVVM intrinsics and NVPTX codegen for redux.sync f32 instructions
introduced in ptx8.6 for sm_100a. Tests added in
CodeGen/NVPTX/redux-sync.ll and verified through ptxas 12.8.0.

PTX Spec Reference:
https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync
---
 clang/include/clang/Basic/BuiltinsNVPTX.td   |   8 ++
 clang/test/CodeGenCUDA/redux-f32-builtins.cu |  34 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  12 ++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  19 +++
 llvm/test/CodeGen/NVPTX/redux-sync-f32.ll| 139 +++
 5 files changed, 212 insertions(+)
 create mode 100644 clang/test/CodeGenCUDA/redux-f32-builtins.cu
 create mode 100644 llvm/test/CodeGen/NVPTX/redux-sync-f32.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 9d24a992563a450..327dc88cffdb4e6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -669,6 +669,14 @@ def __nvvm_redux_sync_umax : 
NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, in
 def __nvvm_redux_sync_and : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, 
PTX70>;
 def __nvvm_redux_sync_xor : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, 
PTX70>;
 def __nvvm_redux_sync_or : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_fmin : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_abs : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_abs_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_abs : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_abs_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
 
 // Membar
 
diff --git a/clang/test/CodeGenCUDA/redux-f32-builtins.cu 
b/clang/test/CodeGenCUDA/redux-f32-builtins.cu
new file mode 100644
index 000..7359fb000699169
--- /dev/null
+++ b/clang/test/CodeGenCUDA/redux-f32-builtins.cu
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx86" 
"-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx86" 
"-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+// CHECK: define{{.*}} void @_Z6kernelPf(ptr noundef %out_f)
+__attribute__((global)) void kernel(float* out_f) {
+  float a = 3.0;
+  int i = 0;
+
+  out_f[i++] = __nvvm_redux_sync_fmin(a, 0xFF);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmin
+
+  out_f[i++] = __nvvm_redux_sync_fmin_abs(a, 0xFF);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.abs
+
+  out_f[i++] = __nvvm_redux_sync_fmin_NaN(a, 0xF0);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.NaN
+
+  out_f[i++] = __nvvm_redux_sync_fmin_abs_NaN(a, 0x0F);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.abs.NaN
+
+  out_f[i++] = __nvvm_redux_sync_fmax(a, 0xFF);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmax
+
+  out_f[i++] = __nvvm_redux_sync_fmax_abs(a, 0x01);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.abs
+
+  out_f[i++] = __nvvm_redux_sync_fmax_NaN(a, 0xF1);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.NaN
+
+  out_f[i++] = __nvvm_redux_sync_fmax_abs_NaN(a, 0x10);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.abs.NaN
+
+  // CHECK: ret void
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index f299a145ac73b12..5aa7ebf48a6e5e2 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4823,6 +4823,18 @@ def int_nvvm_redux_sync_xor : 
ClangBuiltin<"__nvvm_redux_sync_xor">,
 def int_nvvm_redux_sync_or : ClangBuiltin<"__nvvm_redux_sync_or">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
 [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
+
+// redux.sync.op.{abs}.{NaN}.f32 dst, src, membermask;
+foreach binOp = ["min", "max"] in {
+  foreach abs = ["", "_abs"] in {
+foreach NaN = ["", "_NaN"] in {
+  def int_nvvm_redux_sync_f # binOp # abs # NaN : 
+ClangBuiltin,
+Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty],
+  [Int

[clang] [llvm] [NVPTX] Add intrinsics for redux.sync f32 instructions (PR #126664)

2025-02-11 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/126664

>From 062a48e73ea1434f3c00ab3c0e717db66aa0f15e Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Mon, 10 Feb 2025 14:13:42 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for redux.sync f32 instructions

Adds NVVM intrinsics and NVPTX codegen for redux.sync f32 instructions
introduced in ptx8.6 for sm_100a. Tests added in
CodeGen/NVPTX/redux-sync.ll and verified through ptxas 12.8.0.

PTX Spec Reference:
https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-redux-sync
---
 clang/include/clang/Basic/BuiltinsNVPTX.td   |   8 ++
 clang/test/CodeGenCUDA/redux-f32-builtins.cu |  34 +
 llvm/include/llvm/IR/IntrinsicsNVVM.td   |  12 ++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td |  18 +++
 llvm/test/CodeGen/NVPTX/redux-sync-f32.ll| 139 +++
 5 files changed, 211 insertions(+)
 create mode 100644 clang/test/CodeGenCUDA/redux-f32-builtins.cu
 create mode 100644 llvm/test/CodeGen/NVPTX/redux-sync-f32.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 9d24a992563a450..327dc88cffdb4e6 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -669,6 +669,14 @@ def __nvvm_redux_sync_umax : 
NVPTXBuiltinSMAndPTX<"unsigned int(unsigned int, in
 def __nvvm_redux_sync_and : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, 
PTX70>;
 def __nvvm_redux_sync_xor : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, 
PTX70>;
 def __nvvm_redux_sync_or : NVPTXBuiltinSMAndPTX<"int(int, int)", SM_80, PTX70>;
+def __nvvm_redux_sync_fmin : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_abs : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmin_abs_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_abs : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
+def __nvvm_redux_sync_fmax_abs_NaN : NVPTXBuiltinSMAndPTX<"float(float, int)", 
SM_100a, PTX86>;
 
 // Membar
 
diff --git a/clang/test/CodeGenCUDA/redux-f32-builtins.cu 
b/clang/test/CodeGenCUDA/redux-f32-builtins.cu
new file mode 100644
index 000..7359fb000699169
--- /dev/null
+++ b/clang/test/CodeGenCUDA/redux-f32-builtins.cu
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx86" 
"-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx86" 
"-target-cpu" "sm_100a" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s
+
+// CHECK: define{{.*}} void @_Z6kernelPf(ptr noundef %out_f)
+__attribute__((global)) void kernel(float* out_f) {
+  float a = 3.0;
+  int i = 0;
+
+  out_f[i++] = __nvvm_redux_sync_fmin(a, 0xFF);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmin
+
+  out_f[i++] = __nvvm_redux_sync_fmin_abs(a, 0xFF);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.abs
+
+  out_f[i++] = __nvvm_redux_sync_fmin_NaN(a, 0xF0);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.NaN
+
+  out_f[i++] = __nvvm_redux_sync_fmin_abs_NaN(a, 0x0F);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmin.abs.NaN
+
+  out_f[i++] = __nvvm_redux_sync_fmax(a, 0xFF);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmax
+
+  out_f[i++] = __nvvm_redux_sync_fmax_abs(a, 0x01);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.abs
+
+  out_f[i++] = __nvvm_redux_sync_fmax_NaN(a, 0xF1);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.NaN
+
+  out_f[i++] = __nvvm_redux_sync_fmax_abs_NaN(a, 0x10);
+  // CHECK: call contract float @llvm.nvvm.redux.sync.fmax.abs.NaN
+
+  // CHECK: ret void
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index f299a145ac73b12..0ceb64d506243c5 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -4823,6 +4823,18 @@ def int_nvvm_redux_sync_xor : 
ClangBuiltin<"__nvvm_redux_sync_xor">,
 def int_nvvm_redux_sync_or : ClangBuiltin<"__nvvm_redux_sync_or">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
 [IntrConvergent, IntrInaccessibleMemOnly, IntrNoCallback]>;
+
+// redux.sync.{min/max}.{abs}.{nan}.f32 dst, src, membermask;
+foreach binOp = ["min", "max"] in {
+  foreach abs = ["", "_abs"] in {
+foreach nan = ["", "_NaN"] in {
+  def int_nvvm_redux_sync_f # binOp # abs # nan : 
+ClangBuiltin,
+Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty],
+   

[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-07 Thread Srinivasa Ravi via cfe-commits


@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : 
NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, 
PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_89, PTX81>;

Wolfram70 wrote:

Updated the summary and description, thanks!

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-07 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 edited 
https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-07 Thread Srinivasa Ravi via cfe-commits


@@ -703,6 +703,53 @@ let hasSideEffects = false in {
   defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, 
hasSM<100>]>;
   defm CVT_to_tf32_rn_relu_satf  : CVT_TO_TF32<"rn.relu.satfinite", 
[hasPTX<86>, hasSM<100>]>;
   defm CVT_to_tf32_rz_relu_satf  : CVT_TO_TF32<"rz.relu.satfinite", 
[hasPTX<86>, hasSM<100>]>;
+
+  // FP6 conversions.
+  multiclass CVT_TO_F6X2 {
+def _f32 :
+  NVPTXInst<(outs Int16Regs:$dst),
+(ins Float32Regs:$src1, Float32Regs:$src2, CvtMode:$mode),
+!strconcat("cvt${mode:base}.satfinite${mode:relu}.",
+F6Name, "x2.f32 \t$dst, $src1, $src2;"), []>;
+  }
+
+  defm CVT_e2m3x2 : CVT_TO_F6X2<"e2m3">;
+  defm CVT_e3m2x2 : CVT_TO_F6X2<"e3m2">;
+
+  class CVT_f16x2_fp6 :
+NVPTXInst<(outs Int32Regs:$dst),
+  (ins Int16Regs:$src, CvtMode:$mode),
+  !strconcat("cvt${mode:base}${mode:relu}.f16x2.",
+  F6Name, "x2 \t$dst, $src;"), []>;

Wolfram70 wrote:

Fixed in the latest revision.

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-08 Thread Srinivasa Ravi via cfe-commits


@@ -1021,6 +1036,174 @@ __device__ void nvvm_cvt_sm89() {
   __nvvm_e5m2x2_to_f16x2_rn(0x4c4c);
   // CHECK_PTX81_SM89: call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 
19532)
   __nvvm_e5m2x2_to_f16x2_rn_relu(0x4c4c);
+
+  // CHECK_PTX81_SM89: call i32 @llvm.nvvm.f2tf32.rna.satfinite(float 
1.00e+00)
+  __nvvm_f2tf32_rna_satfinite(1.0f);
+#endif
+  // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_cvt_sm90
+__device__ void nvvm_cvt_sm90() {
+#if (PTX >= 78) && (__CUDA_ARCH__ >= 900)
+  // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn(float 1.00e+00)
+  __nvvm_f2tf32_rn(1.0f); 
+  // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rn.relu(float 1.00e+00)
+  __nvvm_f2tf32_rn_relu(1.0f); 
+  // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz(float 1.00e+00)
+  __nvvm_f2tf32_rz(1.0f); 
+  // CHECK_PTX78_SM90: call i32 @llvm.nvvm.f2tf32.rz.relu(float 1.00e+00)
+  __nvvm_f2tf32_rz_relu(1.0f); 
+#endif
+  // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_cvt_sm100
+__device__ void nvvm_cvt_sm100() {
+#if (PTX >= 86) && (__CUDA_ARCH__ >= 1000)
+  // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.satfinite(float 
1.00e+00)
+  __nvvm_f2tf32_rn_satfinite(1.0f); 
+  // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rn.relu.satfinite(float 
1.00e+00)
+  __nvvm_f2tf32_rn_relu_satfinite(1.0f); 
+  // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.satfinite(float 
1.00e+00)
+  __nvvm_f2tf32_rz_satfinite(1.0f); 
+  // CHECK_PTX86_SM100: call i32 @llvm.nvvm.f2tf32.rz.relu.satfinite(float 
1.00e+00)
+  __nvvm_f2tf32_rz_relu_satfinite(1.0f); 
+#endif
+  // CHECK: ret void
+}
+
+// CHECK-LABEL: nvvm_cvt_sm100a
+__device__ void nvvm_cvt_sm100a() {
+#if (PTX >= 86) && __CUDA_ARCH_FEAT_SM100_ALL

Wolfram70 wrote:

Oh yes, that’s better. I combined all these checks into a single function in 
the latest revision. Thanks!

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-08 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 edited 
https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-09 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/134345

>From c5f843152d035f4671d132d3844ea1f18be703fe Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 5 Mar 2025 12:35:39 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants

This change adds NVVM intrinsics and clang builtins for the cvt
instruction variants of types .e2m3x2, .e3m2x2, and .ue8m0x2 introduced
in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm1XXa.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  31 +++
 clang/test/CodeGen/builtins-nvptx.c| 150 ++-
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  30 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td|  40 +++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   |  56 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 290 +
 6 files changed, 593 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100a.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 61e48b31c244b..d240b1a8d0d16 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : 
NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, 
PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_89, PTX81>;
+def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
 
 def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
 def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 
+def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+
+def __nvvm_ue8m0x2_

[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-09 Thread Srinivasa Ravi via cfe-commits


@@ -1548,6 +1548,45 @@ let TargetPrefix = "nvvm" in {
   Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_e5m2x2_to_f16x2_rn_relu : 
ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">,
   Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+  
+  def int_nvvm_ff_to_e2m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn">,

Wolfram70 wrote:

Used foreach loops to simplify these definitions in the latest revision. 
Perhaps we could do this for the other CVT intrinsics too since many appear to 
be very similar in terms of their signatures and properties.

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-09 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 edited 
https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-09 Thread Srinivasa Ravi via cfe-commits


@@ -1548,6 +1548,45 @@ let TargetPrefix = "nvvm" in {
   Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_e5m2x2_to_f16x2_rn_relu : 
ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">,
   Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+  
+  def int_nvvm_ff_to_e2m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn">,
+  Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;

Wolfram70 wrote:

Converted all intrinsics to `DefaultAttrsIntrinsic` in the latest revision. 

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-09 Thread Srinivasa Ravi via cfe-commits


@@ -1944,6 +1944,62 @@ def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a),
 def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a),
   (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>;
 
+def : Pat<(int_nvvm_ff_to_e2m3x2_rn f32:$a, f32:$b),
+  (CVT_e2m3x2_f32 $a, $b, CvtRN)>,
+  Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
+def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu f32:$a, f32:$b),
+  (CVT_e2m3x2_f32 $a, $b, CvtRN_RELU)>,
+  Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
+def : Pat<(int_nvvm_ff_to_e3m2x2_rn f32:$a, f32:$b),
+  (CVT_e3m2x2_f32 $a, $b, CvtRN)>,
+  Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
+def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu f32:$a, f32:$b),
+  (CVT_e3m2x2_f32 $a, $b, CvtRN_RELU)>,
+  Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>;
+
+def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn Int16Regs:$a),

Wolfram70 wrote:

Changed the input pattern to `i16` in the latest revision, thanks!

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-11 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 edited 
https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits

Wolfram70 wrote:

@Artem-B and @AlexMaclean , could you please take another look to see if this 
change is good to go?

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/134345

>From fdaf84496ae8f6b5213be8d568010b7f3c1d1aee Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 5 Mar 2025 12:35:39 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants

This change adds NVVM intrinsics and clang builtins for the cvt
instruction variants of types .e2m3x2, .e3m2x2, and .ue8m0x2 introduced
in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm1XXa.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  31 +++
 clang/test/CodeGen/builtins-nvptx.c| 150 ++-
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  39 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td|  47 
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   |  56 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 290 +
 6 files changed, 609 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100a.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 61e48b31c244b..d240b1a8d0d16 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : 
NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, 
PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_89, PTX81>;
+def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
 
 def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
 def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 
+def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+
+def __nvvm_ue8m0x2

[clang] [llvm] [clang][NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits

Wolfram70 wrote:

Added `-disable-llvm-optzns` to the new RUN lines in `builtins-nvptx.c` since I 
think some tests are failing due to changes from #134416 

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/134345

>From 93757e155b631f4287b96d0b2ad7091371faba83 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 5 Mar 2025 12:35:39 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants

This change adds NVVM intrinsics and clang builtins for the cvt
instruction variants of types .e2m3x2, .e3m2x2, and .ue8m0x2 introduced
in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm1XXa.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  31 +++
 clang/test/CodeGen/builtins-nvptx.c| 150 ++-
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  31 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td|  35 +++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   |  56 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 290 +
 6 files changed, 589 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100a.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 61e48b31c244b..d240b1a8d0d16 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : 
NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, 
PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_89, PTX81>;
+def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
 
 def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
 def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 
+def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+
+def __nvvm_ue8m0x2_

[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits


@@ -703,6 +703,46 @@ let hasSideEffects = false in {
   defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, 
hasSM<100>]>;
   defm CVT_to_tf32_rn_relu_satf  : CVT_TO_TF32<"rn.relu.satfinite", 
[hasPTX<86>, hasSM<100>]>;
   defm CVT_to_tf32_rz_relu_satf  : CVT_TO_TF32<"rz.relu.satfinite", 
[hasPTX<86>, hasSM<100>]>;
+
+  // FP6 conversions.
+  class CVT_f32_to_f6x2

Wolfram70 wrote:

Oh yes, that makes sense. Removed these classes in the latest revision.

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [clang][NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/134345

>From 18c6641b676ac90bfe300df48d0b8719df1f1add Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 5 Mar 2025 12:35:39 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants

This change adds NVVM intrinsics and clang builtins for the cvt
instruction variants of types .e2m3x2, .e3m2x2, and .ue8m0x2 introduced
in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm1XXa.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  31 +++
 clang/test/CodeGen/builtins-nvptx.c| 150 ++-
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  31 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td|  35 +++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   |  56 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 290 +
 6 files changed, 589 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100a.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 61e48b31c244b..d240b1a8d0d16 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : 
NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, 
PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_89, PTX81>;
+def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
 
 def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
 def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 
+def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+
+def __nvvm_ue8m0x2_

[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits


@@ -703,6 +703,53 @@ let hasSideEffects = false in {
   defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, 
hasSM<100>]>;
   defm CVT_to_tf32_rn_relu_satf  : CVT_TO_TF32<"rn.relu.satfinite", 
[hasPTX<86>, hasSM<100>]>;
   defm CVT_to_tf32_rz_relu_satf  : CVT_TO_TF32<"rz.relu.satfinite", 
[hasPTX<86>, hasSM<100>]>;
+
+  // FP6 conversions.
+  multiclass CVT_TO_F6X2 {

Wolfram70 wrote:

Simplified these definitions in the latest revision.

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits


@@ -1548,6 +1548,45 @@ let TargetPrefix = "nvvm" in {
   Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
   def int_nvvm_e5m2x2_to_f16x2_rn_relu : 
ClangBuiltin<"__nvvm_e5m2x2_to_f16x2_rn_relu">,
   Intrinsic<[llvm_v2f16_ty], [llvm_i16_ty], [IntrNoMem, IntrNoCallback]>;
+  
+  def int_nvvm_ff_to_e2m3x2_rn : ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn">,
+  Intrinsic<[llvm_i16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem, 
IntrNoCallback]>;
+  def int_nvvm_ff_to_e2m3x2_rn_relu : 
ClangBuiltin<"__nvvm_ff_to_e2m3x2_rn_relu">,

Wolfram70 wrote:

I chose this name for the intrinsics to maintain consistency since all other 
CVT intrinsics with two `llvm_float_ty` as inputs (like the `f32` to `e4m3x2`, 
`e5m2x2`, `f16x2` and `bf16x2` conversions) appear to use `ff` in the name 
instead of `f32x2`.

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/134345

>From 648bfd77d48c395ae81a1d28fa6d925e1e75c02a Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 5 Mar 2025 12:35:39 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants

This change adds NVVM intrinsics and clang builtins for the cvt
instruction variants of types .e2m3x2, .e3m2x2, and .ue8m0x2 introduced
in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm1XXa.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  31 +++
 clang/test/CodeGen/builtins-nvptx.c| 150 ++-
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  31 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td|  40 +++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   |  56 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 290 +
 6 files changed, 594 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100a.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 61e48b31c244b..d240b1a8d0d16 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : 
NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, 
PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_89, PTX81>;
+def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
 
 def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
 def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 
+def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+
+def __nvvm_ue8m0x2_

[clang] [llvm] [NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-10 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/134345

>From 1f59834f760c7880bd958f82853f253816b1e653 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 5 Mar 2025 12:35:39 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants

This change adds NVVM intrinsics and clang builtins for the cvt
instruction variants of types .e2m3x2, .e3m2x2, and .ue8m0x2 introduced
in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm1XXa.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  31 +++
 clang/test/CodeGen/builtins-nvptx.c| 150 ++-
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  27 ++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td|  40 +++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   |  56 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 290 +
 6 files changed, 590 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100a.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 61e48b31c244b..d240b1a8d0d16 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : 
NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, 
PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_89, PTX81>;
+def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
 
 def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
 def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 
+def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rz_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_bf16x2_to_ue8m0x2_rp_satfinite : 
NVPTXBuiltinSMAndPTX<"short(_Vector<2, __bf16>)", SM<"100a", [SM_101a, 
SM_120a]>, PTX86>;
+
+def __nvvm_ue8m0x2_t

[clang] [llvm] [clang][NVPTX] Add builtins and intrinsics for conversions of new FP types (PR #134345)

2025-04-15 Thread Srinivasa Ravi via cfe-commits


@@ -703,6 +703,41 @@ let hasSideEffects = false in {
   defm CVT_to_tf32_rz_satf : CVT_TO_TF32<"rz.satfinite", [hasPTX<86>, 
hasSM<100>]>;
   defm CVT_to_tf32_rn_relu_satf  : CVT_TO_TF32<"rn.relu.satfinite", 
[hasPTX<86>, hasSM<100>]>;
   defm CVT_to_tf32_rz_relu_satf  : CVT_TO_TF32<"rz.relu.satfinite", 
[hasPTX<86>, hasSM<100>]>;
+
+  // FP6 conversions.
+  foreach type = ["e2m3x2", "e3m2x2"] in {
+def CVT_ # type # _f32 : NVPTXInst<(outs Int16Regs:$dst),

Wolfram70 wrote:

I had initially omitted the suffix since we appear to do that for the only 
other case with the `.satfinite` modifier mandatorily required (the `f32` to 
`f8x2` conversions). But I agree that it would be better to have all the 
modifiers as a suffix in the intrinsic and builtin names since we do that for 
everything else. I have updated the names of those intrinsics and builtins now. 
Thanks!

https://github.com/llvm/llvm-project/pull/134345
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants (PR #134345)

2025-04-03 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 created 
https://github.com/llvm/llvm-project/pull/134345

This change adds NVVM intrinsics and clang builtins for the cvt instruction 
variants of types `.e2m3x2`, `.e3m2x2`, and `.ue8m0x2` introduced in PTX 8.6 
for `sm_100a`, `sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm100a.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt

>From 184b9a4e591f562fcc75f341500eb74e39ec9105 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 5 Mar 2025 12:35:39 +0530
Subject: [PATCH] [NVPTX] Add intrinsics for cvt .f6x2 and .ue8m0x2 variants

This change adds NVVM intrinsics and clang builtins for the cvt
instruction variants of types .e2m3x2, .e3m2x2, and .ue8m0x2 introduced
in PTX 8.6 for `sm_100a`, `sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm1XXa.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  31 +++
 clang/test/CodeGen/builtins-nvptx.c| 191 +-
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  39 +++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td|  47 
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   |  56 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 290 +
 6 files changed, 650 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/convert-sm100a.ll

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index 61e48b31c244b..d240b1a8d0d16 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -580,6 +580,15 @@ def __nvvm_f2bf16_rz : 
NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, PTX70>;
 def __nvvm_f2bf16_rz_relu : NVPTXBuiltinSMAndPTX<"__bf16(float)", SM_80, 
PTX70>;
 
 def __nvvm_f2tf32_rna : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_80, PTX70>;
+def __nvvm_f2tf32_rna_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_89, PTX81>;
+def __nvvm_f2tf32_rn : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rn_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rn_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, PTX78>;
+def __nvvm_f2tf32_rz_relu : NVPTXBuiltinSMAndPTX<"int32_t(float)", SM_90, 
PTX78>;
+def __nvvm_f2tf32_rz_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
+def __nvvm_f2tf32_rz_relu_satfinite : NVPTXBuiltinSMAndPTX<"int32_t(float)", 
SM_100, PTX86>;
 
 def __nvvm_ff_to_e4m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
 def __nvvm_ff_to_e4m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM_89, PTX81>;
@@ -596,6 +605,28 @@ def __nvvm_e4m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e5m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 def __nvvm_e5m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM_89, PTX81>;
 
+def __nvvm_ff_to_e2m3x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m3x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e3m2x2_rn_relu : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m3x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m3x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_ue8m0x2_rp_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_bf16x2_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(_Vector<2, 
__bf16>)", SM<"100a", [SM

[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-09 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 edited 
https://github.com/llvm/llvm-project/pull/139244
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-09 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 created 
https://github.com/llvm/llvm-project/pull/139244

This change adds intrinsics and cllang builtiins for the cvt instruction 
variants of type (FP4) `.e2m1x2`. introduced in PTX 8.6 for `sm_100a`, 
`sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm100a.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt

>From 63c35e8a005cd038c96005a23b93b8d9a503d6f8 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 7 May 2025 14:41:48 +0530
Subject: [PATCH] [NVPTX] Add intrinsics and clang builtins for conversions of
 f4x2 type

This change adds intrinsics and cllang builtiins for the cvt instruction
variants of type (FP4) `.e2m1x2`. introduced in PTX 8.6 for `sm_100a`,
`sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm100a.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  6 ++
 clang/test/CodeGen/builtins-nvptx.c| 20 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  7 ++
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td| 17 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   | 14 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 82 ++
 6 files changed, 146 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index f797e29fe66a3..2cea44e224674 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -620,6 +620,12 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 
+def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
 def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 639c18190f436..7904762709df6 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1127,6 +1127,26 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() {
   // CHECK_PTX86_SM120a: call <2 x half> 
@llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
   __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
 
+  // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e2m1x2_rn_satfinite(1.0f, 1.0f);
+
+  // CHECK_PTX86_SM100a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  // CHECK_PTX86_SM101a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  // CHECK_PTX86_SM120a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  __nvvm_ff_to_e2m1x2_rn_relu_satfinite(1.0f, 1.0f);
+  
+  // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  __nvvm_e2m1x2_to_f16x2_rn(0x004C);
+  
+  // CHECK_PTX86_SM100a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  // CHECK_PTX86_SM101a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  // CHECK_PTX86_SM120a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  __nvvm_e2m1x2_to_f16x2_rn_relu(0x004C);
+
   // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
   // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, f

[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-11 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/139244

>From 0f236de49493d7fb7c1ebee69065b15c9bc07eca Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 7 May 2025 14:41:48 +0530
Subject: [PATCH] [NVPTX] Add intrinsics and clang builtins for conversions of
 f4x2 type

This change adds intrinsics and cllang builtiins for the cvt instruction
variants of type (FP4) `.e2m1x2`. introduced in PTX 8.6 for `sm_100a`,
`sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm100a.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  6 ++
 clang/test/CodeGen/builtins-nvptx.c| 20 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  9 ++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td| 17 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   | 14 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 82 ++
 6 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index f797e29fe66a3..2cea44e224674 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -620,6 +620,12 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 
+def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
 def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 639c18190f436..7904762709df6 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1127,6 +1127,26 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() {
   // CHECK_PTX86_SM120a: call <2 x half> 
@llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
   __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
 
+  // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e2m1x2_rn_satfinite(1.0f, 1.0f);
+
+  // CHECK_PTX86_SM100a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  // CHECK_PTX86_SM101a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  // CHECK_PTX86_SM120a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  __nvvm_ff_to_e2m1x2_rn_relu_satfinite(1.0f, 1.0f);
+  
+  // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  __nvvm_e2m1x2_to_f16x2_rn(0x004C);
+  
+  // CHECK_PTX86_SM100a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  // CHECK_PTX86_SM101a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  // CHECK_PTX86_SM120a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  __nvvm_e2m1x2_to_f16x2_rn_relu(0x004C);
+
   // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
   // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
   // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 8b87822d3fdda..c5f3a35f1f901 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1663,11 +1663,18 @@ let TargetPrefix = "nvvm" in {

[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-13 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/139244



  



Rate limit · GitHub


  body {
background-color: #f6f8fa;
color: #24292e;
font-family: -apple-system,BlinkMacSystemFont,Segoe 
UI,Helvetica,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol;
font-size: 14px;
line-height: 1.5;
margin: 0;
  }

  .container { margin: 50px auto; max-width: 600px; text-align: center; 
padding: 0 24px; }

  a { color: #0366d6; text-decoration: none; }
  a:hover { text-decoration: underline; }

  h1 { line-height: 60px; font-size: 48px; font-weight: 300; margin: 0px; 
text-shadow: 0 1px 0 #fff; }
  p { color: rgba(0, 0, 0, 0.5); margin: 20px 0 40px; }

  ul { list-style: none; margin: 25px 0; padding: 0; }
  li { display: table-cell; font-weight: bold; width: 1%; }

  .logo { display: inline-block; margin-top: 35px; }
  .logo-img-2x { display: none; }
  @media
  only screen and (-webkit-min-device-pixel-ratio: 2),
  only screen and (   min--moz-device-pixel-ratio: 2),
  only screen and ( -o-min-device-pixel-ratio: 2/1),
  only screen and (min-device-pixel-ratio: 2),
  only screen and (min-resolution: 192dpi),
  only screen and (min-resolution: 2dppx) {
.logo-img-1x { display: none; }
.logo-img-2x { display: inline-block; }
  }

  #suggestions {
margin-top: 35px;
color: #ccc;
  }
  #suggestions a {
color: #66;
font-weight: 200;
font-size: 14px;
margin: 0 10px;
  }


  
  



  Whoa there!
  You have exceeded a secondary rate limit.
Please wait a few minutes before you try again;
in some cases this may take up to an hour.
  
  
https://support.github.com/contact";>Contact Support —
https://githubstatus.com";>GitHub Status —
https://twitter.com/githubstatus";>@githubstatus
  

  

  

  

  

  


___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-14 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 closed 
https://github.com/llvm/llvm-project/pull/139244
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-14 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/139244



  



Rate limit · GitHub


  body {
background-color: #f6f8fa;
color: #24292e;
font-family: -apple-system,BlinkMacSystemFont,Segoe 
UI,Helvetica,Arial,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol;
font-size: 14px;
line-height: 1.5;
margin: 0;
  }

  .container { margin: 50px auto; max-width: 600px; text-align: center; 
padding: 0 24px; }

  a { color: #0366d6; text-decoration: none; }
  a:hover { text-decoration: underline; }

  h1 { line-height: 60px; font-size: 48px; font-weight: 300; margin: 0px; 
text-shadow: 0 1px 0 #fff; }
  p { color: rgba(0, 0, 0, 0.5); margin: 20px 0 40px; }

  ul { list-style: none; margin: 25px 0; padding: 0; }
  li { display: table-cell; font-weight: bold; width: 1%; }

  .logo { display: inline-block; margin-top: 35px; }
  .logo-img-2x { display: none; }
  @media
  only screen and (-webkit-min-device-pixel-ratio: 2),
  only screen and (   min--moz-device-pixel-ratio: 2),
  only screen and ( -o-min-device-pixel-ratio: 2/1),
  only screen and (min-device-pixel-ratio: 2),
  only screen and (min-resolution: 192dpi),
  only screen and (min-resolution: 2dppx) {
.logo-img-1x { display: none; }
.logo-img-2x { display: inline-block; }
  }

  #suggestions {
margin-top: 35px;
color: #ccc;
  }
  #suggestions a {
color: #66;
font-weight: 200;
font-size: 14px;
margin: 0 10px;
  }


  
  



  Whoa there!
  You have exceeded a secondary rate limit.
Please wait a few minutes before you try again;
in some cases this may take up to an hour.
  
  
https://support.github.com/contact";>Contact Support —
https://githubstatus.com";>GitHub Status —
https://twitter.com/githubstatus";>@githubstatus
  

  

  

  

  

  


___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-14 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/139244

>From 63b1be08a72815c3fc195ac2356a29b61ab6d1c7 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 7 May 2025 14:41:48 +0530
Subject: [PATCH] [NVPTX] Add intrinsics and clang builtins for conversions of
 f4x2 type

This change adds intrinsics and cllang builtiins for the cvt instruction
variants of type (FP4) `.e2m1x2`. introduced in PTX 8.6 for `sm_100a`,
`sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm100a.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  6 ++
 clang/test/CodeGen/builtins-nvptx.c| 20 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td | 11 ++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td| 17 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   | 14 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 82 ++
 6 files changed, 149 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index f797e29fe66a3..2cea44e224674 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -620,6 +620,12 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 
+def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
 def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 639c18190f436..7904762709df6 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1127,6 +1127,26 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() {
   // CHECK_PTX86_SM120a: call <2 x half> 
@llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
   __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
 
+  // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e2m1x2_rn_satfinite(1.0f, 1.0f);
+
+  // CHECK_PTX86_SM100a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  // CHECK_PTX86_SM101a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  // CHECK_PTX86_SM120a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  __nvvm_ff_to_e2m1x2_rn_relu_satfinite(1.0f, 1.0f);
+  
+  // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  __nvvm_e2m1x2_to_f16x2_rn(0x004C);
+  
+  // CHECK_PTX86_SM100a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  // CHECK_PTX86_SM101a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  // CHECK_PTX86_SM120a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  __nvvm_e2m1x2_to_f16x2_rn_relu(0x004C);
+
   // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
   // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
   // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 5be1a915a06a7..0b26bb9829005 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1293,10 +1293,19 @@ let TargetPrefix = "nvvm" in {

[clang] [llvm] [NVPTX] Add intrinsics and clang builtins for conversions of f4x2 type (PR #139244)

2025-05-13 Thread Srinivasa Ravi via cfe-commits

https://github.com/Wolfram70 updated 
https://github.com/llvm/llvm-project/pull/139244

>From a9588758d03a80ce055725d7d7db0b0deb685f96 Mon Sep 17 00:00:00 2001
From: Srinivasa Ravi 
Date: Wed, 7 May 2025 14:41:48 +0530
Subject: [PATCH] [NVPTX] Add intrinsics and clang builtins for conversions of
 f4x2 type

This change adds intrinsics and cllang builtiins for the cvt instruction
variants of type (FP4) `.e2m1x2`. introduced in PTX 8.6 for `sm_100a`,
`sm_101a`, and `sm_120a`.

Tests are added in `NVPTX/convert-sm100a.ll` and
`clang/test/CodeGen/builtins-nvptx.c` and verified through ptxas 12.8.0.

PTX Spec Reference: 
https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cvt
---
 clang/include/clang/Basic/BuiltinsNVPTX.td |  6 ++
 clang/test/CodeGen/builtins-nvptx.c| 20 ++
 llvm/include/llvm/IR/IntrinsicsNVVM.td |  9 ++-
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td| 17 +
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   | 14 
 llvm/test/CodeGen/NVPTX/convert-sm100a.ll  | 82 ++
 6 files changed, 147 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.td 
b/clang/include/clang/Basic/BuiltinsNVPTX.td
index f797e29fe66a3..2cea44e224674 100644
--- a/clang/include/clang/Basic/BuiltinsNVPTX.td
+++ b/clang/include/clang/Basic/BuiltinsNVPTX.td
@@ -620,6 +620,12 @@ def __nvvm_e2m3x2_to_f16x2_rn_relu : 
NVPTXBuiltinSMAndPTX<"_Vector<2, __fp16>(sh
 def __nvvm_e3m2x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_e3m2x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 
+def __nvvm_ff_to_e2m1x2_rn_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_ff_to_e2m1x2_rn_relu_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
+def __nvvm_e2m1x2_to_f16x2_rn : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+def __nvvm_e2m1x2_to_f16x2_rn_relu : NVPTXBuiltinSMAndPTX<"_Vector<2, 
__fp16>(short)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
+
 def __nvvm_ff_to_ue8m0x2_rz : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rz_satfinite : NVPTXBuiltinSMAndPTX<"short(float, 
float)", SM<"100a", [SM_101a, SM_120a]>, PTX86>;
 def __nvvm_ff_to_ue8m0x2_rp : NVPTXBuiltinSMAndPTX<"short(float, float)", 
SM<"100a", [SM_101a, SM_120a]>, PTX86>;
diff --git a/clang/test/CodeGen/builtins-nvptx.c 
b/clang/test/CodeGen/builtins-nvptx.c
index 639c18190f436..7904762709df6 100644
--- a/clang/test/CodeGen/builtins-nvptx.c
+++ b/clang/test/CodeGen/builtins-nvptx.c
@@ -1127,6 +1127,26 @@ __device__ void nvvm_cvt_sm100a_sm101a_sm120a() {
   // CHECK_PTX86_SM120a: call <2 x half> 
@llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 19532)
   __nvvm_e3m2x2_to_f16x2_rn_relu(0x4C4C);
 
+  // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.e2m1x2.rn.satfinite(float 
1.00e+00, float 1.00e+00)
+  __nvvm_ff_to_e2m1x2_rn_satfinite(1.0f, 1.0f);
+
+  // CHECK_PTX86_SM100a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  // CHECK_PTX86_SM101a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  // CHECK_PTX86_SM120a: call i16 
@llvm.nvvm.ff.to.e2m1x2.rn.relu.satfinite(float 1.00e+00, float 
1.00e+00)
+  __nvvm_ff_to_e2m1x2_rn_relu_satfinite(1.0f, 1.0f);
+  
+  // CHECK_PTX86_SM100a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  // CHECK_PTX86_SM101a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  // CHECK_PTX86_SM120a: call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 76)
+  __nvvm_e2m1x2_to_f16x2_rn(0x004C);
+  
+  // CHECK_PTX86_SM100a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  // CHECK_PTX86_SM101a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  // CHECK_PTX86_SM120a: call <2 x half> 
@llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 76)
+  __nvvm_e2m1x2_to_f16x2_rn_relu(0x004C);
+
   // CHECK_PTX86_SM100a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
   // CHECK_PTX86_SM101a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
   // CHECK_PTX86_SM120a: call i16 @llvm.nvvm.ff.to.ue8m0x2.rz(float 
1.00e+00, float 1.00e+00)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td 
b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 5be1a915a06a7..f1df7be0f88ef 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1293,10 +1293,17 @@ let TargetPrefix = "nvvm" in {