[PATCH] D147732: [AMDGPU] Add f32 permlane{16, x16} builtin variants

Jeffrey Byrnes via Phabricator via cfe-commits Thu, 06 Apr 2023 11:58:35 -0700

jrbyrnes created this revision.
jrbyrnes added reviewers: rampitec, arsenm.
Herald added subscribers: kosarev, foad, kerbowa, hiraditya, tpr, dstuttard, 
yaxunl, jvesely, kzhuravl.
Herald added a project: All.
jrbyrnes requested review of this revision.
Herald added subscribers: llvm-commits, cfe-commits, wdng.
Herald added projects: clang, LLVM.


Add builtins which accept floats for these instructions. A user is requesting 
to have permlane builtins for floats without use of casts.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D147732

Files:
  clang/include/clang/Basic/BuiltinsAMDGPU.def
  clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
  clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl
  llvm/include/llvm/IR/IntrinsicsAMDGPU.td
  llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
  llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
  llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
  llvm/lib/Target/AMDGPU/VOP3Instructions.td

Index: llvm/lib/Target/AMDGPU/VOP3Instructions.td
===================================================================
--- llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -663,7 +663,9 @@
 let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in
   defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
 
-def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
+
+
+class VOP3_PERMLANE_Profile<ValueType vt> : VOP3_Profile<VOPProfile <[vt, vt, i32, i32]>, VOP3_OPSEL> {
   let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
                           IntOpSelMods:$src1_modifiers, SSrc_b32:$src1,
                           IntOpSelMods:$src2_modifiers, SSrc_b32:$src2,
@@ -679,9 +681,9 @@
 def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">,
   GISDNodeXFormEquiv<opsel_i1timm>;
 
-class PermlanePat<SDPatternOperator permlane,
+class PermlanePat<ValueType vt, SDPatternOperator permlane,
   Instruction inst> : GCNPat<
-  (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
+  (permlane vt:$vdst_in, vt:$src0, i32:$src1, i32:$src2,
             timm:$fi, timm:$bc),
   (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
         SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
@@ -695,12 +697,17 @@
   def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
 
   let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-    defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
-    defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
+    defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile<i32>>;
+    defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile<i32>>;
+    defm V_PERMLANE16_F32_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile<f32>>;
+    defm V_PERMLANEX16_F32_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile<f32>>;
   } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 
-  def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
-  def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
+  def : PermlanePat<i32, int_amdgcn_permlane16, V_PERMLANE16_B32_e64>;
+  def : PermlanePat<i32, int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64>;
+  def : PermlanePat<f32, int_amdgcn_permlane16_f32, V_PERMLANE16_F32_B32_e64>;
+  def : PermlanePat<f32, int_amdgcn_permlanex16_f32, V_PERMLANEX16_F32_B32_e64>;
+
 
   defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
   defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
Index: llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -303,6 +303,8 @@
 def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
 def : SourceOfDivergence<int_amdgcn_permlane16>;
 def : SourceOfDivergence<int_amdgcn_permlanex16>;
+def : SourceOfDivergence<int_amdgcn_permlane16_f32>;
+def : SourceOfDivergence<int_amdgcn_permlanex16_f32>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2990,7 +2990,9 @@
       applyDefaultMapping(OpdMapper);
       return;
     case Intrinsic::amdgcn_permlane16:
-    case Intrinsic::amdgcn_permlanex16: {
+    case Intrinsic::amdgcn_permlanex16:
+    case Intrinsic::amdgcn_permlane16_f32:
+    case Intrinsic::amdgcn_permlanex16_f32: {
       // Doing a waterfall loop over these wouldn't make any sense.
       substituteSimpleCopyRegs(OpdMapper, 2);
       substituteSimpleCopyRegs(OpdMapper, 3);
@@ -4367,7 +4369,9 @@
       break;
     }
     case Intrinsic::amdgcn_permlane16:
-    case Intrinsic::amdgcn_permlanex16: {
+    case Intrinsic::amdgcn_permlanex16:
+    case Intrinsic::amdgcn_permlane16_f32:
+    case Intrinsic::amdgcn_permlanex16_f32: {
       unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
Index: llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -892,7 +892,9 @@
     return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
   }
   case Intrinsic::amdgcn_permlane16:
-  case Intrinsic::amdgcn_permlanex16: {
+  case Intrinsic::amdgcn_permlanex16:
+  case Intrinsic::amdgcn_permlane16_f32:
+  case Intrinsic::amdgcn_permlanex16_f32: {
     // Discard vdst_in if it's not going to be read.
     Value *VDstIn = II.getArgOperand(0);
     if (isa<UndefValue>(VDstIn))
Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1950,6 +1950,24 @@
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
 
+
+// llvm.amdgcn.permlane16.f32 <old> <src0> <src1> <src2> <fi> <bound_control>
+def int_amdgcn_permlane16_f32 : ClangBuiltin<"__builtin_amdgcn_permlane16_f32">,
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn,
+             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
+
+// llvm.amdgcn.permlanex16.f32 <old> <src0> <src1> <src2> <fi> <bound_control>
+def int_amdgcn_permlanex16_f32 : ClangBuiltin<"__builtin_amdgcn_permlanex16_f32">,
+  Intrinsic<[llvm_float_ty],
+            [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn,
+             ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
+
+
+
+
 // llvm.amdgcn.mov.dpp8.i32 <src> <sel>
 // <sel> is a 32-bit constant whose high 8 bits must be zero which selects
 // the lanes to read from.
Index: clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl
===================================================================
--- clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl
+++ clang/test/SemaOpenCL/builtins-amdgcn-error-gfx10-param.cl
@@ -13,6 +13,18 @@
   *out = __builtin_amdgcn_permlanex16(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16' must be a constant integer}}
 }
 
+
+void test_permlane16_f32(global float* out, float a, float b, uint c, uint d, uint e) {
+  *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}}
+  *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlane16_f32' must be a constant integer}}
+}
+
+void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d, uint e) {
+  *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, e, 1); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}}
+  *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 1, e); // expected-error{{argument to '__builtin_amdgcn_permlanex16_f32' must be a constant integer}}
+}
+
+
 void test_mov_dpp8(global uint* out, uint a, uint b) {
   *out = __builtin_amdgcn_mov_dpp8(a, b); // expected-error{{argument to '__builtin_amdgcn_mov_dpp8' must be a constant integer}}
 }
Index: clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
===================================================================
--- clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
+++ clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl
@@ -18,6 +18,19 @@
   *out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
 }
 
+// CHECK-LABEL: @test_permlane16_f32(
+// CHECK: call float @llvm.amdgcn.permlane16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false)
+void test_permlane16_f32(global float* out, float a, float b, uint c, uint d) {
+  *out = __builtin_amdgcn_permlane16_f32(a, b, c, d, 0, 0);
+}
+
+// CHECK-LABEL: @test_permlanex16_f32(
+// CHECK: call float @llvm.amdgcn.permlanex16.f32(float %a, float %b, i32 %c, i32 %d, i1 false, i1 false)
+void test_permlanex16_f32(global float* out, float a, float b, uint c, uint d) {
+  *out = __builtin_amdgcn_permlanex16_f32(a, b, c, d, 0, 0);
+}
+
+
 // CHECK-LABEL: @test_mov_dpp8(
 // CHECK: call i32 @llvm.amdgcn.mov.dpp8.i32(i32 %a, i32 1)
 void test_mov_dpp8(global uint* out, uint a) {
Index: clang/include/clang/Basic/BuiltinsAMDGPU.def
===================================================================
--- clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -255,6 +255,8 @@
 //===----------------------------------------------------------------------===//
 TARGET_BUILTIN(__builtin_amdgcn_permlane16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_permlanex16, "UiUiUiUiUiIbIb", "nc", "gfx10-insts")
+TARGET_BUILTIN(__builtin_amdgcn_permlane16_f32, "fffUiUiIbIb", "nc", "gfx10-insts")
+TARGET_BUILTIN(__builtin_amdgcn_permlanex16_f32, "fffUiUiIbIb", "nc", "gfx10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_mov_dpp8, "UiUiIUi", "nc", "gfx10-insts")
 
 //===----------------------------------------------------------------------===//

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D147732: [AMDGPU] Add f32 permlane{16, x16} builtin variants

Reply via email to