[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Matt Arsenault via cfe-commits Thu, 13 Mar 2025 17:56:49 -0700

================
@@ -0,0 +1,501 @@
+//===- LowerGPUIntrinsic.cpp 
----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM 
Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower the llvm.gpu intrinsics to target specific code sequences.
+// Can be called from clang if building for a specific GPU or from the backend
+// as part of a SPIRV lowering pipeline. Initial pass can lower to amdgcn or
+// nvptx, adding further architectures means adding a column to the lookup 
table
+// and further intrinsics adding a row.
+//
+// The idea is for the intrinsics to represent a thin abstraction over the
+// different GPU architectures. In particular, code compiled to spirv-- without
+// specifying a specific target can be specialised at JIT time, at which point
+// this pass will rewrite those intrinsics to ones that the current backend
+// knows.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGPUIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "lower-gpu-intrinsic"
+
+using namespace llvm;
+
+namespace {
+
+// For each intrinsic, specify what function to call to lower it
+typedef bool (*lowerFunction)(Module &M, IRBuilder<> &, Intrinsic::ID from,
+                              CallBase *CI);
+
+// Simple lowering, directly replace the intrinsic with a different one
+// with the same type, and optionally refine range metadata on the return value
+template <Intrinsic::ID To>
+bool S(Module &M, IRBuilder<> &, Intrinsic::ID from, CallBase *CI) {
+
+  static_assert(To != Intrinsic::not_intrinsic);
+  Intrinsic::ID GenericID = from;
+  Intrinsic::ID SpecificID = To;
+
+  bool Changed = false;
+  Function *Generic = Intrinsic::getDeclarationIfExists(&M, GenericID);
+  auto *Specific = Intrinsic::getOrInsertDeclaration(&M, SpecificID);
+
+  if ((Generic->getType() != Specific->getType()) ||
+      (Generic->getReturnType() != Specific->getReturnType()))
+    report_fatal_error("LowerGPUIntrinsic: Inconsistent types between "
+                       "intrinsics in lookup table");
+
+  CI->setCalledFunction(Specific);
+  Changed = true;
+
+  return Changed;
+}
+
+// Replace intrinsic call with a linear sequence of instructions
+typedef Value *(*builder)(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                          CallBase *CI);
+
+template <builder F>
+bool B(Module &M, IRBuilder<> &Builder, Intrinsic::ID from, CallBase *CI) {
+  bool Changed = false;
+
+  Builder.SetInsertPoint(CI);
+
+  Value *replacement = F(M, Builder, from, CI);
+  if (replacement) {
+    CI->replaceAllUsesWith(replacement);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+template <Intrinsic::ID Numerator, Intrinsic::ID Denominator>
+Value *intrinsicRatio(Module &M, IRBuilder<> &Builder, Intrinsic::ID,
+                      CallBase *) {
+  Value *N = Builder.CreateIntrinsic(Numerator, {}, {});
+  Value *D = Builder.CreateIntrinsic(Denominator, {}, {});
+  return Builder.CreateUDiv(N, D);
+}
+
+namespace amdgpu {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(
+      Intrinsic::amdgcn_ballot, {Type::getInt64Ty(Ctx)},
+      {ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+}
+
+Value *lane_id(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+               CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Constant *M1 = ConstantInt::get(Type::getInt32Ty(Ctx), -1);
+  Constant *Z = ConstantInt::get(Type::getInt32Ty(Ctx), 0);
+
+  CallInst *Lo =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {}, {M1, Z});
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {M1, Lo});
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                  CallBase *CI) {
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                 {Type::getInt32Ty(Ctx)},
+                                 {CI->getArgOperand(1)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                   CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *idx = CI->getArgOperand(1);
+  Value *x = CI->getArgOperand(2);
+  Value *width = CI->getArgOperand(3);
+
+  Value *id = Builder.CreateIntrinsic(Intrinsic::gpu_lane_id, {}, {});
+
+  Value *n = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 0),
+                               width, "not");
+  Value *a = Builder.CreateAnd(id, n, "and");
+  Value *add = Builder.CreateAdd(a, idx, "add");
+  Value *shl =
+      Builder.CreateShl(add, ConstantInt::get(Type::getInt32Ty(Ctx), 2), 
"shl");
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_ds_bpermute, {}, {shl, x});
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *C =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot, 
{Type::getInt64Ty(Ctx)},
+                              {CI->getArgOperand(1)});
+
+  return Builder.CreateAnd(C, CI->getArgOperand(0));
+}
+
+Value *sync_threads(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                    CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
+
+  Value *F = Builder.CreateFence(AtomicOrdering::SequentiallyConsistent,
+                                 Ctx.getOrInsertSyncScopeID("workgroup"));
+
+  return F;
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                      CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  return Builder.CreateIntrinsic(Intrinsic::amdgcn_s_sleep, {},
+                                 {ConstantInt::get(Type::getInt32Ty(Ctx), 2)});
+}
+
+Value *dispatch_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 64));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(4)));
+  return Call;
+}
+
+Value *implicit_arg_ptr(IRBuilder<> &Builder) {
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {}, {});
+  Call->addRetAttr(
+      Attribute::getWithDereferenceableBytes(Call->getContext(), 256));
+  Call->addRetAttr(Attribute::getWithAlignment(Call->getContext(), Align(8)));
+  return Call;
+}
+
+template <unsigned Index>
+Value *grid_size(Module &M, IRBuilder<> &Builder, Intrinsic::ID, CallBase *) {
+  auto &Ctx = M.getContext();
+  const unsigned XOffset = 12;
+  auto *DP = dispatch_ptr(Builder);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  auto *Offset = ConstantInt::get(Type::getInt32Ty(Ctx), XOffset + Index * 4);
+  auto *GEP = Builder.CreateGEP(Type::getInt8Ty(Ctx), DP, Offset);
+  auto *LD = Builder.CreateLoad(Type::getInt32Ty(Ctx), GEP);
+  llvm::MDBuilder MDB(Ctx);
+  // Known non-zero.
+  LD->setMetadata(llvm::LLVMContext::MD_range,
+                  MDB.createRange(APInt(32, 1), APInt::getZero(32)));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ctx, {}));
+  return LD;
+}
+
+template <int Index>
+Value *WGSize(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+
+  // Note: "__oclc_ABI_version" is supposed to be emitted and initialized by
+  //       clang during compilation of user code.
+  StringRef Name = "__oclc_ABI_version";
+  auto *ABIVersionC = M.getNamedGlobal(Name);
+  if (!ABIVersionC) {
+    // In CGBuiltin, we'd have to create an extern variable to emit the load 
for
+    // Here, we can leave the intrinsic in place and it'll get lowered later
+    return nullptr;
+  }
+  auto &Ctx = M.getContext();
+
+  Value *ABIVersion = Builder.CreateLoad(Type::getInt32Ty(Ctx), ABIVersionC);
+
+  Value *IsCOV5 = Builder.CreateICmpSGE(
+      ABIVersion,
+      ConstantInt::get(Type::getInt32Ty(Ctx), CodeObjectVersionKind::COV_5));
+
+  Value *ImplicitGEP = Builder.CreateConstGEP1_32(
+      Type::getInt8Ty(Ctx), implicit_arg_ptr(Builder), 12 + Index * 2);
+
+  // Indexing the HSA kernel_dispatch_packet struct.
+  Value *DispatchGEP = Builder.CreateConstGEP1_32(
+      Type::getInt8Ty(Ctx), dispatch_ptr(Builder), 4 + Index * 2);
+
+  auto Result = Builder.CreateSelect(IsCOV5, ImplicitGEP, DispatchGEP);
+  LoadInst *LD = Builder.CreateLoad(Type::getInt16Ty(Ctx), Result);
+
+  // TODO: CGBuiltin digs MaxOpenCLWorkGroupSize out of targetinfo and limtis
+  // the range on the load based on that (MD_range)
+
+  LD->setMetadata(llvm::LLVMContext::MD_noundef, llvm::MDNode::get(Ctx, {}));
+  LD->setMetadata(llvm::LLVMContext::MD_invariant_load,
+                  llvm::MDNode::get(Ctx, {}));
+
+  // The workgroup size is a uint16_t but gpu_block_id returns a uint32_t
+  return Builder.CreateZExt(LD, Type::getInt32Ty(Ctx));
+}
+
+} // namespace amdgpu
+
+namespace nvptx {
+Value *lane_mask(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+  auto &Ctx = M.getContext();
+  CallInst *C = Builder.CreateIntrinsic(Intrinsic::nvvm_activemask, {}, {});
+  return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv");
+}
+
+Value *first_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                  CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Value *conv =
+      Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv");
+  Value *C = Builder.CreateIntrinsic(
+      Intrinsic::cttz, {Type::getInt32Ty(Ctx)},
+      {conv, ConstantInt::get(Type::getInt1Ty(Ctx), true)});
+  Value *iszero = Builder.CreateICmpEQ(
+      conv, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "iszero");
+  Value *sub = Builder.CreateSelect(
+      iszero, ConstantInt::get(Type::getInt32Ty(Ctx), -1), C, "sub");
+
+  return Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {},
+                                 {conv, CI->getArgOperand(1), sub,
+                                  ConstantInt::get(Type::getInt32Ty(Ctx), 
31)});
+}
+
+Value *shuffle_idx(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                   CallBase *CI) {
+  auto &Ctx = M.getContext();
+
+  Value *lane_mask = CI->getArgOperand(0);
+  Value *idx = CI->getArgOperand(1);
+  Value *x = CI->getArgOperand(2);
+  Value *width = CI->getArgOperand(3);
+
+  Value *Conv = Builder.CreateTrunc(lane_mask, Type::getInt32Ty(Ctx), "conv");
+
+  Value *sh_prom = Builder.CreateZExt(idx, Type::getInt64Ty(Ctx), "sh_prom");
+  Value *shl0 =
+      Builder.CreateShl(width, ConstantInt::get(Type::getInt32Ty(Ctx), 8));
+  Value *or0 = Builder.CreateSub(ConstantInt::get(Type::getInt32Ty(Ctx), 8223),
+                                 shl0, "or");
+
+  Value *core = Builder.CreateIntrinsic(Intrinsic::nvvm_shfl_sync_idx_i32, {},
+                                        {Conv, x, idx, or0});
+
+  Value *shl1 =
+      Builder.CreateShl(ConstantInt::get(Type::getInt64Ty(Ctx), 1), sh_prom);
+  Value *and0 = Builder.CreateAnd(shl1, lane_mask);
+  Value *cmp =
+      Builder.CreateICmpEQ(and0, ConstantInt::get(Type::getInt64Ty(Ctx), 0));
+  Value *and4 = Builder.CreateSelect(
+      cmp, ConstantInt::get(Type::getInt32Ty(Ctx), 0), core, "and4");
+
+  return and4;
+}
+
+Value *ballot(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+              CallBase *CI) {
+  auto &Ctx = M.getContext();
+  Value *Conv =
+      Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx), "conv");
+  Value *C = Builder.CreateIntrinsic(Intrinsic::nvvm_vote_ballot_sync, {},
+                                     {Conv, CI->getArgOperand(1)});
+
+  return Builder.CreateZExt(C, Type::getInt64Ty(Ctx), "conv");
+}
+
+Value *sync_lane(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                 CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  Value *X = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(Ctx));
+  return Builder.CreateIntrinsic(Intrinsic::nvvm_bar_warp_sync, {}, {X});
+}
+
+Value *thread_suspend(Module &M, IRBuilder<> &Builder, Intrinsic::ID from,
+                      CallBase *CI) {
+
+  auto &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+
+  Value *str = Builder.CreateGlobalString(
+      "__CUDA_ARCH", "", DL.getDefaultGlobalsAddressSpace(), &M);
+
+  Builder.SetInsertPoint(CI);
+  Value *Reflect = Builder.CreateIntrinsic(Intrinsic::nvvm_reflect, {}, {str});
+  Value *Cmp = Builder.CreateICmpUGT(
+      Reflect, ConstantInt::get(Type::getInt32Ty(Ctx), 699));
+
+  Builder.SetInsertPoint(SplitBlockAndInsertIfThen(Cmp, CI, false));
+
+  Builder.CreateIntrinsic(Intrinsic::nvvm_nanosleep, {},
+                          {ConstantInt::get(Type::getInt32Ty(Ctx), 64)});
+
+  CI->eraseFromParent();
+  return nullptr; // All done
+}
+
+} // namespace nvptx
+
+struct IntrinsicMap {
+  Intrinsic::ID Generic;
+  lowerFunction AMDGPU;
+  lowerFunction NVPTX;
+};
+
+using namespace Intrinsic;
+
+static const IntrinsicMap ls[] = {
+    // This table of intrinsic => what to do with it is walked in order.
+    // A row can create calls to intrinsics that are expanded in subsequent 
rows
+    // but that does mean that the order of rows is somewhat significant.
+    // S<intrinsic> is a simple lowering to an existing intrinsic
+    // B<function> involves building a short sequence of instructions
+
+    // amdgpu defines num_blocks as an integer ratio of two other intrinsics.
+    // amdgcn_grid_size and gpu_num_threads are expanded further down.
+    {
+        gpu_num_blocks_x,
+        B<intrinsicRatio<amdgcn_grid_size_x, gpu_num_threads_x>>,
+        S<nvvm_read_ptx_sreg_nctaid_x>,
+    },
+    {
+        gpu_num_blocks_y,
+        B<intrinsicRatio<amdgcn_grid_size_y, gpu_num_threads_y>>,
+        S<nvvm_read_ptx_sreg_nctaid_y>,
+    },
+    {
+        gpu_num_blocks_z,
+        B<intrinsicRatio<amdgcn_grid_size_z, gpu_num_threads_z>>,
+        S<nvvm_read_ptx_sreg_nctaid_z>,
+    },
+
+    // Note: Could canonicalise in favour of the target agnostic one without
+    // breaking existing users of builtin or intrinsic:
+    //  {amdgcn_workgroup_id_x, S<gpu_block_id_x>, nullptr},
+    //  {gpu_block_id_x, nullptr, S<nvvm_read_ptx_sreg_ctaid_x>},
+    // Using the target agnostic one throughout the rest of the backend would
+    // work fine, and amdgpu-no-workgroup-id-x attribute and similar may be
+    // applicable to other targets.
+    // Map {block,thread}_id onto existing intrinsics for the time being.
+    {gpu_block_id_x, S<amdgcn_workgroup_id_x>, S<nvvm_read_ptx_sreg_ctaid_x>},
+    {gpu_block_id_y, S<amdgcn_workgroup_id_y>, S<nvvm_read_ptx_sreg_ctaid_y>},
+    {gpu_block_id_z, S<amdgcn_workgroup_id_z>, S<nvvm_read_ptx_sreg_ctaid_z>},
+    {gpu_thread_id_x, S<amdgcn_workitem_id_x>, S<nvvm_read_ptx_sreg_tid_x>},
+    {gpu_thread_id_y, S<amdgcn_workitem_id_y>, S<nvvm_read_ptx_sreg_tid_y>},
+    {gpu_thread_id_z, S<amdgcn_workitem_id_z>, S<nvvm_read_ptx_sreg_tid_z>},
+
+    // CGBuiltin maps builtin_amdgcn_workgroup_size onto gpu_num_threads
+    {gpu_num_threads_x, B<amdgpu::WGSize<0>>, S<nvvm_read_ptx_sreg_ntid_x>},
+    {gpu_num_threads_y, B<amdgpu::WGSize<1>>, S<nvvm_read_ptx_sreg_ntid_y>},
+    {gpu_num_threads_z, B<amdgpu::WGSize<2>>, S<nvvm_read_ptx_sreg_ntid_z>},
+
+    // Some of the following intrinsics need minor impedance matching
+    {gpu_num_lanes, S<amdgcn_wavefrontsize>, S<nvvm_read_ptx_sreg_warpsize>},
+    {gpu_lane_mask, B<amdgpu::lane_mask>, B<nvptx::lane_mask>},
+
+    {gpu_read_first_lane_u32, B<amdgpu::first_lane>, B<nvptx::first_lane>},
+    {gpu_shuffle_idx_u32, B<amdgpu::shuffle_idx>, B<nvptx::shuffle_idx>},
+
+    // shuffle sometimes emits call into lane_id so lower lane_id after shuffle
+    {gpu_lane_id, B<amdgpu::lane_id>, S<nvvm_read_ptx_sreg_laneid>},
+
+    {gpu_ballot, B<amdgpu::ballot>, B<nvptx::ballot>},
+
+    {gpu_sync_threads, B<amdgpu::sync_threads>, S<nvvm_barrier0>},
+    {gpu_sync_lane, B<amdgpu::sync_lane>, B<nvptx::sync_lane>},
+
+    {gpu_thread_suspend, B<amdgpu::thread_suspend>, B<nvptx::thread_suspend>},
+    {gpu_exit, S<amdgcn_endpgm>, S<nvvm_exit>},
+
+    // These aren't generic intrinsics but lowering them here instead of
+    // in CGBuiltin allows the above to be implemented partly in terms of
+    // amdgcn_grid_size.
+    {amdgcn_grid_size_x, B<amdgpu::grid_size<0>>, nullptr},
+    {amdgcn_grid_size_y, B<amdgpu::grid_size<1>>, nullptr},
+    {amdgcn_grid_size_z, B<amdgpu::grid_size<2>>, nullptr},
+};
+
+class LowerGPUIntrinsic : public ModulePass {
+public:
+  static char ID;
+
+  LowerGPUIntrinsic() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) override;
+};
+
+bool LowerGPUIntrinsic::runOnModule(Module &M) {
+  bool Changed = false;
+
+  Triple TT(M.getTargetTriple());
+
+  if (!TT.isAMDGPU() && !TT.isNVPTX()) {
+    return Changed;
+  }
+
+  auto &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+
+  for (const IntrinsicMap &I : ls) {
+    auto *Intr = Intrinsic::getDeclarationIfExists(&M, I.Generic);
+    if (!Intr)
+      continue;
+
+    lowerFunction maybeLowering = TT.isAMDGPU() ? I.AMDGPU : I.NVPTX;
+    if (maybeLowering == nullptr)
+      continue;
+
+    for (auto *U : make_early_inc_range(Intr->users())) {
+      if (auto *CI = dyn_cast<CallBase>(U)) {
----------------
arsenm wrote:


```suggestion
      if (auto *CI = dyn_cast<CallInst>(U)) {
```

https://github.com/llvm/llvm-project/pull/131190
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [llvm] [SPIRV] GPU intrinsics (PR #131190)

Reply via email to