cdevadas created this revision.
cdevadas added reviewers: rjmccall, Anastasia, yaxunl, arsenm.
Herald added subscribers: Naghasan, ldrumm, kerbowa, t-tye, tpr, dstuttard, 
jvesely, kzhuravl.
cdevadas requested review of this revision.
Herald added subscribers: cfe-commits, wdng.
Herald added a project: clang.

In OpenCL, a kernel is allowed to call other kernels as if
they are regular functions. To support it, clang emits
amdgpu_kernel calling convention for both caller and callee.
A backend pass in our downstream compiler alters such calls
by introducing regular function bodies which are clones of
the callee kernels. This implementation currently limits us
in certain ways. For instance, the restriction to not use
byref attribute for callee kernels.

To avoid such limitations, this patch brings in those
cloned functions early on and prevents clang from generating
amdgpu_kernel call sites. A new function body will be added
for each kernel in the compilation unit expecting that the
unused clones will get removed at link time.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D120566

Files:
  clang/lib/CodeGen/CodeGenModule.cpp
  clang/lib/CodeGen/TargetInfo.cpp
  clang/lib/CodeGen/TargetInfo.h
  clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl
  clang/test/CodeGenOpenCL/visibility.cl

Index: clang/test/CodeGenOpenCL/visibility.cl
===================================================================
--- clang/test/CodeGenOpenCL/visibility.cl
+++ clang/test/CodeGenOpenCL/visibility.cl
@@ -94,23 +94,6 @@
     ext_func_default();
 }
 
-// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern()
-// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern()
-// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern()
-
-// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_hidden()
-// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_hidden()
-// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_hidden()
-
-// FVIS-DEFAULT: declare protected amdgpu_kernel void @ext_kern_protected()
-// FVIS-PROTECTED: declare protected amdgpu_kernel void @ext_kern_protected()
-// FVIS-HIDDEN: declare protected amdgpu_kernel void @ext_kern_protected()
-
-// FVIS-DEFAULT: declare amdgpu_kernel void @ext_kern_default()
-// FVIS-PROTECTED: declare amdgpu_kernel void @ext_kern_default()
-// FVIS-HIDDEN: declare amdgpu_kernel void @ext_kern_default()
-
-
 // FVIS-DEFAULT: declare void @ext_func()
 // FVIS-PROTECTED: declare protected void @ext_func()
 // FVIS-HIDDEN: declare hidden void @ext_func()
@@ -126,3 +109,21 @@
 // FVIS-DEFAULT: declare void @ext_func_default()
 // FVIS-PROTECTED: declare void @ext_func_default()
 // FVIS-HIDDEN: declare void @ext_func_default()
+
+// A kernel call will be emitted as a call to its cloned function
+// of non-kernel convention.
+// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_kernel_body()
+// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_kernel_body()
+// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_kernel_body()
+
+// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_hidden_kernel_body()
+// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_hidden_kernel_body()
+// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_hidden_kernel_body()
+
+// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_protected_kernel_body()
+// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_protected_kernel_body()
+// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_protected_kernel_body()
+
+// FVIS-DEFAULT: declare void @__amdgpu_ext_kern_default_kernel_body()
+// FVIS-PROTECTED: declare void @__amdgpu_ext_kern_default_kernel_body()
+// FVIS-HIDDEN: declare void @__amdgpu_ext_kern_default_kernel_body()
Index: clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl
===================================================================
--- /dev/null
+++ clang/test/CodeGenOpenCL/amdgpu-kernel-calls.cl
@@ -0,0 +1,60 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -S -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
+
+// AMDGPU disallows kernel callsites from another kernels. For each kernel, clang codegen will introduce
+// a cloned function body with a non-kernel calling convention and amdgpu_kernel callsites will get
+// transformed to call appropriate clones.
+
+extern kernel void test_extern_kernel_callee(global int *in);
+
+// CHECK: define dso_local amdgpu_kernel void @test_kernel_callee(i32 addrspace(1)* noundef align 4 %in)
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5)
+// CHECK-NEXT:    store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:    store i32 10, i32 addrspace(1)* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+kernel void test_kernel_callee(global int *in) {
+  *in = (int)(10);
+}
+
+// CHECK: define dso_local amdgpu_kernel void @test_kernel_caller(i32 addrspace(1)* noundef align 4 %in)
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5)
+// CHECK-NEXT:    store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:    %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:    call void @__amdgpu_test_kernel_callee_kernel_body(
+// CHECK-NOT:     call amdgpu_kernel void @test_kernel_callee(
+// CHECK-NEXT:    %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:    call void @__amdgpu_test_extern_kernel_callee_kernel_body(
+// CHECK-NOT:     call amdgpu_kernel void @test_kernel_callee(
+// CHECK-NEXT:    ret void
+//
+kernel void test_kernel_caller(global int *in) {
+  test_kernel_callee(in);
+  test_extern_kernel_callee(in);
+}
+
+// CHECK: declare amdgpu_kernel void @test_extern_kernel_callee(i32 addrspace(1)* noundef align 4)
+
+// CHECK:   define dso_local void @__amdgpu_test_kernel_callee_kernel_body(i32 addrspace(1)* noundef align 4 %in)
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:     [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5)
+// CHECK-NEXT:     store i32 addrspace(1)* %in, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:     [[TMP0:%.*]] = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:     store i32 10, i32 addrspace(1)* [[TMP0]], align 4
+// CHECK-NEXT:     ret void
+
+// CHECK:   define dso_local void @__amdgpu_test_kernel_caller_kernel_body(i32 addrspace(1)* noundef align 4 %in)
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[IN_ADDR:%.*]] = alloca i32 addrspace(1)*, align 8, addrspace(5)
+// CHECK-NEXT:    store i32 addrspace(1)* [[IN:%.*]], i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:    %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:    call void @__amdgpu_test_kernel_callee_kernel_body(
+// CHECK-NEXT:    %{{.*}} = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* [[IN_ADDR]], align 8
+// CHECK-NEXT:    call void @__amdgpu_test_extern_kernel_callee_kernel_body(
+// CHECK-NEXT:    ret void
+//
+
+// CHECK:  declare void @__amdgpu_test_extern_kernel_callee_kernel_body(i32 addrspace(1)*)
Index: clang/lib/CodeGen/TargetInfo.h
===================================================================
--- clang/lib/CodeGen/TargetInfo.h
+++ clang/lib/CodeGen/TargetInfo.h
@@ -247,6 +247,10 @@
                                        llvm::StringRef Value,
                                        llvm::SmallString<32> &Opt) const {}
 
+  /// Clean up and other special handling at the end when all functions are
+  /// codegenerated.
+  virtual void finalizeModule(llvm::Module &M) const {}
+
   /// Get LLVM calling convention for OpenCL kernel.
   virtual unsigned getOpenCLKernelCallingConv() const;
 
Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -19,9 +19,9 @@
 #include "CodeGenFunction.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/RecordLayout.h"
+#include "clang/Basic/Builtins.h"
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/DiagnosticFrontend.h"
-#include "clang/Basic/Builtins.h"
 #include "clang/CodeGen/CGFunctionInfo.h"
 #include "clang/CodeGen/SwiftCallingConv.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -34,6 +34,7 @@
 #include "llvm/IR/IntrinsicsS390.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include <algorithm> // std::sort
 
 using namespace clang;
@@ -9217,6 +9218,7 @@
                             llvm::Value *BlockLiteral) const override;
   bool shouldEmitStaticExternCAliases() const override;
   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
+  void finalizeModule(llvm::Module &M) const override;
 };
 }
 
@@ -9233,6 +9235,26 @@
            cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType()));
 }
 
+static llvm::Function *getKernelClone(llvm::Function &F) {
+  llvm::Module *M = F.getParent();
+  SmallString<128> MangledName("__amdgpu_");
+  MangledName.append(F.getName());
+  MangledName.append("_kernel_body");
+  llvm::Function *NewF = M->getFunction(MangledName);
+  if (!NewF) {
+    llvm::ValueToValueMapTy ignored;
+    NewF = F.empty()
+               ? llvm::Function::Create(F.getFunctionType(),
+                                        llvm::GlobalVariable::ExternalLinkage,
+                                        "", M)
+               : CloneFunction(&F, ignored);
+    NewF->setCallingConv(llvm::CallingConv::C);
+    NewF->setName(MangledName);
+  }
+
+  return NewF;
+}
+
 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
   const auto *ReqdWGS =
@@ -9435,6 +9457,30 @@
       FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));
 }
 
+void AMDGPUTargetCodeGenInfo::finalizeModule(llvm::Module &M) const {
+  // Insert a cloned function body for each kernel and adjust the kernel
+  // callsite to use its equivalent clone function. For extern kernel calls,
+  // insert a declaration node since the body isn't available.
+  if (!getABIInfo().getContext().getLangOpts().OpenCL)
+    return;
+
+  for (auto &F : M) {
+    if (F.getCallingConv() != llvm::CallingConv::AMDGPU_KERNEL)
+      continue;
+
+    llvm::Function *Clone = getKernelClone(F);
+    for (llvm::Function::user_iterator UI = F.user_begin(), UE = F.user_end();
+         UI != UE;) {
+      auto *CI = dyn_cast<llvm::CallInst>(*UI++);
+      if (!CI)
+        continue;
+
+      CI->setCalledFunction(Clone);
+      CI->setCallingConv(llvm::CallingConv::C);
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // SPARC v8 ABI Implementation.
 // Based on the SPARC Compliance Definition version 2.4.1.
Index: clang/lib/CodeGen/CodeGenModule.cpp
===================================================================
--- clang/lib/CodeGen/CodeGenModule.cpp
+++ clang/lib/CodeGen/CodeGenModule.cpp
@@ -576,6 +576,7 @@
                                 "amdgpu_code_object_version",
                                 getTarget().getTargetOpts().CodeObjectVersion);
     }
+    getTargetCodeGenInfo().finalizeModule(TheModule);
   }
 
   emitLLVMUsed();
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to