[PATCH] D86376: [HIP] Improve kernel launching latency

Yaxun Liu via Phabricator via cfe-commits Fri, 21 Aug 2020 15:04:01 -0700

yaxunl created this revision.
yaxunl added reviewers: tra, rjmccall.
yaxunl requested review of this revision.


Currently clang emits emits the following code for triple chevron kernel call 
for HIP:

  __hipPushCallConfiguration(grids, blocks, shmem, stream);
  kernel_stub();

whereas for each kernel, clang emits a kernel_stub:

  void kernel_stub() {
    __hipPopCallConfiguration(&grids, &blocks, &shmem, &stream);
    hipLaunchKernel(kernel_stub, grids, blocks, kernel_args, shmem, stream);
  }

This is really unnecessary. in host code, a kernel function is not really a 
"function"
since you cannot "call" it in the generated IR, you can only launch it through 
kernel
launching API.

This patch simplifies the generated code for kernel launching by eliminating the
call of `__hipPushCallConfiguration` and `__hipPopCallConfiguration`. For each
triple chevron, a call of `hipLaunchKernel` is directly emitted. The kernel stub
function is still emitted as an empty function, for the sole purpose of as a 
shadow
symbol to map to the device symbol in device binary so that runtime can use it
to find the device symbol.

This patch does not change AST for kernel since semantically a triple chevron
is like a function call. Keep it as a function call facilitates overloading 
resolution
and function argument type checking.

This patch only changes kernel launching codegen for HIP for the new kernel 
launching
API since we are sure there is no other side effect in 
`__hipPushCallConfiguration`
and `__hipPopCallConfiguration`.


https://reviews.llvm.org/D86376

Files:
  clang/lib/CodeGen/CGCUDANV.cpp
  clang/test/CodeGenCUDA/kernel-call.cu
  clang/test/CodeGenCUDA/kernel-call.hip
  clang/test/lit.cfg.py

Index: clang/test/lit.cfg.py
===================================================================
--- clang/test/lit.cfg.py
+++ clang/test/lit.cfg.py
@@ -25,7 +25,7 @@
 config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
 
 # suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.c', '.cpp', '.i', '.cppm', '.m', '.mm', '.cu',
+config.suffixes = ['.c', '.cpp', '.i', '.cppm', '.m', '.mm', '.cu', '.hip',
                    '.ll', '.cl', '.s', '.S', '.modulemap', '.test', '.rs', '.ifs']
 
 # excludes: A list of directories to exclude from the testsuite. The 'Inputs'
Index: clang/test/CodeGenCUDA/kernel-call.hip
===================================================================
--- /dev/null
+++ clang/test/CodeGenCUDA/kernel-call.hip
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 -fhip-new-launch-api -triple x86_64-unknown-linux-gnu \
+// RUN:   -std=c++11 -emit-llvm %s -o - | FileCheck %s
+
+#include "Inputs/cuda.h"
+
+struct A { int a[10]; };
+
+__global__ void g1(int x) {}
+__global__ void g2(A x) {}
+__global__ void g3(A &x) {}
+template<typename F> __global__ void g4(F f, int *x) { *x = f(); }
+void (*pg1)(int x) = g1;
+
+// CHECK-LABEL: define{{.*}}test1
+void test1() {
+  // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 2, i32 1, i32 1)
+  // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 3, i32 1, i32 1)
+  // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g1i{{.*}}, i64 0, %struct.hipStream* null)
+  g1<<<2, 3>>>(0);
+
+  // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 4, i32 5, i32 6)
+  // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 7, i32 8, i32 9)
+  // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g1i{{.*}}, i64 10, {{.*}}inttoptr (i64 11
+  g1<<<dim3(4,5,6), dim3(7,8,9), 10, (hipStream_t)11>>>(0);
+
+  // CHECK: %[[LD:.*]] = load void (i32)*, void (i32)** @pg1
+  // CHECK: %[[PTR:.*]] = bitcast void (i32)* %[[LD]] to i8*
+  // CHECK: call i32 @hipLaunchKernel({{.*}}%[[PTR]]{{.*}}, i64 0, %struct.hipStream* null)
+  pg1<<<1, 1>>>(0);
+}
+
+// CHECK-LABEL: define{{.*}}test2
+void test2() {
+  A a;
+  // CHECK: %agg.tmp = alloca %struct.A, align 4
+  // CHECK: %kernel_args = alloca i8*, i64 1, align 16
+  // CHECK: %[[CAST:.*]] = bitcast %struct.A* %agg.tmp to i8*
+  // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+  // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8
+  // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g21A{{.*}}, i64 0, %struct.hipStream* null)
+  g2<<<1, 1>>>(a);
+}
+
+// CHECK-LABEL: define{{.*}}test3
+void test3() {
+  A a;
+  // CHECK: %a = alloca %struct.A, align 4
+  // CHECK: %kernel_arg = alloca %struct.A*, align 8
+  // CHECK: %kernel_args = alloca i8*, i64 1, align 16
+  // CHECK: store %struct.A* %a, %struct.A** %kernel_arg, align 8
+  // CHECK: %[[CAST:.*]] = bitcast %struct.A** %kernel_arg to i8*
+  // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+  // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8
+  // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g3R1A{{.*}}, i64 0, %struct.hipStream* null)
+  g3<<<1, 1>>>(a);
+}
+
+// CHECK-LABEL: define{{.*}}test4
+void test4() {
+  int x = 123;
+  int y;
+  // CHECK: %agg.tmp = alloca %class.anon, align 4
+  // CHECK: %kernel_args = alloca i8*, i64 2, align 16
+  // CHECK: %[[CAST:.*]] = bitcast %class.anon* %agg.tmp to i8*
+  // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0
+  // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8
+  // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g4IZ5test4vEUlvE_EvT_Pi{{.*}}, i64 0, %struct.hipStream* null)
+  g4<<<1, 1>>>([=]() { return x; }, &y);
+}
Index: clang/test/CodeGenCUDA/kernel-call.cu
===================================================================
--- clang/test/CodeGenCUDA/kernel-call.cu
+++ clang/test/CodeGenCUDA/kernel-call.cu
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \
-// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK
+// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK,COMMON
 // RUN: %clang_cc1 -target-sdk-version=9.2  -emit-llvm %s -o - \
-// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK
+// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK,COMMON
 // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \
-// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK
+// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK,COMMON
 // RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \
-// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK
+// RUN: | FileCheck %s --check-prefixes=HIP-NEW
 
 #include "Inputs/cuda.h"
 
 // CHECK-LABEL: define{{.*}}g1
 // HIP-OLD: call{{.*}}hipSetupArgument
 // HIP-OLD: call{{.*}}hipLaunchByPtr
-// HIP-NEW: call{{.*}}__hipPopCallConfiguration
-// HIP-NEW: call{{.*}}hipLaunchKernel
+// HIP-NEW-NOT: call{{.*}}__hipPopCallConfiguration
+// HIP-NEW-NOT: call{{.*}}hipLaunchKernel
 // CUDA-OLD: call{{.*}}cudaSetupArgument
 // CUDA-OLD: call{{.*}}cudaLaunch
 // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration
@@ -23,11 +23,12 @@
 // CHECK-LABEL: define{{.*}}main
 int main(void) {
   // HIP-OLD: call{{.*}}hipConfigureCall
-  // HIP-NEW: call{{.*}}__hipPushCallConfiguration
+  // HIP-NEW-NOT: call{{.*}}__hipPushCallConfiguration
+  // HIP-NEW: call{{.*}}hipLaunchKernel
   // CUDA-OLD: call{{.*}}cudaConfigureCall
   // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration
-  // CHECK: icmp
-  // CHECK: br
-  // CHECK: call{{.*}}g1
+  // COMMON: icmp
+  // COMMON: br
+  // COMMON: call{{.*}}g1
   g1<<<1, 1>>>(42);
 }
Index: clang/lib/CodeGen/CGCUDANV.cpp
===================================================================
--- clang/lib/CodeGen/CGCUDANV.cpp
+++ clang/lib/CodeGen/CGCUDANV.cpp
@@ -116,7 +116,9 @@
   }
 
   void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
-  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
+  void emitDeviceStubBodyNew(CodeGenFunction &CGF, llvm::Value *Kernel,
+                             const Address &KernelArgs,
+                             Expr const *const *ConfigArgs = nullptr);
   std::string getDeviceSideName(const NamedDecl *ND) override;
 
 public:
@@ -149,6 +151,13 @@
   llvm::Function *makeModuleCtorFunction() override;
   /// Creates module destructor function
   llvm::Function *makeModuleDtorFunction() override;
+
+  RValue EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
+                                const CUDAKernelCallExpr *E,
+                                ReturnValueSlot ReturnValue) override;
+
+  Address createTempVarForKernelArgs(CodeGenFunction &CGF,
+                                     FunctionArgList &Args);
 };
 
 }
@@ -241,19 +250,17 @@
                                      FunctionArgList &Args) {
   EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
   if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
-                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
-      (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
-    emitDeviceStubBodyNew(CGF, Args);
-  else
+                         CudaFeature::CUDA_USES_NEW_LAUNCH))
+    emitDeviceStubBodyNew(CGF, CGF.CurFn,
+                          createTempVarForKernelArgs(CGF, Args));
+  else if (!CGF.getLangOpts().HIP || !CGF.getLangOpts().HIPUseNewLaunchAPI)
     emitDeviceStubBodyLegacy(CGF, Args);
 }
 
-// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
-// array and kernels are launched using cudaLaunchKernel().
-void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
-                                            FunctionArgList &Args) {
-  // Build the shadow stack entry at the very start of the function.
-
+// Create a temporary array to hold all kernel arguments for kernel stub.
+// \p Args is the kernel argument list of the kernel stub.
+Address CGNVCUDARuntime::createTempVarForKernelArgs(CodeGenFunction &CGF,
+                                                    FunctionArgList &Args) {
   // Calculate amount of space we will need for all arguments.  If we have no
   // args, allocate a single pointer so we still have a valid pointer to the
   // argument array that we can pass to runtime, even if it will be unused.
@@ -267,8 +274,19 @@
     CGF.Builder.CreateDefaultAlignedStore(
         VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
   }
+  return KernelArgs;
+}
 
-  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
+// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
+// array and kernels are launched using cudaLaunchKernel().
+void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
+                                            llvm::Value *Kernel,
+                                            const Address &KernelArgs,
+                                            Expr const *const *ConfigArgs) {
+  // Build the shadow stack entry at the very start of the function.
+  llvm::BasicBlock *EndBlock = nullptr;
+  if (!CGF.getLangOpts().HIP)
+     EndBlock = CGF.createBasicBlock("setup.end");
 
   // Lookup cudaLaunchKernel/hipLaunchKernel function.
   // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
@@ -296,39 +314,49 @@
   // Create temporary dim3 grid_dim, block_dim.
   ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
   QualType Dim3Ty = GridDimParam->getType();
-  Address GridDim =
-      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
-  Address BlockDim =
-      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
-  Address ShmemSize =
-      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
-  Address Stream =
-      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
-  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
-      llvm::FunctionType::get(IntTy,
-                              {/*gridDim=*/GridDim.getType(),
-                               /*blockDim=*/BlockDim.getType(),
-                               /*ShmemSize=*/ShmemSize.getType(),
-                               /*Stream=*/Stream.getType()},
-                              /*isVarArg=*/false),
-      addUnderscoredPrefixToName("PopCallConfiguration"));
-
-  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
-                              {GridDim.getPointer(), BlockDim.getPointer(),
-                               ShmemSize.getPointer(), Stream.getPointer()});
+  RValue ConfigArgRVals[4];
+  if (!CGF.getLangOpts().HIP) {
+    Address GridDim =
+        CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
+    Address BlockDim =
+        CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
+    Address ShmemSize =
+        CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
+    Address Stream =
+        CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
+    llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
+        llvm::FunctionType::get(IntTy,
+                                {/*gridDim=*/GridDim.getType(),
+                                 /*blockDim=*/BlockDim.getType(),
+                                 /*ShmemSize=*/ShmemSize.getType(),
+                                 /*Stream=*/Stream.getType()},
+                                /*isVarArg=*/false),
+        addUnderscoredPrefixToName("PopCallConfiguration"));
+    CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
+                                {GridDim.getPointer(), BlockDim.getPointer(),
+                                 ShmemSize.getPointer(), Stream.getPointer()});
+    ConfigArgRVals[0] = RValue::getAggregate(GridDim);
+    ConfigArgRVals[1] = RValue::getAggregate(BlockDim);
+    ConfigArgRVals[2] = RValue::get(CGF.Builder.CreateLoad(ShmemSize));
+    ConfigArgRVals[3] = RValue::get(CGF.Builder.CreateLoad(Stream));
+  } else {
+    assert(ConfigArgs);
+    for (unsigned I = 0; I < 4; ++I)
+      ConfigArgRVals[I] = CGF.EmitAnyExprToTemp(ConfigArgs[I]);
+  }
 
   // Emit the call to cudaLaunch
-  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
   CallArgList LaunchKernelArgs;
-  LaunchKernelArgs.add(RValue::get(Kernel),
-                       cudaLaunchKernelFD->getParamDecl(0)->getType());
-  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
-  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
+  LaunchKernelArgs.add(
+      RValue::get(CGF.Builder.CreatePointerCast(Kernel, VoidPtrTy)),
+      cudaLaunchKernelFD->getParamDecl(0)->getType());
+  LaunchKernelArgs.add(ConfigArgRVals[0], Dim3Ty);
+  LaunchKernelArgs.add(ConfigArgRVals[1], Dim3Ty);
   LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
                        cudaLaunchKernelFD->getParamDecl(3)->getType());
-  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
+  LaunchKernelArgs.add(ConfigArgRVals[2],
                        cudaLaunchKernelFD->getParamDecl(4)->getType());
-  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
+  LaunchKernelArgs.add(ConfigArgRVals[3],
                        cudaLaunchKernelFD->getParamDecl(5)->getType());
 
   QualType QT = cudaLaunchKernelFD->getType();
@@ -342,9 +370,12 @@
       CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
   CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
                LaunchKernelArgs);
-  CGF.EmitBranch(EndBlock);
 
-  CGF.EmitBlock(EndBlock);
+  if (!CGF.getLangOpts().HIP) {
+    assert(EndBlock);
+    CGF.EmitBranch(EndBlock);
+    CGF.EmitBlock(EndBlock);
+  }
 }
 
 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
@@ -846,3 +877,50 @@
 CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
   return new CGNVCUDARuntime(CGM);
 }
+
+RValue CGNVCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF,
+                                               const CUDAKernelCallExpr *E,
+                                               ReturnValueSlot ReturnValue) {
+  if (!CGF.CGM.getLangOpts().HIP || !CGF.getLangOpts().HIPUseNewLaunchAPI)
+    return CGCUDARuntime::EmitCUDAKernelCallExpr(CGF, E, ReturnValue);
+
+  CGCallee Callee = CGF.EmitCallee(E->getCallee());
+  auto FnType = E->getCallee()
+                    ->getType()
+                    ->getAs<PointerType>()
+                    ->getPointeeType()
+                    ->getAs<FunctionProtoType>();
+  CallArgList Args;
+  CGF.EmitCallArgs(Args, FnType, E->arguments());
+
+  Address KernelArgs = CGF.CreateTempAlloca(
+      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
+      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
+  for (unsigned I = 0; I < Args.size(); ++I) {
+    auto RV = Args[I].getRValue(CGF);
+    llvm::Value *VoidArgPtr;
+    if (RV.isScalar()) {
+      llvm::Value *Arg = RV.getScalarVal();
+      auto Ty = Arg->getType();
+      Address ArgPtr = CGF.CreateTempAlloca(
+          Ty,
+          CharUnits::fromQuantity(
+              CGF.CGM.getDataLayout().getPrefTypeAlignment(Ty)),
+          "kernel_arg");
+      CGF.Builder.CreateDefaultAlignedStore(Arg, ArgPtr.getPointer());
+      VoidArgPtr =
+          CGF.Builder.CreatePointerCast(ArgPtr.getPointer(), VoidPtrTy);
+    } else {
+      Address ArgPtr = RV.getAggregateAddress();
+      VoidArgPtr =
+          CGF.Builder.CreatePointerCast(ArgPtr.getPointer(), VoidPtrTy);
+    }
+    CGF.Builder.CreateDefaultAlignedStore(
+        VoidArgPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), I));
+  }
+
+  emitDeviceStubBodyNew(CGF, Callee.getFunctionPointer(), KernelArgs,
+                        E->getConfig()->getArgs());
+
+  return RValue::get(nullptr);
+}

_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[PATCH] D86376: [HIP] Improve kernel launching latency

Reply via email to