yaxunl created this revision. yaxunl added reviewers: tra, rjmccall. yaxunl requested review of this revision.
Currently clang emits emits the following code for triple chevron kernel call for HIP: __hipPushCallConfiguration(grids, blocks, shmem, stream); kernel_stub(); whereas for each kernel, clang emits a kernel_stub: void kernel_stub() { __hipPopCallConfiguration(&grids, &blocks, &shmem, &stream); hipLaunchKernel(kernel_stub, grids, blocks, kernel_args, shmem, stream); } This is really unnecessary. in host code, a kernel function is not really a "function" since you cannot "call" it in the generated IR, you can only launch it through kernel launching API. This patch simplifies the generated code for kernel launching by eliminating the call of `__hipPushCallConfiguration` and `__hipPopCallConfiguration`. For each triple chevron, a call of `hipLaunchKernel` is directly emitted. The kernel stub function is still emitted as an empty function, for the sole purpose of as a shadow symbol to map to the device symbol in device binary so that runtime can use it to find the device symbol. This patch does not change AST for kernel since semantically a triple chevron is like a function call. Keep it as a function call facilitates overloading resolution and function argument type checking. This patch only changes kernel launching codegen for HIP for the new kernel launching API since we are sure there is no other side effect in `__hipPushCallConfiguration` and `__hipPopCallConfiguration`. https://reviews.llvm.org/D86376 Files: clang/lib/CodeGen/CGCUDANV.cpp clang/test/CodeGenCUDA/kernel-call.cu clang/test/CodeGenCUDA/kernel-call.hip clang/test/lit.cfg.py
Index: clang/test/lit.cfg.py =================================================================== --- clang/test/lit.cfg.py +++ clang/test/lit.cfg.py @@ -25,7 +25,7 @@ config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell) # suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.i', '.cppm', '.m', '.mm', '.cu', +config.suffixes = ['.c', '.cpp', '.i', '.cppm', '.m', '.mm', '.cu', '.hip', '.ll', '.cl', '.s', '.S', '.modulemap', '.test', '.rs', '.ifs'] # excludes: A list of directories to exclude from the testsuite. The 'Inputs' Index: clang/test/CodeGenCUDA/kernel-call.hip =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/kernel-call.hip @@ -0,0 +1,69 @@ +// RUN: %clang_cc1 -fhip-new-launch-api -triple x86_64-unknown-linux-gnu \ +// RUN: -std=c++11 -emit-llvm %s -o - | FileCheck %s + +#include "Inputs/cuda.h" + +struct A { int a[10]; }; + +__global__ void g1(int x) {} +__global__ void g2(A x) {} +__global__ void g3(A &x) {} +template<typename F> __global__ void g4(F f, int *x) { *x = f(); } +void (*pg1)(int x) = g1; + +// CHECK-LABEL: define{{.*}}test1 +void test1() { + // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 2, i32 1, i32 1) + // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 3, i32 1, i32 1) + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g1i{{.*}}, i64 0, %struct.hipStream* null) + g1<<<2, 3>>>(0); + + // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 4, i32 5, i32 6) + // CHECK: call void @_ZN4dim3C1Ejjj(%struct.dim3* {{.*}}, i32 7, i32 8, i32 9) + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g1i{{.*}}, i64 10, {{.*}}inttoptr (i64 11 + g1<<<dim3(4,5,6), dim3(7,8,9), 10, (hipStream_t)11>>>(0); + + // CHECK: %[[LD:.*]] = load void (i32)*, void (i32)** @pg1 + // CHECK: %[[PTR:.*]] = bitcast void (i32)* %[[LD]] to i8* + // CHECK: call i32 @hipLaunchKernel({{.*}}%[[PTR]]{{.*}}, i64 0, %struct.hipStream* null) + pg1<<<1, 1>>>(0); +} + +// CHECK-LABEL: define{{.*}}test2 +void test2() { + A a; + // CHECK: %agg.tmp = alloca %struct.A, align 4 + // CHECK: %kernel_args = alloca i8*, i64 1, align 16 + // CHECK: %[[CAST:.*]] = bitcast %struct.A* %agg.tmp to i8* + // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 + // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8 + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g21A{{.*}}, i64 0, %struct.hipStream* null) + g2<<<1, 1>>>(a); +} + +// CHECK-LABEL: define{{.*}}test3 +void test3() { + A a; + // CHECK: %a = alloca %struct.A, align 4 + // CHECK: %kernel_arg = alloca %struct.A*, align 8 + // CHECK: %kernel_args = alloca i8*, i64 1, align 16 + // CHECK: store %struct.A* %a, %struct.A** %kernel_arg, align 8 + // CHECK: %[[CAST:.*]] = bitcast %struct.A** %kernel_arg to i8* + // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 + // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8 + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g3R1A{{.*}}, i64 0, %struct.hipStream* null) + g3<<<1, 1>>>(a); +} + +// CHECK-LABEL: define{{.*}}test4 +void test4() { + int x = 123; + int y; + // CHECK: %agg.tmp = alloca %class.anon, align 4 + // CHECK: %kernel_args = alloca i8*, i64 2, align 16 + // CHECK: %[[CAST:.*]] = bitcast %class.anon* %agg.tmp to i8* + // CHECK: %[[GEP:.*]] = getelementptr i8*, i8** %kernel_args, i32 0 + // CHECK: store i8* %[[CAST]], i8** %[[GEP]], align 8 + // CHECK: call i32 @hipLaunchKernel({{.*}}@_Z17__device_stub__g4IZ5test4vEUlvE_EvT_Pi{{.*}}, i64 0, %struct.hipStream* null) + g4<<<1, 1>>>([=]() { return x; }, &y); +} Index: clang/test/CodeGenCUDA/kernel-call.cu =================================================================== --- clang/test/CodeGenCUDA/kernel-call.cu +++ clang/test/CodeGenCUDA/kernel-call.cu @@ -1,19 +1,19 @@ // RUN: %clang_cc1 -target-sdk-version=8.0 -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK +// RUN: | FileCheck %s --check-prefixes=CUDA-OLD,CHECK,COMMON // RUN: %clang_cc1 -target-sdk-version=9.2 -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK +// RUN: | FileCheck %s --check-prefixes=CUDA-NEW,CHECK,COMMON // RUN: %clang_cc1 -x hip -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK +// RUN: | FileCheck %s --check-prefixes=HIP-OLD,CHECK,COMMON // RUN: %clang_cc1 -fhip-new-launch-api -x hip -emit-llvm %s -o - \ -// RUN: | FileCheck %s --check-prefixes=HIP-NEW,CHECK +// RUN: | FileCheck %s --check-prefixes=HIP-NEW #include "Inputs/cuda.h" // CHECK-LABEL: define{{.*}}g1 // HIP-OLD: call{{.*}}hipSetupArgument // HIP-OLD: call{{.*}}hipLaunchByPtr -// HIP-NEW: call{{.*}}__hipPopCallConfiguration -// HIP-NEW: call{{.*}}hipLaunchKernel +// HIP-NEW-NOT: call{{.*}}__hipPopCallConfiguration +// HIP-NEW-NOT: call{{.*}}hipLaunchKernel // CUDA-OLD: call{{.*}}cudaSetupArgument // CUDA-OLD: call{{.*}}cudaLaunch // CUDA-NEW: call{{.*}}__cudaPopCallConfiguration @@ -23,11 +23,12 @@ // CHECK-LABEL: define{{.*}}main int main(void) { // HIP-OLD: call{{.*}}hipConfigureCall - // HIP-NEW: call{{.*}}__hipPushCallConfiguration + // HIP-NEW-NOT: call{{.*}}__hipPushCallConfiguration + // HIP-NEW: call{{.*}}hipLaunchKernel // CUDA-OLD: call{{.*}}cudaConfigureCall // CUDA-NEW: call{{.*}}__cudaPushCallConfiguration - // CHECK: icmp - // CHECK: br - // CHECK: call{{.*}}g1 + // COMMON: icmp + // COMMON: br + // COMMON: call{{.*}}g1 g1<<<1, 1>>>(42); } Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -116,7 +116,9 @@ } void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); - void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); + void emitDeviceStubBodyNew(CodeGenFunction &CGF, llvm::Value *Kernel, + const Address &KernelArgs, + Expr const *const *ConfigArgs = nullptr); std::string getDeviceSideName(const NamedDecl *ND) override; public: @@ -149,6 +151,13 @@ llvm::Function *makeModuleCtorFunction() override; /// Creates module destructor function llvm::Function *makeModuleDtorFunction() override; + + RValue EmitCUDAKernelCallExpr(CodeGenFunction &CGF, + const CUDAKernelCallExpr *E, + ReturnValueSlot ReturnValue) override; + + Address createTempVarForKernelArgs(CodeGenFunction &CGF, + FunctionArgList &Args); }; } @@ -241,19 +250,17 @@ FunctionArgList &Args) { EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), - CudaFeature::CUDA_USES_NEW_LAUNCH) || - (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI)) - emitDeviceStubBodyNew(CGF, Args); - else + CudaFeature::CUDA_USES_NEW_LAUNCH)) + emitDeviceStubBodyNew(CGF, CGF.CurFn, + createTempVarForKernelArgs(CGF, Args)); + else if (!CGF.getLangOpts().HIP || !CGF.getLangOpts().HIPUseNewLaunchAPI) emitDeviceStubBodyLegacy(CGF, Args); } -// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local -// array and kernels are launched using cudaLaunchKernel(). -void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, - FunctionArgList &Args) { - // Build the shadow stack entry at the very start of the function. - +// Create a temporary array to hold all kernel arguments for kernel stub. +// \p Args is the kernel argument list of the kernel stub. +Address CGNVCUDARuntime::createTempVarForKernelArgs(CodeGenFunction &CGF, + FunctionArgList &Args) { // Calculate amount of space we will need for all arguments. If we have no // args, allocate a single pointer so we still have a valid pointer to the // argument array that we can pass to runtime, even if it will be unused. @@ -267,8 +274,19 @@ CGF.Builder.CreateDefaultAlignedStore( VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i)); } + return KernelArgs; +} - llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); +// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local +// array and kernels are launched using cudaLaunchKernel(). +void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, + llvm::Value *Kernel, + const Address &KernelArgs, + Expr const *const *ConfigArgs) { + // Build the shadow stack entry at the very start of the function. + llvm::BasicBlock *EndBlock = nullptr; + if (!CGF.getLangOpts().HIP) + EndBlock = CGF.createBasicBlock("setup.end"); // Lookup cudaLaunchKernel/hipLaunchKernel function. // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, @@ -296,39 +314,49 @@ // Create temporary dim3 grid_dim, block_dim. ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); QualType Dim3Ty = GridDimParam->getType(); - Address GridDim = - CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); - Address BlockDim = - CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); - Address ShmemSize = - CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); - Address Stream = - CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); - llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( - llvm::FunctionType::get(IntTy, - {/*gridDim=*/GridDim.getType(), - /*blockDim=*/BlockDim.getType(), - /*ShmemSize=*/ShmemSize.getType(), - /*Stream=*/Stream.getType()}, - /*isVarArg=*/false), - addUnderscoredPrefixToName("PopCallConfiguration")); - - CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, - {GridDim.getPointer(), BlockDim.getPointer(), - ShmemSize.getPointer(), Stream.getPointer()}); + RValue ConfigArgRVals[4]; + if (!CGF.getLangOpts().HIP) { + Address GridDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); + Address BlockDim = + CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); + Address ShmemSize = + CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); + Address Stream = + CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); + llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, + {/*gridDim=*/GridDim.getType(), + /*blockDim=*/BlockDim.getType(), + /*ShmemSize=*/ShmemSize.getType(), + /*Stream=*/Stream.getType()}, + /*isVarArg=*/false), + addUnderscoredPrefixToName("PopCallConfiguration")); + CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, + {GridDim.getPointer(), BlockDim.getPointer(), + ShmemSize.getPointer(), Stream.getPointer()}); + ConfigArgRVals[0] = RValue::getAggregate(GridDim); + ConfigArgRVals[1] = RValue::getAggregate(BlockDim); + ConfigArgRVals[2] = RValue::get(CGF.Builder.CreateLoad(ShmemSize)); + ConfigArgRVals[3] = RValue::get(CGF.Builder.CreateLoad(Stream)); + } else { + assert(ConfigArgs); + for (unsigned I = 0; I < 4; ++I) + ConfigArgRVals[I] = CGF.EmitAnyExprToTemp(ConfigArgs[I]); + } // Emit the call to cudaLaunch - llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy); CallArgList LaunchKernelArgs; - LaunchKernelArgs.add(RValue::get(Kernel), - cudaLaunchKernelFD->getParamDecl(0)->getType()); - LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); - LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); + LaunchKernelArgs.add( + RValue::get(CGF.Builder.CreatePointerCast(Kernel, VoidPtrTy)), + cudaLaunchKernelFD->getParamDecl(0)->getType()); + LaunchKernelArgs.add(ConfigArgRVals[0], Dim3Ty); + LaunchKernelArgs.add(ConfigArgRVals[1], Dim3Ty); LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), cudaLaunchKernelFD->getParamDecl(3)->getType()); - LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), + LaunchKernelArgs.add(ConfigArgRVals[2], cudaLaunchKernelFD->getParamDecl(4)->getType()); - LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), + LaunchKernelArgs.add(ConfigArgRVals[3], cudaLaunchKernelFD->getParamDecl(5)->getType()); QualType QT = cudaLaunchKernelFD->getType(); @@ -342,9 +370,12 @@ CGM.CreateRuntimeFunction(FTy, LaunchKernelName); CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), LaunchKernelArgs); - CGF.EmitBranch(EndBlock); - CGF.EmitBlock(EndBlock); + if (!CGF.getLangOpts().HIP) { + assert(EndBlock); + CGF.EmitBranch(EndBlock); + CGF.EmitBlock(EndBlock); + } } void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, @@ -846,3 +877,50 @@ CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { return new CGNVCUDARuntime(CGM); } + +RValue CGNVCUDARuntime::EmitCUDAKernelCallExpr(CodeGenFunction &CGF, + const CUDAKernelCallExpr *E, + ReturnValueSlot ReturnValue) { + if (!CGF.CGM.getLangOpts().HIP || !CGF.getLangOpts().HIPUseNewLaunchAPI) + return CGCUDARuntime::EmitCUDAKernelCallExpr(CGF, E, ReturnValue); + + CGCallee Callee = CGF.EmitCallee(E->getCallee()); + auto FnType = E->getCallee() + ->getType() + ->getAs<PointerType>() + ->getPointeeType() + ->getAs<FunctionProtoType>(); + CallArgList Args; + CGF.EmitCallArgs(Args, FnType, E->arguments()); + + Address KernelArgs = CGF.CreateTempAlloca( + VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", + llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); + for (unsigned I = 0; I < Args.size(); ++I) { + auto RV = Args[I].getRValue(CGF); + llvm::Value *VoidArgPtr; + if (RV.isScalar()) { + llvm::Value *Arg = RV.getScalarVal(); + auto Ty = Arg->getType(); + Address ArgPtr = CGF.CreateTempAlloca( + Ty, + CharUnits::fromQuantity( + CGF.CGM.getDataLayout().getPrefTypeAlignment(Ty)), + "kernel_arg"); + CGF.Builder.CreateDefaultAlignedStore(Arg, ArgPtr.getPointer()); + VoidArgPtr = + CGF.Builder.CreatePointerCast(ArgPtr.getPointer(), VoidPtrTy); + } else { + Address ArgPtr = RV.getAggregateAddress(); + VoidArgPtr = + CGF.Builder.CreatePointerCast(ArgPtr.getPointer(), VoidPtrTy); + } + CGF.Builder.CreateDefaultAlignedStore( + VoidArgPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), I)); + } + + emitDeviceStubBodyNew(CGF, Callee.getFunctionPointer(), KernelArgs, + E->getConfig()->getArgs()); + + return RValue::get(nullptr); +}
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits