https://github.com/lalaniket8 updated https://github.com/llvm/llvm-project/pull/115821
>From 131a8180bc66f7f95e765d98e7d66ce30f03bd40 Mon Sep 17 00:00:00 2001 From: anikelal <anike...@amd.com> Date: Tue, 12 Nov 2024 11:30:45 +0530 Subject: [PATCH] [Clang][AMDGPU] Emit stub version of OpenCL Kernel OpenCL allows a kernel function to call another kernel function. To facilitate this we emit a stub version of each kernel function with different name mangling scheme, and replace the kernel callsite appropriately. This commit fixes https://github.com/llvm/llvm-project/issues/60313 https://ontrack-internal.amd.com/browse/SWDEV-245936 D120566 was an earlier effort to upstream a fix for this issue. --- clang/include/clang/AST/GlobalDecl.h | 37 +++++++++++++++++-------- clang/include/clang/AST/Type.h | 5 ++++ clang/lib/AST/Expr.cpp | 3 +- clang/lib/AST/ItaniumMangle.cpp | 14 ++++++++++ clang/lib/AST/Mangle.cpp | 2 +- clang/lib/AST/MicrosoftMangle.cpp | 11 ++++++++ clang/lib/CodeGen/CGCall.cpp | 27 +++++++++++++++--- clang/lib/CodeGen/CGExpr.cpp | 40 +++++++++++++++++++++++++++ clang/lib/CodeGen/CGOpenCLRuntime.cpp | 5 +++- clang/lib/CodeGen/CodeGenModule.cpp | 15 +++++++++- clang/lib/CodeGen/CodeGenTypes.h | 4 ++- clang/lib/CodeGen/TargetInfo.h | 2 ++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 8 ++++++ 13 files changed, 153 insertions(+), 20 deletions(-) diff --git a/clang/include/clang/AST/GlobalDecl.h b/clang/include/clang/AST/GlobalDecl.h index 386693cabb1fbb..39d779ab7aad31 100644 --- a/clang/include/clang/AST/GlobalDecl.h +++ b/clang/include/clang/AST/GlobalDecl.h @@ -71,6 +71,10 @@ class GlobalDecl { GlobalDecl(const FunctionDecl *D, unsigned MVIndex = 0) : MultiVersionIndex(MVIndex) { if (!D->hasAttr<CUDAGlobalAttr>()) { + if (D->hasAttr<OpenCLKernelAttr>()) { + Value.setPointerAndInt(D, unsigned(KernelReferenceKind::Kernel)); + return; + } Init(D); return; } @@ -78,7 +82,8 @@ class GlobalDecl { } GlobalDecl(const FunctionDecl *D, KernelReferenceKind Kind) : Value(D, unsigned(Kind)) { - assert(D->hasAttr<CUDAGlobalAttr>() && "Decl is not a GPU kernel!"); + assert((D->hasAttr<CUDAGlobalAttr>() && "Decl is not a GPU kernel!") || + (D->hasAttr<OpenCLKernelAttr>() && "Decl is not a OpenCL kernel!")); } GlobalDecl(const NamedDecl *D) { Init(D); } GlobalDecl(const BlockDecl *D) { Init(D); } @@ -130,13 +135,20 @@ class GlobalDecl { } KernelReferenceKind getKernelReferenceKind() const { - assert(((isa<FunctionDecl>(getDecl()) && - cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>()) || - (isa<FunctionTemplateDecl>(getDecl()) && - cast<FunctionTemplateDecl>(getDecl()) - ->getTemplatedDecl() - ->hasAttr<CUDAGlobalAttr>())) && - "Decl is not a GPU kernel!"); + assert((((isa<FunctionDecl>(getDecl()) && + cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>()) || + (isa<FunctionTemplateDecl>(getDecl()) && + cast<FunctionTemplateDecl>(getDecl()) + ->getTemplatedDecl() + ->hasAttr<CUDAGlobalAttr>())) && + "Decl is not a GPU kernel!") || + (((isa<FunctionDecl>(getDecl()) && + cast<FunctionDecl>(getDecl())->hasAttr<OpenCLKernelAttr>()) || + (isa<FunctionTemplateDecl>(getDecl()) && + cast<FunctionTemplateDecl>(getDecl()) + ->getTemplatedDecl() + ->hasAttr<OpenCLKernelAttr>())) && + "Decl is not a OpenCL kernel!")); return static_cast<KernelReferenceKind>(Value.getInt()); } @@ -196,9 +208,12 @@ class GlobalDecl { } GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind) { - assert(isa<FunctionDecl>(getDecl()) && - cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>() && - "Decl is not a GPU kernel!"); + assert((isa<FunctionDecl>(getDecl()) && + cast<FunctionDecl>(getDecl())->hasAttr<CUDAGlobalAttr>() && + "Decl is not a GPU kernel!") || + (isa<FunctionDecl>(getDecl()) && + cast<FunctionDecl>(getDecl())->hasAttr<OpenCLKernelAttr>() && + "Decl is not a OpenCL kernel!")); GlobalDecl Result(*this); Result.Value.setInt(unsigned(Kind)); return Result; diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 8979129017163b..69372b99fd6280 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -4668,6 +4668,11 @@ class FunctionType : public Type { CallingConv getCallConv() const { return getExtInfo().getCC(); } ExtInfo getExtInfo() const { return ExtInfo(FunctionTypeBits.ExtInfo); } + void setCC(unsigned cc) { + FunctionTypeBits.ExtInfo = + (FunctionTypeBits.ExtInfo & ~ExtInfo::CallConvMask) | cc; + } + static_assert((~Qualifiers::FastMask & Qualifiers::CVRMask) == 0, "Const, volatile and restrict are assumed to be a subset of " "the fast qualifiers."); diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index bf2c1b92fa6b49..84e6eefc4a8a96 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -694,7 +694,8 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK, GD = GlobalDecl(CD, Ctor_Base); else if (const CXXDestructorDecl *DD = dyn_cast<CXXDestructorDecl>(ND)) GD = GlobalDecl(DD, Dtor_Base); - else if (ND->hasAttr<CUDAGlobalAttr>()) + else if (ND->hasAttr<CUDAGlobalAttr>() || + ND->hasAttr<OpenCLKernelAttr>()) GD = GlobalDecl(cast<FunctionDecl>(ND)); else GD = GlobalDecl(ND); diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 14bc260d0245fb..9a57f7f4c42a48 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -528,6 +528,7 @@ class CXXNameMangler { void mangleSourceName(const IdentifierInfo *II); void mangleRegCallName(const IdentifierInfo *II); void mangleDeviceStubName(const IdentifierInfo *II); + void mangleOCLDeviceStubName(const IdentifierInfo *II); void mangleSourceNameWithAbiTags( const NamedDecl *ND, const AbiTagList *AdditionalAbiTags = nullptr); void mangleLocalName(GlobalDecl GD, @@ -1563,8 +1564,13 @@ void CXXNameMangler::mangleUnqualifiedName( bool IsDeviceStub = FD && FD->hasAttr<CUDAGlobalAttr>() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub; + bool IsOCLDeviceStub = + FD && FD->hasAttr<OpenCLKernelAttr>() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub; if (IsDeviceStub) mangleDeviceStubName(II); + else if (IsOCLDeviceStub) + mangleOCLDeviceStubName(II); else if (IsRegCall) mangleRegCallName(II); else @@ -1782,6 +1788,14 @@ void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) { << II->getName(); } +void CXXNameMangler::mangleOCLDeviceStubName(const IdentifierInfo *II) { + // <source-name> ::= <positive length number> __clang_ocl_kern_imp_ + // <identifier> <number> ::= [n] <non-negative decimal integer> <identifier> + // ::= <unqualified source code identifier> + Out << II->getLength() + sizeof("__clang_ocl_kern_imp_") - 1 + << "__clang_ocl_kern_imp_" << II->getName(); +} + void CXXNameMangler::mangleSourceName(const IdentifierInfo *II) { // <source-name> ::= <positive length number> <identifier> // <number> ::= [n] <non-negative decimal integer> diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp index 4875e8537b3c11..2e3b2a684dd2d7 100644 --- a/clang/lib/AST/Mangle.cpp +++ b/clang/lib/AST/Mangle.cpp @@ -540,7 +540,7 @@ class ASTNameGenerator::Implementation { GD = GlobalDecl(CtorD, Ctor_Complete); else if (const auto *DtorD = dyn_cast<CXXDestructorDecl>(D)) GD = GlobalDecl(DtorD, Dtor_Complete); - else if (D->hasAttr<CUDAGlobalAttr>()) + else if (D->hasAttr<CUDAGlobalAttr>() || D->hasAttr<OpenCLKernelAttr>()) GD = GlobalDecl(cast<FunctionDecl>(D)); else GD = GlobalDecl(D); diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp index dbc161347025c0..6de8c531232e0d 100644 --- a/clang/lib/AST/MicrosoftMangle.cpp +++ b/clang/lib/AST/MicrosoftMangle.cpp @@ -1163,9 +1163,20 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD, ->getTemplatedDecl() ->hasAttr<CUDAGlobalAttr>())) && GD.getKernelReferenceKind() == KernelReferenceKind::Stub; + bool IsOCLDeviceStub = + ND && + ((isa<FunctionDecl>(ND) && ND->hasAttr<OpenCLKernelAttr>()) || + (isa<FunctionTemplateDecl>(ND) && + cast<FunctionTemplateDecl>(ND) + ->getTemplatedDecl() + ->hasAttr<OpenCLKernelAttr>())) && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub; if (IsDeviceStub) mangleSourceName( (llvm::Twine("__device_stub__") + II->getName()).str()); + else if (IsOCLDeviceStub) + mangleSourceName( + (llvm::Twine("__clang_ocl_kern_imp_") + II->getName()).str()); else mangleSourceName(II->getName()); break; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 8f4f5d3ed81601..59c467cc99d6f1 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -300,6 +300,16 @@ static void setCUDAKernelCallingConvention(CanQualType &FTy, CodeGenModule &CGM, } } +static void setOCLKernelStubCallingConvention(CanQualType &FTy, + CodeGenModule &CGM, + const FunctionDecl *FD) { + if (FD->hasAttr<OpenCLKernelAttr>()) { + const FunctionType *FT = FTy->getAs<FunctionType>(); + CGM.getTargetCodeGenInfo().setOCLKernelStubCallingConvention(FT); + FTy = FT->getCanonicalTypeUnqualified(); + } +} + /// Arrange the argument and result information for a declaration or /// definition of the given C++ non-static member function. The /// member function must be an ordinary function, i.e. not a @@ -460,15 +470,19 @@ CodeGenTypes::arrangeCXXConstructorCall(const CallArgList &args, /// Arrange the argument and result information for the declaration or /// definition of the given function. const CGFunctionInfo & -CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD) { +CodeGenTypes::arrangeFunctionDeclaration(const FunctionDecl *FD, + CanQualType *FTy_ptr /* = nullptr*/) { if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD)) if (MD->isImplicitObjectMemberFunction()) return arrangeCXXMethodDeclaration(MD); - CanQualType FTy = FD->getType()->getCanonicalTypeUnqualified(); + CanQualType FTy = FTy_ptr == nullptr + ? FD->getType()->getCanonicalTypeUnqualified() + : *FTy_ptr; assert(isa<FunctionType>(FTy)); - setCUDAKernelCallingConvention(FTy, CGM, FD); + if (!FD->getLangOpts().OpenCL) + setCUDAKernelCallingConvention(FTy, CGM, FD); // When declaring a function without a prototype, always use a // non-variadic type. @@ -548,7 +562,12 @@ CodeGenTypes::arrangeGlobalDeclaration(GlobalDecl GD) { isa<CXXDestructorDecl>(GD.getDecl())) return arrangeCXXStructorDeclaration(GD); - return arrangeFunctionDeclaration(FD); + CanQualType FTy = FD->getType()->getCanonicalTypeUnqualified(); + if (FD->hasAttr<OpenCLKernelAttr>() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + setOCLKernelStubCallingConvention(FTy, CGM, FD); + } + return arrangeFunctionDeclaration(FD, &FTy); } /// Arrange a thunk that takes 'this' as the first parameter followed by diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 096f4c4f550435..10c189c97e4cb6 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -5607,6 +5607,40 @@ RValue CodeGenFunction::EmitCallExpr(const CallExpr *E, return EmitCXXPseudoDestructorExpr(callee.getPseudoDestructorExpr()); } + // Change calling convention of callee function at callsite. + const Expr *CalleeExpr = E->getCallee()->IgnoreParens(); + while (auto ICE = dyn_cast<ImplicitCastExpr>(CalleeExpr)) { + if (ICE->getCastKind() != CK_FunctionToPointerDecay && + ICE->getCastKind() != CK_BuiltinFnToFnPtr) + break; + CalleeExpr = ICE->getSubExpr()->IgnoreParens(); + } + + if (auto DRE = dyn_cast<DeclRefExpr>(CalleeExpr)) { + if (auto FD = dyn_cast<FunctionDecl>(DRE->getDecl())) { + if (FD->hasAttr<OpenCLKernelAttr>() && !FD->getBuiltinID()) { + const FunctionType *FT = + cast<FunctionType>(cast<PointerType>(getContext().getCanonicalType( + E->getCallee()->getType())) + ->getPointeeType()); + FunctionType *FTNC = const_cast<FunctionType *>(FT); + FTNC->setCC(CC_C); + } + } + } + if (auto ME = dyn_cast<MemberExpr>(CalleeExpr)) { + if (auto FD = dyn_cast<FunctionDecl>(ME->getMemberDecl())) { + if (FD->hasAttr<OpenCLKernelAttr>() && !FD->getBuiltinID()) { + const FunctionType *FT = + cast<FunctionType>(cast<PointerType>(getContext().getCanonicalType( + E->getCallee()->getType())) + ->getPointeeType()); + FunctionType *FTNC = const_cast<FunctionType *>(FT); + FTNC->setCC(CC_C); + } + } + } + return EmitCall(E->getCallee()->getType(), callee, E, ReturnValue, /*Chain=*/nullptr, CallOrInvoke); } @@ -5695,11 +5729,17 @@ CGCallee CodeGenFunction::EmitCallee(const Expr *E) { // Resolve direct calls. } else if (auto DRE = dyn_cast<DeclRefExpr>(E)) { if (auto FD = dyn_cast<FunctionDecl>(DRE->getDecl())) { + if (FD->hasAttr<OpenCLKernelAttr>()) + return EmitDirectCallee(*this, + GlobalDecl(FD, KernelReferenceKind::Stub)); return EmitDirectCallee(*this, FD); } } else if (auto ME = dyn_cast<MemberExpr>(E)) { if (auto FD = dyn_cast<FunctionDecl>(ME->getMemberDecl())) { EmitIgnoredExpr(ME->getBase()); + if (FD->hasAttr<OpenCLKernelAttr>()) + return EmitDirectCallee(*this, + GlobalDecl(FD, KernelReferenceKind::Stub)); return EmitDirectCallee(*this, FD); } diff --git a/clang/lib/CodeGen/CGOpenCLRuntime.cpp b/clang/lib/CodeGen/CGOpenCLRuntime.cpp index 115b618056a445..c77de0701e0e6c 100644 --- a/clang/lib/CodeGen/CGOpenCLRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenCLRuntime.cpp @@ -127,7 +127,10 @@ static const BlockExpr *getBlockExpr(const Expr *E) { void CGOpenCLRuntime::recordBlockInfo(const BlockExpr *E, llvm::Function *InvokeF, llvm::Value *Block, llvm::Type *BlockTy) { - assert(!EnqueuedBlockMap.contains(E) && "Block expression emitted twice"); + + // FIXME: Since OpenCL Kernels are emitted twice (kernel version and stub + // version), its constituent BlockExpr will also be emitted twice. + // assert(!EnqueuedBlockMap.contains(E) && "Block expression emitted twice"); assert(isa<llvm::Function>(InvokeF) && "Invalid invoke function"); assert(Block->getType()->isPointerTy() && "Invalid block literal type"); EnqueuedBlockMap[E].InvokeFunc = InvokeF; diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index ba376f9ecfacde..4454bda69165ac 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1888,6 +1888,9 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD, } else if (FD && FD->hasAttr<CUDAGlobalAttr>() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { Out << "__device_stub__" << II->getName(); + } else if (FD && FD->hasAttr<OpenCLKernelAttr>() && + GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { + Out << "__clang_ocl_kern_imp_" << II->getName(); } else { Out << II->getName(); } @@ -3284,7 +3287,13 @@ void CodeGenModule::EmitDeferred() { continue; // Otherwise, emit the definition and move on to the next one. - EmitGlobalDefinition(D, GV); + // Do not emit definition for a device version of OpenCL kernel that does + // not have a body. + if (!(isa<FunctionDecl>(D.getDecl()) && + (cast<FunctionDecl>(D.getDecl()))->hasAttr<OpenCLKernelAttr>() && + D.getKernelReferenceKind() == KernelReferenceKind::Stub && + !((cast<FunctionDecl>(D.getDecl()))->doesThisDeclarationHaveABody()))) + EmitGlobalDefinition(D, GV); // If we found out that we need to emit more decls, do that recursively. // This has the advantage that the decls are emitted in a DFS and related @@ -3842,6 +3851,10 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) { // Ignore declarations, they will be emitted on their first use. if (const auto *FD = dyn_cast<FunctionDecl>(Global)) { + + if (FD->hasAttr<OpenCLKernelAttr>()) + addDeferredDeclToEmit(GlobalDecl(FD, KernelReferenceKind::Stub)); + // Update deferred annotations with the latest declaration if the function // function was already used or defined. if (FD->hasAttr<AnnotateAttr>()) { diff --git a/clang/lib/CodeGen/CodeGenTypes.h b/clang/lib/CodeGen/CodeGenTypes.h index 5aebf9a2122372..fd9f37de67f187 100644 --- a/clang/lib/CodeGen/CodeGenTypes.h +++ b/clang/lib/CodeGen/CodeGenTypes.h @@ -207,7 +207,9 @@ class CodeGenTypes { /// Free functions are functions that are compatible with an ordinary /// C function pointer type. - const CGFunctionInfo &arrangeFunctionDeclaration(const FunctionDecl *FD); + const CGFunctionInfo & + arrangeFunctionDeclaration(const FunctionDecl *FD, + CanQualType *FTy_ptr = nullptr); const CGFunctionInfo &arrangeFreeFunctionCall(const CallArgList &Args, const FunctionType *Ty, bool ChainCall); diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h index 373f8b8a80fdb1..2673b1f7e32c28 100644 --- a/clang/lib/CodeGen/TargetInfo.h +++ b/clang/lib/CodeGen/TargetInfo.h @@ -382,6 +382,8 @@ class TargetCodeGenInfo { virtual bool shouldEmitDWARFBitFieldSeparators() const { return false; } virtual void setCUDAKernelCallingConvention(const FunctionType *&FT) const {} + virtual void + setOCLKernelStubCallingConvention(const FunctionType *&FT) const {} /// Return the device-side type for the CUDA device builtin surface type. virtual llvm::Type *getCUDADeviceBuiltinSurfaceDeviceType() const { diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 56ad0503a11ab2..37e07b16193e5f 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -321,6 +321,8 @@ class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo { bool shouldEmitStaticExternCAliases() const override; bool shouldEmitDWARFBitFieldSeparators() const override; void setCUDAKernelCallingConvention(const FunctionType *&FT) const override; + void + setOCLKernelStubCallingConvention(const FunctionType *&FT) const override; }; } @@ -598,6 +600,12 @@ void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention( FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel)); } +void AMDGPUTargetCodeGenInfo::setOCLKernelStubCallingConvention( + const FunctionType *&FT) const { + FT = getABIInfo().getContext().adjustFunctionType( + FT, FT->getExtInfo().withCallingConv(CC_C)); +} + /// Create an OpenCL kernel for an enqueued block. /// /// The type of the first argument (the block literal) is the struct type _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits