Author: Arthur Eubanks Date: 2020-12-28T17:52:31-08:00 New Revision: 0e9abcfc1920f25a959eaa08116427b795e10dd8
URL: https://github.com/llvm/llvm-project/commit/0e9abcfc1920f25a959eaa08116427b795e10dd8 DIFF: https://github.com/llvm/llvm-project/commit/0e9abcfc1920f25a959eaa08116427b795e10dd8.diff LOG: [AMDGPU][NewPM] Port amdgpu-promote-alloca(-to-vector) And add to AMDGPU opt pipeline. Don't pin an opt run to the legacy PM when -enable-new-pm=1 if these passes (or passes introduced in https://reviews.llvm.org/D93863) are in the list of passes. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D93875 Added: Modified: llvm/lib/Target/AMDGPU/AMDGPU.h llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll llvm/test/CodeGen/AMDGPU/vector-alloca.ll llvm/tools/opt/opt.cpp Removed: ################################################################################ diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 22d264e2880b..6a0ba20e8026 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -208,6 +208,23 @@ FunctionPass *createAMDGPUPromoteAllocaToVector(); void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&); extern char &AMDGPUPromoteAllocaToVectorID; +struct AMDGPUPromoteAllocaPass : PassInfoMixin<AMDGPUPromoteAllocaPass> { + AMDGPUPromoteAllocaPass(TargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + TargetMachine &TM; +}; + +struct AMDGPUPromoteAllocaToVectorPass + : PassInfoMixin<AMDGPUPromoteAllocaToVectorPass> { + AMDGPUPromoteAllocaToVectorPass(TargetMachine &TM) : TM(TM) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + +private: + TargetMachine &TM; +}; + Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag( TargetMachine *TM = nullptr, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 4cfe0edfc533..3dc7b1643081 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" #include "llvm/IR/Type.h" #include "llvm/IR/User.h" #include "llvm/IR/Value.h" @@ -83,8 +84,26 @@ static cl::opt<unsigned> PromoteAllocaToVectorLimit( // FIXME: This can create globals so should be a module pass. class AMDGPUPromoteAlloca : public FunctionPass { +public: + static char ID; + + AMDGPUPromoteAlloca() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } + + bool handleAlloca(AllocaInst &I, bool SufficientLDS); + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + FunctionPass::getAnalysisUsage(AU); + } +}; + +class AMDGPUPromoteAllocaImpl { private: - const TargetMachine *TM; + const TargetMachine &TM; Module *Mod = nullptr; const DataLayout *DL = nullptr; @@ -116,28 +135,14 @@ class AMDGPUPromoteAlloca : public FunctionPass { /// Check whether we have enough local memory for promotion. bool hasSufficientLocalMem(const Function &F); -public: - static char ID; - - AMDGPUPromoteAlloca() : FunctionPass(ID) {} - - bool doInitialization(Module &M) override; - bool runOnFunction(Function &F) override; - - StringRef getPassName() const override { return "AMDGPU Promote Alloca"; } - bool handleAlloca(AllocaInst &I, bool SufficientLDS); - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - FunctionPass::getAnalysisUsage(AU); - } +public: + AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {} + bool run(Function &F); }; class AMDGPUPromoteAllocaToVector : public FunctionPass { -private: - unsigned MaxVGPRs; - public: static char ID; @@ -149,8 +154,6 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass { return "AMDGPU Promote Alloca to vector"; } - bool handleAlloca(AllocaInst &I); - void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); @@ -171,32 +174,41 @@ INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector", char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID; char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID; -bool AMDGPUPromoteAlloca::doInitialization(Module &M) { - Mod = &M; - DL = &Mod->getDataLayout(); +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { + return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F); + } return false; } -bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; +PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F, + FunctionAnalysisManager &AM) { + bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F); + if (Changed) { + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; + } + return PreservedAnalyses::all(); +} - if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) - TM = &TPC->getTM<TargetMachine>(); - else - return false; +bool AMDGPUPromoteAllocaImpl::run(Function &F) { + Mod = F.getParent(); + DL = &Mod->getDataLayout(); - const Triple &TT = TM->getTargetTriple(); + const Triple &TT = TM.getTargetTriple(); IsAMDGCN = TT.getArch() == Triple::amdgcn; IsAMDHSA = TT.getOS() == Triple::AMDHSA; - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!ST.isPromoteAllocaEnabled()) return false; if (IsAMDGCN) { - const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); } else { MaxVGPRs = 128; @@ -221,9 +233,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { } std::pair<Value *, Value *> -AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { +AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { const Function &F = *Builder.GetInsertBlock()->getParent(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!IsAMDHSA) { Function *LocalSizeYFn @@ -308,9 +320,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) { return std::make_pair(Y, LoadZU); } -Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) { +Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, + unsigned N) { const AMDGPUSubtarget &ST = - AMDGPUSubtarget::get(*TM, *Builder.GetInsertBlock()->getParent()); + AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent()); Intrinsic::ID IntrID = Intrinsic::not_intrinsic; switch (N) { @@ -592,11 +605,9 @@ static bool isCallPromotable(CallInst *CI) { } } -bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca, - Value *Val, - Instruction *Inst, - int OpIdx0, - int OpIdx1) const { +bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca( + Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0, + int OpIdx1) const { // Figure out which operand is the one we might not be promoting. Value *OtherOp = Inst->getOperand(OpIdx0); if (Val == OtherOp) @@ -624,10 +635,8 @@ bool AMDGPUPromoteAlloca::binaryOpIsDerivedFromSameAlloca(Value *BaseAlloca, return true; } -bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( - Value *BaseAlloca, - Value *Val, - std::vector<Value*> &WorkList) const { +bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes( + Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const { for (User *User : Val->users()) { if (is_contained(WorkList, User)) @@ -727,10 +736,10 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes( return true; } -bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { +bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) { FunctionType *FTy = F.getFunctionType(); - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); // If the function has any arguments in the local address space, then it's // possible these arguments require the entire local memory space, so @@ -863,7 +872,7 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) { } // FIXME: Should try to pick the most likely to be profitable allocas first. -bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { +bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) { // Array allocations are probably not worth handling, since an allocation of // the array type is the canonical form. if (!I.isStaticAlloca() || I.isArrayAllocation()) @@ -904,7 +913,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { if (!SufficientLDS) return false; - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction); unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second; Align Alignment = @@ -1083,22 +1092,29 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) { return true; } -bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { - if (skipFunction(F) || DisablePromoteAllocaToVector) +bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) { + // Array allocations are probably not worth handling, since an allocation of + // the array type is the canonical form. + if (!I.isStaticAlloca() || I.isArrayAllocation()) return false; - const TargetMachine *TM; - if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) - TM = &TPC->getTM<TargetMachine>(); - else + LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); + + Module *Mod = I.getParent()->getParent()->getParent(); + return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); +} + +bool promoteAllocasToVector(Function &F, TargetMachine &TM) { + if (DisablePromoteAllocaToVector) return false; - const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F); + const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!ST.isPromoteAllocaEnabled()) return false; - if (TM->getTargetTriple().getArch() == Triple::amdgcn) { - const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F); + unsigned MaxVGPRs; + if (TM.getTargetTriple().getArch() == Triple::amdgcn) { + const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F); MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first); } else { MaxVGPRs = 128; @@ -1114,23 +1130,31 @@ bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { } for (AllocaInst *AI : Allocas) { - if (handleAlloca(*AI)) + if (handlePromoteAllocaToVector(*AI, MaxVGPRs)) Changed = true; } return Changed; } -bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) { - // Array allocations are probably not worth handling, since an allocation of - // the array type is the canonical form. - if (!I.isStaticAlloca() || I.isArrayAllocation()) +bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) { + if (skipFunction(F)) return false; + if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) { + return promoteAllocasToVector(F, TPC->getTM<TargetMachine>()); + } + return false; +} - LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n'); - - Module *Mod = I.getParent()->getParent()->getParent(); - return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs); +PreservedAnalyses +AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) { + bool Changed = promoteAllocasToVector(F, TM); + if (Changed) { + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; + } + return PreservedAnalyses::all(); } FunctionPass *llvm::createAMDGPUPromoteAlloca() { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6ea99bdf9206..89ae9d8029e0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -29,6 +29,7 @@ #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/CodeGen/GlobalISel/Legalizer.h" @@ -488,8 +489,8 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, bool DebugPassManager) { PB.registerPipelineParsingCallback( - [](StringRef PassName, FunctionPassManager &PM, - ArrayRef<PassBuilder::PipelineElement>) { + [this](StringRef PassName, FunctionPassManager &PM, + ArrayRef<PassBuilder::PipelineElement>) { if (PassName == "amdgpu-simplifylib") { PM.addPass(AMDGPUSimplifyLibCallsPass()); return true; @@ -498,6 +499,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, PM.addPass(AMDGPUUseNativeCallsPass()); return true; } + if (PassName == "amdgpu-promote-alloca") { + PM.addPass(AMDGPUPromoteAllocaPass(*this)); + return true; + } + if (PassName == "amdgpu-promote-alloca-to-vector") { + PM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); + return true; + } return false; }); @@ -510,6 +519,18 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB, FPM.addPass(AMDGPUSimplifyLibCallsPass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); + + PB.registerCGSCCOptimizerLateEPCallback( + [this, DebugPassManager](CGSCCPassManager &PM, + PassBuilder::OptimizationLevel Level) { + if (Level != PassBuilder::OptimizationLevel::O0) { + FunctionPassManager FPM(DebugPassManager); + // Promote alloca to vector before SROA and loop unroll. If we manage + // to eliminate allocas before unroll we may choose to unroll less. + FPM.addPass(AMDGPUPromoteAllocaToVectorPass(*this)); + PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM))); + } + }); } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll index b71454c720b1..47839dcb9344 100644 --- a/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll +++ b/llvm/test/CodeGen/AMDGPU/sroa-before-unroll.ll @@ -1,5 +1,7 @@ ; RUN: opt -mtriple=amdgcn-- -O1 -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP +; RUN: opt -mtriple=amdgcn-- -passes='default<O1>' -S < %s | FileCheck %s --check-prefixes=FUNC,LOOP ; RUN: opt -mtriple=amdgcn-- -O1 -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL +; RUN: opt -mtriple=amdgcn-- -passes='default<O1>' -S -disable-promote-alloca-to-vector < %s | FileCheck %s --check-prefixes=FUNC,FULL-UNROLL target datalayout = "A5" diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll index 3b6df750ff07..0d37ed60c83b 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll @@ -4,6 +4,7 @@ ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood < %s | FileCheck --check-prefix=EG -check-prefix=FUNC %s ; RUN: opt -S -mtriple=amdgcn-- -amdgpu-promote-alloca -sroa -instcombine < %s | FileCheck -check-prefix=OPT %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-promote-alloca,sroa,instcombine < %s | FileCheck -check-prefix=OPT %s target datalayout = "A5" ; OPT-LABEL: @vector_read( diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index d7a39c911811..6a2c21e80034 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -462,6 +462,13 @@ struct TimeTracerRAII { // TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once // it exists. static bool shouldPinPassToLegacyPM(StringRef Pass) { + std::vector<StringRef> PassNameExactToIgnore = { + "amdgpu-simplifylib", "amdgpu-usenative", "amdgpu-promote-alloca", + "amdgpu-promote-alloca-to-vector"}; + for (const auto &P : PassNameExactToIgnore) + if (Pass == P) + return false; + std::vector<StringRef> PassNamePrefix = { "x86-", "xcore-", "wasm-", "systemz-", "ppc-", "nvvm-", "nvptx-", "mips-", "lanai-", "hexagon-", "bpf-", "avr-", "thumb2-", "arm-", _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits