https://github.com/optimisan updated https://github.com/llvm/llvm-project/pull/130065
>From 78bcc3a3576cc1f0dba5c9feb5ed781a62877ffe Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Mon, 10 Mar 2025 04:31:20 +0000 Subject: [PATCH 1/9] [AMDGPU][NFC] Format GCNCreateVOPD.cpp --- llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index d40a1a2a10d9b..798279b279da3 100644 --- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -38,15 +38,15 @@ namespace { class GCNCreateVOPD : public MachineFunctionPass { private: - class VOPDCombineInfo { - public: - VOPDCombineInfo() = default; - VOPDCombineInfo(MachineInstr *First, MachineInstr *Second) - : FirstMI(First), SecondMI(Second) {} - - MachineInstr *FirstMI; - MachineInstr *SecondMI; - }; + class VOPDCombineInfo { + public: + VOPDCombineInfo() = default; + VOPDCombineInfo(MachineInstr *First, MachineInstr *Second) + : FirstMI(First), SecondMI(Second) {} + + MachineInstr *FirstMI; + MachineInstr *SecondMI; + }; public: static char ID; >From ab31097bd24434b6dca9eedae15acda3a50d5fbb Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Wed, 5 Mar 2025 10:52:00 +0000 Subject: [PATCH 2/9] [AMDGPU][NPM] Port GCNCreateVOPD to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 7 ++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 1 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp | 53 ++++++++++++------- 4 files changed, 43 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 57297288eecb4..f208a8bb9964b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -358,6 +358,11 @@ class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> { PreservedAnalyses run(MachineFunction &F, MachineFunctionAnalysisManager &AM); }; +class GCNCreateVOPDPass : public PassInfoMixin<GCNCreateVOPDPass> { +public: + PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &AM); +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -443,7 +448,7 @@ extern char &SIFormMemoryClausesID; void initializeSIPostRABundlerLegacyPass(PassRegistry &); extern char &SIPostRABundlerLegacyID; -void initializeGCNCreateVOPDPass(PassRegistry &); +void initializeGCNCreateVOPDLegacyPass(PassRegistry &); extern char &GCNCreateVOPDID; void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 1050855176c04..0e3dcb4267ede 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -103,6 +103,7 @@ MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUse MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) +MACHINE_FUNCTION_PASS("gcn-create-vopd", GCNCreateVOPDPass()) MACHINE_FUNCTION_PASS("si-fix-sgpr-copies", SIFixSGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ce3dcd920bce3..73ae9135eb319 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -546,7 +546,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIPreAllocateWWMRegsLegacyPass(*PR); initializeSIFormMemoryClausesLegacyPass(*PR); initializeSIPostRABundlerLegacyPass(*PR); - initializeGCNCreateVOPDPass(*PR); + initializeGCNCreateVOPDLegacyPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -2149,7 +2149,7 @@ void AMDGPUCodeGenPassBuilder::addPostRegAlloc(AddMachinePass &addPass) const { void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { - // TODO: addPass(GCNCreateVOPDPass()); + addPass(GCNCreateVOPDPass()); } // TODO: addPass(SIMemoryLegalizerPass()); // TODO: addPass(SIInsertWaitcntsPass()); diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index 798279b279da3..32a26469d616b 100644 --- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -27,6 +27,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/Support/Debug.h" #define DEBUG_TYPE "gcn-create-vopd" @@ -36,7 +37,7 @@ using namespace llvm; namespace { -class GCNCreateVOPD : public MachineFunctionPass { +class GCNCreateVOPD { private: class VOPDCombineInfo { public: @@ -49,20 +50,8 @@ class GCNCreateVOPD : public MachineFunctionPass { }; public: - static char ID; const GCNSubtarget *ST = nullptr; - GCNCreateVOPD() : MachineFunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - - StringRef getPassName() const override { - return "GCN Create VOPD Instructions"; - } - bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) { auto *FirstMI = CI.FirstMI; auto *SecondMI = CI.SecondMI; @@ -112,9 +101,7 @@ class GCNCreateVOPD : public MachineFunctionPass { return true; } - bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(MF.getFunction())) - return false; + bool run(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32()) return false; @@ -163,11 +150,39 @@ class GCNCreateVOPD : public MachineFunctionPass { } }; +class GCNCreateVOPDLegacy : public MachineFunctionPass { +public: + static char ID; + GCNCreateVOPDLegacy() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "GCN Create VOPD Instructions"; + } + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + return GCNCreateVOPD().run(MF); + } +}; + } // namespace -char GCNCreateVOPD::ID = 0; +PreservedAnalyses llvm::GCNCreateVOPDPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &AM) { + if (!GCNCreateVOPD().run(MF)) + return PreservedAnalyses::all(); + return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>(); +} + +char GCNCreateVOPDLegacy::ID = 0; -char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID; +char &llvm::GCNCreateVOPDID = GCNCreateVOPDLegacy::ID; -INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions", +INITIALIZE_PASS(GCNCreateVOPDLegacy, DEBUG_TYPE, "GCN Create VOPD Instructions", false, false) >From 9d01cd56b51b13083c61b92cf10a97cfa4ac077b Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Mon, 10 Mar 2025 04:27:24 +0000 Subject: [PATCH 3/9] clang format --- llvm/lib/Target/AMDGPU/AMDGPU.h | 3 +- llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp | 186 ++++++++++++----------- 2 files changed, 97 insertions(+), 92 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index f208a8bb9964b..f331f741e3993 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -360,7 +360,8 @@ class SIModeRegisterPass : public PassInfoMixin<SIModeRegisterPass> { class GCNCreateVOPDPass : public PassInfoMixin<GCNCreateVOPDPass> { public: - PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &AM); + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &AM); }; FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp index 32a26469d616b..22123f738c948 100644 --- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -49,105 +49,108 @@ class GCNCreateVOPD { MachineInstr *SecondMI; }; -public: - const GCNSubtarget *ST = nullptr; - - bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) { - auto *FirstMI = CI.FirstMI; - auto *SecondMI = CI.SecondMI; - unsigned Opc1 = FirstMI->getOpcode(); - unsigned Opc2 = SecondMI->getOpcode(); - unsigned EncodingFamily = - AMDGPU::getVOPDEncodingFamily(SII->getSubtarget()); - int NewOpcode = - AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), - AMDGPU::getVOPDOpcode(Opc2), EncodingFamily); - assert(NewOpcode != -1 && - "Should have previously determined this as a possible VOPD\n"); - - auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI, - FirstMI->getDebugLoc(), SII->get(NewOpcode)) - .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags()); - - namespace VOPD = AMDGPU::VOPD; - MachineInstr *MI[] = {FirstMI, SecondMI}; - auto InstInfo = - AMDGPU::getVOPDInstInfo(FirstMI->getDesc(), SecondMI->getDesc()); - - for (auto CompIdx : VOPD::COMPONENTS) { - auto MCOprIdx = InstInfo[CompIdx].getIndexOfDstInMCOperands(); - VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx)); - } - - for (auto CompIdx : VOPD::COMPONENTS) { - auto CompSrcOprNum = InstInfo[CompIdx].getCompSrcOperandsNum(); - for (unsigned CompSrcIdx = 0; CompSrcIdx < CompSrcOprNum; ++CompSrcIdx) { - auto MCOprIdx = InstInfo[CompIdx].getIndexOfSrcInMCOperands(CompSrcIdx); + public: + const GCNSubtarget *ST = nullptr; + + bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) { + auto *FirstMI = CI.FirstMI; + auto *SecondMI = CI.SecondMI; + unsigned Opc1 = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + unsigned EncodingFamily = + AMDGPU::getVOPDEncodingFamily(SII->getSubtarget()); + int NewOpcode = + AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), + AMDGPU::getVOPDOpcode(Opc2), EncodingFamily); + assert(NewOpcode != -1 && + "Should have previously determined this as a possible VOPD\n"); + + auto VOPDInst = + BuildMI(*FirstMI->getParent(), FirstMI, FirstMI->getDebugLoc(), + SII->get(NewOpcode)) + .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags()); + + namespace VOPD = AMDGPU::VOPD; + MachineInstr *MI[] = {FirstMI, SecondMI}; + auto InstInfo = + AMDGPU::getVOPDInstInfo(FirstMI->getDesc(), SecondMI->getDesc()); + + for (auto CompIdx : VOPD::COMPONENTS) { + auto MCOprIdx = InstInfo[CompIdx].getIndexOfDstInMCOperands(); VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx)); } - } - SII->fixImplicitOperands(*VOPDInst); - for (auto CompIdx : VOPD::COMPONENTS) - VOPDInst.copyImplicitOps(*MI[CompIdx]); + for (auto CompIdx : VOPD::COMPONENTS) { + auto CompSrcOprNum = InstInfo[CompIdx].getCompSrcOperandsNum(); + for (unsigned CompSrcIdx = 0; CompSrcIdx < CompSrcOprNum; + ++CompSrcIdx) { + auto MCOprIdx = + InstInfo[CompIdx].getIndexOfSrcInMCOperands(CompSrcIdx); + VOPDInst.add(MI[CompIdx]->getOperand(MCOprIdx)); + } + } - LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: " - << *CI.FirstMI << "\tY: " << *CI.SecondMI << "\n"); + SII->fixImplicitOperands(*VOPDInst); + for (auto CompIdx : VOPD::COMPONENTS) + VOPDInst.copyImplicitOps(*MI[CompIdx]); - for (auto CompIdx : VOPD::COMPONENTS) - MI[CompIdx]->eraseFromParent(); + LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: " + << *CI.FirstMI << "\tY: " << *CI.SecondMI << "\n"); - ++NumVOPDCreated; - return true; - } + for (auto CompIdx : VOPD::COMPONENTS) + MI[CompIdx]->eraseFromParent(); - bool run(MachineFunction &MF) { - ST = &MF.getSubtarget<GCNSubtarget>(); - if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32()) - return false; - LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n"); - - const SIInstrInfo *SII = ST->getInstrInfo(); - bool Changed = false; - - SmallVector<VOPDCombineInfo> ReplaceCandidates; - - for (auto &MBB : MF) { - auto MII = MBB.begin(), E = MBB.end(); - while (MII != E) { - auto *FirstMI = &*MII; - MII = next_nodbg(MII, MBB.end()); - if (MII == MBB.end()) - break; - if (FirstMI->isDebugInstr()) - continue; - auto *SecondMI = &*MII; - unsigned Opc = FirstMI->getOpcode(); - unsigned Opc2 = SecondMI->getOpcode(); - llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); - llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); - VOPDCombineInfo CI; - - if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) - CI = VOPDCombineInfo(FirstMI, SecondMI); - else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) - CI = VOPDCombineInfo(SecondMI, FirstMI); - else - continue; - // checkVOPDRegConstraints cares about program order, but doReplace - // cares about X-Y order in the constituted VOPD - if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) { - ReplaceCandidates.push_back(CI); - ++MII; + ++NumVOPDCreated; + return true; + } + + bool run(MachineFunction &MF) { + ST = &MF.getSubtarget<GCNSubtarget>(); + if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32()) + return false; + LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n"); + + const SIInstrInfo *SII = ST->getInstrInfo(); + bool Changed = false; + + SmallVector<VOPDCombineInfo> ReplaceCandidates; + + for (auto &MBB : MF) { + auto MII = MBB.begin(), E = MBB.end(); + while (MII != E) { + auto *FirstMI = &*MII; + MII = next_nodbg(MII, MBB.end()); + if (MII == MBB.end()) + break; + if (FirstMI->isDebugInstr()) + continue; + auto *SecondMI = &*MII; + unsigned Opc = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + VOPDCombineInfo CI; + + if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) + CI = VOPDCombineInfo(FirstMI, SecondMI); + else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) + CI = VOPDCombineInfo(SecondMI, FirstMI); + else + continue; + // checkVOPDRegConstraints cares about program order, but doReplace + // cares about X-Y order in the constituted VOPD + if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) { + ReplaceCandidates.push_back(CI); + ++MII; + } } } - } - for (auto &CI : ReplaceCandidates) { - Changed |= doReplace(SII, CI); - } + for (auto &CI : ReplaceCandidates) { + Changed |= doReplace(SII, CI); + } - return Changed; - } + return Changed; + } }; class GCNCreateVOPDLegacy : public MachineFunctionPass { @@ -173,8 +176,9 @@ class GCNCreateVOPDLegacy : public MachineFunctionPass { } // namespace -PreservedAnalyses llvm::GCNCreateVOPDPass::run(MachineFunction &MF, - MachineFunctionAnalysisManager &AM) { +PreservedAnalyses +llvm::GCNCreateVOPDPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &AM) { if (!GCNCreateVOPD().run(MF)) return PreservedAnalyses::all(); return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>(); >From b1402edb380ddf044af4810a9b7a88c4f874c0ed Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Wed, 5 Mar 2025 11:06:40 +0000 Subject: [PATCH 4/9] [AMDGPU][NPM] Port SIMemoryLegalizer to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 ++- llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 43 ++++++++++++++----- 4 files changed, 45 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index f331f741e3993..4197a60e77014 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -364,6 +364,13 @@ class GCNCreateVOPDPass : public PassInfoMixin<GCNCreateVOPDPass> { MachineFunctionAnalysisManager &AM); }; +class SIMemoryLegalizerPass : public PassInfoMixin<SIMemoryLegalizerPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -428,7 +435,7 @@ class SIAnnotateControlFlowPass void initializeSIAnnotateControlFlowLegacyPass(PassRegistry &); extern char &SIAnnotateControlFlowLegacyPassID; -void initializeSIMemoryLegalizerPass(PassRegistry&); +void initializeSIMemoryLegalizerLegacyPass(PassRegistry &); extern char &SIMemoryLegalizerID; void initializeSIModeRegisterLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 0e3dcb4267ede..de959f8a2aa62 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -113,6 +113,7 @@ MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) MACHINE_FUNCTION_PASS("si-lower-wwm-copies", SILowerWWMCopiesPass()) +MACHINE_FUNCTION_PASS("si-memory-legalizer", SIMemoryLegalizerPass()) MACHINE_FUNCTION_PASS("si-mode-register", SIModeRegisterPass()) MACHINE_FUNCTION_PASS("si-opt-vgpr-liverange", SIOptimizeVGPRLiveRangePass()) MACHINE_FUNCTION_PASS("si-optimize-exec-masking", SIOptimizeExecMaskingPass()) @@ -132,7 +133,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPas DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-memory-legalizer", SIMemoryLegalizerPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it // already exists. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 73ae9135eb319..dbe212ad0a216 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -541,7 +541,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSILowerControlFlowLegacyPass(*PR); initializeSIPreEmitPeepholePass(*PR); initializeSILateBranchLoweringPass(*PR); - initializeSIMemoryLegalizerPass(*PR); + initializeSIMemoryLegalizerLegacyPass(*PR); initializeSIOptimizeExecMaskingLegacyPass(*PR); initializeSIPreAllocateWWMRegsLegacyPass(*PR); initializeSIFormMemoryClausesLegacyPass(*PR); @@ -2151,7 +2151,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less)) { addPass(GCNCreateVOPDPass()); } - // TODO: addPass(SIMemoryLegalizerPass()); + + addPass(SIMemoryLegalizerPass()); // TODO: addPass(SIInsertWaitcntsPass()); // TODO: addPass(SIModeRegisterPass()); diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 34953f9c08db7..1375ba201ec58 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -21,8 +21,10 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/MemoryModelRelaxationAnnotations.h" +#include "llvm/IR/PassManager.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" @@ -625,9 +627,9 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { } }; -class SIMemoryLegalizer final : public MachineFunctionPass { +class SIMemoryLegalizer final { private: - + const MachineModuleInfo &MMI; /// Cache Control. std::unique_ptr<SICacheControl> CC = nullptr; @@ -661,10 +663,16 @@ class SIMemoryLegalizer final : public MachineFunctionPass { bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); +public: + SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {}; + bool run(MachineFunction &MF); +}; + +class SIMemoryLegalizerLegacy final : public MachineFunctionPass { public: static char ID; - SIMemoryLegalizer() : MachineFunctionPass(ID) {} + SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -2767,11 +2775,26 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } -bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { - bool Changed = false; - +bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) { const MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI(); + return SIMemoryLegalizer(MMI).run(MF); +} + +PreservedAnalyses +SIMemoryLegalizerPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto *MMI = MFAM.getResult<ModuleAnalysisManagerFunctionProxy>(MF) + .getCachedResult<MachineModuleAnalysis>( + *MF.getFunction().getParent()); + assert(MMI && "MachineModuleAnalysis must be available"); + if (!SIMemoryLegalizer(MMI->getMMI()).run(MF)) + return PreservedAnalyses::all(); + return getMachineFunctionPassPreservedAnalyses().preserveSet<CFGAnalyses>(); +} + +bool SIMemoryLegalizer::run(MachineFunction &MF) { + bool Changed = false; SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>()); CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>()); @@ -2812,11 +2835,11 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { return Changed; } -INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false) +INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false) -char SIMemoryLegalizer::ID = 0; -char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID; +char SIMemoryLegalizerLegacy::ID = 0; +char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID; FunctionPass *llvm::createSIMemoryLegalizerPass() { - return new SIMemoryLegalizer(); + return new SIMemoryLegalizerLegacy(); } >From 11b7833df74f3d2dd933a28b69a5dcf86c041b21 Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Thu, 6 Mar 2025 04:41:08 +0000 Subject: [PATCH 5/9] [AMDGPU][NPM] Port SIInsertWaitcnts to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 4 +- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 91 +++++++++++++------ llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir | 1 + .../CodeGen/AMDGPU/insert-waitcnts-hang.mir | 1 + .../AMDGPU/vccz-corrupt-bug-workaround.mir | 2 + 7 files changed, 76 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4197a60e77014..4dcfaf9b12b5e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -371,6 +371,13 @@ class SIMemoryLegalizerPass : public PassInfoMixin<SIMemoryLegalizerPass> { static bool isRequired() { return true; } }; +class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -447,7 +454,7 @@ extern char &AMDGPUInsertDelayAluID; void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; -void initializeSIInsertWaitcntsPass(PassRegistry&); +void initializeSIInsertWaitcntsLegacyPass(PassRegistry &); extern char &SIInsertWaitcntsID; void initializeSIFormMemoryClausesLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index de959f8a2aa62..c4641cba60e53 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) +MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) @@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartial DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index dbe212ad0a216..c3cc1dc6e495b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -535,7 +535,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); initializeSIInsertHardClausesPass(*PR); - initializeSIInsertWaitcntsPass(*PR); + initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); @@ -2153,7 +2153,7 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { } addPass(SIMemoryLegalizerPass()); - // TODO: addPass(SIInsertWaitcntsPass()); + addPass(SIInsertWaitcntsPass()); // TODO: addPass(SIModeRegisterPass()); diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ee263f58bcaf2..8951a4144bd68 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -33,6 +33,7 @@ #include "llvm/ADT/Sequence.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachinePassManager.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/Support/DebugCounter.h" #include "llvm/TargetParser/TargetParser.h" @@ -594,7 +595,7 @@ class WaitcntGeneratorGFX12Plus : public WaitcntGenerator { AMDGPU::Waitcnt getAllZeroWaitcnt(bool IncludeVSCnt) const override; }; -class SIInsertWaitcnts : public MachineFunctionPass { +class SIInsertWaitcnts { private: const GCNSubtarget *ST = nullptr; const SIInstrInfo *TII = nullptr; @@ -633,9 +634,9 @@ class SIInsertWaitcnts : public MachineFunctionPass { InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS; public: - static char ID; - - SIInsertWaitcnts() : MachineFunctionPass(ID) { + SIInsertWaitcnts(MachineLoopInfo *MLI, MachinePostDominatorTree *PDT, + AliasAnalysis *AA) + : MLI(MLI), PDT(PDT), AA(AA) { (void)ForceExpCounter; (void)ForceLgkmCounter; (void)ForceVMCounter; @@ -645,20 +646,7 @@ class SIInsertWaitcnts : public MachineFunctionPass { bool isPreheaderToFlush(MachineBasicBlock &MBB, WaitcntBrackets &ScoreBrackets); bool isVMEMOrFlatVMEM(const MachineInstr &MI) const; - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { - return "SI insert wait instructions"; - } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<MachineLoopInfoWrapperPass>(); - AU.addRequired<MachinePostDominatorTreeWrapperPass>(); - AU.addUsedIfAvailable<AAResultsWrapperPass>(); - AU.addPreserved<AAResultsWrapperPass>(); - MachineFunctionPass::getAnalysisUsage(AU); - } + bool run(MachineFunction &MF); bool isForceEmitWaitcnt() const { for (auto T : inst_counter_types()) @@ -742,6 +730,36 @@ class SIInsertWaitcnts : public MachineFunctionPass { WaitcntBrackets &ScoreBrackets); }; +class SIInsertWaitcntsLegacy : public MachineFunctionPass { +public: + static char ID; + SIInsertWaitcntsLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + auto *MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); + auto *PDT = + &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); + AliasAnalysis *AA = nullptr; + if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) + AA = &AAR->getAAResults(); + + return SIInsertWaitcnts(MLI, PDT, AA).run(MF); + } + + StringRef getPassName() const override { + return "SI insert wait instructions"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<MachineLoopInfoWrapperPass>(); + AU.addRequired<MachinePostDominatorTreeWrapperPass>(); + AU.addUsedIfAvailable<AAResultsWrapperPass>(); + AU.addPreserved<AAResultsWrapperPass>(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + } // end anonymous namespace RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, @@ -1124,19 +1142,19 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const { return hasMixedPendingEvents(T); } -INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, - false) +INITIALIZE_PASS_BEGIN(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts", + false, false) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTreeWrapperPass) -INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, - false) +INITIALIZE_PASS_END(SIInsertWaitcntsLegacy, DEBUG_TYPE, "SI Insert Waitcnts", + false, false) -char SIInsertWaitcnts::ID = 0; +char SIInsertWaitcntsLegacy::ID = 0; -char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID; +char &llvm::SIInsertWaitcntsID = SIInsertWaitcntsLegacy::ID; FunctionPass *llvm::createSIInsertWaitcntsPass() { - return new SIInsertWaitcnts(); + return new SIInsertWaitcntsLegacy(); } static bool updateOperandIfDifferent(MachineInstr &MI, AMDGPU::OpName OpName, @@ -2406,16 +2424,29 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside && ST->hasVmemWriteVgprInOrder(); } -bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { +PreservedAnalyses +SIInsertWaitcntsPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto *MLI = &MFAM.getResult<MachineLoopAnalysis>(MF); + auto *PDT = &MFAM.getResult<MachinePostDominatorTreeAnalysis>(MF); + auto *AA = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF) + .getManager() + .getCachedResult<AAManager>(MF.getFunction()); + + if (!SIInsertWaitcnts(MLI, PDT, AA).run(MF)) + return PreservedAnalyses::all(); + + return getMachineFunctionPassPreservedAnalyses() + .preserveSet<CFGAnalyses>() + .preserve<AAManager>(); +} + +bool SIInsertWaitcnts::run(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); - PDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree(); - if (auto *AAR = getAnalysisIfAvailable<AAResultsWrapperPass>()) - AA = &AAR->getAAResults(); AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU()); diff --git a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir index b6dc75db3edc1..0456d5cc463f1 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/call-waw-waitcnt.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -passes=si-insert-waitcnts %s -o - | FileCheck -check-prefix=GCN %s # $sgpr30_sgpr31 will hold the return address. We need a waitcnt before SI_CALL so # that the return address is not clobbered in the callee by the outstanding load. diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir index 28d79efc00b0d..2834ca5fa6858 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-hang.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-insert-waitcnts %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -passes si-insert-waitcnts %s -o - | FileCheck %s --- name: test diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 17e3d93ed393b..f5321591a3c88 100644 --- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -2,6 +2,8 @@ # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx900 -o - %s | FileCheck %s -check-prefixes=CHECK,GFX9 # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -o - %s | FileCheck %s # RUN: llc -run-pass=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s + +# RUN: llc -passes=si-insert-waitcnts -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -o - %s | FileCheck %s --- # CHECK-LABEL: name: vccz_corrupt_workaround # CHECK: $vcc = V_CMP_EQ_F32 >From 79a590f31ec8f979790ff770790a4f938fe67498 Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Thu, 6 Mar 2025 04:52:38 +0000 Subject: [PATCH 6/9] [AMDGPU][NPM] Port SIInsertHardClauses to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 8 ++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- .../lib/Target/AMDGPU/SIInsertHardClauses.cpp | 50 +++++++++++++------ .../CodeGen/AMDGPU/hard-clauses-img-gfx10.mir | 1 + .../CodeGen/AMDGPU/hard-clauses-img-gfx11.mir | 1 + 6 files changed, 46 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4dcfaf9b12b5e..b434676f85581 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -378,6 +378,12 @@ class SIInsertWaitcntsPass : public PassInfoMixin<SIInsertWaitcntsPass> { static bool isRequired() { return true; } }; +class SIInsertHardClausesPass : public PassInfoMixin<SIInsertHardClausesPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -451,7 +457,7 @@ extern char &SIModeRegisterID; void initializeAMDGPUInsertDelayAluLegacyPass(PassRegistry &); extern char &AMDGPUInsertDelayAluID; -void initializeSIInsertHardClausesPass(PassRegistry &); +void initializeSIInsertHardClausesLegacyPass(PassRegistry &); extern char &SIInsertHardClausesID; void initializeSIInsertWaitcntsLegacyPass(PassRegistry &); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index c4641cba60e53..3eabe087a8a33 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -109,6 +109,7 @@ MACHINE_FUNCTION_PASS("si-fix-vgpr-copies", SIFixVGPRCopiesPass()) MACHINE_FUNCTION_PASS("si-fold-operands", SIFoldOperandsPass()); MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) +MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) @@ -131,7 +132,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizations DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c3cc1dc6e495b..6c24fe5f1441a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -534,7 +534,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowLegacyPass(*PR); initializeAMDGPUInsertDelayAluLegacyPass(*PR); - initializeSIInsertHardClausesPass(*PR); + initializeSIInsertHardClausesLegacyPass(*PR); initializeSIInsertWaitcntsLegacyPass(*PR); initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp index dcc60765cc203..71b937f23cc3c 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp @@ -36,6 +36,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePassManager.h" using namespace llvm; @@ -89,18 +90,10 @@ enum HardClauseType { HARDCLAUSE_ILLEGAL, }; -class SIInsertHardClauses : public MachineFunctionPass { +class SIInsertHardClauses { public: - static char ID; const GCNSubtarget *ST = nullptr; - SIInsertHardClauses() : MachineFunctionPass(ID) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); - } - HardClauseType getHardClauseType(const MachineInstr &MI) { if (MI.mayLoad() || (MI.mayStore() && ST->shouldClusterStores())) { if (ST->getGeneration() == AMDGPUSubtarget::GFX10) { @@ -189,9 +182,7 @@ class SIInsertHardClauses : public MachineFunctionPass { return true; } - bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(MF.getFunction())) - return false; + bool run(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); if (!ST->hasHardClauses()) @@ -265,11 +256,40 @@ class SIInsertHardClauses : public MachineFunctionPass { } }; +class SIInsertHardClausesLegacy : public MachineFunctionPass { +public: + static char ID; + SIInsertHardClausesLegacy() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + return SIInsertHardClauses().run(MF); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + } // namespace -char SIInsertHardClauses::ID = 0; +PreservedAnalyses +llvm::SIInsertHardClausesPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (!SIInsertHardClauses().run(MF)) + return PreservedAnalyses::all(); + + auto PA = getMachineFunctionPassPreservedAnalyses(); + PA.preserveSet<CFGAnalyses>(); + return PA; +} + +char SIInsertHardClausesLegacy::ID = 0; -char &llvm::SIInsertHardClausesID = SIInsertHardClauses::ID; +char &llvm::SIInsertHardClausesID = SIInsertHardClausesLegacy::ID; -INITIALIZE_PASS(SIInsertHardClauses, DEBUG_TYPE, "SI Insert Hard Clauses", +INITIALIZE_PASS(SIInsertHardClausesLegacy, DEBUG_TYPE, "SI Insert Hard Clauses", false, false) diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir index 50eea4aebd5e9..1baceeef82c92 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx10.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -passes si-insert-hard-clauses %s -o - | FileCheck %s --- name: mimg_nsa diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir index b22de06e68a7f..7505fde047782 100644 --- a/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir +++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-img-gfx11.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -passes si-insert-hard-clauses %s -o - | FileCheck %s --- name: mimg_nsa >From 70a3b5827ef2a8579ea6218cf9f797ffd3d79404 Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Thu, 6 Mar 2025 05:26:49 +0000 Subject: [PATCH 7/9] [AMDGPU][NPM] Port SILateBranchLowering to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 10 ++++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 ++- .../Target/AMDGPU/SILateBranchLowering.cpp | 40 ++++++++++++++----- llvm/test/CodeGen/AMDGPU/early-term.mir | 2 + llvm/test/CodeGen/AMDGPU/readlane_exec0.mir | 1 + 6 files changed, 46 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index b434676f85581..d1dc62e9cc526 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -213,7 +213,7 @@ extern char &SILowerControlFlowLegacyID; void initializeSIPreEmitPeepholePass(PassRegistry &); extern char &SIPreEmitPeepholeID; -void initializeSILateBranchLoweringPass(PassRegistry &); +void initializeSILateBranchLoweringLegacyPass(PassRegistry &); extern char &SILateBranchLoweringPassID; void initializeSIOptimizeExecMaskingLegacyPass(PassRegistry &); @@ -384,6 +384,14 @@ class SIInsertHardClausesPass : public PassInfoMixin<SIInsertHardClausesPass> { MachineFunctionAnalysisManager &MFAM); }; +class SILateBranchLoweringPass + : public PassInfoMixin<SILateBranchLoweringPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 3eabe087a8a33..318aad5590cda 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -111,6 +111,7 @@ MACHINE_FUNCTION_PASS("si-form-memory-clauses", SIFormMemoryClausesPass()) MACHINE_FUNCTION_PASS("si-i1-copies", SILowerI1CopiesPass()) MACHINE_FUNCTION_PASS("si-insert-hard-clauses", SIInsertHardClausesPass()) MACHINE_FUNCTION_PASS("si-insert-waitcnts", SIInsertWaitcntsPass()) +MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) MACHINE_FUNCTION_PASS("si-load-store-opt", SILoadStoreOptimizerPass()) MACHINE_FUNCTION_PASS("si-lower-control-flow", SILowerControlFlowPass()) MACHINE_FUNCTION_PASS("si-lower-sgpr-spills", SILowerSGPRSpillsPass()) @@ -132,7 +133,6 @@ DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizations DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-late-branch-lowering", SILateBranchLoweringPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it // already exists. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 6c24fe5f1441a..b9d62cc9e4b63 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -540,7 +540,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); initializeSIPreEmitPeepholePass(*PR); - initializeSILateBranchLoweringPass(*PR); + initializeSILateBranchLoweringLegacyPass(*PR); initializeSIMemoryLegalizerLegacyPass(*PR); initializeSIOptimizeExecMaskingLegacyPass(*PR); initializeSIPreAllocateWWMRegsLegacyPass(*PR); @@ -2161,7 +2161,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { // TODO: addPass(SIInsertHardClausesPass()); } - // addPass(SILateBranchLoweringPass()); + addPass(SILateBranchLoweringPass()); + if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) { // TODO: addPass(AMDGPUSetWavePriorityPass()); } diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index d02173f57ee37..0f5b6bd9374b0 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -16,6 +16,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachinePassManager.h" using namespace llvm; @@ -23,7 +24,7 @@ using namespace llvm; namespace { -class SILateBranchLowering : public MachineFunctionPass { +class SILateBranchLowering { private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; @@ -33,14 +34,23 @@ class SILateBranchLowering : public MachineFunctionPass { void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); public: - static char ID; + SILateBranchLowering(MachineDominatorTree *MDT) : MDT(MDT) {} + + bool run(MachineFunction &MF); unsigned MovOpc; Register ExecReg; +}; - SILateBranchLowering() : MachineFunctionPass(ID) {} +class SILateBranchLoweringLegacy : public MachineFunctionPass { +public: + static char ID; + SILateBranchLoweringLegacy() : MachineFunctionPass(ID) {} - bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override { + auto *MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); + return SILateBranchLowering(MDT).run(MF); + } StringRef getPassName() const override { return "SI Final Branch Preparation"; @@ -55,15 +65,15 @@ class SILateBranchLowering : public MachineFunctionPass { } // end anonymous namespace -char SILateBranchLowering::ID = 0; +char SILateBranchLoweringLegacy::ID = 0; -INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(SILateBranchLoweringLegacy, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass) -INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE, +INITIALIZE_PASS_END(SILateBranchLoweringLegacy, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) -char &llvm::SILateBranchLoweringPassID = SILateBranchLowering::ID; +char &llvm::SILateBranchLoweringPassID = SILateBranchLoweringLegacy::ID; static void generateEndPgm(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, @@ -144,11 +154,21 @@ void SILateBranchLowering::earlyTerm(MachineInstr &MI, MDT->insertEdge(&MBB, EarlyExitBlock); } -bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { +PreservedAnalyses +llvm::SILateBranchLoweringPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + auto *MDT = &MFAM.getResult<MachineDominatorTreeAnalysis>(MF); + if (!SILateBranchLowering(MDT).run(MF)) + return PreservedAnalyses::all(); + + return getMachineFunctionPassPreservedAnalyses() + .preserve<MachineDominatorTreeAnalysis>(); +} + +bool SILateBranchLowering::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(); MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir index 77bc9729ee845..3d75d405a46d3 100644 --- a/llvm/test/CodeGen/AMDGPU/early-term.mir +++ b/llvm/test/CodeGen/AMDGPU/early-term.mir @@ -2,6 +2,8 @@ # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX10 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -passes=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX11 %s + --- | define amdgpu_ps void @early_term_scc0_end_block() { ret void diff --git a/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir b/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir index 6a286eafa6d58..a4c05aa781df7 100644 --- a/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir +++ b/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir @@ -1,4 +1,5 @@ # RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji -run-pass=si-late-branch-lowering -verify-machineinstrs | FileCheck -check-prefix=GCN %s +# RUN: llc -o - %s -mtriple=amdgcn -mcpu=fiji -passes=si-late-branch-lowering -verify-machineinstrs | FileCheck -check-prefix=GCN %s # GCN-LABEL: readlane_exec0 # GCN: bb.0 >From 192871bf259c7187ef1b9d2d9daec53470e73264 Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Thu, 6 Mar 2025 06:07:23 +0000 Subject: [PATCH 8/9] [AMDGPU][NPM] Port AMDGPUSetWavePriority to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../Target/AMDGPU/AMDGPUSetWavePriority.cpp | 44 ++++++++++++++----- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 5 +-- llvm/test/CodeGen/AMDGPU/set-wave-priority.ll | 5 +++ 5 files changed, 48 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index d1dc62e9cc526..27ae6d42ec21d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -392,6 +392,13 @@ class SILateBranchLoweringPass static bool isRequired() { return true; } }; +class AMDGPUSetWavePriorityPass + : public PassInfoMixin<AMDGPUSetWavePriorityPass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); +}; + FunctionPass *createAMDGPUAnnotateUniformValuesLegacy(); ModulePass *createAMDGPUPrintfRuntimeBinding(); @@ -504,7 +511,7 @@ void initializeGCNPreRAOptimizationsLegacyPass(PassRegistry &); extern char &GCNPreRAOptimizationsID; FunctionPass *createAMDGPUSetWavePriorityPass(); -void initializeAMDGPUSetWavePriorityPass(PassRegistry &); +void initializeAMDGPUSetWavePriorityLegacyPass(PassRegistry &); void initializeGCNRewritePartialRegUsesLegacyPass(llvm::PassRegistry &); extern char &GCNRewritePartialRegUsesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 318aad5590cda..4956897d22fde 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -100,6 +100,7 @@ MACHINE_FUNCTION_PASS("amdgpu-insert-delay-alu", AMDGPUInsertDelayAluPass()) MACHINE_FUNCTION_PASS("amdgpu-isel", AMDGPUISelDAGToDAGPass(*this)) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-long-branch-reg", GCNPreRALongBranchRegPass()) MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) +MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) MACHINE_FUNCTION_PASS("amdgpu-nsa-reassign", GCNNSAReassignPass()) MACHINE_FUNCTION_PASS("gcn-dpp-combine", GCNDPPCombinePass()) @@ -131,7 +132,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) #define DUMMY_MACHINE_FUNCTION_PASS(NAME, CREATE_PASS) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) -DUMMY_MACHINE_FUNCTION_PASS("amdgpu-set-wave-priority", AMDGPUSetWavePriorityPass()) DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp index c16d33f1453c0..29aecda82bc4b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSetWavePriority.cpp @@ -19,6 +19,7 @@ #include "SIInstrInfo.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachinePassManager.h" using namespace llvm; @@ -40,15 +41,11 @@ struct MBBInfo { using MBBInfoSet = DenseMap<const MachineBasicBlock *, MBBInfo>; -class AMDGPUSetWavePriority : public MachineFunctionPass { +class AMDGPUSetWavePriority { public: static char ID; - AMDGPUSetWavePriority() : MachineFunctionPass(ID) {} - - StringRef getPassName() const override { return "Set wave priority"; } - - bool runOnMachineFunction(MachineFunction &MF) override; + bool run(MachineFunction &MF); private: MachineInstr *BuildSetprioMI(MachineBasicBlock &MBB, @@ -58,15 +55,30 @@ class AMDGPUSetWavePriority : public MachineFunctionPass { const SIInstrInfo *TII; }; +class AMDGPUSetWavePriorityLegacy : public MachineFunctionPass { +public: + static char ID; + + AMDGPUSetWavePriorityLegacy() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "Set wave priority"; } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + return AMDGPUSetWavePriority().run(MF); + } +}; + } // End anonymous namespace. -INITIALIZE_PASS(AMDGPUSetWavePriority, DEBUG_TYPE, "Set wave priority", false, - false) +INITIALIZE_PASS(AMDGPUSetWavePriorityLegacy, DEBUG_TYPE, "Set wave priority", + false, false) -char AMDGPUSetWavePriority::ID = 0; +char AMDGPUSetWavePriorityLegacy::ID = 0; FunctionPass *llvm::createAMDGPUSetWavePriorityPass() { - return new AMDGPUSetWavePriority(); + return new AMDGPUSetWavePriorityLegacy(); } MachineInstr * @@ -96,12 +108,20 @@ static bool isVMEMLoad(const MachineInstr &MI) { return SIInstrInfo::isVMEM(MI) && MI.mayLoad(); } -bool AMDGPUSetWavePriority::runOnMachineFunction(MachineFunction &MF) { +PreservedAnalyses +llvm::AMDGPUSetWavePriorityPass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (!AMDGPUSetWavePriority().run(MF)) + return PreservedAnalyses::all(); + return getMachineFunctionPassPreservedAnalyses(); +} + +bool AMDGPUSetWavePriority::run(MachineFunction &MF) { const unsigned HighPriority = 3; const unsigned LowPriority = 0; Function &F = MF.getFunction(); - if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv())) + if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) return false; const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index b9d62cc9e4b63..857af30b348cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -2163,9 +2163,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { addPass(SILateBranchLoweringPass()); - if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) { - // TODO: addPass(AMDGPUSetWavePriorityPass()); - } + if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) + addPass(AMDGPUSetWavePriorityPass()); if (TM.getOptLevel() > CodeGenOptLevel::None) { // TODO: addPass(SIPreEmitPeepholePass()); diff --git a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll index ab6877ac4e6d1..a27d1217031ca 100644 --- a/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll +++ b/llvm/test/CodeGen/AMDGPU/set-wave-priority.ll @@ -1,6 +1,11 @@ ; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \ ; RUN: FileCheck %s +; RUN: llc -mtriple=amdgcn -stop-after=si-late-branch-lowering -o - %s | \ +; RUN: llc -x mir -mtriple=amdgcn -passes=amdgpu-set-wave-priority -o - | \ +; RUN: llc -x mir -mtriple=amdgcn -start-after=si-late-branch-lowering -o - | \ +; RUN: FileCheck %s + ; CHECK-LABEL: no_setprio: ; CHECK-NOT: s_setprio ; CHECK: ; return to shader part epilog >From bedfcb1378d22a7d58cde2547fba71f7ff3f235f Mon Sep 17 00:00:00 2001 From: Akshat Oke <akshat....@amd.com> Date: Thu, 6 Mar 2025 06:20:13 +0000 Subject: [PATCH 9/9] [AMDGPU][NPM] Port SIPreEmitPeephole to NPM --- llvm/lib/Target/AMDGPU/AMDGPU.h | 9 +++++- llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def | 2 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 ++--- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 31 ++++++++++++++----- .../AMDGPU/insert-handle-flat-vmem-ds.mir | 1 + ...ort-exec-branches-special-instructions.mir | 1 + .../CodeGen/AMDGPU/set-gpr-idx-peephole.mir | 1 + 7 files changed, 38 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 27ae6d42ec21d..b8f5d85ef0b9a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -210,7 +210,7 @@ extern char &SIWholeQuadModeID; void initializeSILowerControlFlowLegacyPass(PassRegistry &); extern char &SILowerControlFlowLegacyID; -void initializeSIPreEmitPeepholePass(PassRegistry &); +void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &); extern char &SIPreEmitPeepholeID; void initializeSILateBranchLoweringLegacyPass(PassRegistry &); @@ -392,6 +392,13 @@ class SILateBranchLoweringPass static bool isRequired() { return true; } }; +class SIPreEmitPeepholePass : public PassInfoMixin<SIPreEmitPeepholePass> { +public: + PreservedAnalyses run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM); + static bool isRequired() { return true; } +}; + class AMDGPUSetWavePriorityPass : public PassInfoMixin<AMDGPUSetWavePriorityPass> { public: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 4956897d22fde..f14499d0d3146 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -125,6 +125,7 @@ MACHINE_FUNCTION_PASS("si-optimize-exec-masking-pre-ra", SIOptimizeExecMaskingPr MACHINE_FUNCTION_PASS("si-peephole-sdwa", SIPeepholeSDWAPass()) MACHINE_FUNCTION_PASS("si-post-ra-bundler", SIPostRABundlerPass()) MACHINE_FUNCTION_PASS("si-pre-allocate-wwm-regs", SIPreAllocateWWMRegsPass()) +MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) MACHINE_FUNCTION_PASS("si-shrink-instructions", SIShrinkInstructionsPass()) MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) #undef MACHINE_FUNCTION_PASS @@ -133,7 +134,6 @@ MACHINE_FUNCTION_PASS("si-wqm", SIWholeQuadModePass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-pre-ra-optimizations", GCNPreRAOptimizationsPass()) DUMMY_MACHINE_FUNCTION_PASS("amdgpu-rewrite-partial-reg-uses", GCNRewritePartialRegUsesPass()) -DUMMY_MACHINE_FUNCTION_PASS("si-pre-emit-peephole", SIPreEmitPeepholePass()) // TODO: Move amdgpu-preload-kern-arg-prolog to MACHINE_FUNCTION_PASS since it // already exists. DUMMY_MACHINE_FUNCTION_PASS("amdgpu-preload-kern-arg-prolog", AMDGPUPreloadKernArgPrologPass()) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 857af30b348cb..05eb609956199 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -539,7 +539,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIModeRegisterLegacyPass(*PR); initializeSIWholeQuadModeLegacyPass(*PR); initializeSILowerControlFlowLegacyPass(*PR); - initializeSIPreEmitPeepholePass(*PR); + initializeSIPreEmitPeepholeLegacyPass(*PR); initializeSILateBranchLoweringLegacyPass(*PR); initializeSIMemoryLegalizerLegacyPass(*PR); initializeSIOptimizeExecMaskingLegacyPass(*PR); @@ -2166,9 +2166,8 @@ void AMDGPUCodeGenPassBuilder::addPreEmitPass(AddMachinePass &addPass) const { if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less)) addPass(AMDGPUSetWavePriorityPass()); - if (TM.getOptLevel() > CodeGenOptLevel::None) { - // TODO: addPass(SIPreEmitPeepholePass()); - } + if (TM.getOptLevel() > CodeGenOptLevel::None) + addPass(SIPreEmitPeepholePass()); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 2bb70c138a50c..9db2118f2997b 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -24,7 +24,7 @@ using namespace llvm; namespace { -class SIPreEmitPeephole : public MachineFunctionPass { +class SIPreEmitPeephole { private: const SIInstrInfo *TII = nullptr; const SIRegisterInfo *TRI = nullptr; @@ -40,24 +40,31 @@ class SIPreEmitPeephole : public MachineFunctionPass { const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); +public: + bool run(MachineFunction &MF); +}; + +class SIPreEmitPeepholeLegacy : public MachineFunctionPass { public: static char ID; - SIPreEmitPeephole() : MachineFunctionPass(ID) { - initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry()); + SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) { + initializeSIPreEmitPeepholeLegacyPass(*PassRegistry::getPassRegistry()); } - bool runOnMachineFunction(MachineFunction &MF) override; + bool runOnMachineFunction(MachineFunction &MF) override { + return SIPreEmitPeephole().run(MF); + } }; } // End anonymous namespace. -INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE, +INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE, "SI peephole optimizations", false, false) -char SIPreEmitPeephole::ID = 0; +char SIPreEmitPeepholeLegacy::ID = 0; -char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID; +char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID; bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const { // Match: @@ -410,7 +417,15 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return true; } -bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { +PreservedAnalyses +llvm::SIPreEmitPeepholePass::run(MachineFunction &MF, + MachineFunctionAnalysisManager &MFAM) { + if (!SIPreEmitPeephole().run(MF)) + return PreservedAnalyses::all(); + return getMachineFunctionPassPreservedAnalyses(); +} + +bool SIPreEmitPeephole::run(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir index d89f306c96a36..2e8c8ca9c7a6c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=polaris10 -passes si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir index 20de119471ba3..92a9a195fc4c7 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir +++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -passes=si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are not removed around mode defs. --- diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir index 796a70cfe8a39..1d0a6db36ea3b 100644 --- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir +++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX --- name: simple _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits