From: Michel Dänzer <michel.daen...@amd.com>
Signed-off-by: Michel Dänzer <michel.daen...@amd.com> --- This patch applies on top of Christian's SGPR liveness patch. lib/Target/AMDGPU/AMDGPU.h | 2 +- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- lib/Target/AMDGPU/CMakeLists.txt | 2 +- lib/Target/AMDGPU/SIFixSGPRLiveness.cpp | 2 +- lib/Target/AMDGPU/SILowerControlFlow.cpp | 193 +++++++++++++++++++++++++++++ lib/Target/AMDGPU/SILowerFlowControl.cpp | 193 ----------------------------- 6 files changed, 197 insertions(+), 197 deletions(-) create mode 100644 lib/Target/AMDGPU/SILowerControlFlow.cpp delete mode 100644 lib/Target/AMDGPU/SILowerFlowControl.cpp diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 33a74dc..2a06ade 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -25,7 +25,7 @@ FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); // SI Passes FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); -FunctionPass *createSILowerFlowControlPass(TargetMachine &tm); +FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm); FunctionPass *createSIFixSGPRLivenessPass(TargetMachine &tm); diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 024ff3d..5c4af91 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -137,7 +137,7 @@ bool AMDGPUPassConfig::addPreEmitPass() { addPass(&FinalizeMachineBundlesID); } else { addPass(createSILowerLiteralConstantsPass(*TM)); - addPass(createSILowerFlowControlPass(*TM)); + addPass(createSILowerControlFlowPass(*TM)); } return false; diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 5e013f6..cd3f174 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -41,7 +41,7 @@ add_llvm_target(AMDGPUCodeGen SIInstrInfo.cpp SIISelLowering.cpp SILowerLiteralConstants.cpp - SILowerFlowControl.cpp + SILowerControlFlow.cpp SIMachineFunctionInfo.cpp SIRegisterInfo.cpp SIPreColorSGPRs.cpp diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp index 028753e..f92eff5 100644 --- a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// SGPRs are not affected by flow control. This pass adjust SGPR liveness in +// SGPRs are not affected by control flow. This pass adjust SGPR liveness in // so that the register allocator can still correctly allocate them. // //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp new file mode 100644 index 0000000..b43fdeb --- /dev/null +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -0,0 +1,193 @@ +//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF) +// to predicated instructions. +// +// All control flow (except loops) is handled using predicated instructions and +// a predicate stack. Each Scalar ALU controls the operations of 64 Vector +// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs +// by writting to the 64-bit EXEC register (each bit corresponds to a +// single vector ALU). Typically, for predicates, a vector ALU will write +// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each +// Vector ALU) and then the ScalarALU will AND the VCC register with the +// EXEC to update the predicates. +// +// For example: +// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 +// SI_IF_NZ %VCC +// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 +// ELSE +// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 +// ENDIF +// +// becomes: +// +// %SGPR0 = S_MOV_B64 %EXEC // Save the current exec mask +// %EXEC = S_AND_B64 %VCC, %EXEC // Update the exec mask +// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask +// S_CBRANCH_EXECZ label0 // This instruction is an +// // optimization which allows us to +// // branch if all the bits of +// // EXEC are zero. +// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch +// +// label0: +// %SGPR2 = S_MOV_B64 %EXEC // Save the current exec mask +// %EXEC = S_MOV_B64 %SGPR0 // Restore the exec mask for the Then block +// %SGPR0 = S_MOV_B64 %SGPR2 // Save the exec mask from the If block +// S_BRANCH_EXECZ label1 // Use our branch optimization +// // instruction again. +// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block +// label1: +// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" + +using namespace llvm; + +namespace { + +class SILowerControlFlowPass : public MachineFunctionPass { + +private: + static char ID; + const TargetInstrInfo *TII; + std::vector<unsigned> PredicateStack; + std::vector<unsigned> UnusedRegisters; + + void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); + +public: + SILowerControlFlowPass(TargetMachine &tm) : + MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } + + virtual bool runOnMachineFunction(MachineFunction &MF); + + const char *getPassName() const { + return "SI Lower control flow instructions"; + } + +}; + +} // End anonymous namespace + +char SILowerControlFlowPass::ID = 0; + +FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { + return new SILowerControlFlowPass(tm); +} + +bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { + + // Find all the unused registers that can be used for the predicate stack. + for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(), + S = AMDGPU::SReg_64RegClass.end(); + I != S; ++I) { + unsigned Reg = *I; + if (!MF.getRegInfo().isPhysRegUsed(Reg)) { + UnusedRegisters.insert(UnusedRegisters.begin(), Reg); + } + } + + for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); + BB != BB_E; ++BB) { + MachineBasicBlock &MBB = *BB; + for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); + I != MBB.end(); I = Next) { + Next = llvm::next(I); + MachineInstr &MI = *I; + switch (MI.getOpcode()) { + default: break; + case AMDGPU::SI_IF_NZ: + pushExecMask(MBB, I); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64), + AMDGPU::EXEC) + .addOperand(MI.getOperand(0)) // VCC + .addReg(AMDGPU::EXEC); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), + PredicateStack.back()) + .addReg(PredicateStack.back()) + .addReg(AMDGPU::EXEC); + MI.eraseFromParent(); + break; + case AMDGPU::ELSE: + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), + UnusedRegisters.back()) + .addReg(AMDGPU::EXEC); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::EXEC) + .addReg(PredicateStack.back()); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), + PredicateStack.back()) + .addReg(UnusedRegisters.back()); + MI.eraseFromParent(); + break; + case AMDGPU::ENDIF: + popExecMask(MBB, I); + if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL && + PredicateStack.empty()) { + // If the exec mask is non-zero, skip the next two instructions + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(3) + .addReg(AMDGPU::EXEC); + + // Exec mask is zero: Export to NULL target... + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0); + + // ... and terminate wavefront + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM)); + } + MI.eraseFromParent(); + break; + } + } + } + return false; +} + +void SILowerControlFlowPass::pushExecMask(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + + assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack"); + unsigned StackReg = UnusedRegisters.back(); + UnusedRegisters.pop_back(); + PredicateStack.push_back(StackReg); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), + StackReg) + .addReg(AMDGPU::EXEC); +} + +void SILowerControlFlowPass::popExecMask(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I) { + unsigned StackReg = PredicateStack.back(); + PredicateStack.pop_back(); + UnusedRegisters.push_back(StackReg); + BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(StackReg); +} diff --git a/lib/Target/AMDGPU/SILowerFlowControl.cpp b/lib/Target/AMDGPU/SILowerFlowControl.cpp deleted file mode 100644 index 0d90c13..0000000 --- a/lib/Target/AMDGPU/SILowerFlowControl.cpp +++ /dev/null @@ -1,193 +0,0 @@ -//===-- SILowerFlowControl.cpp - Use predicates for flow control ----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This pass lowers the pseudo flow control instructions (SI_IF_NZ, ELSE, ENDIF) -// to predicated instructions. -// -// All flow control (except loops) is handled using predicated instructions and -// a predicate stack. Each Scalar ALU controls the operations of 64 Vector -// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs -// by writting to the 64-bit EXEC register (each bit corresponds to a -// single vector ALU). Typically, for predicates, a vector ALU will write -// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each -// Vector ALU) and then the ScalarALU will AND the VCC register with the -// EXEC to update the predicates. -// -// For example: -// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 -// SI_IF_NZ %VCC -// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 -// ELSE -// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 -// ENDIF -// -// becomes: -// -// %SGPR0 = S_MOV_B64 %EXEC // Save the current exec mask -// %EXEC = S_AND_B64 %VCC, %EXEC // Update the exec mask -// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask -// S_CBRANCH_EXECZ label0 // This instruction is an -// // optimization which allows us to -// // branch if all the bits of -// // EXEC are zero. -// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch -// -// label0: -// %SGPR2 = S_MOV_B64 %EXEC // Save the current exec mask -// %EXEC = S_MOV_B64 %SGPR0 // Restore the exec mask for the Then block -// %SGPR0 = S_MOV_B64 %SGPR2 // Save the exec mask from the If block -// S_BRANCH_EXECZ label1 // Use our branch optimization -// // instruction again. -// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block -// label1: -// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "SIInstrInfo.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" - -using namespace llvm; - -namespace { - -class SILowerFlowControlPass : public MachineFunctionPass { - -private: - static char ID; - const TargetInstrInfo *TII; - std::vector<unsigned> PredicateStack; - std::vector<unsigned> UnusedRegisters; - - void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); - void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I); - -public: - SILowerFlowControlPass(TargetMachine &tm) : - MachineFunctionPass(ID), TII(tm.getInstrInfo()) { } - - virtual bool runOnMachineFunction(MachineFunction &MF); - - const char *getPassName() const { - return "SI Lower flow control instructions"; - } - -}; - -} // End anonymous namespace - -char SILowerFlowControlPass::ID = 0; - -FunctionPass *llvm::createSILowerFlowControlPass(TargetMachine &tm) { - return new SILowerFlowControlPass(tm); -} - -bool SILowerFlowControlPass::runOnMachineFunction(MachineFunction &MF) { - - // Find all the unused registers that can be used for the predicate stack. - for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(), - S = AMDGPU::SReg_64RegClass.end(); - I != S; ++I) { - unsigned Reg = *I; - if (!MF.getRegInfo().isPhysRegUsed(Reg)) { - UnusedRegisters.insert(UnusedRegisters.begin(), Reg); - } - } - - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; - for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); - I != MBB.end(); I = Next) { - Next = llvm::next(I); - MachineInstr &MI = *I; - switch (MI.getOpcode()) { - default: break; - case AMDGPU::SI_IF_NZ: - pushExecMask(MBB, I); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64), - AMDGPU::EXEC) - .addOperand(MI.getOperand(0)) // VCC - .addReg(AMDGPU::EXEC); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), - PredicateStack.back()) - .addReg(PredicateStack.back()) - .addReg(AMDGPU::EXEC); - MI.eraseFromParent(); - break; - case AMDGPU::ELSE: - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), - UnusedRegisters.back()) - .addReg(AMDGPU::EXEC); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), - AMDGPU::EXEC) - .addReg(PredicateStack.back()); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), - PredicateStack.back()) - .addReg(UnusedRegisters.back()); - MI.eraseFromParent(); - break; - case AMDGPU::ENDIF: - popExecMask(MBB, I); - if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL && - PredicateStack.empty()) { - // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3) - .addReg(AMDGPU::EXEC); - - // Exec mask is zero: Export to NULL target... - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0); - - // ... and terminate wavefront - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM)); - } - MI.eraseFromParent(); - break; - } - } - } - return false; -} - -void SILowerFlowControlPass::pushExecMask(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - - assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack"); - unsigned StackReg = UnusedRegisters.back(); - UnusedRegisters.pop_back(); - PredicateStack.push_back(StackReg); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64), - StackReg) - .addReg(AMDGPU::EXEC); -} - -void SILowerFlowControlPass::popExecMask(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I) { - unsigned StackReg = PredicateStack.back(); - PredicateStack.pop_back(); - UnusedRegisters.push_back(StackReg); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(StackReg); -} -- 1.7.10.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev