================ @@ -63,4 +70,189 @@ char &llvm::AMDGPURBSelectID = AMDGPURBSelect::ID; FunctionPass *llvm::createAMDGPURBSelectPass() { return new AMDGPURBSelect(); } -bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { return true; } +bool shouldRBSelect(MachineInstr &MI) { + if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode()) + return false; + + if (MI.getOpcode() == AMDGPU::PHI || MI.getOpcode() == AMDGPU::IMPLICIT_DEF) + return false; + + if (MI.isInlineAsm()) + return false; + + return true; +} + +void setRB(MachineInstr &MI, MachineOperand &DefOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = DefOP.getReg(); + // Register that already has Register class got it during pre-inst selection + // of another instruction. Maybe cross bank copy was required so we insert a + // copy trat can be removed later. This simplifies post-rb-legalize artifact + // combiner and avoids need to special case some patterns. + if (MRI.getRegClassOrNull(Reg)) { + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + DefOP.setReg(NewReg); + + auto &MBB = *MI.getParent(); + B.setInsertPt(MBB, MI.isPHI() ? MBB.getFirstNonPHI() + : std::next(MI.getIterator())); + B.buildCopy(Reg, NewReg); + + // The problem was discoverd for uniform S1 that was used as both + // lane mask(vcc) and regular sgpr S1. + // - lane-mask(vcc) use was by si_if, this use is divergent and requires + // non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets + // sreg_64_xexec(S1) on def of uniform S1 making it lane-mask. + // - the regular regular sgpr S1(uniform) instruction is now broken since + // it uses sreg_64_xexec(S1) which is divergent. + + // "Clear" reg classes from uses on generic instructions and but register + // banks instead. + for (auto &UseMI : MRI.use_instructions(Reg)) { + if (shouldRBSelect(UseMI)) { + for (MachineOperand &Op : UseMI.operands()) { + if (Op.isReg() && Op.isUse() && Op.getReg() == Reg) + Op.setReg(NewReg); + } + } + } + + } else { + MRI.setRegBank(Reg, RB); + } +} + +void setRBUse(MachineInstr &MI, MachineOperand &UseOP, MachineIRBuilder B, + MachineRegisterInfo &MRI, const RegisterBank &RB) { + Register Reg = UseOP.getReg(); + + LLT Ty = MRI.getType(Reg); + Register NewReg = MRI.createVirtualRegister({&RB, Ty}); + UseOP.setReg(NewReg); + + if (MI.isPHI()) { + auto DefMI = MRI.getVRegDef(Reg)->getIterator(); + MachineBasicBlock *DefMBB = DefMI->getParent(); + B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI))); + } else { + B.setInstr(MI); + } + + B.buildCopy(NewReg, Reg); +} + +// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of +// the cycle +// Note: uniformity analysis does not consider that registers with vgpr def are +// divergent (you can have uniform value in vgpr). +// - TODO: implicit use of $exec could be implemented as indicator that +// instruction is divergent +bool isTemporalDivergenceCopy(Register Reg, MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getOpcode() == AMDGPU::COPY) { + for (auto Op : MI->implicit_operands()) { + if (!Op.isReg()) + continue; + Register Reg = Op.getReg(); + if (Reg == AMDGPU::EXEC) { + return true; + } + } + } + + return false; +} + +Register getVReg(MachineOperand &Op) { + if (!Op.isReg()) + return 0; + + Register Reg = Op.getReg(); + if (!Reg.isVirtual()) + return 0; + + return Reg; +} + +bool AMDGPURBSelect::runOnMachineFunction(MachineFunction &MF) { + MachineUniformityInfo &MUI = + getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo(); + AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo(); + + MachineIRBuilder B(MF); + + // Assign register banks to ALL def registers on G_ instructions. + // Same for copies if they have no register bank or class on def. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!shouldRBSelect(MI)) + continue; + + for (MachineOperand &DefOP : MI.defs()) { + Register DefReg = getVReg(DefOP); + if (!DefReg) + continue; + + // Copies can have register class on def registers. + if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) { + continue; + } + + if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) { + setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::SGPRRegBankID)); + } else { + if (MRI.getType(DefReg) == LLT::scalar(1)) + setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VCCRegBankID)); + else + setRB(MI, DefOP, B, MRI, RBI.getRegBank(AMDGPU::VGPRRegBankID)); + } + } + } + } + + // At this point all virtual registers have register class or bank + // - Defs of G_ instructions have register banks. + // - Defs and uses of inst-selected instructions have register class. + // - Defs and uses of copies can have either register class or bank + // and most notably + // - Uses of G_ instructions can have either register class or bank + + // Reassign uses of G_ instructions to only have register banks. + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + if (!shouldRBSelect(MI)) + continue; + + // Copies can have register class on use registers. + if (MI.isCopy()) + continue; + + for (MachineOperand &UseOP : MI.uses()) { + Register UseReg = getVReg(UseOP); ---------------- arsenm wrote:
I think getVReg isn't helping readability, it's more canonical to just directly have the isReg + isVirtual check here https://github.com/llvm/llvm-project/pull/112863 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits