================ @@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) { S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg()); } } + +MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc}); + Register Dst = RFL->getOperand(0).getReg(); + Register Src = RFL->getOperand(1).getReg(); + MachineRegisterInfo &MRI = *B.getMRI(); + if (!MRI.getRegBankOrNull(Dst)) + MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); + if (!MRI.getRegBankOrNull(Src)) + MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); + return RFL; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, LLT B32Ty, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector<Register, 8> SgprDstParts; + auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc); + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + SgprDstParts.push_back( + buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0)); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder +AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + MachineRegisterInfo &MRI = *B.getMRI(); + SmallVector<Register, 8> SgprDstParts; + auto Unmerge = B.buildUnmerge(S64, VgprSrc); + + for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { + MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID)); + auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i)); + SmallVector<Register, 2> Unmerge64Parts; + Unmerge64Parts.push_back( + buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0)); + Unmerge64Parts.push_back( + buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0)); + Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0); + MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID)); + SgprDstParts.push_back(MergeReg); + } + + auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts); + MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID)); + return Merge; +} + +MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B, + const DstOp &SgprDst, + const SrcOp &VgprSrc, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + LLT S16 = LLT::scalar(16); + LLT S32 = LLT::scalar(32); + LLT S64 = LLT::scalar(64); + LLT S256 = LLT::scalar(256); + LLT V2S16 = LLT::fixed_vector(2, 16); + LLT Ty = SgprDst.getLLTTy(MRI); + + if (Ty == S16) { + return B.buildTrunc( + SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI)); + } + + if (Ty == S32 || Ty == V2S16 || + (Ty.isPointer() && Ty.getSizeInBits() == 32)) { + return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI); + } + + if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) || + (Ty.isVector() && Ty.getElementType() == S32)) { + return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI); + } + + if (Ty.isVector() && Ty.getElementType() == S16) { + return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI); + } + + if (Ty.isVector() && Ty.getElementType() == S64) { + return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI); + } + + llvm_unreachable("Type not supported"); +} + +void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI, + const RegisterBankInfo &RBI) { + MachineRegisterInfo &MRI = *B.getMRI(); + Register Dst = MI.getOperand(0).getReg(); + const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst); + if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID)) + return; + + Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst)); + MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID)); + + MI.getOperand(0).setReg(VgprDst); + MachineBasicBlock *MBB = MI.getParent(); + B.setInsertPt(*MBB, std::next(MI.getIterator())); + // readAnyLane VgprDst into Dst after MI. + buildReadAnyLane(B, Dst, VgprDst, RBI); + return; +} + +bool AMDGPU::isLaneMask(Register Reg, MachineRegisterInfo &MRI, + const SIRegisterInfo *TRI) { + const RegisterBank *RB = MRI.getRegBankOrNull(Reg); + if (RB && RB->getID() == VCCRegBankID) + return true; + + const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg); + if (RC && TRI->isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1)) + return true; + + return false; +} + +bool AMDGPU::isSgprRB(Register Reg, MachineRegisterInfo &MRI) { ---------------- arsenm wrote:
Avoid the RB abbreviation. This is simple enough that I would just expect it done inline in the local pass, or in the AMDGPURegBank header https://github.com/llvm/llvm-project/pull/112864 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits