[llvm-branch-commits] [llvm] a80ebd0 - [AMDGPU] Fix llvm.amdgcn.init.exec and frame materialization
Author: Carl Ritson Date: 2021-01-25T08:31:17+09:00 New Revision: a80ebd01798ca82a4f5ffd6d355c5c9facd83375 URL: https://github.com/llvm/llvm-project/commit/a80ebd01798ca82a4f5ffd6d355c5c9facd83375 DIFF: https://github.com/llvm/llvm-project/commit/a80ebd01798ca82a4f5ffd6d355c5c9facd83375.diff LOG: [AMDGPU] Fix llvm.amdgcn.init.exec and frame materialization Frame-base materialization may insert vector instructions before EXEC is initialised. Fix this by moving lowering of llvm.amdgcn.init.exec later in backend. Also remove SI_INIT_EXEC_LO pseudo as this is not necessary. Reviewed By: ruiling Differential Revision: https://reviews.llvm.org/D94645 Added: Modified: llvm/include/llvm/IR/IntrinsicsAMDGPU.td llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIInstructions.td llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.exec.ll Removed: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 2cab7f38e281..5b8b563df40a 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -182,6 +182,8 @@ def int_amdgcn_init_exec : Intrinsic<[], // Set EXEC according to a thread count packed in an SGPR input: //thread_count = (input >> bitoffset) & 0x7f; // This is always moved to the beginning of the basic block. +// Note: only inreg arguments to the parent function are valid as +// inputs to this intrinsic, computed values cannot be used. def int_amdgcn_init_exec_from_input : Intrinsic<[], [llvm_i32_ty, // 32-bit SGPR input llvm_i32_ty], // bit offset of the thread count diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e959c5f0f8d3..839437b5e3f8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4021,77 +4021,6 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; } - case AMDGPU::SI_INIT_EXEC: -// This should be before all vector instructions. -BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), -AMDGPU::EXEC) -.addImm(MI.getOperand(0).getImm()); -MI.eraseFromParent(); -return BB; - - case AMDGPU::SI_INIT_EXEC_LO: -// This should be before all vector instructions. -BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), -AMDGPU::EXEC_LO) -.addImm(MI.getOperand(0).getImm()); -MI.eraseFromParent(); -return BB; - - case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { -// Extract the thread count from an SGPR input and set EXEC accordingly. -// Since BFM can't shift by 64, handle that case with CMP + CMOV. -// -// S_BFE_U32 count, input, {shift, 7} -// S_BFM_B64 exec, count, 0 -// S_CMP_EQ_U32 count, 64 -// S_CMOV_B64 exec, -1 -MachineInstr *FirstMI = &*BB->begin(); -MachineRegisterInfo &MRI = MF->getRegInfo(); -Register InputReg = MI.getOperand(0).getReg(); -Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); -bool Found = false; - -// Move the COPY of the input reg to the beginning, so that we can use it. -for (auto I = BB->begin(); I != &MI; I++) { - if (I->getOpcode() != TargetOpcode::COPY || - I->getOperand(0).getReg() != InputReg) -continue; - - if (I == FirstMI) { -FirstMI = &*++BB->begin(); - } else { -I->removeFromParent(); -BB->insert(FirstMI, &*I); - } - Found = true; - break; -} -assert(Found); -(void)Found; - -// This should be before all vector instructions. -unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1; -bool isWave32 = getSubtarget()->isWave32(); -unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; -BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) -.addReg(InputReg) -.addImm((MI.getOperand(1).getImm() & Mask) | 0x7); -BuildMI(*BB, FirstMI, DebugLoc(), -TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), -Exec) -.addReg(CountReg) -.addImm(0); -BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) -.addReg(CountReg, RegState::Kill) -.addImm(getSubtarget()->getWavefrontSize()); -BuildMI(*BB, FirstMI, DebugLoc(), -TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), -Exec) -.addImm(-1); -MI.eraseFromParent(); -return BB; - } - case AMDGPU::GET_GROUPSTATICSIZE: { assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); diff --git a/llvm/lib/Target/AMDGPU/SIInstr
[llvm-branch-commits] [llvm] 790c75c - [AMDGPU] Add SI_EARLY_TERMINATE_SCC0 for early terminating shader
Author: Carl Ritson Date: 2021-01-13T13:29:05+09:00 New Revision: 790c75c16373d37846c8433a69efd9b0d5e4ad12 URL: https://github.com/llvm/llvm-project/commit/790c75c16373d37846c8433a69efd9b0d5e4ad12 DIFF: https://github.com/llvm/llvm-project/commit/790c75c16373d37846c8433a69efd9b0d5e4ad12.diff LOG: [AMDGPU] Add SI_EARLY_TERMINATE_SCC0 for early terminating shader Add pseudo instruction to allow early termination of pixel shader anywhere based on the value of SCC. The intention is to use this when a mask of live lanes is updated, e.g. live lanes in WQM pass. This facilitates early termination of shaders even when EXEC is incomplete, e.g. in non-uniform control flow. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D88777 Added: llvm/test/CodeGen/AMDGPU/early-term.mir Modified: llvm/lib/Target/AMDGPU/SIInsertSkips.cpp llvm/lib/Target/AMDGPU/SIInstructions.td Removed: diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp index eb2e12f2dcda..e80325bddc43 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp @@ -49,6 +49,7 @@ class SIInsertSkips : public MachineFunctionPass { DebugLoc DL); bool kill(MachineInstr &MI); + void earlyTerm(MachineInstr &MI); bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB); @@ -145,19 +146,22 @@ bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) { return true; } -static void generatePsEndPgm(MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, DebugLoc DL, - const SIInstrInfo *TII) { - // Generate "null export; s_endpgm". - BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) - .addImm(AMDGPU::Exp::ET_NULL) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addReg(AMDGPU::VGPR0, RegState::Undef) - .addImm(1) // vm - .addImm(0) // compr - .addImm(0); // en +static void generateEndPgm(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, DebugLoc DL, + const SIInstrInfo *TII, bool IsPS) { + // "null export" + if (IsPS) { +BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE)) +.addImm(AMDGPU::Exp::ET_NULL) +.addReg(AMDGPU::VGPR0, RegState::Undef) +.addReg(AMDGPU::VGPR0, RegState::Undef) +.addReg(AMDGPU::VGPR0, RegState::Undef) +.addReg(AMDGPU::VGPR0, RegState::Undef) +.addImm(1) // vm +.addImm(0) // compr +.addImm(0); // en + } + // s_endpgm BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); } @@ -169,7 +173,9 @@ void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB, if (!EarlyExitBlock) { EarlyExitBlock = MF->CreateMachineBasicBlock(); MF->insert(MF->end(), EarlyExitBlock); -generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII); +generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, + MF->getFunction().getCallingConv() == + CallingConv::AMDGPU_PS); EarlyExitClearsExec = false; } @@ -178,7 +184,6 @@ void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB, unsigned Mov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; Register Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; auto ExitI = EarlyExitBlock->getFirstNonPHI(); -assert(ExitI->getOpcode() == AMDGPU::EXP_DONE); BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(Mov), Exec).addImm(0); EarlyExitClearsExec = true; } @@ -224,7 +229,7 @@ void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB, I == MBB.end() && !llvm::is_contained(MBB.successors(), &*NextBBI); if (NoSuccessor) { -generatePsEndPgm(MBB, I, DL, TII); +generateEndPgm(MBB, I, DL, TII, true); } else { ensureEarlyExitBlock(MBB, false); @@ -368,6 +373,23 @@ bool SIInsertSkips::kill(MachineInstr &MI) { } } +void SIInsertSkips::earlyTerm(MachineInstr &MI) { + MachineBasicBlock &MBB = *MI.getParent(); + const DebugLoc DL = MI.getDebugLoc(); + + ensureEarlyExitBlock(MBB, true); + + auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) + .addMBB(EarlyExitBlock); + auto Next = std::next(MI.getIterator()); + + if (Next != MBB.end() && !Next->isTerminator()) +splitBlock(MBB, *BranchMI, MDT); + + MBB.addSuccessor(EarlyExitBlock); + MDT->getBase().insertEdge(&MBB, EarlyExitBlock); +} + // Returns true if a branch over the block was inserted. bool SIInsertSkips::skipMaskBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB) { @@ -393,6 +415,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { SkipThreshold =
[llvm-branch-commits] [llvm] b58b440 - [AMDGPU][NFC] Document high parameter of f16 interp intrinsics
Author: Carl Ritson Date: 2020-12-18T19:59:13+09:00 New Revision: b58b440d19c84f59aae4679608c55db0d95ff879 URL: https://github.com/llvm/llvm-project/commit/b58b440d19c84f59aae4679608c55db0d95ff879 DIFF: https://github.com/llvm/llvm-project/commit/b58b440d19c84f59aae4679608c55db0d95ff879.diff LOG: [AMDGPU][NFC] Document high parameter of f16 interp intrinsics Added: Modified: llvm/include/llvm/IR/IntrinsicsAMDGPU.td Removed: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index ea4a93f8bdef..2cab7f38e281 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1330,6 +1330,7 @@ def int_amdgcn_interp_p2 : // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. // __builtin_amdgcn_interp_p1_f16 , , , , +// high selects whether high or low 16-bits are loaded from LDS def int_amdgcn_interp_p1_f16 : GCCBuiltin<"__builtin_amdgcn_interp_p1_f16">, Intrinsic<[llvm_float_ty], @@ -1338,6 +1339,7 @@ def int_amdgcn_interp_p1_f16 : ImmArg>, ImmArg>, ImmArg>]>; // __builtin_amdgcn_interp_p2_f16 , , , , , +// high selects whether high or low 16-bits are loaded from LDS def int_amdgcn_interp_p2_f16 : GCCBuiltin<"__builtin_amdgcn_interp_p2_f16">, Intrinsic<[llvm_half_ty], ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] 7722494 - [AMDGPU][NFC] Remove unused Hi16Elt definition
Author: Carl Ritson Date: 2020-12-18T20:38:54+09:00 New Revision: 7722494834a8357a42d3da70d22f4a9d87c78e2c URL: https://github.com/llvm/llvm-project/commit/7722494834a8357a42d3da70d22f4a9d87c78e2c DIFF: https://github.com/llvm/llvm-project/commit/7722494834a8357a42d3da70d22f4a9d87c78e2c.diff LOG: [AMDGPU][NFC] Remove unused Hi16Elt definition Added: Modified: llvm/lib/Target/AMDGPU/SIInstrInfo.td Removed: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 5a6c81a0c89b..746d08b8ce0e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1331,9 +1331,6 @@ def VOP3OpSelMods : ComplexPattern; def VOP3PMadMixMods : ComplexPattern; - -def Hi16Elt : ComplexPattern; - //===--===// // SI assembler operands //===--===// ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] af4570c - [AMDGPU][NFC] Remove unused VOP3Mods0Clamp
Author: Carl Ritson Date: 2020-12-14T20:00:58+09:00 New Revision: af4570cd3ab94dd52574874b0e9c91a4f6e39272 URL: https://github.com/llvm/llvm-project/commit/af4570cd3ab94dd52574874b0e9c91a4f6e39272 DIFF: https://github.com/llvm/llvm-project/commit/af4570cd3ab94dd52574874b0e9c91a4f6e39272.diff LOG: [AMDGPU][NFC] Remove unused VOP3Mods0Clamp This is unused and the selection function does not exist. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D93188 Added: Modified: llvm/lib/Target/AMDGPU/SIInstrInfo.td Removed: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b2bc21975a53..295030d80240 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1316,7 +1316,6 @@ def DS128Bit8ByteAligned : ComplexPattern; def MOVRELOffset : ComplexPattern; def VOP3Mods0 : ComplexPattern; -def VOP3Mods0Clamp : ComplexPattern; def VOP3Mods : ComplexPattern; def VOP3NoMods : ComplexPattern; // VOP3Mods, but the input source is known to never be NaN. ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] 62c246e - [AMDGPU][NFC] Rename opsel/opsel_hi/neg_lo/neg_hi with suffix 0
Author: Carl Ritson Date: 2020-12-14T20:01:56+09:00 New Revision: 62c246eda24c362f1aa5a71f2cf11f9df5642460 URL: https://github.com/llvm/llvm-project/commit/62c246eda24c362f1aa5a71f2cf11f9df5642460 DIFF: https://github.com/llvm/llvm-project/commit/62c246eda24c362f1aa5a71f2cf11f9df5642460.diff LOG: [AMDGPU][NFC] Rename opsel/opsel_hi/neg_lo/neg_hi with suffix 0 These parameters set a default value of 0, so I believe they should include a 0 suffix. This allows for versions which do not set a default value in future. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D93187 Added: Modified: llvm/lib/Target/AMDGPU/SIInstrInfo.td llvm/lib/Target/AMDGPU/VOP3Instructions.td llvm/lib/Target/AMDGPU/VOP3PInstructions.td Removed: diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 295030d80240..5a6c81a0c89b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1136,10 +1136,10 @@ def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>; def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>; def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>; -def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>; -def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; -def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; -def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; +def op_sel0 : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>; +def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>; +def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>; +def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>; def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>; def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>; @@ -1677,25 +1677,25 @@ class getInsVOP3P , VOP3 let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0, IntOpSelMods:$src1_modifiers, SCSrc_b32:$src1, IntOpSelMods:$src2_modifiers, SCSrc_b32:$src2, - VGPR_32:$vdst_in, op_sel:$op_sel); + VGPR_32:$vdst_in, op_sel0:$op_sel); let HasClamp = 0; let HasOMod = 0; } diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 2a9992087ca9..09346f400d71 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -39,7 +39,7 @@ class VOP3_VOP3PInsthttps://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Disable VALU sinking and hoisting with WWM (PR #123124)
perlfu wrote: I guess my concern is performance regressions if any use of WWM (e.g. atomic optimizer) essentially turns off Machine LICM. https://github.com/llvm/llvm-project/pull/123124 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
@@ -552,6 +552,7 @@ enum Id { // HwRegCode, (6) [5:0] enum Offset : unsigned { // Offset, (5) [10:6] OFFSET_MEM_VIOL = 8, + OFFSET_ME_ID = 8, perlfu wrote: It's slightly confusing that this enumeration of offsets applies to multiple registers. Perhaps comment which register this is for? e.g. `OFFSET_ME_ID = 8, // in HW_ID2` https://github.com/llvm/llvm-project/pull/130055 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
@@ -691,17 +691,61 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, } assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg); - if (hasFP(MF)) { + unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST); + if (!mayReserveScratchForCWSR(MF)) { +if (hasFP(MF)) { + Register FPReg = MFI->getFrameOffsetReg(); + assert(FPReg != AMDGPU::FP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); +} + +if (requiresStackPointerReference(MF)) { + Register SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset); +} + } else { +// We need to check if we're on a compute queue - if we are, then the CWSR +// trap handler may need to store some VGPRs on the stack. The first VGPR +// block is saved separately, so we only need to allocate space for any +// additional VGPR blocks used. For now, we will make sure there's enough +// room for the theoretical maximum number of VGPRs that can be allocated. +// FIXME: Figure out if the shader uses fewer VGPRs in practice. +assert(hasFP(MF)); Register FPReg = MFI->getFrameOffsetReg(); assert(FPReg != AMDGPU::FP_REG); -BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); - } - - if (requiresStackPointerReference(MF)) { Register SPReg = MFI->getStackPtrOffsetReg(); assert(SPReg != AMDGPU::SP_REG); -BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) -.addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST)); +unsigned VGPRSize = +llvm::alignTo((ST.getAddressableNumVGPRs() - + AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) * + 4, + FrameInfo.getMaxAlign()); +MFI->setScratchReservedForDynamicVGPRs(VGPRSize); + +BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg) +.addImm(AMDGPU::Hwreg::HwregEncoding::encode( +AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 1)); perlfu wrote: Do you not need to retrieve 2 bits? i.e. AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, **2**) https://github.com/llvm/llvm-project/pull/130055 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] AMDGPU: Replace amdgpu-no-agpr with amdgpu-num-agpr (PR #129893)
@@ -603,11 +601,7 @@ SIRegisterInfo::getMaxNumVectorRegs(const MachineFunction &MF) const { if (MinNumAGPRs == DefaultNumAGPR.first) { // Default to splitting half the registers if AGPRs are required. - - if (MFI->mayNeedAGPRs()) -MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2; - else -MinNumAGPRs = 0; perlfu wrote: I guess the removal of the forced minima yields no functional change? https://github.com/llvm/llvm-project/pull/129893 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
https://github.com/perlfu approved this pull request. LGTM nit: can you note somewhere (in a comment) that `ScratchReservedForDynamicVGPRs` is in bytes -- the magic divide by 4 to set `dynamic_vgpr_saved_count` was not entirely obvious. https://github.com/llvm/llvm-project/pull/130055 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Intrinsic for launching whole wave functions (PR #145859)
https://github.com/perlfu approved this pull request. LGTM But I am unsure if request for tests from @arsenm is fully satisfied. https://github.com/llvm/llvm-project/pull/145859 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits