[llvm-branch-commits] [llvm] 6a19549 - [AMDGPU] Fix failing assert with scratch ST mode
Author: Sebastian Neubauer Date: 2021-01-12T09:54:02+01:00 New Revision: 6a195491b6028185c7278718ac21bca309a6c4ea URL: https://github.com/llvm/llvm-project/commit/6a195491b6028185c7278718ac21bca309a6c4ea DIFF: https://github.com/llvm/llvm-project/commit/6a195491b6028185c7278718ac21bca309a6c4ea.diff LOG: [AMDGPU] Fix failing assert with scratch ST mode In ST mode, flat scratch instructions have neither an sgpr nor a vgpr for the address. This lead to an assertion when inserting hard clauses. Differential Revision: https://reviews.llvm.org/D94406 Added: Modified: llvm/include/llvm/CodeGen/TargetInstrInfo.h llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/test/CodeGen/AMDGPU/memory_clause.ll Removed: diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index 1cf205f9f5a3..36afdefd27b2 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1298,10 +1298,11 @@ class TargetInstrInfo : public MCInstrInfo { bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const; - /// Get the base operands and byte offset of an instruction that reads/writes - /// memory. + /// Get zero or more base operands and the byte offset of an instruction that + /// reads/writes memory. Note that there may be zero base operands if the + /// instruction accesses a constant address. /// It returns false if MI does not read/write memory. - /// It returns false if no base operands and offset was found. + /// It returns false if base operands and offset could not be determined. /// It is not guaranteed to always recognize base operands and offsets in all /// cases. virtual bool getMemOperandsWithOffsetWidth( diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index eebee8e16bc3..6bf9db3f7b2c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -387,7 +387,7 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth( } if (isFLAT(LdSt)) { -// Instructions have either vaddr or saddr or both. +// Instructions have either vaddr or saddr or both or none. BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (BaseOp) BaseOps.push_back(BaseOp); @@ -443,11 +443,15 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1, unsigned NumBytes) const { // If the mem ops (to be clustered) do not have the same base ptr, then they // should not be clustered - assert(!BaseOps1.empty() && !BaseOps2.empty()); - const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); - const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); - if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) + if (!BaseOps1.empty() && !BaseOps2.empty()) { +const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); +const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); +if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) + return false; + } else if (!BaseOps1.empty() || !BaseOps2.empty()) { +// If only one base op is empty, they do not have the same base ptr return false; + } // In order to avoid regester pressure, on an average, the number of DWORDS // loaded together by all clustered mem ops should not exceed 8. This is an diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 2c5931ef57b6..154d8e3320ea 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx1030 -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SCRATCH %s define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { ; GCN-LABEL: vector_clause: @@ -21,6 +22,31 @@ define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocaptu ; GCN-NEXT:s_waitcnt vmcnt(3) ; GCN-NEXT:global_store_dwordx4 v16, v[12:15], s[4:5] offset:48 ; GCN-NEXT:s_endpgm +; +; GCN-SCRATCH-LABEL: vector_clause: +; GCN-SCRATCH: ; %bb.0: ; %bb +; GCN-SCRATCH-NEXT:s_add_u32 s2, s2, s5 +; GCN-SCRATCH-NEXT:s_addc_u32 s3, s3, 0 +; GCN-SCRATCH-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 +; GCN-SCRATCH-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 +; GCN-SCRATCH-NEXT:s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GCN-SCRATCH-NEXT:v_lshlrev_b32_e32 v16, 4, v0 +; GCN-SCRATCH-NEXT:s
[llvm-branch-commits] [llvm] 7898803 - [AMDGPU][NFC] Add more global_atomic_cmpswap tests
Author: Sebastian Neubauer Date: 2020-12-15T14:47:33+01:00 New Revision: 7898803c638497ad32e2d4a189d5597d4eb4506e URL: https://github.com/llvm/llvm-project/commit/7898803c638497ad32e2d4a189d5597d4eb4506e DIFF: https://github.com/llvm/llvm-project/commit/7898803c638497ad32e2d4a189d5597d4eb4506e.diff LOG: [AMDGPU][NFC] Add more global_atomic_cmpswap tests Added: Modified: llvm/test/MC/AMDGPU/flat-global.s llvm/test/MC/AMDGPU/gfx9_asm_all.s llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt llvm/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt Removed: diff --git a/llvm/test/MC/AMDGPU/flat-global.s b/llvm/test/MC/AMDGPU/flat-global.s index e6c25f3f83f6..91c10ae13723 100644 --- a/llvm/test/MC/AMDGPU/flat-global.s +++ b/llvm/test/MC/AMDGPU/flat-global.s @@ -232,9 +232,29 @@ global_atomic_cmpswap v[3:4], v[5:6], off // GFX9: global_atomic_cmpswap v[3:4], v[5:6], off ; encoding: [0x00,0x80,0x04,0xdd,0x03,0x05,0x7f,0x00] // VI-ERR: error: instruction not supported on this GPU -global_atomic_cmpswap_x2 v[3:4], v[5:8], off -// GFX10: encoding: [0x00,0x80,0x44,0xdd,0x03,0x05,0x7d,0x00] -// GFX9: global_atomic_cmpswap_x2 v[3:4], v[5:8], off ; encoding: [0x00,0x80,0x84,0xdd,0x03,0x05,0x7f,0x00] +global_atomic_cmpswap v1, v[3:4], v[5:6], off glc +// GFX10: encoding: [0x00,0x80,0xc5,0xdc,0x03,0x05,0x7d,0x01] +// GFX9: global_atomic_cmpswap v1, v[3:4], v[5:6], off glc ; encoding: [0x00,0x80,0x05,0xdd,0x03,0x05,0x7f,0x01] +// VI-ERR: error: instruction not supported on this GPU + +global_atomic_cmpswap v1, v3, v[5:6], s[2:3] glc +// GFX10: encoding: [0x00,0x80,0xc5,0xdc,0x03,0x05,0x02,0x01] +// GFX9: global_atomic_cmpswap v1, v3, v[5:6], s[2:3] glc ; encoding: [0x00,0x80,0x05,0xdd,0x03,0x05,0x02,0x01] +// VI-ERR: error: instruction not supported on this GPU + +global_atomic_cmpswap_x2 v[5:6], v[7:10], off +// GFX10: encoding: [0x00,0x80,0x44,0xdd,0x05,0x07,0x7d,0x00] +// GFX9: global_atomic_cmpswap_x2 v[5:6], v[7:10], off ; encoding: [0x00,0x80,0x84,0xdd,0x05,0x07,0x7f,0x00] +// VI-ERR: error: instruction not supported on this GPU + +global_atomic_cmpswap_x2 v[1:2], v[5:6], v[7:10], off glc +// GFX10: encoding: [0x00,0x80,0x45,0xdd,0x05,0x07,0x7d,0x01] +// GFX9: global_atomic_cmpswap_x2 v[1:2], v[5:6], v[7:10], off glc ; encoding: [0x00,0x80,0x85,0xdd,0x05,0x07,0x7f,0x01] +// VI-ERR: error: instruction not supported on this GPU + +global_atomic_cmpswap_x2 v[1:2], v5, v[7:10], s[2:3] glc +// GFX10: encoding: [0x00,0x80,0x45,0xdd,0x05,0x07,0x02,0x01] +// GFX9: global_atomic_cmpswap_x2 v[1:2], v5, v[7:10], s[2:3] glc ; encoding: [0x00,0x80,0x85,0xdd,0x05,0x07,0x02,0x01] // VI-ERR: error: instruction not supported on this GPU global_atomic_swap v[3:4], v5, off @@ -362,9 +382,29 @@ global_atomic_cmpswap v[3:4], v[5:6], off offset:-16 // GFX9: global_atomic_cmpswap v[3:4], v[5:6], off offset:-16 ; encoding: [0xf0,0x9f,0x04,0xdd,0x03,0x05,0x7f,0x00] // VI-ERR: :1: error: instruction not supported on this GPU -global_atomic_cmpswap_x2 v[3:4], v[5:8], off offset:-16 -// GFX10: encoding: [0xf0,0x8f,0x44,0xdd,0x03,0x05,0x7d,0x00] -// GFX9: global_atomic_cmpswap_x2 v[3:4], v[5:8], off offset:-16 ; encoding: [0xf0,0x9f,0x84,0xdd,0x03,0x05,0x7f,0x00] +global_atomic_cmpswap v1, v[3:4], v[5:6], off offset:-16 glc +// GFX10: encoding: [0xf0,0x8f,0xc5,0xdc,0x03,0x05,0x7d,0x01] +// GFX9: global_atomic_cmpswap v1, v[3:4], v[5:6], off offset:-16 glc ; encoding: [0xf0,0x9f,0x05,0xdd,0x03,0x05,0x7f,0x01] +// VI-ERR: :1: error: instruction not supported on this GPU + +global_atomic_cmpswap v1, v3, v[5:6], s[2:3] offset:-16 glc +// GFX10: encoding: [0xf0,0x8f,0xc5,0xdc,0x03,0x05,0x02,0x01] +// GFX9: global_atomic_cmpswap v1, v3, v[5:6], s[2:3] offset:-16 glc ; encoding: [0xf0,0x9f,0x05,0xdd,0x03,0x05,0x02,0x01] +// VI-ERR: :1: error: instruction not supported on this GPU + +global_atomic_cmpswap_x2 v[5:6], v[7:10], off offset:-16 +// GFX10: encoding: [0xf0,0x8f,0x44,0xdd,0x05,0x07,0x7d,0x00] +// GFX9: global_atomic_cmpswap_x2 v[5:6], v[7:10], off offset:-16 ; encoding: [0xf0,0x9f,0x84,0xdd,0x05,0x07,0x7f,0x00] +// VI-ERR: :1: error: instruction not supported on this GPU + +global_atomic_cmpswap_x2 v[1:2], v[5:6], v[7:10], off offset:-16 glc +// GFX10: encoding: [0xf0,0x8f,0x45,0xdd,0x05,0x07,0x7d,0x01] +// GFX9: global_atomic_cmpswap_x2 v[1:2], v[5:6], v[7:10], off offset:-16 glc ; encoding: [0xf0,0x9f,0x85,0xdd,0x05,0x07,0x7f,0x01] +// VI-ERR: :1: error: instruction not supported on this GPU + +global_atomic_cmpswap_x2 v[1:2], v5, v[7:10], s[2:3] offset:-16 glc +// GFX10: encoding: [0xf0,0x8f,0x45,0xdd,0x05,0x07,0x02,0x01] +// GFX9: global_atomic_cmpswap_x2 v[1:2], v5, v[7:10], s[2:3] offset:-16 glc ; encoding: [0xf0,0x9f,0x85,0xdd,0x05,0x07,0x02,0x01] // VI-ERR: :1: error: instruction not supported on this GPU global_atomic_swap v[3:4], v5, off offset:-16 diff --git a/llvm/test/MC/AMDGPU/gfx9_asm
[llvm-branch-commits] [llvm] 9144597 - [AMDGPU] Unify flat offset logic
Author: Sebastian Neubauer Date: 2020-12-15T14:59:59+01:00 New Revision: 91445979be0a4e6fe4b42005d7fb03fc46c9ee0c URL: https://github.com/llvm/llvm-project/commit/91445979be0a4e6fe4b42005d7fb03fc46c9ee0c DIFF: https://github.com/llvm/llvm-project/commit/91445979be0a4e6fe4b42005d7fb03fc46c9ee0c.diff LOG: [AMDGPU] Unify flat offset logic Move getNumFlatOffsetBits from AMDGPUAsmParser and SIInstrInfo into AMDGPUBaseInfo. Differential Revision: https://reviews.llvm.org/D93287 Added: Modified: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.cpp llvm/lib/Target/AMDGPU/SIInstrInfo.h llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h llvm/test/MC/AMDGPU/flat-gfx10.s llvm/test/MC/AMDGPU/gfx10_err_pos.s Removed: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 2ad7fab81427..0a0b993778c7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1928,7 +1928,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N, if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) { int64_t RemainderOffset = COffsetVal; int64_t ImmField = 0; -const unsigned NumBits = TII->getNumFlatOffsetBits(true); +const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true); // Use signed division by a power of two to truncate towards 0. int64_t D = 1LL << (NumBits - 1); RemainderOffset = (COffsetVal / D) * D; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 22c32400ecbf..1fd6c2cca6df 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -3646,22 +3646,20 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst, return false; } - // Address offset is 12-bit signed for GFX10, 13-bit for GFX9. // For FLAT segment the offset must be positive; // MSB is ignored and forced to zero. - unsigned OffsetSize = isGFX9() ? 13 : 12; if (TSFlags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch)) { +unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), true); if (!isIntN(OffsetSize, Op.getImm())) { Error(getFlatOffsetLoc(Operands), -isGFX9() ? "expected a 13-bit signed offset" : - "expected a 12-bit signed offset"); +Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset"); return false; } } else { -if (!isUIntN(OffsetSize - 1, Op.getImm())) { +unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), false); +if (!isUIntN(OffsetSize, Op.getImm())) { Error(getFlatOffsetLoc(Operands), -isGFX9() ? "expected a 12-bit unsigned offset" : - "expected an 11-bit unsigned offset"); +Twine("expected a ") + Twine(OffsetSize) + "-bit unsigned offset"); return false; } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 01721595d551..889908bce905 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -7053,13 +7053,6 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const { return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } -unsigned SIInstrInfo::getNumFlatOffsetBits(bool Signed) const { - if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) -return Signed ? 12 : 11; - - return Signed ? 13 : 12; -} - bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, bool Signed) const { // TODO: Should 0 be special cased? @@ -7069,10 +7062,8 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) return false; - if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) -return Signed ? isInt<12>(Offset) : isUInt<11>(Offset); - - return Signed ? isInt<13>(Offset) :isUInt<12>(Offset); + unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed); + return Signed ? isIntN(N, Offset) : isUIntN(N, Offset); } std::pair SIInstrInfo::splitFlatOffset(int64_t COffsetVal, @@ -7080,7 +7071,7 @@ std::pair SIInstrInfo::splitFlatOffset(int64_t COffsetVal, bool IsSigned) const { int64_t RemainderOffset = COffsetVal; int64_t ImmField = 0; - const unsigned NumBits = getNumFlatOffsetBits(IsSigned); + const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, IsSigned); if (IsSigned) { // Use signed division by a power of two to
[llvm-branch-commits] [llvm] 409a2f0 - [AMDGPU] Allow no saddr for global addtid insts
Author: Sebastian Neubauer Date: 2020-12-16T10:01:40+01:00 New Revision: 409a2f0f9e4847cd25560bfbddf22ffa11d15237 URL: https://github.com/llvm/llvm-project/commit/409a2f0f9e4847cd25560bfbddf22ffa11d15237 DIFF: https://github.com/llvm/llvm-project/commit/409a2f0f9e4847cd25560bfbddf22ffa11d15237.diff LOG: [AMDGPU] Allow no saddr for global addtid insts I think the global_load/store_dword_addtid instructions support switching off the scalar address. Add assembler and disassembler support for this. Differential Revision: https://reviews.llvm.org/D93288 Added: Modified: llvm/lib/Target/AMDGPU/FLATInstructions.td llvm/test/MC/AMDGPU/gfx1030_new.s llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt Removed: diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index d47a79414294..57a355a55a02 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -192,24 +192,34 @@ multiclass FLAT_Global_Load_Pseudo : FLAT_Pseudo< + bit HasTiedOutput = 0, bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs regClass:$vdst), - !con((ins SReg_64:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc), + !con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)), +(ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc), !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))), - " $vdst, $saddr$offset$glc$slc$dlc"> { + " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { let is_flat_global = 1; let has_data = 0; let mayLoad = 1; let has_vaddr = 0; let has_saddr = 1; - let enabled_saddr = 1; + let enabled_saddr = EnableSaddr; let maybeAtomic = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", ""); let DisableEncoding = !if(HasTiedOutput, "$vdst_in", ""); } +multiclass FLAT_Global_Load_AddTid_Pseudo { + def "" : FLAT_Global_Load_AddTid_Pseudo, +GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Load_AddTid_Pseudo, +GlobalSaddrTable<1, opName>; +} + multiclass FLAT_Global_Store_Pseudo { let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in { def "" : FLAT_Store_Pseudo, @@ -220,21 +230,29 @@ multiclass FLAT_Global_Store_Pseudo { } class FLAT_Global_Store_AddTid_Pseudo : FLAT_Pseudo< + bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo< opName, (outs), - !con( -(ins vdataClass:$vdata, SReg_64:$saddr), - (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), - " $vdata, $saddr$offset$glc$slc$dlc"> { + !con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins vdataClass:$vdata)), +(ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)), + " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> { let is_flat_global = 1; let mayLoad = 0; let mayStore = 1; let has_vdst = 0; let has_vaddr = 0; let has_saddr = 1; - let enabled_saddr = 1; + let enabled_saddr = EnableSaddr; let maybeAtomic = 1; + let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", ""); +} + +multiclass FLAT_Global_Store_AddTid_Pseudo { + def "" : FLAT_Global_Store_AddTid_Pseudo, +GlobalSaddrTable<0, opName>; + def _SADDR : FLAT_Global_Store_AddTid_Pseudo, +GlobalSaddrTable<1, opName>; } class FlatScratchInst { @@ -603,7 +621,7 @@ defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_ defm GLOBAL_LOAD_SHORT_D16: FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>; defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>; let OtherPredicates = [HasGFX10_BEncoding] in -def GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>; +defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>; defm GLOBAL_STORE_BYTE: FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>; defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>; @@ -612,7 +630,7 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VR defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>; defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>; let OtherPredicates = [HasGFX10_BEncoding] in -def GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>; +defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>; defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>; defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>; @@
[llvm-branch-commits] [llvm] edd6756 - [AMDGPU] Emit stack frame size in metadata
Author: Sebastian Neubauer Date: 2020-11-25T16:30:02+01:00 New Revision: edd675643d5ff49e6ea01af2a2a9b40498b3226c URL: https://github.com/llvm/llvm-project/commit/edd675643d5ff49e6ea01af2a2a9b40498b3226c DIFF: https://github.com/llvm/llvm-project/commit/edd675643d5ff49e6ea01af2a2a9b40498b3226c.diff LOG: [AMDGPU] Emit stack frame size in metadata Add .shader_functions to pal metadata, which contains the stack frame size for all non-entry-point functions. Differential Revision: https://reviews.llvm.org/D90036 Added: Modified: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h llvm/test/CodeGen/AMDGPU/amdpal-callable.ll Removed: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index fc785902843c..8148d0487802 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -456,9 +456,12 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { Info = analyzeResourceUsage(MF); } - if (STM.isAmdPalOS() && MFI->isEntryFunction()) -EmitPALMetadata(MF, CurrentProgramInfo); - else if (!STM.isAmdHsaOS()) { + if (STM.isAmdPalOS()) { +if (MFI->isEntryFunction()) + EmitPALMetadata(MF, CurrentProgramInfo); +else + emitPALFunctionMetadata(MF); + } else if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, CurrentProgramInfo); } @@ -1260,6 +1263,12 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, MD->setWave32(MF.getFunction().getCallingConv()); } +void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { + auto *MD = getTargetStreamer()->getPALMetadata(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + MD->setStackFrameSize(MF, MFI.getStackSize()); +} + // This is supposed to be log2(Size) static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { switch (Size) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 54e8338ab4b0..907ff2bfc162 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -78,6 +78,7 @@ class AMDGPUAsmPrinter final : public AsmPrinter { const SIProgramInfo &KernelInfo); void EmitPALMetadata(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void emitPALFunctionMetadata(const MachineFunction &MF); void emitCommonFunctionComments(uint32_t NumVGPR, Optional NumAGPR, uint32_t TotalNumVGPR, diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp index 85cba165770f..efabab90422f 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp @@ -238,6 +238,14 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) { getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val); } +// Set the scratch size in the metadata. +void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF, + unsigned Val) { + auto Node = MsgPackDoc.getMapNode(); + Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val); + getShaderFunctions()[MF.getFunction().getName()] = Node; +} + // Set the hardware register bit in PAL metadata to enable wave32 on the // shader of the given calling convention. void AMDGPUPALMetadata::setWave32(unsigned CC) { @@ -721,6 +729,24 @@ msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() { return Registers.getMap(); } +// Reference (create if necessary) the node for the shader functions map. +msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() { + auto &N = + MsgPackDoc.getRoot() + .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")] + .getArray(/*Convert=*/true)[0] + .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")]; + N.getMap(/*Convert=*/true); + return N; +} + +// Get (create if necessary) the shader functions map. +msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() { + if (ShaderFunctions.isEmpty()) +ShaderFunctions = refShaderFunctions(); + return ShaderFunctions.getMap(); +} + // Return the PAL metadata hardware shader stage name. static const char *getStageName(CallingConv::ID CC) { switch (CC) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h index b089f295364c..3b1767bb1f64 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h @@ -15,6 +1
[llvm-branch-commits] [llvm] 5733167 - [AMDGPU] Mark amdgpu_gfx functions as module entry function
Author: Sebastian Neubauer Date: 2020-12-14T10:43:39+01:00 New Revision: 5733167f54a582d52fc06617646c13cd1e0b3362 URL: https://github.com/llvm/llvm-project/commit/5733167f54a582d52fc06617646c13cd1e0b3362 DIFF: https://github.com/llvm/llvm-project/commit/5733167f54a582d52fc06617646c13cd1e0b3362.diff LOG: [AMDGPU] Mark amdgpu_gfx functions as module entry function - Allows lds allocations - Writes resource usage into COMPUTE_PGM_RSRC1 registers in PAL metadata Differential Revision: https://reviews.llvm.org/D92946 Added: Modified: llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h llvm/test/CodeGen/AMDGPU/amdpal-callable.ll Removed: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 137f6896c87b..a14f846b76d1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -446,7 +446,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->SwitchSection(ConfigSection); } - if (MFI->isEntryFunction()) { + if (MFI->isModuleEntryFunction()) { getSIProgramInfo(CurrentProgramInfo, MF); } else { auto I = CallGraphResourceInfo.insert( @@ -459,7 +459,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { if (STM.isAmdPalOS()) { if (MFI->isEntryFunction()) EmitPALMetadata(MF, CurrentProgramInfo); -else +else if (MFI->isModuleEntryFunction()) emitPALFunctionMetadata(MF); } else if (!STM.isAmdHsaOS()) { EmitProgramInfoSI(MF, CurrentProgramInfo); @@ -922,7 +922,22 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( = TII->getNamedOperand(MI, AMDGPU::OpName::callee); const Function *Callee = getCalleeFunction(*CalleeOp); -if (!Callee || Callee->isDeclaration()) { +DenseMap::const_iterator I = +CallGraphResourceInfo.end(); +bool IsExternal = !Callee || Callee->isDeclaration(); +if (!IsExternal) + I = CallGraphResourceInfo.find(Callee); + +if (IsExternal || I == CallGraphResourceInfo.end()) { + // Avoid crashing on undefined behavior with an illegal call to a + // kernel. If a callsite's calling convention doesn't match the + // function's, it's undefined behavior. If the callsite calling + // convention does match, that would have errored earlier. + // FIXME: The verifier shouldn't allow this. + if (!IsExternal && + AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) +report_fatal_error("invalid call to entry function"); + // If this is a call to an external function, we can't do much. Make // conservative guesses. @@ -943,19 +958,6 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( // We force CodeGen to run in SCC order, so the callee's register // usage etc. should be the cumulative usage of all callees. - auto I = CallGraphResourceInfo.find(Callee); - if (I == CallGraphResourceInfo.end()) { -// Avoid crashing on undefined behavior with an illegal call to a -// kernel. If a callsite's calling convention doesn't match the -// function's, it's undefined behavior. If the callsite calling -// convention does match, that would have errored earlier. -// FIXME: The verifier shouldn't allow this. -if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv())) - report_fatal_error("invalid call to entry function"); - -llvm_unreachable("callee should have been handled before caller"); - } - MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR); MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR); MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR); @@ -1266,7 +1268,11 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF, void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) { auto *MD = getTargetStreamer()->getPALMetadata(); const MachineFrameInfo &MFI = MF.getFrameInfo(); - MD->setStackFrameSize(MF, MFI.getStackSize()); + MD->setFunctionScratchSize(MF, MFI.getStackSize()); + // Set compute registers + MD->setRsrc1(CallingConv::AMDGPU_CS, + CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS)); + MD->setRsrc2(CallingCo