[llvm-branch-commits] [llvm] 6a19549 - [AMDGPU] Fix failing assert with scratch ST mode

2021-01-12 Thread Sebastian Neubauer via llvm-branch-commits

Author: Sebastian Neubauer
Date: 2021-01-12T09:54:02+01:00
New Revision: 6a195491b6028185c7278718ac21bca309a6c4ea

URL: 
https://github.com/llvm/llvm-project/commit/6a195491b6028185c7278718ac21bca309a6c4ea
DIFF: 
https://github.com/llvm/llvm-project/commit/6a195491b6028185c7278718ac21bca309a6c4ea.diff

LOG: [AMDGPU] Fix failing assert with scratch ST mode

In ST mode, flat scratch instructions have neither an sgpr nor a vgpr
for the address. This lead to an assertion when inserting hard clauses.

Differential Revision: https://reviews.llvm.org/D94406

Added: 


Modified: 
llvm/include/llvm/CodeGen/TargetInstrInfo.h
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/test/CodeGen/AMDGPU/memory_clause.ll

Removed: 




diff  --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h 
b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 1cf205f9f5a3..36afdefd27b2 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -1298,10 +1298,11 @@ class TargetInstrInfo : public MCInstrInfo {
bool &OffsetIsScalable,
const TargetRegisterInfo *TRI) const;
 
-  /// Get the base operands and byte offset of an instruction that reads/writes
-  /// memory.
+  /// Get zero or more base operands and the byte offset of an instruction that
+  /// reads/writes memory. Note that there may be zero base operands if the
+  /// instruction accesses a constant address.
   /// It returns false if MI does not read/write memory.
-  /// It returns false if no base operands and offset was found.
+  /// It returns false if base operands and offset could not be determined.
   /// It is not guaranteed to always recognize base operands and offsets in all
   /// cases.
   virtual bool getMemOperandsWithOffsetWidth(

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index eebee8e16bc3..6bf9db3f7b2c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -387,7 +387,7 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
   }
 
   if (isFLAT(LdSt)) {
-// Instructions have either vaddr or saddr or both.
+// Instructions have either vaddr or saddr or both or none.
 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
 if (BaseOp)
   BaseOps.push_back(BaseOp);
@@ -443,11 +443,15 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef BaseOps1,
   unsigned NumBytes) const {
   // If the mem ops (to be clustered) do not have the same base ptr, then they
   // should not be clustered
-  assert(!BaseOps1.empty() && !BaseOps2.empty());
-  const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
-  const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
-  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
+  if (!BaseOps1.empty() && !BaseOps2.empty()) {
+const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
+const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
+if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
+  return false;
+  } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
+// If only one base op is empty, they do not have the same base ptr
 return false;
+  }
 
   // In order to avoid regester pressure, on an average, the number of DWORDS
   // loaded together by all clustered mem ops should not exceed 8. This is an

diff  --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll 
b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
index 2c5931ef57b6..154d8e3320ea 100644
--- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck 
-check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -amdgpu-enable-flat-scratch 
-verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SCRATCH %s
 
 define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias 
nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) {
 ; GCN-LABEL: vector_clause:
@@ -21,6 +22,31 @@ define amdgpu_kernel void @vector_clause(<4 x i32> 
addrspace(1)* noalias nocaptu
 ; GCN-NEXT:s_waitcnt vmcnt(3)
 ; GCN-NEXT:global_store_dwordx4 v16, v[12:15], s[4:5] offset:48
 ; GCN-NEXT:s_endpgm
+;
+; GCN-SCRATCH-LABEL: vector_clause:
+; GCN-SCRATCH:   ; %bb.0: ; %bb
+; GCN-SCRATCH-NEXT:s_add_u32 s2, s2, s5
+; GCN-SCRATCH-NEXT:s_addc_u32 s3, s3, 0
+; GCN-SCRATCH-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GCN-SCRATCH-NEXT:s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GCN-SCRATCH-NEXT:s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GCN-SCRATCH-NEXT:v_lshlrev_b32_e32 v16, 4, v0
+; GCN-SCRATCH-NEXT:s

[llvm-branch-commits] [llvm] 7898803 - [AMDGPU][NFC] Add more global_atomic_cmpswap tests

2020-12-15 Thread Sebastian Neubauer via llvm-branch-commits

Author: Sebastian Neubauer
Date: 2020-12-15T14:47:33+01:00
New Revision: 7898803c638497ad32e2d4a189d5597d4eb4506e

URL: 
https://github.com/llvm/llvm-project/commit/7898803c638497ad32e2d4a189d5597d4eb4506e
DIFF: 
https://github.com/llvm/llvm-project/commit/7898803c638497ad32e2d4a189d5597d4eb4506e.diff

LOG: [AMDGPU][NFC] Add more global_atomic_cmpswap tests

Added: 


Modified: 
llvm/test/MC/AMDGPU/flat-global.s
llvm/test/MC/AMDGPU/gfx9_asm_all.s
llvm/test/MC/Disassembler/AMDGPU/gfx10_dasm_all.txt
llvm/test/MC/Disassembler/AMDGPU/gfx9_dasm_all.txt

Removed: 




diff  --git a/llvm/test/MC/AMDGPU/flat-global.s 
b/llvm/test/MC/AMDGPU/flat-global.s
index e6c25f3f83f6..91c10ae13723 100644
--- a/llvm/test/MC/AMDGPU/flat-global.s
+++ b/llvm/test/MC/AMDGPU/flat-global.s
@@ -232,9 +232,29 @@ global_atomic_cmpswap v[3:4], v[5:6], off
 // GFX9: global_atomic_cmpswap v[3:4], v[5:6], off ; encoding: 
[0x00,0x80,0x04,0xdd,0x03,0x05,0x7f,0x00]
 // VI-ERR: error: instruction not supported on this GPU
 
-global_atomic_cmpswap_x2 v[3:4], v[5:8], off
-// GFX10: encoding: [0x00,0x80,0x44,0xdd,0x03,0x05,0x7d,0x00]
-// GFX9: global_atomic_cmpswap_x2 v[3:4], v[5:8], off ; encoding: 
[0x00,0x80,0x84,0xdd,0x03,0x05,0x7f,0x00]
+global_atomic_cmpswap v1, v[3:4], v[5:6], off glc
+// GFX10: encoding: [0x00,0x80,0xc5,0xdc,0x03,0x05,0x7d,0x01]
+// GFX9: global_atomic_cmpswap v1, v[3:4], v[5:6], off glc ; encoding: 
[0x00,0x80,0x05,0xdd,0x03,0x05,0x7f,0x01]
+// VI-ERR: error: instruction not supported on this GPU
+
+global_atomic_cmpswap v1, v3, v[5:6], s[2:3] glc
+// GFX10: encoding: [0x00,0x80,0xc5,0xdc,0x03,0x05,0x02,0x01]
+// GFX9: global_atomic_cmpswap v1, v3, v[5:6], s[2:3] glc ; encoding: 
[0x00,0x80,0x05,0xdd,0x03,0x05,0x02,0x01]
+// VI-ERR: error: instruction not supported on this GPU
+
+global_atomic_cmpswap_x2 v[5:6], v[7:10], off
+// GFX10: encoding: [0x00,0x80,0x44,0xdd,0x05,0x07,0x7d,0x00]
+// GFX9: global_atomic_cmpswap_x2 v[5:6], v[7:10], off ; encoding: 
[0x00,0x80,0x84,0xdd,0x05,0x07,0x7f,0x00]
+// VI-ERR: error: instruction not supported on this GPU
+
+global_atomic_cmpswap_x2 v[1:2], v[5:6], v[7:10], off glc
+// GFX10: encoding: [0x00,0x80,0x45,0xdd,0x05,0x07,0x7d,0x01]
+// GFX9: global_atomic_cmpswap_x2 v[1:2], v[5:6], v[7:10], off glc ; encoding: 
[0x00,0x80,0x85,0xdd,0x05,0x07,0x7f,0x01]
+// VI-ERR: error: instruction not supported on this GPU
+
+global_atomic_cmpswap_x2 v[1:2], v5, v[7:10], s[2:3] glc
+// GFX10: encoding: [0x00,0x80,0x45,0xdd,0x05,0x07,0x02,0x01]
+// GFX9: global_atomic_cmpswap_x2 v[1:2], v5, v[7:10], s[2:3] glc ; encoding: 
[0x00,0x80,0x85,0xdd,0x05,0x07,0x02,0x01]
 // VI-ERR: error: instruction not supported on this GPU
 
 global_atomic_swap v[3:4], v5, off
@@ -362,9 +382,29 @@ global_atomic_cmpswap v[3:4], v[5:6], off offset:-16
 // GFX9: global_atomic_cmpswap v[3:4], v[5:6], off offset:-16 ; encoding: 
[0xf0,0x9f,0x04,0xdd,0x03,0x05,0x7f,0x00]
 // VI-ERR: :1: error: instruction not supported on this GPU
 
-global_atomic_cmpswap_x2 v[3:4], v[5:8], off offset:-16
-// GFX10: encoding: [0xf0,0x8f,0x44,0xdd,0x03,0x05,0x7d,0x00]
-// GFX9: global_atomic_cmpswap_x2 v[3:4], v[5:8], off offset:-16 ; encoding: 
[0xf0,0x9f,0x84,0xdd,0x03,0x05,0x7f,0x00]
+global_atomic_cmpswap v1, v[3:4], v[5:6], off offset:-16 glc
+// GFX10: encoding: [0xf0,0x8f,0xc5,0xdc,0x03,0x05,0x7d,0x01]
+// GFX9: global_atomic_cmpswap v1, v[3:4], v[5:6], off offset:-16 glc ; 
encoding: [0xf0,0x9f,0x05,0xdd,0x03,0x05,0x7f,0x01]
+// VI-ERR: :1: error: instruction not supported on this GPU
+
+global_atomic_cmpswap v1, v3, v[5:6], s[2:3] offset:-16 glc
+// GFX10: encoding: [0xf0,0x8f,0xc5,0xdc,0x03,0x05,0x02,0x01]
+// GFX9: global_atomic_cmpswap v1, v3, v[5:6], s[2:3] offset:-16 glc ; 
encoding: [0xf0,0x9f,0x05,0xdd,0x03,0x05,0x02,0x01]
+// VI-ERR: :1: error: instruction not supported on this GPU
+
+global_atomic_cmpswap_x2 v[5:6], v[7:10], off offset:-16
+// GFX10: encoding: [0xf0,0x8f,0x44,0xdd,0x05,0x07,0x7d,0x00]
+// GFX9: global_atomic_cmpswap_x2 v[5:6], v[7:10], off offset:-16 ; encoding: 
[0xf0,0x9f,0x84,0xdd,0x05,0x07,0x7f,0x00]
+// VI-ERR: :1: error: instruction not supported on this GPU
+
+global_atomic_cmpswap_x2 v[1:2], v[5:6], v[7:10], off offset:-16 glc
+// GFX10: encoding: [0xf0,0x8f,0x45,0xdd,0x05,0x07,0x7d,0x01]
+// GFX9: global_atomic_cmpswap_x2 v[1:2], v[5:6], v[7:10], off offset:-16 glc 
; encoding: [0xf0,0x9f,0x85,0xdd,0x05,0x07,0x7f,0x01]
+// VI-ERR: :1: error: instruction not supported on this GPU
+
+global_atomic_cmpswap_x2 v[1:2], v5, v[7:10], s[2:3] offset:-16 glc
+// GFX10: encoding: [0xf0,0x8f,0x45,0xdd,0x05,0x07,0x02,0x01]
+// GFX9: global_atomic_cmpswap_x2 v[1:2], v5, v[7:10], s[2:3] offset:-16 glc ; 
encoding: [0xf0,0x9f,0x85,0xdd,0x05,0x07,0x02,0x01]
 // VI-ERR: :1: error: instruction not supported on this GPU
 
 global_atomic_swap v[3:4], v5, off offset:-16

diff  --git a/llvm/test/MC/AMDGPU/gfx9_asm

[llvm-branch-commits] [llvm] 9144597 - [AMDGPU] Unify flat offset logic

2020-12-15 Thread Sebastian Neubauer via llvm-branch-commits

Author: Sebastian Neubauer
Date: 2020-12-15T14:59:59+01:00
New Revision: 91445979be0a4e6fe4b42005d7fb03fc46c9ee0c

URL: 
https://github.com/llvm/llvm-project/commit/91445979be0a4e6fe4b42005d7fb03fc46c9ee0c
DIFF: 
https://github.com/llvm/llvm-project/commit/91445979be0a4e6fe4b42005d7fb03fc46c9ee0c.diff

LOG: [AMDGPU] Unify flat offset logic

Move getNumFlatOffsetBits from AMDGPUAsmParser and SIInstrInfo into
AMDGPUBaseInfo.

Differential Revision: https://reviews.llvm.org/D93287

Added: 


Modified: 
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/lib/Target/AMDGPU/SIInstrInfo.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/MC/AMDGPU/flat-gfx10.s
llvm/test/MC/AMDGPU/gfx10_err_pos.s

Removed: 




diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2ad7fab81427..0a0b993778c7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1928,7 +1928,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
   if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
 int64_t RemainderOffset = COffsetVal;
 int64_t ImmField = 0;
-const unsigned NumBits = TII->getNumFlatOffsetBits(true);
+const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(*Subtarget, true);
 // Use signed division by a power of two to truncate towards 0.
 int64_t D = 1LL << (NumBits - 1);
 RemainderOffset = (COffsetVal / D) * D;

diff  --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp 
b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 22c32400ecbf..1fd6c2cca6df 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -3646,22 +3646,20 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst 
&Inst,
 return false;
   }
 
-  // Address offset is 12-bit signed for GFX10, 13-bit for GFX9.
   // For FLAT segment the offset must be positive;
   // MSB is ignored and forced to zero.
-  unsigned OffsetSize = isGFX9() ? 13 : 12;
   if (TSFlags & (SIInstrFlags::IsFlatGlobal | SIInstrFlags::IsFlatScratch)) {
+unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), true);
 if (!isIntN(OffsetSize, Op.getImm())) {
   Error(getFlatOffsetLoc(Operands),
-isGFX9() ? "expected a 13-bit signed offset" :
-   "expected a 12-bit signed offset");
+Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset");
   return false;
 }
   } else {
-if (!isUIntN(OffsetSize - 1, Op.getImm())) {
+unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI(), false);
+if (!isUIntN(OffsetSize, Op.getImm())) {
   Error(getFlatOffsetLoc(Operands),
-isGFX9() ? "expected a 12-bit unsigned offset" :
-   "expected an 11-bit unsigned offset");
+Twine("expected a ") + Twine(OffsetSize) + "-bit unsigned offset");
   return false;
 }
   }

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp 
b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 01721595d551..889908bce905 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7053,13 +7053,6 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) 
const {
   return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
 }
 
-unsigned SIInstrInfo::getNumFlatOffsetBits(bool Signed) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
-return Signed ? 12 : 11;
-
-  return Signed ? 13 : 12;
-}
-
 bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
 bool Signed) const {
   // TODO: Should 0 be special cased?
@@ -7069,10 +7062,8 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, 
unsigned AddrSpace,
   if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
 return false;
 
-  if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
-return Signed ? isInt<12>(Offset) : isUInt<11>(Offset);
-
-  return Signed ? isInt<13>(Offset) :isUInt<12>(Offset);
+  unsigned N = AMDGPU::getNumFlatOffsetBits(ST, Signed);
+  return Signed ? isIntN(N, Offset) : isUIntN(N, Offset);
 }
 
 std::pair SIInstrInfo::splitFlatOffset(int64_t COffsetVal,
@@ -7080,7 +7071,7 @@ std::pair 
SIInstrInfo::splitFlatOffset(int64_t COffsetVal,
  bool IsSigned) const {
   int64_t RemainderOffset = COffsetVal;
   int64_t ImmField = 0;
-  const unsigned NumBits = getNumFlatOffsetBits(IsSigned);
+  const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST, IsSigned);
   if (IsSigned) {
 // Use signed division by a power of two to

[llvm-branch-commits] [llvm] 409a2f0 - [AMDGPU] Allow no saddr for global addtid insts

2020-12-16 Thread Sebastian Neubauer via llvm-branch-commits

Author: Sebastian Neubauer
Date: 2020-12-16T10:01:40+01:00
New Revision: 409a2f0f9e4847cd25560bfbddf22ffa11d15237

URL: 
https://github.com/llvm/llvm-project/commit/409a2f0f9e4847cd25560bfbddf22ffa11d15237
DIFF: 
https://github.com/llvm/llvm-project/commit/409a2f0f9e4847cd25560bfbddf22ffa11d15237.diff

LOG: [AMDGPU] Allow no saddr for global addtid insts

I think the global_load/store_dword_addtid instructions support
switching off the scalar address.
Add assembler and disassembler support for this.

Differential Revision: https://reviews.llvm.org/D93288

Added: 


Modified: 
llvm/lib/Target/AMDGPU/FLATInstructions.td
llvm/test/MC/AMDGPU/gfx1030_new.s
llvm/test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt

Removed: 




diff  --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td 
b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index d47a79414294..57a355a55a02 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -192,24 +192,34 @@ multiclass FLAT_Global_Load_Pseudo : FLAT_Pseudo<
+  bit HasTiedOutput = 0, bit HasSignedOffset = 0, bit EnableSaddr = 0> : 
FLAT_Pseudo<
   opName,
   (outs regClass:$vdst),
-  !con((ins SReg_64:$saddr, flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, 
DLC_0:$dlc),
+  !con(!if(EnableSaddr, (ins SReg_64:$saddr), (ins)),
+(ins flat_offset:$offset, GLC_0:$glc, SLC_0:$slc, DLC_0:$dlc),
 !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
-  " $vdst, $saddr$offset$glc$slc$dlc"> {
+  " $vdst, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
   let is_flat_global = 1;
   let has_data = 0;
   let mayLoad = 1;
   let has_vaddr = 0;
   let has_saddr = 1;
-  let enabled_saddr = 1;
+  let enabled_saddr = EnableSaddr;
   let maybeAtomic = 1;
+  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
 
   let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
   let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
 }
 
+multiclass FLAT_Global_Load_AddTid_Pseudo {
+  def "" : FLAT_Global_Load_AddTid_Pseudo,
+GlobalSaddrTable<0, opName>;
+  def _SADDR : FLAT_Global_Load_AddTid_Pseudo,
+GlobalSaddrTable<1, opName>;
+}
+
 multiclass FLAT_Global_Store_Pseudo {
   let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
 def "" : FLAT_Store_Pseudo,
@@ -220,21 +230,29 @@ multiclass FLAT_Global_Store_Pseudo {
 }
 
 class FLAT_Global_Store_AddTid_Pseudo  : FLAT_Pseudo<
+  bit HasSignedOffset = 0, bit EnableSaddr = 0> : FLAT_Pseudo<
   opName,
   (outs),
-  !con(
-(ins vdataClass:$vdata, SReg_64:$saddr),
-  (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
-  " $vdata, $saddr$offset$glc$slc$dlc"> {
+  !con(!if(EnableSaddr, (ins vdataClass:$vdata, SReg_64:$saddr), (ins 
vdataClass:$vdata)),
+(ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+  " $vdata, "#!if(EnableSaddr, "$saddr", "off")#"$offset$glc$slc$dlc"> {
   let is_flat_global = 1;
   let mayLoad  = 0;
   let mayStore = 1;
   let has_vdst = 0;
   let has_vaddr = 0;
   let has_saddr = 1;
-  let enabled_saddr = 1;
+  let enabled_saddr = EnableSaddr;
   let maybeAtomic = 1;
+  let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
+}
+
+multiclass FLAT_Global_Store_AddTid_Pseudo {
+  def "" : FLAT_Global_Store_AddTid_Pseudo,
+GlobalSaddrTable<0, opName>;
+  def _SADDR : FLAT_Global_Store_AddTid_Pseudo,
+GlobalSaddrTable<1, opName>;
 }
 
 class FlatScratchInst  {
@@ -603,7 +621,7 @@ defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo 
<"global_load_sbyte_d16_
 defm GLOBAL_LOAD_SHORT_D16: FLAT_Global_Load_Pseudo 
<"global_load_short_d16", VGPR_32, 1>;
 defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo 
<"global_load_short_d16_hi", VGPR_32, 1>;
 let OtherPredicates = [HasGFX10_BEncoding] in
-def  GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo 
<"global_load_dword_addtid", VGPR_32>;
+defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo 
<"global_load_dword_addtid", VGPR_32>;
 
 defm GLOBAL_STORE_BYTE: FLAT_Global_Store_Pseudo <"global_store_byte", 
VGPR_32>;
 defm GLOBAL_STORE_SHORT   : FLAT_Global_Store_Pseudo <"global_store_short", 
VGPR_32>;
@@ -612,7 +630,7 @@ defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo 
<"global_store_dwordx2", VR
 defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", 
VReg_96>;
 defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", 
VReg_128>;
 let OtherPredicates = [HasGFX10_BEncoding] in
-def  GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo 
<"global_store_dword_addtid", VGPR_32>;
+defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo 
<"global_store_dword_addtid", VGPR_32>;
 
 defm GLOBAL_STORE_BYTE_D16_HI  : FLAT_Global_Store_Pseudo 
<"global_store_byte_d16_hi", VGPR_32>;
 defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo 
<"global_store_short_d16_hi", VGPR_32>;
@@ 

[llvm-branch-commits] [llvm] edd6756 - [AMDGPU] Emit stack frame size in metadata

2020-11-25 Thread Sebastian Neubauer via llvm-branch-commits

Author: Sebastian Neubauer
Date: 2020-11-25T16:30:02+01:00
New Revision: edd675643d5ff49e6ea01af2a2a9b40498b3226c

URL: 
https://github.com/llvm/llvm-project/commit/edd675643d5ff49e6ea01af2a2a9b40498b3226c
DIFF: 
https://github.com/llvm/llvm-project/commit/edd675643d5ff49e6ea01af2a2a9b40498b3226c.diff

LOG: [AMDGPU] Emit stack frame size in metadata

Add .shader_functions to pal metadata, which contains the stack frame
size for all non-entry-point functions.

Differential Revision: https://reviews.llvm.org/D90036

Added: 


Modified: 
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
llvm/test/CodeGen/AMDGPU/amdpal-callable.ll

Removed: 




diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index fc785902843c..8148d0487802 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -456,9 +456,12 @@ bool 
AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 Info = analyzeResourceUsage(MF);
   }
 
-  if (STM.isAmdPalOS() && MFI->isEntryFunction())
-EmitPALMetadata(MF, CurrentProgramInfo);
-  else if (!STM.isAmdHsaOS()) {
+  if (STM.isAmdPalOS()) {
+if (MFI->isEntryFunction())
+  EmitPALMetadata(MF, CurrentProgramInfo);
+else
+  emitPALFunctionMetadata(MF);
+  } else if (!STM.isAmdHsaOS()) {
 EmitProgramInfoSI(MF, CurrentProgramInfo);
   }
 
@@ -1260,6 +1263,12 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const 
MachineFunction &MF,
 MD->setWave32(MF.getFunction().getCallingConv());
 }
 
+void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
+  auto *MD = getTargetStreamer()->getPALMetadata();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MD->setStackFrameSize(MF, MFI.getStackSize());
+}
+
 // This is supposed to be log2(Size)
 static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   switch (Size) {

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h 
b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 54e8338ab4b0..907ff2bfc162 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -78,6 +78,7 @@ class AMDGPUAsmPrinter final : public AsmPrinter {
  const SIProgramInfo &KernelInfo);
   void EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
+  void emitPALFunctionMetadata(const MachineFunction &MF);
   void emitCommonFunctionComments(uint32_t NumVGPR,
   Optional NumAGPR,
   uint32_t TotalNumVGPR,

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 85cba165770f..efabab90422f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -238,6 +238,14 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, 
unsigned Val) {
   getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
 }
 
+// Set the scratch size in the metadata.
+void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF,
+  unsigned Val) {
+  auto Node = MsgPackDoc.getMapNode();
+  Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
+  getShaderFunctions()[MF.getFunction().getName()] = Node;
+}
+
 // Set the hardware register bit in PAL metadata to enable wave32 on the
 // shader of the given calling convention.
 void AMDGPUPALMetadata::setWave32(unsigned CC) {
@@ -721,6 +729,24 @@ msgpack::MapDocNode AMDGPUPALMetadata::getRegisters() {
   return Registers.getMap();
 }
 
+// Reference (create if necessary) the node for the shader functions map.
+msgpack::DocNode &AMDGPUPALMetadata::refShaderFunctions() {
+  auto &N =
+  MsgPackDoc.getRoot()
+  .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+  .getArray(/*Convert=*/true)[0]
+  .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".shader_functions")];
+  N.getMap(/*Convert=*/true);
+  return N;
+}
+
+// Get (create if necessary) the shader functions map.
+msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunctions() {
+  if (ShaderFunctions.isEmpty())
+ShaderFunctions = refShaderFunctions();
+  return ShaderFunctions.getMap();
+}
+
 // Return the PAL metadata hardware shader stage name.
 static const char *getStageName(CallingConv::ID CC) {
   switch (CC) {

diff  --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h 
b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index b089f295364c..3b1767bb1f64 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -15,6 +1

[llvm-branch-commits] [llvm] 5733167 - [AMDGPU] Mark amdgpu_gfx functions as module entry function

2020-12-14 Thread Sebastian Neubauer via llvm-branch-commits

Author: Sebastian Neubauer
Date: 2020-12-14T10:43:39+01:00
New Revision: 5733167f54a582d52fc06617646c13cd1e0b3362

URL: 
https://github.com/llvm/llvm-project/commit/5733167f54a582d52fc06617646c13cd1e0b3362
DIFF: 
https://github.com/llvm/llvm-project/commit/5733167f54a582d52fc06617646c13cd1e0b3362.diff

LOG: [AMDGPU] Mark amdgpu_gfx functions as module entry function

- Allows lds allocations
- Writes resource usage into COMPUTE_PGM_RSRC1 registers in PAL metadata

Differential Revision: https://reviews.llvm.org/D92946

Added: 


Modified: 
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
llvm/test/CodeGen/AMDGPU/amdpal-callable.ll

Removed: 




diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp 
b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 137f6896c87b..a14f846b76d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -446,7 +446,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction 
&MF) {
 OutStreamer->SwitchSection(ConfigSection);
   }
 
-  if (MFI->isEntryFunction()) {
+  if (MFI->isModuleEntryFunction()) {
 getSIProgramInfo(CurrentProgramInfo, MF);
   } else {
 auto I = CallGraphResourceInfo.insert(
@@ -459,7 +459,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction 
&MF) {
   if (STM.isAmdPalOS()) {
 if (MFI->isEntryFunction())
   EmitPALMetadata(MF, CurrentProgramInfo);
-else
+else if (MFI->isModuleEntryFunction())
   emitPALFunctionMetadata(MF);
   } else if (!STM.isAmdHsaOS()) {
 EmitProgramInfoSI(MF, CurrentProgramInfo);
@@ -922,7 +922,22 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo 
AMDGPUAsmPrinter::analyzeResourceUsage(
   = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
 
 const Function *Callee = getCalleeFunction(*CalleeOp);
-if (!Callee || Callee->isDeclaration()) {
+DenseMap::const_iterator I =
+CallGraphResourceInfo.end();
+bool IsExternal = !Callee || Callee->isDeclaration();
+if (!IsExternal)
+  I = CallGraphResourceInfo.find(Callee);
+
+if (IsExternal || I == CallGraphResourceInfo.end()) {
+  // Avoid crashing on undefined behavior with an illegal call to a
+  // kernel. If a callsite's calling convention doesn't match the
+  // function's, it's undefined behavior. If the callsite calling
+  // convention does match, that would have errored earlier.
+  // FIXME: The verifier shouldn't allow this.
+  if (!IsExternal &&
+  AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+report_fatal_error("invalid call to entry function");
+
   // If this is a call to an external function, we can't do much. Make
   // conservative guesses.
 
@@ -943,19 +958,6 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo 
AMDGPUAsmPrinter::analyzeResourceUsage(
   // We force CodeGen to run in SCC order, so the callee's register
   // usage etc. should be the cumulative usage of all callees.
 
-  auto I = CallGraphResourceInfo.find(Callee);
-  if (I == CallGraphResourceInfo.end()) {
-// Avoid crashing on undefined behavior with an illegal call to a
-// kernel. If a callsite's calling convention doesn't match the
-// function's, it's undefined behavior. If the callsite calling
-// convention does match, that would have errored earlier.
-// FIXME: The verifier shouldn't allow this.
-if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
-  report_fatal_error("invalid call to entry function");
-
-llvm_unreachable("callee should have been handled before caller");
-  }
-
   MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
   MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
   MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
@@ -1266,7 +1268,11 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const 
MachineFunction &MF,
 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
   auto *MD = getTargetStreamer()->getPALMetadata();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  MD->setStackFrameSize(MF, MFI.getStackSize());
+  MD->setFunctionScratchSize(MF, MFI.getStackSize());
+  // Set compute registers
+  MD->setRsrc1(CallingConv::AMDGPU_CS,
+   CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
+  MD->setRsrc2(CallingCo