[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From 07b3f94e49df221406cf7b83a05c8704e1af1c75 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/2] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_na
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From 07b3f94e49df221406cf7b83a05c8704e1af1c75 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/3] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_na
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From 07b3f94e49df221406cf7b83a05c8704e1af1c75 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/4] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_na
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From 07b3f94e49df221406cf7b83a05c8704e1af1c75 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/5] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_na
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From a87ba1892375ef67edb5d6f3bd537869203273a6 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/5] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_na
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From a87ba1892375ef67edb5d6f3bd537869203273a6 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/6] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_na
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { jwanggit86 wrote: Although they both insert s_waitcnt instructions, the new feature is quite different from the existing SIInsertWaitcnt pass. The new feature, controlled by a command-line option, inserts a "s_waitcnt 0" after each memory instruction. The logic therefore is very simple. The existing pass, however, has more complicated logic implemented with essentially a static analysis aided by its own data structures, which are not necessary for the new feature. >From the performance point of view, it should be noted that by default this >feature is not activated. Therefore, extra overhead should be minimized for >the normal use-case scenario. A separate pass achieves this b/c there is only >one extra IF for each compiled function. On the other hand, integrating with >the existing pass would mean many more checks for the feature activation, >which are waste in the normal case when the feature is not activated. With the above 2 points, I think a separate pass is advantageous over an integrated pass. Pls let me know your thoughts. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -52,6 +52,11 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt +PreciseMemOpFlag("amdgpu-precise-memory-op", + cl::desc("Emit s_waitcnt 0 after each memory operation"), + cl::init(false)); + jwanggit86 wrote: Sure. I will look into this. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -52,6 +52,11 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt +PreciseMemOpFlag("amdgpu-precise-memory-op", + cl::desc("Emit s_waitcnt 0 after each memory operation"), + cl::init(false)); + jwanggit86 wrote: Adding a command-line option for llc is from the 1st version of this patch. Now that we have decided to add a clang command-line option, this llc option is not really necessary anymore. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { jwanggit86 wrote: As commented above, the newly created llc option is not really necessary and will be removed. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From a87ba1892375ef67edb5d6f3bd537869203273a6 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/8] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_na
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1809,6 +1816,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { jwanggit86 wrote: @arsenm Pull req has been updated. Now the code for the new feature has been integrated with the existing pass. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
jwanggit86 wrote: > > So, while it's possible to create a combined option, using a separate > > option also makes sense. Do we generally try to avoid creating new > > command-line options? > > Looking again, I see they are different and unrelated. I don't really > understand why we have amdgpu-waitcnt-forcezero, I'm not sure I've ever used > it. I always expected it to behave like this flag. So do you still think the new code should be integrated with the existing pass, or should it be separate? https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
jwanggit86 wrote: @arsenm Pls let me know if you have any further comments. I'd like to have the code committed and the PR closed. Thanks! https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 1/9] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_na
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -0,0 +1,222 @@ +; Testing the -amdgpu-precise-memory-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 jwanggit86 wrote: Comment. Some testcases in this file won't run if mcpu=hawaii. In the latest commit, the test file has been split into 2. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,13 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto builder = jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -388,6 +388,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + // bool insertWaitcntAfterMemOp(MachineFunction &MF); jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 01/10] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { jwanggit86 wrote: S_waitcnt(0) is inserted after each mem op, both stores and loads. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/68932 >From e393477607cb94b45a3b9a5db2aea98fb8af2a86 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Thu, 12 Oct 2023 16:45:59 -0500 Subject: [PATCH 01/11] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch implements a new command-line option for the backend, namely, amdgpu-waitcnt-for-all-mem-op. When this option is specified, a "waitcnt 0" instruction is generated after each memory load/store instruction. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 30 ++- .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 222 ++ 2 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index ede4841b8a5fd7d..728be7c61fa2217 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -52,6 +52,10 @@ static cl::opt ForceEmitZeroFlag( cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), cl::init(false), cl::Hidden); +static cl::opt EmitForAllMemOpFlag( +"amdgpu-waitcnt-for-all-mem-op", +cl::desc("Emit s_waitcnt 0 after each memory operation"), cl::init(false)); + namespace { // Class of object that encapsulates latest instruction counter score // associated with the operand. Used for determining whether @@ -388,6 +392,8 @@ class SIInsertWaitcnts : public MachineFunctionPass { // message. DenseSet ReleaseVGPRInsts; + bool insertWaitcntAfterMemOp(MachineFunction &MF); + public: static char ID; @@ -1809,6 +1815,23 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, return HasVMemLoad && UsesVgprLoadedOutside; } +bool SIInsertWaitcnts::insertWaitcntAfterMemOp(MachineFunction &MF) { + bool Modified = false; + + for (auto &MBB : MF) { +for (auto It = MBB.begin(); It != MBB.end();) { + bool IsMemOp = It->mayLoadOrStore(); + ++It; + if (IsMemOp) { +BuildMI(MBB, It, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); +Modified = true; + } +} + } + + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1819,6 +1842,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis(); PDT = &getAnalysis(); + bool Modified = false; + + if (EmitForAllMemOpFlag) { +Modified = insertWaitcntAfterMemOp(MF); + } + ForceEmitZeroWaitcnts = ForceEmitZeroFlag; for (auto T : inst_counter_types()) ForceEmitWaitcnt[T] = false; @@ -1847,7 +1876,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); - bool Modified = false; if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll new file mode 100644 index 000..4580b9074ada3cc --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll @@ -0,0 +1,222 @@ +; Testing the -amdgpu-waitcnt-for-all-mem-op option +; COM: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX7 +; COM: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch -amdgpu-use-divergent-register-indexing -amdgpu-waitcnt-for-all-mem-op -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1847,6 +1862,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { TrackedWaitcntSet.clear(); BlockInfos.clear(); + jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { +Builder = +BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) +.addReg(AMDGPU::SGPR_NULL, RegState::Undef) +.addImm(0); + } + OldWaitcntInstr = Builder.getInstr(); jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
@@ -1708,6 +1710,19 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } ++Iter; +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + auto Builder = + BuildMI(Block, Iter, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + if (IsGFX10Plus) { jwanggit86 wrote: My understanding is that the feature request asks for a "s_waitcnt 0" to be *blindly* inserted after each and every memory instruction. Enabling the feature is at the user's discretion via a clang command-line option (disabled by default). The purpose of the feature is to help debug memory problems on GPUs that do not support precise memory. (Although someone, Tony I think, mentioned it could go beyond debugging). I'll send you the link for the feature request. Based on that, the implementation doesn't check on GPU models, doesn't have model-dependent code (except the newly-added code for GFX10+), or differentiate loads from stores. I'll work with the requester to get the requirements straightened out. https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)
https://github.com/jwanggit86 created https://github.com/llvm/llvm-project/pull/75647 A new function attribute named amdgpu-num-work-groups is added. This attribute allows programmers to let the compiler know the number of workgroups to be launched and do optimizations based on that information. >From bb15eebae9645e5383f26066093c0734ea76442d Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 15 Dec 2023 13:53:54 -0600 Subject: [PATCH] [AMDGPU] Adding the amdgpu-num-work-groups function attribute A new function attribute named amdgpu-num-work-groups is added. This attribute allows programmers to let the compiler know the number of workgroups to be launched and do optimizations based on that information. --- clang/include/clang/Basic/Attr.td | 7 ++ clang/include/clang/Basic/AttrDocs.td | 23 ++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 7 ++ clang/lib/Sema/SemaDeclAttr.cpp | 13 +++ ...a-attribute-supported-attributes-list.test | 1 + .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 4 + llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp| 6 ++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 + .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 1 + .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 9 ++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 15 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 8 ++ .../AMDGPU/attr-amdgpu-num-work-groups.ll | 82 +++ 13 files changed, 179 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 5943583d92773a..605fcbbff027b9 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2011,6 +2011,13 @@ def AMDGPUNumVGPR : InheritableAttr { let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; } +def AMDGPUNumWorkGroups : InheritableAttr { + let Spellings = [Clang<"amdgpu_num_work_groups", 0>]; + let Args = [UnsignedArgument<"NumWorkGroups">]; + let Documentation = [AMDGPUNumWorkGroupsDocs]; + let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; +} + def AMDGPUKernelCall : DeclOrTypeAttr { let Spellings = [Clang<"amdgpu_kernel">]; let Documentation = [Undocumented]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 77950ab6d877ea..0bf3ccf367284c 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2693,6 +2693,29 @@ An error will be given if: }]; } +def AMDGPUNumWorkGroupsDocs : Documentation { + let Category = DocCatAMDGPUAttributes; + let Content = [{ +The number of work groups specifies the number of work groups when the kernel +is dispatched. + +Clang supports the +``__attribute__((amdgpu_num_work_groups()))`` attribute for the +AMDGPU target. This attribute may be attached to a kernel function definition +and is an optimization hint. + + parameter specifies the number of work groups. + +If specified, the AMDGPU target backend might be able to produce better machine +code. + +An error will be given if: + - Specified values violate subtarget specifications; + - Specified values are not compatible with values provided through other +attributes. + }]; +} + def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> { let Content = [{ Clang supports several different calling conventions, depending on the target diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 03ac6b78598fc8..11a0835f37f4a9 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -356,6 +356,13 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } + + if (const auto *Attr = FD->getAttr()) { +uint32_t NumWG = Attr->getNumWorkGroups(); + +if (NumWG != 0) + F->addFnAttr("amdgpu-num-work-groups", llvm::utostr(NumWG)); + } } /// Emits control constants used to change per-architecture behaviour in the diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 5b29b05dee54b3..3737dd256aff02 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -8051,6 +8051,16 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR)); } +static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D, + const ParsedAttr &AL) { + uint32_t NumWG = 0; + Expr *NumWGExpr = AL.getArgAsExpr(0); + if (!checkUInt32Argument(S, AL, NumWGExpr, NumWG)) +return; + + D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWG)); +} + static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D,
[llvm] [clang] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/75647 >From bb15eebae9645e5383f26066093c0734ea76442d Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Fri, 15 Dec 2023 13:53:54 -0600 Subject: [PATCH 1/2] [AMDGPU] Adding the amdgpu-num-work-groups function attribute A new function attribute named amdgpu-num-work-groups is added. This attribute allows programmers to let the compiler know the number of workgroups to be launched and do optimizations based on that information. --- clang/include/clang/Basic/Attr.td | 7 ++ clang/include/clang/Basic/AttrDocs.td | 23 ++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 7 ++ clang/lib/Sema/SemaDeclAttr.cpp | 13 +++ ...a-attribute-supported-attributes-list.test | 1 + .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 4 + llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp| 6 ++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 + .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 1 + .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 9 ++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 15 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 8 ++ .../AMDGPU/attr-amdgpu-num-work-groups.ll | 82 +++ 13 files changed, 179 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 5943583d92773a..605fcbbff027b9 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2011,6 +2011,13 @@ def AMDGPUNumVGPR : InheritableAttr { let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; } +def AMDGPUNumWorkGroups : InheritableAttr { + let Spellings = [Clang<"amdgpu_num_work_groups", 0>]; + let Args = [UnsignedArgument<"NumWorkGroups">]; + let Documentation = [AMDGPUNumWorkGroupsDocs]; + let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; +} + def AMDGPUKernelCall : DeclOrTypeAttr { let Spellings = [Clang<"amdgpu_kernel">]; let Documentation = [Undocumented]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 77950ab6d877ea..0bf3ccf367284c 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2693,6 +2693,29 @@ An error will be given if: }]; } +def AMDGPUNumWorkGroupsDocs : Documentation { + let Category = DocCatAMDGPUAttributes; + let Content = [{ +The number of work groups specifies the number of work groups when the kernel +is dispatched. + +Clang supports the +``__attribute__((amdgpu_num_work_groups()))`` attribute for the +AMDGPU target. This attribute may be attached to a kernel function definition +and is an optimization hint. + + parameter specifies the number of work groups. + +If specified, the AMDGPU target backend might be able to produce better machine +code. + +An error will be given if: + - Specified values violate subtarget specifications; + - Specified values are not compatible with values provided through other +attributes. + }]; +} + def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> { let Content = [{ Clang supports several different calling conventions, depending on the target diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 03ac6b78598fc8..11a0835f37f4a9 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -356,6 +356,13 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } + + if (const auto *Attr = FD->getAttr()) { +uint32_t NumWG = Attr->getNumWorkGroups(); + +if (NumWG != 0) + F->addFnAttr("amdgpu-num-work-groups", llvm::utostr(NumWG)); + } } /// Emits control constants used to change per-architecture behaviour in the diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 5b29b05dee54b3..3737dd256aff02 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -8051,6 +8051,16 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR)); } +static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D, + const ParsedAttr &AL) { + uint32_t NumWG = 0; + Expr *NumWGExpr = AL.getArgAsExpr(0); + if (!checkUInt32Argument(S, AL, NumWGExpr, NumWG)) +return; + + D->addAttr(::new (S.Context) AMDGPUNumWorkGroupsAttr(S.Context, AL, NumWG)); +} + static void handleX86ForceAlignArgPointerAttr(Sema &S, Decl *D, const ParsedAttr &AL) { // If we try to apply it to a function pointer, don't warn, but don't @@ -9058,6 +9068,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const Parsed
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)
jwanggit86 wrote: Two possible optimizations mentioned by the requester are, "1. This'll let the backend know the maximum size of the workgroup ID, and so we can do things like infer nsw or the ability to use a 16-bit add or so on 2. This could be used to optimize global sync stuff in the future " https://github.com/llvm/llvm-project/pull/75647 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,577 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+precise-memory -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +; +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX10-LABEL: syncscope_workgroup_nortn: +; GFX10: ; %bb.0: +; GFX10: flat_load_dword v4, v[0:1] +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10: .LBB0_1: ; %atomicrmw.start +; GFX10: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX11-LABEL: syncscope_workgroup_nortn: +; GFX11: ; %bb.0: +; GFX11: flat_load_b32 v4, v[0:1] +; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11: .LBB0_1:; %atomicrmw.start +; GFX11: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX12-LABEL: syncscope_workgroup_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT:s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:s_wait_expcnt 0x0 +; GFX12-NEXT:s_wait_samplecnt 0x0 +; GFX12-NEXT:s_wait_bvhcnt 0x0 +; GFX12-NEXT:s_wait_kmcnt 0x0 +; GFX12-NEXT:flat_load_b32 v4, v[0:1] +; GFX12-NEXT:s_wait_loadcnt_dscnt 0x0 + +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +; +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:global_load_dword v2, v[0:1], off +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-NOT: s_waitcnt vmcnt(0) +; GFX9-NEXT:v_mov_b32_e32 v3, v2 +; GFX9-NEXT:v_not_b32_e32 v2, v3 +; GFX9-NEXT:v_or_b32_e32 v2, -5, v2 +; GFX9-NEXT:global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:buffer_wbinvl1_vol +; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v0, v2 +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: atomic_nand_i32_global: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:global_load_dword v2, v[0:1], off +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX10-NOT: s_waitcnt vmcnt(0) +; GFX10-NEXT:v_mov_b32_e32 v3, v2 +; GFX10-NEXT:v_not_b32_e32 v2, v3 +; GFX10-NEXT:v_or_b32_e32 v2, -5, v2 +; GFX10-NEXT:s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT:global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:buffer_gl1_inv +; GFX10-NEXT:buffer_gl0_inv +; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT:s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT:s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_cbranch_execnz .LBB1_1 +; +; GFX11-LABEL: atomic_nand_i32_global: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11: .LBB1_1:; %atomicrmw.start +; GFX11: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; +; GFX12-LABEL: atomic_nand_i32_global: +; GFX12: ;
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,577 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+precise-memory -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +; +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX10-LABEL: syncscope_workgroup_nortn: +; GFX10: ; %bb.0: +; GFX10: flat_load_dword v4, v[0:1] +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10: .LBB0_1: ; %atomicrmw.start +; GFX10: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX11-LABEL: syncscope_workgroup_nortn: +; GFX11: ; %bb.0: +; GFX11: flat_load_b32 v4, v[0:1] +; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11: .LBB0_1:; %atomicrmw.start +; GFX11: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc +; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; +; GFX12-LABEL: syncscope_workgroup_nortn: +; GFX12: ; %bb.0: +; GFX12-NEXT:s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT:s_wait_expcnt 0x0 +; GFX12-NEXT:s_wait_samplecnt 0x0 +; GFX12-NEXT:s_wait_bvhcnt 0x0 +; GFX12-NEXT:s_wait_kmcnt 0x0 +; GFX12-NEXT:flat_load_b32 v4, v[0:1] +; GFX12-NEXT:s_wait_loadcnt_dscnt 0x0 + +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +; +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:global_load_dword v2, v[0:1], off +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-NOT: s_waitcnt vmcnt(0) +; GFX9-NEXT:v_mov_b32_e32 v3, v2 +; GFX9-NEXT:v_not_b32_e32 v2, v3 +; GFX9-NEXT:v_or_b32_e32 v2, -5, v2 +; GFX9-NEXT:global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX9-NEXT:s_waitcnt vmcnt(0) +; GFX9-NEXT:buffer_wbinvl1_vol +; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v0, v2 +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: atomic_nand_i32_global: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:global_load_dword v2, v[0:1], off +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX10-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX10-NOT: s_waitcnt vmcnt(0) +; GFX10-NEXT:v_mov_b32_e32 v3, v2 +; GFX10-NEXT:v_not_b32_e32 v2, v3 +; GFX10-NEXT:v_or_b32_e32 v2, -5, v2 +; GFX10-NEXT:s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT:global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX10-NEXT:s_waitcnt vmcnt(0) +; GFX10-NEXT:buffer_gl1_inv +; GFX10-NEXT:buffer_gl0_inv +; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, v2, v3 +; GFX10-NEXT:s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT:s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_cbranch_execnz .LBB1_1 +; +; GFX11-LABEL: atomic_nand_i32_global: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: global_load_b32 v2, v[0:1], off +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11: .LBB1_1:; %atomicrmw.start +; GFX11: v_mov_b32_e32 v3, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_not_b32_e32 v2, v3 +; GFX11-NEXT: v_or_b32_e32 v2, -5, v2 +; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: buffer_gl0_inv +; +; GFX12-LABEL: atomic_nand_i32_global: +; GFX12: ;
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,577 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+precise-memory -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2326,6 +2326,20 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif +if (ST->isPreciseMemoryEnabled()) { + AMDGPU::Waitcnt Wait; + if (WCG == &WCGPreGFX12) +Wait = AMDGPU::Waitcnt(0, 0, 0, 0); jwanggit86 wrote: The option `amdgpu-waitcnt-forcezero` appears to force an s_waitcnt(0) before EVERY instruction. With this example, ``` define void @syncscope_workgroup_nortn(ptr %addr, float %val) { %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst ret void } ``` The .s is as follows: ``` ; %bb.0: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) flat_load_dword v4, v[0:1] s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_mov_b64 s[4:5], 0 .LBB0_1:; %atomicrmw.start ; =>This Inner Loop Header: Depth=1 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) v_add_f32_e32 v3, v4, v2 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) flat_atomic_cmpswap v3, v[0:1], v[3:4] glc s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) v_cmp_eq_u32_e32 vcc, v3, v4 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_or_b64 s[4:5], vcc, s[4:5] s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) v_mov_b32_e32 v4, v3 s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_andn2_b64 exec, exec, s[4:5] s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_cbranch_execnz .LBB0_1 ; %bb.2:; %atomicrmw.end s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_or_b64 exec, exec, s[4:5] s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) s_setpc_b64 s[30:31] ``` So, it doesn't seem to be helpful for either this issue or [issue#66](https://github.com/ROCm/ROCm-CompilerSupport/issues/66). https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2326,6 +2326,20 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + AMDGPU::Waitcnt Wait; + if (ST->hasExtendedWaitCounts()) +Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0); + else +Wait = AMDGPU::Waitcnt(0, 0, 0, 0); + + if (!Inst.mayStore()) +Wait.StoreCnt = ~0u; jwanggit86 wrote: Code updated as suggested. Testfile includes case for both atomic-with-ret and atomic-no-ret. However, for the following case, even though `ds_add_u32` is atomic-no-ret, the Waitcnt for StoreCnt is set to ~0u after the call of `ScoreBrackets.simplifyWaitcnt(Wait)`. Therefore, no s_waitcnt for the StoreCnt is generated after the `ds_add_u32`. ``` define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) { %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst ret void } ``` The code for GFX1100 is: ``` ; GFX11: ds_add_u32 v0, v1 ; GFX11-NEXT:s_waitcnt lgkmcnt(0) ; GFX11-NEXT:buffer_gl0_inv ``` Pls let me know if this looks correct. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
https://github.com/jwanggit86 edited https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,618 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic (atomic with return) +; +; GFX90A-LABEL: syncscope_workgroup_nortn: jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2326,6 +2326,20 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, } #endif +if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore()) { + AMDGPU::Waitcnt Wait; + if (ST->hasExtendedWaitCounts()) +Wait = AMDGPU::Waitcnt(0, 0, 0, 0, 0, 0, 0); + else +Wait = AMDGPU::Waitcnt(0, 0, 0, 0); + + if (!Inst.mayStore()) +Wait.StoreCnt = ~0u; jwanggit86 wrote: Thanks! Is there anything else that should be addressed? https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,1413 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic (atomic with return) +; +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX9-LABEL: syncscope_workgroup_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:flat_load_dword v4, v[0:1] +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX9-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v4, v3 +; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT:flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT:s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT:v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT:flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT:v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT:v_mov_b32_e32 v5, v3 +; GFX90A-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT:s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: syncscope_workgroup_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:flat_load_dword v4, v[0:1] +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT:s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT:s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT:buffer_gl0_inv +; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT:v_mov_b32_e32 v4, v3 +; GFX10-NEXT:s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT:s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT:s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:flat_load_dword v4, v[0:1] +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:s_mov_b64 s[0:1], 0 +; GFX9-FLATSCR-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-FLATSCR-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX9-FLATSCR-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-FLATSCR-NEXT:s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-FLATSCR-NEXT:v_mov_b32_e32 v4, v3 +; GFX9-FLATSCR-NEXT:s_andn2_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT:s_cbranch_execnz .LBB0_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-FLATSCR-NEXT:s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: syncscope_workgroup_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:flat_load_b32 v4, v[0:1] +; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT:s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT:s_delay_al
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,1413 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12 + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic (atomic with return) +; +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX9-LABEL: syncscope_workgroup_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:flat_load_dword v4, v[0:1] +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX9-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v4, v3 +; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_cbranch_execnz .LBB0_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT:flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT:s_mov_b64 s[4:5], 0 +; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX90A-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX90A-NEXT:v_add_f32_e32 v4, v5, v2 +; GFX90A-NEXT:flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A-NEXT:v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX90A-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX90A-NEXT:v_mov_b32_e32 v5, v3 +; GFX90A-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX90A-NEXT:s_cbranch_execnz .LBB0_1 +; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX90A-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX90A-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: syncscope_workgroup_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:flat_load_dword v4, v[0:1] +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT:s_mov_b32 s4, 0 +; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX10-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX10-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX10-NEXT:s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX10-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT:buffer_gl0_inv +; GFX10-NEXT:v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX10-NEXT:v_mov_b32_e32 v4, v3 +; GFX10-NEXT:s_or_b32 s4, vcc_lo, s4 +; GFX10-NEXT:s_andn2_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_cbranch_execnz .LBB0_1 +; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX10-NEXT:s_or_b32 exec_lo, exec_lo, s4 +; GFX10-NEXT:s_setpc_b64 s[30:31] +; +; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn: +; GFX9-FLATSCR: ; %bb.0: +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:flat_load_dword v4, v[0:1] +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:s_mov_b64 s[0:1], 0 +; GFX9-FLATSCR-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX9-FLATSCR-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-FLATSCR-NEXT:v_add_f32_e32 v3, v4, v2 +; GFX9-FLATSCR-NEXT:flat_atomic_cmpswap v3, v[0:1], v[3:4] glc +; GFX9-FLATSCR-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR-NEXT:v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX9-FLATSCR-NEXT:s_or_b64 s[0:1], vcc, s[0:1] +; GFX9-FLATSCR-NEXT:v_mov_b32_e32 v4, v3 +; GFX9-FLATSCR-NEXT:s_andn2_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT:s_cbranch_execnz .LBB0_1 +; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-FLATSCR-NEXT:s_or_b64 exec, exec, s[0:1] +; GFX9-FLATSCR-NEXT:s_setpc_b64 s[30:31] +; +; GFX11-LABEL: syncscope_workgroup_nortn: +; GFX11: ; %bb.0: +; GFX11-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT:flat_load_b32 v4, v[0:1] +; GFX11-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX11-NEXT:s_mov_b32 s0, 0 +; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start +; GFX11-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX11-NEXT:s_delay_al
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
jwanggit86 wrote: @jayfoad @arsenm Any other comments? https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
jwanggit86 wrote: I thought about having one attribute with 6 numbers. Then you have to provide 6 numbers when using it. In the current design, either the min or the max attribute can be omitted. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode", "Enable CU wavefront execution mode" >; +def FeaturePreciseMemory jwanggit86 wrote: The name was the result of some discussions last year. I've forwarded you the email. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsNonTemporal) const override; }; +class SIPreciseMemorySupport { jwanggit86 wrote: I don't have strong objection to merging with CacheControl, but would like to get Tony's opinion. @t-tye Tony, what do you think? https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsNonTemporal) const override; }; +class SIPreciseMemorySupport { +protected: + const GCNSubtarget &ST; + const SIInstrInfo *TII = nullptr; + + IsaVersion IV; + + SIPreciseMemorySupport(const GCNSubtarget &ST) : ST(ST) { +TII = ST.getInstrInfo(); +IV = getIsaVersion(ST.getCPU()); + } + +public: + static std::unique_ptr create(const GCNSubtarget &ST); + + virtual bool handleNonAtomic(MachineBasicBlock::iterator &MI) = 0; + /// Handles atomic instruction \p MI with \p ret indicating whether \p MI + /// returns a result. + virtual bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) = 0; +}; + +class SIGfx9PreciseMemorySupport : public SIPreciseMemorySupport { +public: + SIGfx9PreciseMemorySupport(const GCNSubtarget &ST) + : SIPreciseMemorySupport(ST) {} + bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; + bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; +}; + +class SIGfx10And11PreciseMemorySupport : public SIPreciseMemorySupport { +public: + SIGfx10And11PreciseMemorySupport(const GCNSubtarget &ST) + : SIPreciseMemorySupport(ST) {} + bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; + bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; +}; + +std::unique_ptr +SIPreciseMemorySupport::create(const GCNSubtarget &ST) { + GCNSubtarget::Generation Generation = ST.getGeneration(); + if (Generation < AMDGPUSubtarget::GFX10) jwanggit86 wrote: Not sure this is required for GFX12. @t-tye Tony, is this required for GFX12? We didn't discuss this for GFX12. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsNonTemporal) const override; }; +class SIPreciseMemorySupport { jwanggit86 wrote: Ok, will merge with CacheControl. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsNonTemporal) const override; }; +class SIPreciseMemorySupport { +protected: + const GCNSubtarget &ST; + const SIInstrInfo *TII = nullptr; + + IsaVersion IV; + + SIPreciseMemorySupport(const GCNSubtarget &ST) : ST(ST) { +TII = ST.getInstrInfo(); +IV = getIsaVersion(ST.getCPU()); + } + +public: + static std::unique_ptr create(const GCNSubtarget &ST); + + virtual bool handleNonAtomic(MachineBasicBlock::iterator &MI) = 0; + /// Handles atomic instruction \p MI with \p ret indicating whether \p MI + /// returns a result. + virtual bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) = 0; +}; + +class SIGfx9PreciseMemorySupport : public SIPreciseMemorySupport { +public: + SIGfx9PreciseMemorySupport(const GCNSubtarget &ST) + : SIPreciseMemorySupport(ST) {} + bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; + bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; +}; + +class SIGfx10And11PreciseMemorySupport : public SIPreciseMemorySupport { +public: + SIGfx10And11PreciseMemorySupport(const GCNSubtarget &ST) + : SIPreciseMemorySupport(ST) {} + bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; + bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; +}; + +std::unique_ptr +SIPreciseMemorySupport::create(const GCNSubtarget &ST) { + GCNSubtarget::Generation Generation = ST.getGeneration(); + if (Generation < AMDGPUSubtarget::GFX10) +return std::make_unique(ST); + return std::make_unique(ST); +} + +bool SIGfx9PreciseMemorySupport ::handleNonAtomic( +MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + if (TII->isSMRD(Inst)) { // scalar +if (Inst.mayStore()) + return false; +Wait.DsCnt = 0; // LgkmCnt + } else {// vector +if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) {// VMEM load +Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat load +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else {// LDS load +Wait.DsCnt = 0; // LgkmCnt + } +} else { // vector store + if (TII->isVMEM(Inst)) {// VMEM store +Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat store +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else { +Wait.DsCnt = 0; // LDS store; LgkmCnt + } +} + } + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx9PreciseMemorySupport ::handleAtomic(MachineBasicBlock::iterator &MI, + bool ret) { + assert(MI->mayLoadOrStore()); + + AMDGPU::Waitcnt Wait; + + Wait.LoadCnt = 0; // VmCnt + Wait.DsCnt = 0; // LgkmCnt + + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock &MBB = *MI->getParent(); + BuildMI(MBB, ++MI, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); + --MI; + return true; +} + +bool SIGfx10And11PreciseMemorySupport ::handleNonAtomic( +MachineBasicBlock::iterator &MI) { + assert(MI->mayLoadOrStore()); + + MachineInstr &Inst = *MI; + AMDGPU::Waitcnt Wait; + + bool BuildWaitCnt = true; + bool BuildVsCnt = false; + + if (TII->isSMRD(Inst)) { // scalar +if (Inst.mayStore()) + return false; +Wait.DsCnt = 0; // LgkmCnt + } else {// vector +if (Inst.mayLoad()) { // vector load + if (TII->isVMEM(Inst)) {// VMEM load +Wait.LoadCnt = 0; // VmCnt + } else if (TII->isFLAT(Inst)) { // Flat load +Wait.LoadCnt = 0; // VmCnt +Wait.DsCnt = 0; // LgkmCnt + } else {// LDS load +Wait.DsCnt = 0; // LgkmCnt + } +} + +// For some instructions, mayLoad() and mayStore() can be both true. +if (Inst.mayStore()) { // vector store; an instruction can be both + // load/store + if (TII->isVMEM(Inst)) { // VMEM store +if (!Inst.mayLoad()) + BuildWaitCnt = false; +BuildVsCnt = true; + } else if (TII->isFLAT(Inst)) { // Flat store +Wait.DsCnt = 0; // LgkmCnt +BuildVsCnt = true; + } else { +Wait.DsCnt = 0; // LDS store; LgkmCnt + } +} + } + + MachineBasicBlock &MBB = *MI->getParent(); + if (BuildWaitCnt) { +unsign
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79035 >From 5c088a59bd36df40bae9a3a712f3994feded359d Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 22 Jan 2024 12:43:27 -0600 Subject: [PATCH 1/3] [AMDGPU] Adding the amdgpu-num-work-groups function attribute A new function attribute named amdgpu-num-work-groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information. --- clang/include/clang/Basic/Attr.td | 7 ++ clang/include/clang/Basic/AttrDocs.td | 24 +++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 13 clang/lib/Sema/SemaDeclAttr.cpp | 22 +++ ...a-attribute-supported-attributes-list.test | 1 + .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 8 +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp| 5 ++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 + .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 2 + .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 53 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 ++ .../AMDGPU/attr-amdgpu-num-work-groups.ll | 65 +++ 13 files changed, 232 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 58838b01b4fd7c..1b4718258d91e6 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr { let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; } +def AMDGPUNumWorkGroups : InheritableAttr { + let Spellings = [Clang<"amdgpu_num_work_groups", 0>]; + let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">]; + let Documentation = [AMDGPUNumWorkGroupsDocs]; + let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; +} + def AMDGPUKernelCall : DeclOrTypeAttr { let Spellings = [Clang<"amdgpu_kernel">]; let Documentation = [Undocumented]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index e02a1201e2ad79..e8fd10587a8022 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2705,6 +2705,30 @@ An error will be given if: }]; } +def AMDGPUNumWorkGroupsDocs : Documentation { + let Category = DocCatAMDGPUAttributes; + let Content = [{ +The number of work groups specifies the number of work groups when the kernel +is dispatched. + +Clang supports the +``__attribute__((amdgpu_num_work_groups(, , )))`` attribute for the +AMDGPU target. This attribute may be attached to a kernel function definition +and is an optimization hint. + + parameter specifies the maximum number of work groups in the x dimentsion. +Similarly and are for the y and z dimensions respectively. + +If specified, the AMDGPU target backend might be able to produce better machine +code. + +An error will be given if: + - Specified values violate subtarget specifications; + - Specified values are not compatible with values provided through other +attributes. + }]; +} + def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> { let Content = [{ Clang supports several different calling conventions, depending on the target diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 03ac6b78598fc8..93321efd26462c 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } + + if (const auto *Attr = FD->getAttr()) { +uint32_t X = Attr->getNumWorkGroupsX(); +uint32_t Y = Attr->getNumWorkGroupsY(); +uint32_t Z = Attr->getNumWorkGroupsZ(); + +if (X != 0 && Y != 0 && Z != 0) { + std::string AttrVal = llvm::utostr(X) + std::string(", ") + +llvm::utostr(Y) + std::string(", ") + +llvm::utostr(Z); + F->addFnAttr("amdgpu-num-work-groups", AttrVal); +} + } } /// Emits control constants used to change per-architecture behaviour in the diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 069571fcf78641..98d1726bb3e0b8 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -8069,6 +8069,25 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR)); } +static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D, +
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79035 >From 5c088a59bd36df40bae9a3a712f3994feded359d Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 22 Jan 2024 12:43:27 -0600 Subject: [PATCH 1/4] [AMDGPU] Adding the amdgpu-num-work-groups function attribute A new function attribute named amdgpu-num-work-groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information. --- clang/include/clang/Basic/Attr.td | 7 ++ clang/include/clang/Basic/AttrDocs.td | 24 +++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 13 clang/lib/Sema/SemaDeclAttr.cpp | 22 +++ ...a-attribute-supported-attributes-list.test | 1 + .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 8 +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp| 5 ++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 + .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 2 + .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 53 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 ++ .../AMDGPU/attr-amdgpu-num-work-groups.ll | 65 +++ 13 files changed, 232 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 58838b01b4fd7c..1b4718258d91e6 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr { let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; } +def AMDGPUNumWorkGroups : InheritableAttr { + let Spellings = [Clang<"amdgpu_num_work_groups", 0>]; + let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">]; + let Documentation = [AMDGPUNumWorkGroupsDocs]; + let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; +} + def AMDGPUKernelCall : DeclOrTypeAttr { let Spellings = [Clang<"amdgpu_kernel">]; let Documentation = [Undocumented]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index e02a1201e2ad79..e8fd10587a8022 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2705,6 +2705,30 @@ An error will be given if: }]; } +def AMDGPUNumWorkGroupsDocs : Documentation { + let Category = DocCatAMDGPUAttributes; + let Content = [{ +The number of work groups specifies the number of work groups when the kernel +is dispatched. + +Clang supports the +``__attribute__((amdgpu_num_work_groups(, , )))`` attribute for the +AMDGPU target. This attribute may be attached to a kernel function definition +and is an optimization hint. + + parameter specifies the maximum number of work groups in the x dimentsion. +Similarly and are for the y and z dimensions respectively. + +If specified, the AMDGPU target backend might be able to produce better machine +code. + +An error will be given if: + - Specified values violate subtarget specifications; + - Specified values are not compatible with values provided through other +attributes. + }]; +} + def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> { let Content = [{ Clang supports several different calling conventions, depending on the target diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 03ac6b78598fc8..93321efd26462c 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } + + if (const auto *Attr = FD->getAttr()) { +uint32_t X = Attr->getNumWorkGroupsX(); +uint32_t Y = Attr->getNumWorkGroupsY(); +uint32_t Z = Attr->getNumWorkGroupsZ(); + +if (X != 0 && Y != 0 && Z != 0) { + std::string AttrVal = llvm::utostr(X) + std::string(", ") + +llvm::utostr(Y) + std::string(", ") + +llvm::utostr(Z); + F->addFnAttr("amdgpu-num-work-groups", AttrVal); +} + } } /// Emits control constants used to change per-architecture behaviour in the diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 069571fcf78641..98d1726bb3e0b8 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -8069,6 +8069,25 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR)); } +static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D, +
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } + + if (const auto *Attr = FD->getAttr()) { +uint32_t X = Attr->getNumWorkGroupsX(); +uint32_t Y = Attr->getNumWorkGroupsY(); +uint32_t Z = Attr->getNumWorkGroupsZ(); + +if (X != 0 && Y != 0 && Z != 0) { jwanggit86 wrote: My understanding is that 0 is not allowed. If any of the 3 numbers is 0 the attribute is rejected. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79236 >From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 23 Jan 2024 19:19:00 -0600 Subject: [PATCH 1/3] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction. --- clang/include/clang/Driver/Options.td | 4 + clang/test/Driver/amdgpu-features.c | 6 + llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 79 +++ .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 199 ++ 6 files changed, 295 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7f4fa33748faca..d570786534b361 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4796,6 +4796,10 @@ defm tgsplit : SimpleMFlag<"tgsplit", "Enable", "Disable", defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64", "Specify wavefront size 64", "Specify wavefront size 32", " mode (AMDGPU only)">; +defm amdgpu_precise_memory_op +: SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable", + " precise memory mode (AMDGPU only)", + m_amdgpu_Features_Group>; defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics", TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse, diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c index a516bc6b7ff200..57d31ccedd8783 100644 --- a/clang/test/Driver/amdgpu-features.c +++ b/clang/test/Driver/amdgpu-features.c @@ -32,3 +32,9 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-cumode %s 2>&1 | FileCheck --check-prefix=NO-CUMODE %s // NO-CUMODE: "-target-feature" "-cumode" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mamdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=PREC-MEM %s +// PREC-MEM: "-target-feature" "+amdgpu-precise-memory-op" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s +// NO-PREC-MEM: "-target-feature" "-amdgpu-precise-memory-op" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index cb29d5d9475981..c39cc947702359 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode", "Enable CU wavefront execution mode" >; +def FeaturePreciseMemory +: SubtargetFeature<"amdgpu-precise-memory-op", "EnablePreciseMemory", + "true", "Enable precise memory mode">; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8019b98b1c68d6..b69df21f785985 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -87,6 +87,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool EnableTgSplit = false; bool EnableCuMode = false; bool TrapHandler = false; + bool EnablePreciseMemory = false; // Used as options. bool EnableLoadStoreOpt = false; @@ -592,6 +593,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return EnableCuMode; } + bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 84b9330ef9633e..93cdceb37bd501 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -17,6 +17,7 @@ #include "AMDGPUMachineModuleInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -24,6 +25,8 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" +#include + using namespace llvm; using namespace llvm::AMDGPU; @@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass { bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF); + bool GFX10And11InsertWaitcntForPreciseMem(MachineFunction &MF); + public:
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsNonTemporal) const override; }; +class SIPreciseMemorySupport { jwanggit86 wrote: Merged with SICacheControl. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode", "Enable CU wavefront execution mode" >; +def FeaturePreciseMemory jwanggit86 wrote: As it is, we have a clang command-line option "-mamdgpu-precise-memory-op". When specified, "+amdgpu-precise-memory-op" would be passed to the backend as a target feature. So if the backend uses a different name for this feature, e.g., dropping the 'amdgpu' prefix as you suggested, the target feature passed by clang will be considered invalid and silently ignored. In short, my understanding is that if the prefix is dropped in the backend (in `AMDGPU.td`), then the clang command-line option also has to drop it. If you know how to keep it in clang but drop it in the backend, pls let me know. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,362 @@ +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX90A +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s jwanggit86 wrote: Tests for gfx11 added. Gfx12 is on-going. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -605,12 +606,197 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { bool IsNonTemporal) const override; }; +class SIPreciseMemorySupport { +protected: + const GCNSubtarget &ST; + const SIInstrInfo *TII = nullptr; + + IsaVersion IV; + + SIPreciseMemorySupport(const GCNSubtarget &ST) : ST(ST) { +TII = ST.getInstrInfo(); +IV = getIsaVersion(ST.getCPU()); + } + +public: + static std::unique_ptr create(const GCNSubtarget &ST); + + virtual bool handleNonAtomic(MachineBasicBlock::iterator &MI) = 0; + /// Handles atomic instruction \p MI with \p ret indicating whether \p MI + /// returns a result. + virtual bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) = 0; +}; + +class SIGfx9PreciseMemorySupport : public SIPreciseMemorySupport { +public: + SIGfx9PreciseMemorySupport(const GCNSubtarget &ST) + : SIPreciseMemorySupport(ST) {} + bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; + bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; +}; + +class SIGfx10And11PreciseMemorySupport : public SIPreciseMemorySupport { +public: + SIGfx10And11PreciseMemorySupport(const GCNSubtarget &ST) + : SIPreciseMemorySupport(ST) {} + bool handleNonAtomic(MachineBasicBlock::iterator &MI) override; + bool handleAtomic(MachineBasicBlock::iterator &MI, bool ret) override; +}; + +std::unique_ptr +SIPreciseMemorySupport::create(const GCNSubtarget &ST) { + GCNSubtarget::Generation Generation = ST.getGeneration(); + if (Generation < AMDGPUSubtarget::GFX10) jwanggit86 wrote: Pls take a look at the updated code. I'll be working on gfx12 in the meantime. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -2705,6 +2705,30 @@ An error will be given if: }]; } +def AMDGPUNumWorkGroupsDocs : Documentation { + let Category = DocCatAMDGPUAttributes; + let Content = [{ +The number of work groups specifies the number of work groups when the kernel +is dispatched. + +Clang supports the +``__attribute__((amdgpu_num_work_groups(, , )))`` attribute for the +AMDGPU target. This attribute may be attached to a kernel function definition jwanggit86 wrote: Would it be better to say "This attribute may be attached to HIP or OpenCL kernel function..." ? https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -194,3 +204,105 @@ __global__ void non_cexpr_waves_per_eu_2() {} // expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}} __attribute__((amdgpu_waves_per_eu(2, ipow2(2 __global__ void non_cexpr_waves_per_eu_2_4() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32))) +__global__ void max_num_work_groups_32() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32, 1))) +__global__ void max_num_work_groups_32_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32, 1, 1, 1))) +__global__ void max_num_work_groups_32_1_1_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(ipow2(5), 1, 1))) +__global__ void max_num_work_groups_32_1_1_non_int_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, "1", 1))) +__global__ void max_num_work_groups_32_1_1_non_int_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(-32, 1, 1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(32, -1, 1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(32, 1, -1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg2() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(0, 1, 1))) +__global__ void max_num_work_groups_0_1_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(32, 0, 1))) +__global__ void max_num_work_groups_32_0_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(32, 1, 0))) +__global__ void max_num_work_groups_32_1_0() {} + + +int num_wg_x = 32; +int num_wg_y = 1; +int num_wg_z = 1; +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(num_wg_x, 1, 1))) +__global__ void max_num_work_groups_32_1_1_non_const_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, num_wg_y, 1))) +__global__ void max_num_work_groups_32_1_1_non_const_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, 1, num_wg_z))) +__global__ void max_num_work_groups_32_1_1_non_const_arg2() {} + +const int c_num_wg_x = 32; +__attribute__((amdgpu_max_num_work_groups(c_num_wg_x, 1, 1))) +__global__ void max_num_work_groups_32_1_1_const_arg0() {} + +template +__attribute__((amdgpu_max_num_work_groups(a, 1, 1))) +__global__ void template_a_1_1_max_num_work_groups() {} +template __global__ void template_a_1_1_max_num_work_groups<32>(); + +template +__attribute__((amdgpu_max_num_work_groups(32, a, 1))) +__global__ void template_32_a_1_max_num_work_groups() {} +template __global__ void template_32_a_1_max_num_work_groups<1>(); + +template +__attribute__((amdgpu_max_num_work_groups(32, 1, a))) +__global__ void template_32_1_a_max_num_work_groups() {} +template __global__ void template_32_1_a_max_num_work_groups<1>(); + +// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +// expected-note@+4{{in instantiation of}} +template +__attribute__((amdgpu_max_num_work_groups(b, 1, 1))) +__global__ void template_b_1_1_max_num_work_groups() {} +template __global__ void template_b_1_1_max_num_work_groups<0>(); + +// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +// expected-note@+4{{in instantiation of}} +template +__attribute__((amdgpu_max_num_work_groups(32, b, 1))) +__global__ void template_32_b_1_max_num_work_groups() {} +template __global__ void template_32_b_1_max_num_work_groups<0>(); + +// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +// expected-note@+4{{in instantiation of}} +template +__attribute__(
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -139,6 +139,36 @@ kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() { // CHECK: define{{.*}} amdgpu_kernel void @reqd_work_group_size_32_2_1_flat_work_group_size_16_128() [[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]] } +__attribute__((amdgpu_max_num_work_groups(1, 1, 1))) // expected-no-diagnostics +kernel void max_num_work_groups_1_1_1() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_1() [[MAX_NUM_WORK_GROUPS_1_1_1:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(32, 1, 1))) // expected-no-diagnostics +kernel void max_num_work_groups_32_1_1() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(32, 8, 1))) // expected-no-diagnostics +kernel void max_num_work_groups_32_8_1() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_8_1() [[MAX_NUM_WORK_GROUPS_32_8_1:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(1, 1, 32))) // expected-no-diagnostics +kernel void max_num_work_groups_1_1_32() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_32() [[MAX_NUM_WORK_GROUPS_1_1_32:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(1, 8, 32))) // expected-no-diagnostics +kernel void max_num_work_groups_1_8_32() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_8_32() [[MAX_NUM_WORK_GROUPS_1_8_32:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(4, 8, 32))) // expected-no-diagnostics +kernel void max_num_work_groups_4_8_32() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_4_8_32() [[MAX_NUM_WORK_GROUPS_4_8_32:#[0-9]+]] +} + jwanggit86 wrote: Added `max_num_work_groups_max_unsigned_int()` and `max_num_work_groups_max_unsigned_int_plus1()` to `clang/test/SemaCUDA/amdgpu-attrs.cu`. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -0,0 +1,77 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s 2>&1 | FileCheck --check-prefix=ERROR %s + +; ERROR: error: can't parse integer attribute -1 in amdgpu-max-num-work-groups +define amdgpu_kernel void @empty_max_num_work_groups_neg_num1() #21 { +entry: + ret void +} +attributes #21 = {"amdgpu-max-num-work-groups"="-1,2,3"} + +; ERROR: error: can't parse integer attribute -2 in amdgpu-max-num-work-groups +define amdgpu_kernel void @empty_max_num_work_groups_neg_num2() #22 { +entry: + ret void +} +attributes #22 = {"amdgpu-max-num-work-groups"="1,-2,3"} + +; ERROR: error: can't parse integer attribute -3 in amdgpu-max-num-work-groups +define amdgpu_kernel void @empty_max_num_work_groups_neg_num3() #23 { +entry: + ret void +} +attributes #23 = {"amdgpu-max-num-work-groups"="1,2,-3"} + + +; ERROR: error: can't parse integer attribute 1.0 in amdgpu-max-num-work-groups +define amdgpu_kernel void @empty_max_num_work_groups_non_int1() #31 { +entry: + ret void +} +attributes #31 = {"amdgpu-max-num-work-groups"="1.0,2,3"} + +; ERROR: error: can't parse integer attribute 2.0 in amdgpu-max-num-work-groups +define amdgpu_kernel void @empty_max_num_work_groups_non_int2() #32 { +entry: + ret void +} +attributes #32 = {"amdgpu-max-num-work-groups"="1,2.0,3"} + +; ERROR: error: can't parse integer attribute 3.0 in amdgpu-max-num-work-groups +define amdgpu_kernel void @empty_max_num_work_groups_non_int3() #33 { +entry: + ret void +} +attributes #33 = {"amdgpu-max-num-work-groups"="1,2,3.0"} + +; ERROR: error: can't parse integer attribute 100 in amdgpu-max-num-work-groups +define amdgpu_kernel void @empty_max_num_work_groups_too_large() #41 { +entry: + ret void +} +attributes #41 = {"amdgpu-max-num-work-groups"="100,2,3"} + + +; ERROR: error: attribute amdgpu-max-num-work-groups has incorrect number of integers; expected 3 +define amdgpu_kernel void @empty_max_num_work_groups_1_arg() #51 { +entry: + ret void +} +attributes #51 = {"amdgpu-max-num-work-groups"="1"} + +; ERROR: error: attribute amdgpu-max-num-work-groups has incorrect number of integers; expected 3 +define amdgpu_kernel void @empty_max_num_work_groups_2_args() #52 { +entry: + ret void +} +attributes #52 = {"amdgpu-max-num-work-groups"="1,2"} + +; ERROR: error: attribute amdgpu-max-num-work-groups has incorrect number of integers; expected 3 +define amdgpu_kernel void @empty_max_num_work_groups_4_args() #53 { +entry: + ret void +} +attributes #53 = {"amdgpu-max-num-work-groups"="1,2,3,4"} + + + + jwanggit86 wrote: Removed. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -137,6 +137,12 @@ Removed Compiler Flags Attribute Changes in Clang -- +- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or jwanggit86 wrote: There are existing attributes that have workgroup spelled as two separate words: [flat-work-group-size](https://clang.llvm.org/docs/AttributeReference.html#amdgpu-flat-work-group-size), [reqd_work_group_size](https://docs.xilinx.com/r/2021.1-English/ug1393-vitis-application-acceleration/reqd_work_group_size). Pls let me know if you still want workgroup as one word. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -137,6 +137,12 @@ Removed Compiler Flags Attribute Changes in Clang -- +- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or jwanggit86 wrote: In the case of flat workgroup size, the LLVM attribute is called `amdgpu-flat-work-group-size`, but the metadata is `.max_flat_workgroup_size`. I suppose we can copy that and change the metadata from `.max_num_work_groups_x` to `.max_num_workgoups_x` and so on. Do you want the LLVM attribute to be changed from `amdgpu-max-num-work-groups` to `amdgpu-max-num-workgroups` as well? Note that in the file `AMDGPUUsage.rst` the word `work-group` with a hyphen is used a lot. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
https://github.com/jwanggit86 edited https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
https://github.com/jwanggit86 closed https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -194,3 +204,87 @@ __global__ void non_cexpr_waves_per_eu_2() {} // expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}} __attribute__((amdgpu_waves_per_eu(2, ipow2(2 __global__ void non_cexpr_waves_per_eu_2_4() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32))) +__global__ void max_num_work_groups_32() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32, 1))) +__global__ void max_num_work_groups_32_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32, 1, 1, 1))) +__global__ void max_num_work_groups_32_1_1_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(ipow2(5), 1, 1))) +__global__ void max_num_work_groups_32_1_1_non_int_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, "1", 1))) +__global__ void max_num_work_groups_32_1_1_non_int_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(-32, 1, 1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(32, -1, 1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(32, 1, -1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg2() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(0, 1, 1))) +__global__ void max_num_work_groups_0_1_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(32, 0, 1))) +__global__ void max_num_work_groups_32_0_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(32, 1, 0))) +__global__ void max_num_work_groups_32_1_0() {} + + +int num_wg_x = 32; +int num_wg_y = 1; +int num_wg_z = 1; +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(num_wg_x, 1, 1))) +__global__ void max_num_work_groups_32_1_1_non_const_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, num_wg_y, 1))) +__global__ void max_num_work_groups_32_1_1_non_const_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, 1, num_wg_z))) +__global__ void max_num_work_groups_32_1_1_non_const_arg2() {} + +const int c_num_wg_x = 32; +__attribute__((amdgpu_max_num_work_groups(c_num_wg_x, 1, 1))) +__global__ void max_num_work_groups_32_1_1_const_arg0() {} + +// expected-error@+2{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}} jwanggit86 wrote: These tests have been verified. In the case of line 272, because `checkUInt32Argument` (which is used by `handleAMDGPUMaxNumWorkGroupsAttr`) rejects template args (i.e., it checks `isTypeDependent`), the argument `a` would be rejected. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -8069,6 +8069,67 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR)); } +// Returns true if error jwanggit86 wrote: (1) Updated `ReleaseNotes.rst` (2) Added codegen testcases in `CodeGenCUDA/amdgpu-kernel-attrs.cu` (3) Removed the comment line as suggested. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -8069,6 +8069,67 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR)); } +// Returns true if error +static bool +checkAMDGPUMaxNumWorkGroupsArguments(Sema &S, Expr *XExpr, Expr *YExpr, + Expr *ZExpr, + const AMDGPUMaxNumWorkGroupsAttr &Attr) { + if (S.DiagnoseUnexpandedParameterPack(XExpr) || + S.DiagnoseUnexpandedParameterPack(YExpr) || + S.DiagnoseUnexpandedParameterPack(ZExpr)) +return true; + + // Accept template arguments for now as they depend on something else. + // We'll get to check them when they eventually get instantiated. + if (XExpr->isValueDependent() || YExpr->isValueDependent() || + ZExpr->isValueDependent()) +return false; + + uint32_t NumWG[3]; + Expr *Exprs[3] = {XExpr, YExpr, ZExpr}; + for (int i = 0; i < 3; i++) { +if (!checkUInt32Argument(S, Attr, Exprs[i], NumWG[i], i, + /*StrictlyUnsigned=*/true)) + return true; +if (NumWG[i] == 0) { + S.Diag(Attr.getLoc(), diag::err_attribute_argument_is_zero) + << &Attr << Exprs[i]->getSourceRange(); + return true; +} + } + + return false; +} + +AMDGPUMaxNumWorkGroupsAttr * +Sema::CreateAMDGPUMaxNumWorkGroupsAttr(const AttributeCommonInfo &CI, + Expr *XExpr, Expr *YExpr, Expr *ZExpr) { + AMDGPUMaxNumWorkGroupsAttr TmpAttr(Context, CI, XExpr, YExpr, ZExpr); + + if (checkAMDGPUMaxNumWorkGroupsArguments(*this, XExpr, YExpr, ZExpr, TmpAttr)) +return nullptr; + + return ::new (Context) + AMDGPUMaxNumWorkGroupsAttr(Context, CI, XExpr, YExpr, ZExpr); +} + +void Sema::addAMDGPUMaxNumWorkGroupsAttr(Decl *D, const AttributeCommonInfo &CI, + Expr *XExpr, Expr *YExpr, + Expr *ZExpr) { + if (auto *Attr = CreateAMDGPUMaxNumWorkGroupsAttr(CI, XExpr, YExpr, ZExpr)) +D->addAttr(Attr); +} + +static void handleAMDGPUMaxNumWorkGroupsAttr(Sema &S, Decl *D, + const ParsedAttr &AL) { + if (AL.getNumArgs() != 3) { jwanggit86 wrote: Removed. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -607,6 +607,29 @@ static void instantiateDependentAMDGPUWavesPerEUAttr( S.addAMDGPUWavesPerEUAttr(New, Attr, MinExpr, MaxExpr); } +static void instantiateDependentAMDGPUMaxNumWorkGroupsAttr( +Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs, +const AMDGPUMaxNumWorkGroupsAttr &Attr, Decl *New) { + EnterExpressionEvaluationContext Unevaluated( + S, Sema::ExpressionEvaluationContext::ConstantEvaluated); + + ExprResult ResultX = S.SubstExpr(Attr.getMaxNumWorkGroupsX(), TemplateArgs); + if (ResultX.isInvalid()) jwanggit86 wrote: Updated as suggested. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
jwanggit86 wrote: @arsenm Any comments on the LLVM side? https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
jwanggit86 wrote: @Pierre-vh Any further comments? https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
jwanggit86 wrote: @Pierre-vh Could you pls help review the backend part of this patch? Thanks! https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
jwanggit86 wrote: @jayfoad After trying the patch you provided above, it appears that this feature can indeed be done in SIInsertWaitcnt instead of SIMemoryLegalizer. Code has been updated accordingly. Pls take a look. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -137,6 +137,11 @@ Removed Compiler Flags Attribute Changes in Clang -- +- Introduced a new function attribute ``__attribute__((amdgpu_max_num_work_groups(x, y, z)))`` or + ``[[clang::amdgpu_max_num_work_groups(x, y, z)]]`` for the AMDGPU target. This attribute can be + attached to HIP or OpenCL kernel function definitions to provide an optimization hint. The parameters + ``x``, ``y``, and ``z`` specify the maximum number of workgroups for the respective dimensions, + and each must be a positive integer. jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -356,6 +356,24 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } + + if (const auto *Attr = FD->getAttr()) { +uint32_t X = Attr->getMaxNumWorkGroupsX() + ->EvaluateKnownConstInt(M.getContext()) + .getExtValue(); +uint32_t Y = Attr->getMaxNumWorkGroupsY() + ->EvaluateKnownConstInt(M.getContext()) + .getExtValue(); +uint32_t Z = Attr->getMaxNumWorkGroupsZ() + ->EvaluateKnownConstInt(M.getContext()) + .getExtValue(); + +llvm::SmallString<32> AttrVal; +llvm::raw_svector_ostream OS(AttrVal); +OS << X << "," << Y << "," << Z; jwanggit86 wrote: Done. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -139,6 +139,36 @@ kernel void reqd_work_group_size_32_2_1_flat_work_group_size_16_128() { // CHECK: define{{.*}} amdgpu_kernel void @reqd_work_group_size_32_2_1_flat_work_group_size_16_128() [[FLAT_WORK_GROUP_SIZE_16_128:#[0-9]+]] } +__attribute__((amdgpu_max_num_work_groups(1, 1, 1))) // expected-no-diagnostics +kernel void max_num_work_groups_1_1_1() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_1() [[MAX_NUM_WORK_GROUPS_1_1_1:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(32, 1, 1))) // expected-no-diagnostics +kernel void max_num_work_groups_32_1_1() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_1_1() [[MAX_NUM_WORK_GROUPS_32_1_1:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(32, 8, 1))) // expected-no-diagnostics +kernel void max_num_work_groups_32_8_1() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_32_8_1() [[MAX_NUM_WORK_GROUPS_32_8_1:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(1, 1, 32))) // expected-no-diagnostics +kernel void max_num_work_groups_1_1_32() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_1_32() [[MAX_NUM_WORK_GROUPS_1_1_32:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(1, 8, 32))) // expected-no-diagnostics +kernel void max_num_work_groups_1_8_32() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_1_8_32() [[MAX_NUM_WORK_GROUPS_1_8_32:#[0-9]+]] +} + +__attribute__((amdgpu_max_num_work_groups(4, 8, 32))) // expected-no-diagnostics +kernel void max_num_work_groups_4_8_32() { +// CHECK: define{{.*}} amdgpu_kernel void @max_num_work_groups_4_8_32() [[MAX_NUM_WORK_GROUPS_4_8_32:#[0-9]+]] +} + jwanggit86 wrote: According to Brian Sumner, there's no max. So in the implementation the max is just the max value of an unsigned int. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -194,3 +204,105 @@ __global__ void non_cexpr_waves_per_eu_2() {} // expected-error@+1{{'amdgpu_waves_per_eu' attribute requires parameter 1 to be an integer constant}} __attribute__((amdgpu_waves_per_eu(2, ipow2(2 __global__ void non_cexpr_waves_per_eu_2_4() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32))) +__global__ void max_num_work_groups_32() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32, 1))) +__global__ void max_num_work_groups_32_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires exactly 3 arguments}} +__attribute__((amdgpu_max_num_work_groups(32, 1, 1, 1))) +__global__ void max_num_work_groups_32_1_1_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(ipow2(5), 1, 1))) +__global__ void max_num_work_groups_32_1_1_non_int_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, "1", 1))) +__global__ void max_num_work_groups_32_1_1_non_int_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(-32, 1, 1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(32, -1, 1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires a non-negative integral compile time constant expression}} +__attribute__((amdgpu_max_num_work_groups(32, 1, -1))) +__global__ void max_num_work_groups_32_1_1_neg_int_arg2() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(0, 1, 1))) +__global__ void max_num_work_groups_0_1_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(32, 0, 1))) +__global__ void max_num_work_groups_32_0_1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +__attribute__((amdgpu_max_num_work_groups(32, 1, 0))) +__global__ void max_num_work_groups_32_1_0() {} + + +int num_wg_x = 32; +int num_wg_y = 1; +int num_wg_z = 1; +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 0 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(num_wg_x, 1, 1))) +__global__ void max_num_work_groups_32_1_1_non_const_arg0() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 1 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, num_wg_y, 1))) +__global__ void max_num_work_groups_32_1_1_non_const_arg1() {} + +// expected-error@+1{{'amdgpu_max_num_work_groups' attribute requires parameter 2 to be an integer constant}} +__attribute__((amdgpu_max_num_work_groups(32, 1, num_wg_z))) +__global__ void max_num_work_groups_32_1_1_non_const_arg2() {} + +const int c_num_wg_x = 32; +__attribute__((amdgpu_max_num_work_groups(c_num_wg_x, 1, 1))) +__global__ void max_num_work_groups_32_1_1_const_arg0() {} + +template +__attribute__((amdgpu_max_num_work_groups(a, 1, 1))) +__global__ void template_a_1_1_max_num_work_groups() {} +template __global__ void template_a_1_1_max_num_work_groups<32>(); + +template +__attribute__((amdgpu_max_num_work_groups(32, a, 1))) +__global__ void template_32_a_1_max_num_work_groups() {} +template __global__ void template_32_a_1_max_num_work_groups<1>(); + +template +__attribute__((amdgpu_max_num_work_groups(32, 1, a))) +__global__ void template_32_1_a_max_num_work_groups() {} +template __global__ void template_32_1_a_max_num_work_groups<1>(); + +// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +// expected-note@+4{{in instantiation of}} +template +__attribute__((amdgpu_max_num_work_groups(b, 1, 1))) +__global__ void template_b_1_1_max_num_work_groups() {} +template __global__ void template_b_1_1_max_num_work_groups<0>(); + +// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +// expected-note@+4{{in instantiation of}} +template +__attribute__((amdgpu_max_num_work_groups(32, b, 1))) +__global__ void template_32_b_1_max_num_work_groups() {} +template __global__ void template_32_b_1_max_num_work_groups<0>(); + +// expected-error@+3{{'amdgpu_max_num_work_groups' attribute must be greater than 0}} +// expected-note@+4{{in instantiation of}} +template +__attribute__(
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -814,6 +814,15 @@ bool shouldEmitConstantsToTextSection(const Triple &TT); /// to integer. int getIntegerAttribute(const Function &F, StringRef Name, int Default); +/// \returns Unsigned Integer value requested using \p F's \p Name attribute. +/// +/// \returns \p Default if attribute is not present. +/// +/// \returns \p Default and emits error if requested value cannot be converted +/// to integer. +unsigned getUnsignedIntegerAttribute(const Function &F, StringRef Name, jwanggit86 wrote: Removed `getUnsignedIntegerAttribute()`. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -0,0 +1,84 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + jwanggit86 wrote: Created a new test file to test various errors. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
@@ -0,0 +1,84 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s + +; Attribute not specified. +; CHECK-LABEL: {{^}}empty_no_attribute: +define amdgpu_kernel void @empty_no_attribute() { +entry: + ret void +} + +; Ignore if number of work groups for x dimension is 0. +; CHECK-LABEL: {{^}}empty_max_num_work_groups_x0: +define amdgpu_kernel void @empty_max_num_work_groups_x0() #0 { +entry: + ret void +} +attributes #0 = {"amdgpu-max-num-work-groups"="0,2,3"} + +; Ignore if number of work groups for y dimension is 0. +; CHECK-LABEL: {{^}}empty_max_num_work_groups_y0: +define amdgpu_kernel void @empty_max_num_work_groups_y0() #1 { +entry: + ret void +} +attributes #1 = {"amdgpu-max-num-work-groups"="1,0,3"} + +; Ignore if number of work groups for z dimension is 0. +; CHECK-LABEL: {{^}}empty_max_num_work_groups_z0: +define amdgpu_kernel void @empty_max_num_work_groups_z0() #2 { +entry: + ret void +} +attributes #2 = {"amdgpu-max-num-work-groups"="1,2,0"} + +; CHECK-LABEL: {{^}}empty_max_num_work_groups_1_2_3: +define amdgpu_kernel void @empty_max_num_work_groups_1_2_3() #3 { +entry: + ret void +} +attributes #3 = {"amdgpu-max-num-work-groups"="1,2,3"} + +; CHECK-LABEL: {{^}}empty_max_num_work_groups_1024_1024_1024: +define amdgpu_kernel void @empty_max_num_work_groups_1024_1024_1024() #4 { +entry: + ret void +} +attributes #4 = {"amdgpu-max-num-work-groups"="1024,1024,1024"} + jwanggit86 wrote: See the file `attr-amdgpu-num-work-groups_error_check.ll`. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass { bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF); jwanggit86 wrote: Sure. Code has been updated as suggested. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode", "Enable CU wavefront execution mode" >; +def FeaturePreciseMemory jwanggit86 wrote: End user needs this, so a clang command-line option has been added. There was a lot discussion on this. I'll forward you the email. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -17,13 +17,16 @@ #include "AMDGPUMachineModuleInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" +#include jwanggit86 wrote: Removed. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -0,0 +1,199 @@ +; Testing the -amdgpu-precise-memory-op option +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9 +; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX90A +; COM: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+amdgpu-precise-memory-op -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global,+enable-flat-scratch,+amdgpu-precise-memory-op -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s + +; from atomicrmw-expand.ll +; covers flat_load, flat_atomic +define void @syncscope_workgroup_nortn(ptr %addr, float %val) { +; GFX90A-LABEL: syncscope_workgroup_nortn: +; GFX90A: ; %bb.0: +; GFX90A: flat_load_dword v5, v[0:1] +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX90A: .LBB0_1: ; %atomicrmw.start +; GFX90A: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) + %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst + ret void +} + +; from atomicrmw-nand.ll +; covers global_atomic, global_load +define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind { +; GFX9-LABEL: atomic_nand_i32_global: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:global_load_dword v2, v[0:1], off +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_mov_b64 s[4:5], 0 +; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX9-NEXT:; =>This Inner Loop Header: Depth=1 +; GFX9-NOT: s_waitcnt vmcnt(0) +; GFX9-NEXT:v_mov_b32_e32 v3, v2 +; GFX9-NEXT:v_not_b32_e32 v2, v3 +; GFX9-NEXT:v_or_b32_e32 v2, -5, v2 +; GFX9-NEXT:global_atomic_cmpswap v2, v[0:1], v[2:3], off glc +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:buffer_wbinvl1_vol +; GFX9-NEXT:v_cmp_eq_u32_e32 vcc, v2, v3 +; GFX9-NEXT:s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT:s_andn2_b64 exec, exec, s[4:5] +; GFX9-NEXT:s_cbranch_execnz .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX9-NEXT:s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT:v_mov_b32_e32 v0, v2 +; GFX9-NEXT:s_setpc_b64 s[30:31] + %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst + ret i32 %result +} + +; from bf16.ll +; covers buffer_load, buffer_store, flat_load, flat_store, global_load, global_store +define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; +; GFX9-LABEL: test_load_store: +; GFX9: ; %bb.0: +; GFX9-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT:global_load_ushort v0, v[0:1], off +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:global_store_short v[2:3], v0, off +; GFX9-NEXT:s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT:s_setpc_b64 s[30:31] +; +; GFX10-LABEL: test_load_store: +; GFX10: ; %bb.0: +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:global_load_ushort v0, v[0:1], off +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT:global_store_short v[2:3], v0, off +; GFX10-NEXT:s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT:s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT:s_setpc_b64 s[30:31] + %val = load bfloat, ptr addrspace(1) %in + store bfloat %val, ptr addrspace(1) %out + ret void +} + +; from scratch-simple.ll +; covers scratch_load, scratch_store +; +; GFX9-FLATSCR-LABEL: {{^}}vs_main: +; GFX9-FLATSCR:scratch_store_dwordx4 off, v[{{[0-9:]+}}], +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-FLATSCR:scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off +; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +define amdgpu_vs float @vs_main(i32 %idx) { + %v1 = extractelement <81 x float> , i32 %idx jwanggit86 wrote: The function is copied from another test case. The purpose was to generate one or more particular memory instructions. In this particular case, "scratch_load" and "scratch_store". https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) { jwanggit86 wrote: Yes. Code has been rewritten. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + IsaVersion IV = getIsaVersion(ST.getCPU()); + + bool Changed = false; + + for (auto &MBB : MF) { +for (auto MI = MBB.begin(); MI != MBB.end();) { + MachineInstr &Inst = *MI; + ++MI; + if (Inst.mayLoadOrStore() == false) jwanggit86 wrote: Ok. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79236 >From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 23 Jan 2024 19:19:00 -0600 Subject: [PATCH 1/2] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction. --- clang/include/clang/Driver/Options.td | 4 + clang/test/Driver/amdgpu-features.c | 6 + llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 79 +++ .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 199 ++ 6 files changed, 295 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7f4fa33748fac..d570786534b36 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4796,6 +4796,10 @@ defm tgsplit : SimpleMFlag<"tgsplit", "Enable", "Disable", defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64", "Specify wavefront size 64", "Specify wavefront size 32", " mode (AMDGPU only)">; +defm amdgpu_precise_memory_op +: SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable", + " precise memory mode (AMDGPU only)", + m_amdgpu_Features_Group>; defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics", TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse, diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c index a516bc6b7ff20..57d31ccedd878 100644 --- a/clang/test/Driver/amdgpu-features.c +++ b/clang/test/Driver/amdgpu-features.c @@ -32,3 +32,9 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-cumode %s 2>&1 | FileCheck --check-prefix=NO-CUMODE %s // NO-CUMODE: "-target-feature" "-cumode" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mamdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=PREC-MEM %s +// PREC-MEM: "-target-feature" "+amdgpu-precise-memory-op" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s +// NO-PREC-MEM: "-target-feature" "-amdgpu-precise-memory-op" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index cb29d5d947598..c39cc94770235 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode", "Enable CU wavefront execution mode" >; +def FeaturePreciseMemory +: SubtargetFeature<"amdgpu-precise-memory-op", "EnablePreciseMemory", + "true", "Enable precise memory mode">; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8019b98b1c68d..b69df21f78598 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -87,6 +87,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool EnableTgSplit = false; bool EnableCuMode = false; bool TrapHandler = false; + bool EnablePreciseMemory = false; // Used as options. bool EnableLoadStoreOpt = false; @@ -592,6 +593,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return EnableCuMode; } + bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 84b9330ef9633..93cdceb37bd50 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -17,6 +17,7 @@ #include "AMDGPUMachineModuleInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -24,6 +25,8 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" +#include + using namespace llvm; using namespace llvm::AMDGPU; @@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass { bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF); + bool GFX10And11InsertWaitcntForPreciseMem(MachineFunction &MF); + public: static ch
[llvm] [clang] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79035 >From 5c088a59bd36df40bae9a3a712f3994feded359d Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 22 Jan 2024 12:43:27 -0600 Subject: [PATCH 1/2] [AMDGPU] Adding the amdgpu-num-work-groups function attribute A new function attribute named amdgpu-num-work-groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information. --- clang/include/clang/Basic/Attr.td | 7 ++ clang/include/clang/Basic/AttrDocs.td | 24 +++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 13 clang/lib/Sema/SemaDeclAttr.cpp | 22 +++ ...a-attribute-supported-attributes-list.test | 1 + .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 8 +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp| 5 ++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 + .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 2 + .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 53 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 ++ .../AMDGPU/attr-amdgpu-num-work-groups.ll | 65 +++ 13 files changed, 232 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 58838b01b4fd7c..1b4718258d91e6 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr { let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; } +def AMDGPUNumWorkGroups : InheritableAttr { + let Spellings = [Clang<"amdgpu_num_work_groups", 0>]; + let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">]; + let Documentation = [AMDGPUNumWorkGroupsDocs]; + let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; +} + def AMDGPUKernelCall : DeclOrTypeAttr { let Spellings = [Clang<"amdgpu_kernel">]; let Documentation = [Undocumented]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index e02a1201e2ad79..e8fd10587a8022 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2705,6 +2705,30 @@ An error will be given if: }]; } +def AMDGPUNumWorkGroupsDocs : Documentation { + let Category = DocCatAMDGPUAttributes; + let Content = [{ +The number of work groups specifies the number of work groups when the kernel +is dispatched. + +Clang supports the +``__attribute__((amdgpu_num_work_groups(, , )))`` attribute for the +AMDGPU target. This attribute may be attached to a kernel function definition +and is an optimization hint. + + parameter specifies the maximum number of work groups in the x dimentsion. +Similarly and are for the y and z dimensions respectively. + +If specified, the AMDGPU target backend might be able to produce better machine +code. + +An error will be given if: + - Specified values violate subtarget specifications; + - Specified values are not compatible with values provided through other +attributes. + }]; +} + def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> { let Content = [{ Clang supports several different calling conventions, depending on the target diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 03ac6b78598fc8..93321efd26462c 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } + + if (const auto *Attr = FD->getAttr()) { +uint32_t X = Attr->getNumWorkGroupsX(); +uint32_t Y = Attr->getNumWorkGroupsY(); +uint32_t Z = Attr->getNumWorkGroupsZ(); + +if (X != 0 && Y != 0 && Z != 0) { + std::string AttrVal = llvm::utostr(X) + std::string(", ") + +llvm::utostr(Y) + std::string(", ") + +llvm::utostr(Z); + F->addFnAttr("amdgpu-num-work-groups", AttrVal); +} + } } /// Emits control constants used to change per-architecture behaviour in the diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 069571fcf78641..98d1726bb3e0b8 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -8069,6 +8069,25 @@ static void handleAMDGPUNumVGPRAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(::new (S.Context) AMDGPUNumVGPRAttr(S.Context, AL, NumVGPR)); } +static void handleAMDGPUNumWorkGroupsAttr(Sema &S, Decl *D, +
[llvm] [clang] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
jwanggit86 wrote: @krzysz00 Code has been updated. Pls take a look when convenient. Pls note the following: (1) Two attributes are now supported, one for min and one for max num of workgroups. (2) It is allowed to only specify one of the two attributes. (3) An attribute is ignored if any one of the 3 numbers (for x,y,z) is 0. (4) When both attributes are valid, we make sure max >= min element-wise. If not, both are ignored. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
jwanggit86 wrote: @t-tye Code has been updated based on your feedback. Pls take a look. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
https://github.com/jwanggit86 created https://github.com/llvm/llvm-project/pull/79236 This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction. >From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 23 Jan 2024 19:19:00 -0600 Subject: [PATCH] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction. --- clang/include/clang/Driver/Options.td | 4 + clang/test/Driver/amdgpu-features.c | 6 + llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 79 +++ .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 199 ++ 6 files changed, 295 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7f4fa33748facaf..d570786534b3611 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4796,6 +4796,10 @@ defm tgsplit : SimpleMFlag<"tgsplit", "Enable", "Disable", defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64", "Specify wavefront size 64", "Specify wavefront size 32", " mode (AMDGPU only)">; +defm amdgpu_precise_memory_op +: SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable", + " precise memory mode (AMDGPU only)", + m_amdgpu_Features_Group>; defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics", TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse, diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c index a516bc6b7ff2004..57d31ccedd8783e 100644 --- a/clang/test/Driver/amdgpu-features.c +++ b/clang/test/Driver/amdgpu-features.c @@ -32,3 +32,9 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-cumode %s 2>&1 | FileCheck --check-prefix=NO-CUMODE %s // NO-CUMODE: "-target-feature" "-cumode" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mamdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=PREC-MEM %s +// PREC-MEM: "-target-feature" "+amdgpu-precise-memory-op" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s +// NO-PREC-MEM: "-target-feature" "-amdgpu-precise-memory-op" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index cb29d5d94759812..c39cc9477023591 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode", "Enable CU wavefront execution mode" >; +def FeaturePreciseMemory +: SubtargetFeature<"amdgpu-precise-memory-op", "EnablePreciseMemory", + "true", "Enable precise memory mode">; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8019b98b1c68d66..b69df21f7859851 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -87,6 +87,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool EnableTgSplit = false; bool EnableCuMode = false; bool TrapHandler = false; + bool EnablePreciseMemory = false; // Used as options. bool EnableLoadStoreOpt = false; @@ -592,6 +593,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return EnableCuMode; } + bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 84b9330ef9633eb..93cdceb37bd5017 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -17,6 +17,7 @@ #include "AMDGPUMachineModuleInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -24,6 +25,8 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" +#include + using namespace llvm; using namespace llvm::AMDGPU; @@ -641,6 +644,9 @@ class SIMemoryLegali
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
https://github.com/jwanggit86 closed https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #68932)
jwanggit86 wrote: Implementation is moved to SIMemoryLegalizer pass. See pull req [79236](https://github.com/llvm/llvm-project/pull/79236). https://github.com/llvm/llvm-project/pull/68932 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + IsaVersion IV = getIsaVersion(ST.getCPU()); + + bool Changed = false; + + for (auto &MBB : MF) { +for (auto MI = MBB.begin(); MI != MBB.end();) { + MachineInstr &Inst = *MI; + ++MI; + if (Inst.mayLoadOrStore() == false) +continue; + + // Todo: if next insn is an s_waitcnt + AMDGPU::Waitcnt Wait; + + if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { +if (TII->isSMRD(Inst)) { // scalar jwanggit86 wrote: That's a valid point. However, even though it does similar work, I'd say the similarity to SIInsertWaitcnt is only to a certain extent. The differences include: (1) This is for a different purpose, i.e. to support the so-called "precise memory mode", in particular precise memory exceptions, for certain GPUs. (2) This feature is optional while SIInsertWaintcnt is not. (3) The counter values in SIInsertWaitcnt are precise, while in this features the counters are simply set to 0. If performance is a concern, pls note that this feature is controlled by a command-line option which by default is off. The user has to explicitly give the option for it to work. We assume the user knows there's extra work for the compiler when the option is turned on. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
@@ -2561,6 +2567,70 @@ bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, return Changed; } +bool SIMemoryLegalizer::GFX9InsertWaitcntForPreciseMem(MachineFunction &MF) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + IsaVersion IV = getIsaVersion(ST.getCPU()); + + bool Changed = false; + + for (auto &MBB : MF) { +for (auto MI = MBB.begin(); MI != MBB.end();) { + MachineInstr &Inst = *MI; + ++MI; + if (Inst.mayLoadOrStore() == false) +continue; + + // Todo: if next insn is an s_waitcnt + AMDGPU::Waitcnt Wait; + + if (!(Inst.getDesc().TSFlags & SIInstrFlags::maybeAtomic)) { +if (TII->isSMRD(Inst)) { // scalar jwanggit86 wrote: @t-tye Hi Tony, do you have any comments? Note that the function for gfx10 and 11 is empty for now. I want to get some feedback before going further. https://github.com/llvm/llvm-project/pull/79236 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
jwanggit86 wrote: @arsenm @krzysz00 Any comments? https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
jwanggit86 wrote: @krzysz00 Are you asking for something like the following: ``` "amdgpu-min-num-work-groups"="1,2,3", "amdgpu-max-num-work-groups"="4,5,6" ``` When both are given, min must be <= max. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
jwanggit86 wrote: @krzysz00 Let me make sure I understand the requirements correctly. Based on my understanding, the following are the requirements. Pls let me know if there are any mistakes. 1. Create a new function attribute for the number of workgroups (maybe 2 attributes, one for max, and one for min). 2. The function attribute consists of 3 unsigned integers, one for each of the x, y, and z dimensions. 3. If the numbers are all valid, they are listed in the medata data section of the .s file. 4. What the compiler does with the numbers is left for future work. https://github.com/llvm/llvm-project/pull/79035 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)
jwanggit86 wrote: @krzysz00 So how do you want to proceed? https://github.com/llvm/llvm-project/pull/75647 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[llvm] [clang] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)
jwanggit86 wrote: @krzysz00 So, instead of 1 number as in the current implementation, you want 3 numbers, i.e., 3 lines like the following in the metadata section? ``` .num_workgroups_x: 1 .num_workgroups_y: 2 .num_workgroups_z: 3 ``` https://github.com/llvm/llvm-project/pull/75647 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #79035)
https://github.com/jwanggit86 created https://github.com/llvm/llvm-project/pull/79035 A new function attribute named amdgpu-num-work-groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information. >From 8ed74e15d0798bf7d171c912e6268eba1d760b64 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Mon, 22 Jan 2024 12:43:27 -0600 Subject: [PATCH] [AMDGPU] Adding the amdgpu-num-work-groups function attribute A new function attribute named amdgpu-num-work-groups is added. This attribute, which consists of three integers, allows programmers to let the compiler know the number of workgroups to be launched in each of the three dimensions and do optimizations based on that information. --- clang/include/clang/Basic/Attr.td | 7 ++ clang/include/clang/Basic/AttrDocs.td | 24 +++ clang/lib/CodeGen/Targets/AMDGPU.cpp | 13 clang/lib/Sema/SemaDeclAttr.cpp | 22 +++ ...a-attribute-supported-attributes-list.test | 1 + .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 8 +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp| 5 ++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 3 + .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 2 + .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 10 +++ .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp| 53 +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 19 ++ .../AMDGPU/attr-amdgpu-num-work-groups.ll | 65 +++ 13 files changed, 232 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-work-groups.ll diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 78a9229aeaf081..5251858ac3bfd7 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -2031,6 +2031,13 @@ def AMDGPUNumVGPR : InheritableAttr { let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; } +def AMDGPUNumWorkGroups : InheritableAttr { + let Spellings = [Clang<"amdgpu_num_work_groups", 0>]; + let Args = [UnsignedArgument<"NumWorkGroupsX">, UnsignedArgument<"NumWorkGroupsY">, UnsignedArgument<"NumWorkGroupsZ">]; + let Documentation = [AMDGPUNumWorkGroupsDocs]; + let Subjects = SubjectList<[Function], ErrorDiag, "kernel functions">; +} + def AMDGPUKernelCall : DeclOrTypeAttr { let Spellings = [Clang<"amdgpu_kernel">]; let Documentation = [Undocumented]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 9e8190614fbe8a..268d15eddab16f 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -2702,6 +2702,30 @@ An error will be given if: }]; } +def AMDGPUNumWorkGroupsDocs : Documentation { + let Category = DocCatAMDGPUAttributes; + let Content = [{ +The number of work groups specifies the number of work groups when the kernel +is dispatched. + +Clang supports the +``__attribute__((amdgpu_num_work_groups(, , )))`` attribute for the +AMDGPU target. This attribute may be attached to a kernel function definition +and is an optimization hint. + + parameter specifies the maximum number of work groups in the x dimentsion. +Similarly and are for the y and z dimensions respectively. + +If specified, the AMDGPU target backend might be able to produce better machine +code. + +An error will be given if: + - Specified values violate subtarget specifications; + - Specified values are not compatible with values provided through other +attributes. + }]; +} + def DocCatCallingConvs : DocumentationCategory<"Calling Conventions"> { let Content = [{ Clang supports several different calling conventions, depending on the target diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 03ac6b78598fc8..93321efd26462c 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -356,6 +356,19 @@ void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes( if (NumVGPR != 0) F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR)); } + + if (const auto *Attr = FD->getAttr()) { +uint32_t X = Attr->getNumWorkGroupsX(); +uint32_t Y = Attr->getNumWorkGroupsY(); +uint32_t Z = Attr->getNumWorkGroupsZ(); + +if (X != 0 && Y != 0 && Z != 0) { + std::string AttrVal = llvm::utostr(X) + std::string(", ") + +llvm::utostr(Y) + std::string(", ") + +llvm::utostr(Z); + F->addFnAttr("amdgpu-num-work-groups", AttrVal); +} + } } /// Emits control constants used to change per-architecture behaviour in the diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index a482919356e1bc..dc1c951031d58c 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp
[llvm] [clang] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)
jwanggit86 wrote: Reimplemented in [79035](https://github.com/llvm/llvm-project/pull/79035). https://github.com/llvm/llvm-project/pull/75647 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Adding the amdgpu-num-work-groups function attribute (PR #75647)
https://github.com/jwanggit86 closed https://github.com/llvm/llvm-project/pull/75647 ___ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[clang] [llvm] [AMDGPU] Emit a waitcnt instruction after each memory instruction (PR #79236)
https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/79236 >From 9c40b1151b0673430ff53eb121784724a5b090e5 Mon Sep 17 00:00:00 2001 From: Jun Wang Date: Tue, 23 Jan 2024 19:19:00 -0600 Subject: [PATCH 1/4] [AMDGPU] Emit a waitcnt instruction after each memory instruction This patch introduces a new command-line option for clang, namely, amdgpu-precise-mem-op. When this option is specified, a waitcnt instruction is generated after each memory load/store instruction. The counter values are always 0, but which counters are involved depends on the memory instruction. --- clang/include/clang/Driver/Options.td | 4 + clang/test/Driver/amdgpu-features.c | 6 + llvm/lib/Target/AMDGPU/AMDGPU.td | 4 + llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 + llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp | 79 +++ .../CodeGen/AMDGPU/insert_waitcnt_for_all.ll | 199 ++ 6 files changed, 295 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_all.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 7f4fa33748faca..d570786534b361 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4796,6 +4796,10 @@ defm tgsplit : SimpleMFlag<"tgsplit", "Enable", "Disable", defm wavefrontsize64 : SimpleMFlag<"wavefrontsize64", "Specify wavefront size 64", "Specify wavefront size 32", " mode (AMDGPU only)">; +defm amdgpu_precise_memory_op +: SimpleMFlag<"amdgpu-precise-memory-op", "Enable", "Disable", + " precise memory mode (AMDGPU only)", + m_amdgpu_Features_Group>; defm unsafe_fp_atomics : BoolOption<"m", "unsafe-fp-atomics", TargetOpts<"AllowAMDGPUUnsafeFPAtomics">, DefaultFalse, diff --git a/clang/test/Driver/amdgpu-features.c b/clang/test/Driver/amdgpu-features.c index a516bc6b7ff200..57d31ccedd8783 100644 --- a/clang/test/Driver/amdgpu-features.c +++ b/clang/test/Driver/amdgpu-features.c @@ -32,3 +32,9 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-cumode %s 2>&1 | FileCheck --check-prefix=NO-CUMODE %s // NO-CUMODE: "-target-feature" "-cumode" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mamdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=PREC-MEM %s +// PREC-MEM: "-target-feature" "+amdgpu-precise-memory-op" + +// RUN: %clang -### -target amdgcn -mcpu=gfx1010 -mno-amdgpu-precise-memory-op %s 2>&1 | FileCheck --check-prefix=NO-PREC-MEM %s +// NO-PREC-MEM: "-target-feature" "-amdgpu-precise-memory-op" diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index cb29d5d9475981..c39cc947702359 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -167,6 +167,10 @@ def FeatureCuMode : SubtargetFeature<"cumode", "Enable CU wavefront execution mode" >; +def FeaturePreciseMemory +: SubtargetFeature<"amdgpu-precise-memory-op", "EnablePreciseMemory", + "true", "Enable precise memory mode">; + def FeatureSGPRInitBug : SubtargetFeature<"sgpr-init-bug", "SGPRInitBug", "true", diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 8019b98b1c68d6..b69df21f785985 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -87,6 +87,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool EnableTgSplit = false; bool EnableCuMode = false; bool TrapHandler = false; + bool EnablePreciseMemory = false; // Used as options. bool EnableLoadStoreOpt = false; @@ -592,6 +593,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, return EnableCuMode; } + bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; } + bool hasFlatAddressSpace() const { return FlatAddressSpace; } diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index 84b9330ef9633e..93cdceb37bd501 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -17,6 +17,7 @@ #include "AMDGPUMachineModuleInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/BitmaskEnum.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -24,6 +25,8 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/TargetParser/TargetParser.h" +#include + using namespace llvm; using namespace llvm::AMDGPU; @@ -641,6 +644,9 @@ class SIMemoryLegalizer final : public MachineFunctionPass { bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI); + bool GFX9InsertWaitcntForPreciseMem(MachineFunction &MF); + bool GFX10And11InsertWaitcntForPreciseMem(MachineFunction &MF); + public: