[llvm-branch-commits] [llvm] cb5b52a - AMDGPU: Annotate amdgpu.noclobber for global loads only
Author: Changpeng Fang Date: 2021-01-05T14:47:19-08:00 New Revision: cb5b52a06eeb7cc868944bb08f71fffe13f33412 URL: https://github.com/llvm/llvm-project/commit/cb5b52a06eeb7cc868944bb08f71fffe13f33412 DIFF: https://github.com/llvm/llvm-project/commit/cb5b52a06eeb7cc868944bb08f71fffe13f33412.diff LOG: AMDGPU: Annotate amdgpu.noclobber for global loads only Summary: This is to avoid unnecessary analysis since amdgpu.noclobber is only used for globals. Reviewers: arsenm Fixes: SWDEV-239161 Differential Revision: https://reviews.llvm.org/D94107 Added: llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll Modified: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll Removed: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index f23c4c17ab794..0123450b18bf4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -142,10 +142,11 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) { } bool NotClobbered = false; + bool GlobalLoad = isGlobalLoad(I); if (PtrI) -NotClobbered = !isClobberedInFunction(&I); +NotClobbered = GlobalLoad && !isClobberedInFunction(&I); else if (isa(Ptr) || isa(Ptr)) { -if (isGlobalLoad(I) && !isClobberedInFunction(&I)) { +if (GlobalLoad && !isClobberedInFunction(&I)) { NotClobbered = true; // Lookup for the existing GEP if (noClobberClones.count(Ptr)) { diff --git a/llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll b/llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll new file mode 100644 index 0..580ea202addd6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/annotate-noclobber.ll @@ -0,0 +1,47 @@ +; RUN: opt -S --amdgpu-annotate-uniform < %s | FileCheck -check-prefix=OPT %s +target datalayout = "A5" + + +; OPT-LABEL: @amdgpu_noclobber_global( +; OPT: %addr = getelementptr i32, i32 addrspace(1)* %in, i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0 +; OPT-NEXT: %load = load i32, i32 addrspace(1)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_global( i32 addrspace(1)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(1)* %in, i64 0 + %load = load i32, i32 addrspace(1)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_local( +; OPT: %addr = getelementptr i32, i32 addrspace(3)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(3)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_local( i32 addrspace(3)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(3)* %in, i64 0 + %load = load i32, i32 addrspace(3)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_private( +; OPT: %addr = getelementptr i32, i32 addrspace(5)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(5)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_private( i32 addrspace(5)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(5)* %in, i64 0 + %load = load i32, i32 addrspace(5)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} + +; OPT-LABEL: @amdgpu_noclobber_flat( +; OPT: %addr = getelementptr i32, i32 addrspace(4)* %in, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %load = load i32, i32 addrspace(4)* %addr, align 4 +define amdgpu_kernel void @amdgpu_noclobber_flat( i32 addrspace(4)* %in, i32 addrspace(1)* %out) { +entry: + %addr = getelementptr i32, i32 addrspace(4)* %in, i64 0 + %load = load i32, i32 addrspace(4)* %addr, align 4 + store i32 %load, i32 addrspace(1)* %out, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll b/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll index 88cfffed47c59..fbf6990a5629d 100644 --- a/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll +++ b/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll @@ -4,24 +4,24 @@ target datalayout = "A5" ; "load vaddr" depends on the store, so we should not mark vaddr as amdgpu.noclobber. ; OPT-LABEL: @store_clobbers_load( -; OPT: %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*, !amdgpu.uniform !0 -; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 -define amdgpu_kernel void @store_clobbers_load(i32 addrspace(1)* %out, i32 %index) { +; OPT: %vaddr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %input, i64 0, !amdgpu.uniform !0 +; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr, align 16 +define amdgpu_kernel void @store_clobbers_load( < 4 x i32> addrspace(1)* %input, i32 addrspace(1)*
[llvm-branch-commits] [llvm] ce0c001 - AMDGPU: If a store defines (alias) a load, it clobbers the load.
Author: Changpeng Fang Date: 2020-12-14T16:34:32-08:00 New Revision: ce0c0013d8b11e9ce9820c0add2a013f5992e6a6 URL: https://github.com/llvm/llvm-project/commit/ce0c0013d8b11e9ce9820c0add2a013f5992e6a6 DIFF: https://github.com/llvm/llvm-project/commit/ce0c0013d8b11e9ce9820c0add2a013f5992e6a6.diff LOG: AMDGPU: If a store defines (alias) a load, it clobbers the load. Summary: If a store defines (must alias) a load, it clobbers the load. Fixes: SWDEV-258915 Reviewers: arsenm Differential Revision: https://reviews.llvm.org/D92951 Added: llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll Modified: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp llvm/test/CodeGen/AMDGPU/wave32.ll Removed: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 6fc58816cf99..f23c4c17ab79 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -110,7 +110,9 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { BasicBlock::iterator(Load) : BB->end(); auto Q = MDR->getPointerDependencyFrom( MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load); -if (Q.isClobber() || Q.isUnknown()) +if (Q.isClobber() || Q.isUnknown() || +// Store defines the load and thus clobbers it. +(Q.isDef() && Q.getInst()->mayWriteToMemory())) return true; } return false; diff --git a/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll b/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll new file mode 100644 index ..88cfffed47c5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/store-clobbers-load.ll @@ -0,0 +1,43 @@ +; RUN: opt -S --amdgpu-annotate-uniform < %s | FileCheck -check-prefix=OPT %s +target datalayout = "A5" + +; "load vaddr" depends on the store, so we should not mark vaddr as amdgpu.noclobber. + +; OPT-LABEL: @store_clobbers_load( +; OPT: %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*, !amdgpu.uniform !0 +; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 +define amdgpu_kernel void @store_clobbers_load(i32 addrspace(1)* %out, i32 %index) { +entry: + %alloca = alloca [4 x i32], addrspace(5) + %addr0 = bitcast [4 x i32] addrspace(5)* %alloca to i32 addrspace(5)* + store i32 0, i32 addrspace(5)* %addr0 + %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)* + %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16 + %one = insertelement <4 x i32> %zero, i32 1, i32 1 + %two = insertelement <4 x i32> %one, i32 2, i32 2 + %three = insertelement <4 x i32> %two, i32 3, i32 3 + store <4 x i32> %three, <4 x i32> addrspace(5)* %vaddr, align 16 + %rslt = extractelement <4 x i32> %three, i32 %index + store i32 %rslt, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +@lds0 = addrspace(3) global [512 x i32] undef, align 4 + +; To check that %arrayidx0 is not marked as amdgpu.noclobber. + +; OPT-LABEL: @atomicrmw_clobbers_load( +; OPT: %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0, !amdgpu.uniform !0 +; OPT-NEXT: %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + +define amdgpu_kernel void @atomicrmw_clobbers_load(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) { + %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 + %idx.0 = add nsw i32 %tid.x, 2 + %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0 + %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst + %load = load i32, i32 addrspace(3)* %arrayidx0, align 4 + store i32 %val, i32 addrspace(1)* %out0, align 4 + store i32 %load, i32 addrspace(1)* %out1, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index b5b42d893205..da37154985b5 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -231,9 +231,9 @@ bb13: ; GCN: ; %bb.{{[0-9]+}}: ; %.preheader ; GCN: BB{{.*}}: +; GCN: global_store_dword ; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo ; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc -; GCN: global_store_dword ; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo ; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec ; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo @@ -249,10 +249,12 @@ bb13: ; GFX1064: s_andn2_b64 exec, exec, [[ACC]] ; GCN: s_cbranch_execz ; GCN: BB{{.*}}: -; GCN: s_load_dword [[LOAD:s[0-9]+]] + ; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], exec_lo ; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], exec -; GCN: s_cmp_lt_i32 [[LOAD]], 11 +; GCN: global_load_dword [[LOAD:v[0-9]+]] +
[llvm-branch-commits] [llvm] AMDGPU: Improve v4f16/v4bf16 copysign handling (PR #142174)
https://github.com/changpeng approved this pull request. https://github.com/llvm/llvm-project/pull/142174 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_prng_b32` on gfx1250 (PR #149450)
https://github.com/changpeng edited https://github.com/llvm/llvm-project/pull/149450 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add support for `v_prng_b32` on gfx1250 (PR #149450)
https://github.com/changpeng approved this pull request. LGTM. But has the builtin definition always been added in clang? https://github.com/llvm/llvm-project/pull/149450 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
[llvm-branch-commits] [llvm] [AMDGPU] Add the code generation support for `llvm.[sin/cos].bf16` (PR #149631)
https://github.com/changpeng approved this pull request. https://github.com/llvm/llvm-project/pull/149631 ___ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits