https://github.com/rovka updated https://github.com/llvm/llvm-project/pull/130037
>From c29d8202c06488a9466aea49dda4cf2b4663236e Mon Sep 17 00:00:00 2001 From: Diana Picus <diana-magda.pi...@amd.com> Date: Mon, 23 Oct 2023 11:46:19 +0200 Subject: [PATCH 1/2] [AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If the shader program does not do this, hardware inserts `S_ALLOC_VGPR 0` before S_ENDPGM, but this may incur some performance cost. Therefore it's better if the compiler proactively generates that instruction. This patch extends `si-insert-waitcnts` to deallocate the VGPRs via a `S_ALLOC_VGPR 0` before any `S_ENDPGM` when in dynamic VGPR mode. --- llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp | 60 +-- .../CodeGen/AMDGPU/release-vgprs-gfx12.mir | 356 ++++++++++++++++++ 2 files changed, 393 insertions(+), 23 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 7e6bce2bf5f12..42ef23e836a58 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) { Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false)); } - // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM - // stores. In this case it can be useful to send a message to explicitly - // release all VGPRs before the stores have completed, but it is only safe to - // do this if: - // * there are no outstanding scratch stores - // * we are not in Dynamic VGPR mode + // In dynamic VGPR mode, we want to release the VGPRs before the wave exits. + // Technically the hardware will do this on its own if we don't, but that + // might cost extra cycles compared to doing it explicitly. + // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may + // have to wait for outstanding VMEM stores. In this case it can be useful to + // send a message to explicitly release all VGPRs before the stores have + // completed, but it is only safe to do this if there are no outstanding + // scratch stores. else if (MI.getOpcode() == AMDGPU::S_ENDPGM || MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) { - if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() && - ScoreBrackets.getScoreRange(STORE_CNT) != 0 && - !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)) + if (!WCG->isOptNone() && + (ST->isDynamicVGPREnabled() || + (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && + ScoreBrackets.getScoreRange(STORE_CNT) != 0 && + !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS)))) ReleaseVGPRInsts.insert(&MI); } // Resolve vm waits before gs-done. @@ -2610,26 +2614,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { } } - // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM - // instructions. + // Deallocate the VGPRs before previously identified S_ENDPGM instructions. + // This is done in different ways depending on how the VGPRs were allocated + // (i.e. whether we're in dynamic VGPR mode or not). // Skip deallocation if kernel is waveslot limited vs VGPR limited. A short // waveslot limited kernel runs slower with the deallocation. - if (!ReleaseVGPRInsts.empty() && - (MF.getFrameInfo().hasCalls() || - ST->getOccupancyWithNumVGPRs( - TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) < - AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { + if (ST->isDynamicVGPREnabled()) { for (MachineInstr *MI : ReleaseVGPRInsts) { - if (ST->requiresNopBeforeDeallocVGPRs()) { - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_NOP)) - .addImm(0); - } BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), - TII->get(AMDGPU::S_SENDMSG)) - .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); + TII->get(AMDGPU::S_ALLOC_VGPR)) + .addImm(0); Modified = true; } + } else { + if (!ReleaseVGPRInsts.empty() && + (MF.getFrameInfo().hasCalls() || + ST->getOccupancyWithNumVGPRs( + TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) < + AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) { + for (MachineInstr *MI : ReleaseVGPRInsts) { + if (ST->requiresNopBeforeDeallocVGPRs()) { + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_NOP)) + .addImm(0); + } + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_SENDMSG)) + .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus); + Modified = true; + } + } } ReleaseVGPRInsts.clear(); PreheadersToFlush.clear(); diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir new file mode 100644 index 0000000000000..884b5f8b6f018 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir @@ -0,0 +1,356 @@ +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT +# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR + +--- | + define amdgpu_ps void @tbuffer_store1() { ret void } + define amdgpu_ps void @tbuffer_store2() { ret void } + define amdgpu_ps void @flat_store() { ret void } + define amdgpu_ps void @global_store() { ret void } + define amdgpu_ps void @buffer_store_format() { ret void } + define amdgpu_ps void @ds_write_b32() { ret void } + define amdgpu_ps void @global_store_dword() { ret void } + define amdgpu_ps void @multiple_basic_blocks1() { ret void } + define amdgpu_ps void @multiple_basic_blocks2() { ret void } + define amdgpu_ps void @multiple_basic_blocks3() { ret void } + define amdgpu_ps void @recursive_loop() { ret void } + define amdgpu_ps void @recursive_loop_vmem() { ret void } + define amdgpu_ps void @image_store() { ret void } + define amdgpu_ps void @scratch_store() { ret void } + define amdgpu_ps void @buffer_atomic() { ret void } + define amdgpu_ps void @flat_atomic() { ret void } + define amdgpu_ps void @global_atomic() { ret void } + define amdgpu_ps void @image_atomic() { ret void } + define amdgpu_ps void @global_store_optnone() noinline optnone { ret void } +... + +--- +name: tbuffer_store1 +body: | + bb.0: + ; CHECK-LABEL: name: tbuffer_store1 + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + TBUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 42, 117, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: tbuffer_store2 +body: | + bb.0: + ; CHECK-LABEL: name: tbuffer_store2 + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 7) + S_ENDPGM 0 +... + +--- +name: flat_store +body: | + bb.0: + ; CHECK-LABEL: name: flat_store + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + FLAT_STORE_DWORDX4 $vgpr49_vgpr50, $vgpr26_vgpr27_vgpr28_vgpr29, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +--- +name: global_store +body: | + bb.0: + ; CHECK-LABEL: name: global_store + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + S_WAIT_STORECNT 0 + S_ENDPGM 0 +... + +--- +name: buffer_store_format +body: | + bb.0: + ; CHECK-LABEL: name: buffer_store_format + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + BUFFER_STORE_FORMAT_D16_X_OFFEN_exact killed renamable $vgpr0, killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: ds_write_b32 +body: | + bb.0: + ; CHECK-LABEL: name: ds_write_b32 + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + renamable $vgpr0 = IMPLICIT_DEF + renamable $vgpr1 = IMPLICIT_DEF + DS_WRITE_B32 killed renamable $vgpr0, killed renamable $vgpr1, 12, 0, implicit $exec, implicit $m0 + S_ENDPGM 0 + +... +--- +name: global_store_dword +body: | + bb.0: + liveins: $vgpr0, $sgpr0_sgpr1 + + ; CHECK-LABEL: name: global_store_dword + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + renamable $vgpr0 = V_MAD_I32_I24_e64 killed $vgpr1, killed $vgpr0, killed $sgpr2, 0, implicit $exec + GLOBAL_STORE_DWORD_SADDR killed renamable $vgpr2, killed renamable $vgpr0, killed renamable $sgpr0_sgpr1, 0, 0, implicit $exec + S_ENDPGM 0 +... + +--- +name: multiple_basic_blocks1 +body: | + ; CHECK-LABEL: name: multiple_basic_blocks1 + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... + +--- +name: multiple_basic_blocks2 +body: | + ; CHECK-LABEL: name: multiple_basic_blocks2 + ; CHECK: bb.2: + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.2 + + TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... + +--- +name: multiple_basic_blocks3 +body: | + ; CHECK-LABEL: name: multiple_basic_blocks3 + ; CHECK: bb.4: + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.2 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + TBUFFER_STORE_FORMAT_X_OFFSET_exact killed renamable $vgpr0, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 125, 0, 0, implicit $exec + S_BRANCH %bb.2 + + bb.1: + successors: %bb.2 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_BRANCH %bb.2 + + bb.2: + successors: %bb.4 + + S_BRANCH %bb.4 + + bb.3: + successors: %bb.4 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_BRANCH %bb.4 + + bb.4: + S_ENDPGM 0 +... + +--- +name: recursive_loop +body: | + ; CHECK-LABEL: name: recursive_loop + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... + +--- +name: recursive_loop_vmem +body: | + ; CHECK-LABEL: name: recursive_loop_vmem + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + TBUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 115, 0, 0, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 +... + +--- +name: image_store +body: | + bb.0: + ; CHECK-LABEL: name: image_store + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + IMAGE_STORE_V2_V1_gfx11 killed renamable $vgpr0_vgpr1, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 12, 0, 1, 0, 0, -1, 0, 0, 0, implicit $exec :: (dereferenceable store (<2 x s32>), addrspace 7) + S_ENDPGM 0 +... + +--- +name: scratch_store +body: | + bb.0: + ; CHECK-LABEL: name: scratch_store + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + renamable $sgpr0 = S_AND_B32 killed renamable $sgpr0, -16, implicit-def dead $scc + SCRATCH_STORE_DWORD_SADDR killed renamable $vgpr0, killed renamable $sgpr0, 0, 0, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + +--- +name: buffer_atomic +body: | + bb.0: + ; CHECK-LABEL: name: buffer_atomic + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + BUFFER_ATOMIC_ADD_F32_OFFEN killed renamable $vgpr0, killed renamable $vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 7) + S_ENDPGM 0 +... + +--- +name: flat_atomic +body: | + bb.0: + ; CHECK-LABEL: name: flat_atomic + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + renamable $vgpr0_vgpr1 = FLAT_ATOMIC_DEC_X2_RTN killed renamable $vgpr0_vgpr1, killed renamable $vgpr2_vgpr3, 40, 1, implicit $exec, implicit $flat_scr + S_ENDPGM 0 +... + + +--- +name: global_atomic +body: | + bb.0: + ; CHECK-LABEL: name: global_atomic + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + renamable $vgpr0_vgpr1 = GLOBAL_ATOMIC_INC_X2_SADDR_RTN killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1, 40, 1, implicit $exec + S_ENDPGM 0 +... + +--- +name: image_atomic +body: | + bb.0: + ; CHECK-LABEL: name: image_atomic + ; CHECK-NOT: S_SENDMSG 3 + ; DEFAULT-NOT: S_ALLOC_VGPR + ; DVGPR: S_ALLOC_VGPR 0 + ; CHECK: S_ENDPGM 0 + renamable $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, killed renamable $vgpr4, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, 15, 0, 1, 1, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 7) + S_ENDPGM 0 +... + +--- +name: global_store_optnone +body: | + bb.0: + ; CHECK-LABEL: name: global_store_optnone + ; CHECK-NOT: S_SENDMSG 3 + ; CHECK-NOT: S_ALLOC_VGPR + ; CHECK: S_ENDPGM 0 + GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec + S_WAIT_STORECNT 0 + S_ENDPGM 0 +... >From 3d09c944c2560fd514fd38a6500aa0acbb061707 Mon Sep 17 00:00:00 2001 From: Diana Picus <diana-magda.pi...@amd.com> Date: Thu, 6 Mar 2025 13:49:46 +0100 Subject: [PATCH 2/2] Remove useless -O2 from test --- llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir index 884b5f8b6f018..d465bf95fbfbe 100644 --- a/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/release-vgprs-gfx12.mir @@ -1,5 +1,5 @@ -# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT -# RUN: llc -O2 -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DEFAULT +# RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=+dynamic-vgpr -run-pass=si-insert-waitcnts -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=CHECK,DVGPR --- | define amdgpu_ps void @tbuffer_store1() { ret void } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits