https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/117287
>From b4858a252d18dd63aa3b88c2685b41fa9a604b0c Mon Sep 17 00:00:00 2001 From: Matt Arsenault <matthew.arsena...@amd.com> Date: Mon, 18 Mar 2024 14:41:11 +0530 Subject: [PATCH] AMDGPU: Handle gfx950 valu write vdst + permlane read hazard --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 30 ++++- llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir | 113 ++++++++++++++++++ .../AMDGPU/llvm.amdgcn.permlane16.swap.ll | 6 + .../AMDGPU/llvm.amdgcn.permlane32.swap.ll | 6 + 4 files changed, 153 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 97995560842090..4c37ef8855a5ba 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2551,8 +2551,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) { return isVCmpXWritesExec(*TII, *TRI, MI); }; - const int NumWaitStates = 4; - return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates); + auto IsVALUFn = [](const MachineInstr &MI) { + return SIInstrInfo::isVALU(MI); + }; + + const int VCmpXWritesExecWaitStates = 4; + const int VALUWritesVDstWaitStates = 2; + int WaitStatesNeeded = 0; + + for (const MachineOperand &Op : MI->explicit_uses()) { + if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg())) + continue; + Register Reg = Op.getReg(); + + int WaitStatesSinceDef = + VALUWritesVDstWaitStates - + getWaitStatesSinceDef(Reg, IsVALUFn, + /*MaxWaitStates=*/VALUWritesVDstWaitStates); + WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef); + if (WaitStatesNeeded >= VALUWritesVDstWaitStates) + break; + } + + int VCmpXHazardWaits = + VCmpXWritesExecWaitStates - + getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates); + + WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits); + return WaitStatesNeeded; } static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir index 97bef7be711ff2..75834316750951 100644 --- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir @@ -142,3 +142,116 @@ body: | $vgpr4 = V_MOV_B32_e32 0, implicit $exec renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec ... + +--- +# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0 +# GCN: V_MOV_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_PERMLANE +name: valu_write_vdst_read_permlane16_swap_0 +body: | + bb.0: + liveins: $vgpr1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec +... + +--- +# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1 +# GCN: V_MOV_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_PERMLANE +name: valu_write_vdst_read_permlane16_swap_1 +body: | + bb.0: + liveins: $vgpr0 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec +... + +--- +# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0 +# GCN: V_MOV_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_PERMLANE +name: valu_write_vdst_read_permlane32_swap_0 +body: | + bb.0: + liveins: $vgpr1 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec +... + +--- +# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1 +# GCN: V_MOV_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_PERMLANE +name: valu_write_vdst_read_permlane32_swap_1 +body: | + bb.0: + liveins: $vgpr0 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec +... + +--- +# No hazard, write of other register +# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg +# GCN: V_MOV_B32 +# GCN-NEXT: V_PERMLANE +name: valu_write_vdst_read_permlane16_swap_0_otherreg +body: | + bb.0: + liveins: $vgpr1 + $vgpr2 = V_MOV_B32_e32 0, implicit $exec + renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec +... + +--- +# Both permlane hazards at once. +# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap +# GCN: V_MOV_B32 +# GCN: V_CMPX_EQ_I32 +# GCN-NEXT: S_NOP 3 +# GCN-NEXT: V_PERMLANE +name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap +body: | + bb.0: + liveins: $vgpr0, $vgpr2, $vgpr3 + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec + renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec +... + +--- +# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap +# GCN: V_CMPX_EQ_I32 +# GCN: V_MOV_B32 +# GCN-NEXT: S_NOP 2 +# GCN-NEXT: V_PERMLANE +name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap +body: | + bb.0: + liveins: $vgpr0, $vgpr2, $vgpr3 + $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec +... + +--- +# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap +# GCN: V_CMPX_EQ_I32 +# GCN: V_MOV_B32 +# GCN: V_MOV_B32 +# GCN-NEXT: S_NOP 1 +# GCN-NEXT: V_PERMLANE +name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap +body: | + bb.0: + liveins: $vgpr0, $vgpr2, $vgpr3 + $exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec + $vgpr1 = V_MOV_B32_e32 0, implicit $exec + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll index 0d5dfa46c2c260..e1cebe28f7fe8a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll @@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false) @@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false) @@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false) @@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) @@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) @@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll index e3b0879af4307d..121c379053fcf7 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll @@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, 1 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false) @@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false) @@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false) @@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) @@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, v0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) @@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false) _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits