https://github.com/llvmbot created https://github.com/llvm/llvm-project/pull/126744
Backport c837f57 Requested by: @arsenm >From 8434879a3566ae040f1915456e9a79fd3192185a Mon Sep 17 00:00:00 2001 From: Vigneshwar Jayakumar <vigneshwar.jayaku...@amd.com> Date: Tue, 11 Feb 2025 09:30:16 -0600 Subject: [PATCH] AMDGPU: Handle gfx950 XDL-write-VGPR-Overlap-Src-AB wait state (#126732) gfx950 needs more additional waitstates from gfx940 (cherry picked from commit c837f572865eb2980b82a8415da45dc1157627bf) --- .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 16 ++--- .../CodeGen/AMDGPU/mai-hazards-gfx940.mir | 60 ++++++++++++------- .../AMDGPU/mai-hazards-mfma-scale.gfx950.mir | 4 +- 3 files changed, 51 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 873d18e30a430..844441308275f 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2297,12 +2297,14 @@ GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { return NumPasses + 2; } -static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { - // 2 pass -> 5 - // 4 pass -> 7 - // 8 pass -> 11 - // 16 pass -> 19 - return NumPasses + 3; +static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, + bool IsGFX950) { + // xdl def cycles | gfx940 | gfx950 + // 2 pass | 5 5 + // 4 pass | 7 8 + // 8 pass | 11 12 + // 16 pass | 19 20 + return NumPasses + 3 + (NumPasses != 2 && IsGFX950); } int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { @@ -2471,7 +2473,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) { NeedWaitStates = isXDL(ST, *MI1) ? GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates( - NumPasses) + NumPasses, ST.hasGFX950Insts()) : GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates( NumPasses); break; diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir index 52891989b88fb..1eb7ec4c142f2 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir @@ -417,7 +417,8 @@ body: | # GCN-LABEL: name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_sgemm16x16_mfma_write_agpr_mfma_srca_read_overlap body: | @@ -439,7 +440,8 @@ body: | # GCN-LABEL: name: smfmac32x32_write_agpr_mfma_srca_read_overlap # GCN: V_SMFMAC # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: smfmac32x32_write_agpr_mfma_srca_read_overlap body: | @@ -450,7 +452,8 @@ body: | # GCN-LABEL: name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap # GCN: V_SMFMAC # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_SMFMAC name: smfmac32x32_write_agpr_smfmac_srcc_read_overlap body: | @@ -462,7 +465,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_sgemm32x32_mfma_write_agpr_mfma_srca_read_overlap body: | @@ -1715,7 +1719,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MFMA name: xdl_sgemm16X16X16_mfma_write_agpr_mfma_srca_read_overlap body: | @@ -1725,7 +1730,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MFMA name: xdl_sgemm16X16X32_mfma_write_agpr_mfma_srcb_read_overlap body: | @@ -1735,7 +1741,8 @@ body: | ... # GCN-LABEL: name: xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MFMA name: xdl_sgemm16X16X16_mfma_write_vgpr_dmfma16x16_srca_read_overlap body: | @@ -1826,7 +1833,8 @@ body: | ... # GCN-LABEL: name: smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx # GCN: V_SMFMAC -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_SMFMAC name: smfmac16x16x32_mfma_write_vgpr_smfmac_read_idx body: | @@ -2188,7 +2196,8 @@ body: | # 4 pass source # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srca # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MFMA name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srca body: | @@ -2202,7 +2211,8 @@ body: | # 4 pass source # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcb # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MFMA name: xdl_mfma_4pass_write_vgpr_xdl_mfma_read_overlap_srcb body: | @@ -2276,7 +2286,8 @@ body: | # 4 pass source # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srca # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MFMA name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srca body: | @@ -2290,7 +2301,8 @@ body: | # 4 pass source # GCN-LABEL: name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcb # GCN: V_MFMA -# GCN-NEXT: S_NOP 6 +# GFX940-NEXT: S_NOP 6 +# GFX950-NEXT: S_NOP 7 # GCN-NEXT: V_MFMA name: xdl_mfma_4pass_write_vgpr_sgemm_mfma_read_overlap_srcb body: | @@ -2321,7 +2333,8 @@ body: | # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca body: | @@ -2336,7 +2349,8 @@ body: | # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb body: | @@ -2370,7 +2384,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srca body: | @@ -2386,7 +2401,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_16pass_write_vgpr_nonxdl_sgemm_mfma_read_overlap_srcb body: | @@ -2456,7 +2472,8 @@ body: | # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srca body: | @@ -2470,7 +2487,8 @@ body: | # GCN-LABEL: name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb # GCN: V_MFMA # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_mfma_8pass_write_vgpr_xdl_mfma_read_overlap_srcb body: | @@ -2502,7 +2520,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srca body: | @@ -2519,7 +2538,8 @@ body: | # GCN: V_MFMA # GCN-NEXT: S_NOP 7 # GCN-NEXT: S_NOP 7 -# GCN-NEXT: S_NOP 2 +# GFX940-NEXT: S_NOP 2 +# GFX950-NEXT: S_NOP 3 # GCN-NEXT: V_MFMA name: xdl_16pass_write_vgpr_xdl_mfma_read_overlap_srcb body: | diff --git a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir index 433236180b137..4585eca8fe894 100644 --- a/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/mai-hazards-mfma-scale.gfx950.mir @@ -254,7 +254,7 @@ body: | ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 7 - ; GCN-NEXT: S_NOP 2 + ; GCN-NEXT: S_NOP 3 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 0, 0, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec @@ -275,7 +275,7 @@ body: | ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $sgpr4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec - ; GCN-NEXT: S_NOP 6 + ; GCN-NEXT: S_NOP 7 ; GCN-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, killed $vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, killed $sgpr4, killed $vgpr21, 12, 4, implicit $mode, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_vgprcd_e64 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19, 2, 2, $sgpr4, $vgpr21, 12, 4, implicit $mode, implicit $exec _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits