llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: None (llvmbot) <details> <summary>Changes</summary> Backport a2263eb Requested by: @<!-- -->arsenm --- Patch is 159.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/126776.diff 7 Files Affected: - (modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+10-7) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll (+95-95) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll (+117-117) - (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll (+101-101) - (modified) llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir (+24-12) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 873d18e30a430..b40958073a092 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2611,12 +2611,14 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { return NumPasses + 3; } -static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { - // 2 pass -> 5 - // 4 pass -> 7 - // 8 pass -> 11 - // 16 pass -> 19 - return NumPasses + 3; +static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, + bool IsGFX950) { + // xdl def cycles | gfx940 | gfx950 + // 2 pass | 5 5 + // 4 pass | 7 8 + // 8 pass | 11 12 + // 16 pass | 19 20 + return NumPasses + 3 + (NumPasses != 2 && IsGFX950); } static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) { @@ -2767,7 +2769,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) { } else if (ST.hasGFX940Insts()) { NeedWaitStates = isXDL(ST, *MFMA) - ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses) + ? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates( + NumPasses, ST.hasGFX950Insts()) : GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates( NumPasses); } else { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll index 8d380516df8b5..452033f332659 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll @@ -49,7 +49,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_mov_b32_e32 v9, s17 ; GCN-NEXT: v_mov_b32_e32 v10, s18 ; GCN-NEXT: v_mov_b32_e32 v11, s19 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 4 ; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -122,7 +122,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0 ; GCN-NEXT: v_mov_b32_e32 v9, s17 ; GCN-NEXT: v_mov_b32_e32 v10, s18 ; GCN-NEXT: v_mov_b32_e32 v11, s19 -; GCN-NEXT: s_nop 3 +; GCN-NEXT: s_nop 4 ; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -179,7 +179,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -224,7 +224,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0, ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -417,7 +417,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat> ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -459,7 +459,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf ; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 1 +; GCN-NEXT: s_nop 2 ; GCN-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; GCN-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 44cb4e803ffad..4628a9c15391b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -19,7 +19,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -39,7 +39,7 @@ define <4 x float> @test_mfma_f32_16x16x32_f16__flags(<8 x half> %arg0, <8 x hal ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -67,7 +67,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -88,7 +88,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd(ptr addrsp ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) @@ -114,7 +114,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -135,7 +135,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_f16_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) @@ -186,7 +186,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 ; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 4 ; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -253,7 +253,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16(<8 x half> %arg0, <8 x hal ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -316,7 +316,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; SDAG-NEXT: v_mov_b32_e32 v9, s17 ; SDAG-NEXT: v_mov_b32_e32 v10, s18 ; SDAG-NEXT: v_mov_b32_e32 v11, s19 -; SDAG-NEXT: s_nop 3 +; SDAG-NEXT: s_nop 4 ; SDAG-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1 @@ -383,7 +383,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__flags(<8 x half> %arg0, < ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -430,7 +430,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac(<8 x half> %arg0, <8 x half ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -475,7 +475,7 @@ define <16 x float> @test_mfma_f32_32x32x16_f16__mac__flags(<8 x half> %arg0, <8 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -776,7 +776,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -813,7 +813,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac(<8 x half> %ar ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -855,7 +855,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; SDAG-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -892,7 +892,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_f16__vgprcd_mac_flags(<8 x hal ; GISEL-NEXT: v_mfma_f32_32x32x16_f16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -919,7 +919,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -939,7 +939,7 @@ define <4 x i32> @test_mfma_i32_16x16x64_i8__flags(<4 x i32> %arg0, <4 x i32> %a ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -971,7 +971,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -992,7 +992,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd(ptr addrspa ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 0, i32 0, i32 0) @@ -1022,7 +1022,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 ; SDAG-NEXT: s_nop 1 ; SDAG-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] ; SDAG-NEXT: s_endpgm ; @@ -1043,7 +1043,7 @@ define amdgpu_kernel void @test_mfma_i32_16x16x64_i8_no_agpr__vgprcd__flags(ptr ; GISEL-NEXT: s_nop 1 ; GISEL-NEXT: v_mfma_i32_16x16x64_i8 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 +; GISEL-NEXT: s_nop 6 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] ; GISEL-NEXT: s_endpgm %result = call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x64.i8(<4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> %arg2, i32 3, i32 2, i32 1) @@ -1097,7 +1097,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 @@ -1169,7 +1169,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8(<4 x i32> %arg0, <4 x i32> ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -1233,7 +1233,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; SDAG-NEXT: v_mov_b32_e32 v1, s17 ; SDAG-NEXT: v_mov_b32_e32 v2, s18 ; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: s_nop 6 +; SDAG-NEXT: s_nop 7 ; SDAG-NEXT: global_store_dwordx4 v[8:9], a[28:31], off sc0 sc1 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: global_store_dwordx4 v[10:11], a[24:27], off sc0 sc1 @@ -1305,7 +1305,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__flags(<4 x i32> %arg0, <4 ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15] ; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[18:19] -; GISEL-NEXT: s_nop 3 +; GISEL-NEXT: s_nop 4 ; GISEL-NEXT: global_store_dwordx4 v[20:21], a[16:19], off sc0 sc1 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: global_store_dwordx4 v[22:23], a[20:23], off sc0 sc1 @@ -1352,7 +1352,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac(<4 x i32> %arg0, <4 x i32> %ar ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1397,7 +1397,7 @@ define <16 x i32> @test_mfma_i32_32x32x32_i8__mac__flags(<4 x i32> %arg0, <4 x i ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1 ; GCN-NEXT: s_nop 7 -; GCN-NEXT: s_nop 2 +; GCN-NEXT: s_nop 3 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1717,7 +1717,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -1754,7 +1754,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac(<4 x i32> %arg0 ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -1801,7 +1801,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; SDAG-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; SDAG-NEXT: v_mov_b32_e32 v0, 0 ; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 +; SDAG-NEXT: s_nop 2 ; SDAG-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48 ; SDAG-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 ; SDAG-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 @@ -1838,7 +1838,7 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> ; GISEL-NEXT: v_mfma_i32_32x32x32_i8 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:3 abid:2 blgp:1 ; GISEL-NEXT: v_mov_b32_e32 v0, 0 ; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 +; GISEL-NEXT: s_nop 2 ; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1] ; GISEL-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16 ; GISEL-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32 @@ -1865,7 +1865,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1885,7 +1885,7 @@ define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x ; GCN-NEXT: v_accvgpr_write_b32 a3, v11 ; GCN-NEXT: s_nop 1 ; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GCN-NEXT: s_nop 6 +; GCN-NEXT: s_nop 7 ; GCN-NEXT: v_accvgpr_read_b32 v0, a0 ; GCN-NEXT: v_accvgpr_read_b32 v1, a1 ; GCN-NEXT: v_accvgpr_read_b32 v2, a2 @@ -1913,7 +1913,7 @@ define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrs ; GCN-NEXT: v_accvgpr_write_b... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/126776 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits