https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/77927
>From 3f3bcdb89adf032e26c95807abf5e3b23ff50e4a Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.f...@amd.com> Date: Fri, 12 Jan 2024 12:24:28 +0000 Subject: [PATCH 1/3] Precommit extra GFX12 test coverage --- .../GlobalISel/inst-select-mad_64_32.mir | 21 ++ llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 163 ++++++++++++++ llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 211 ++++++++++++++++++ 3 files changed, 395 insertions(+) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir index 698281caca245e..6e33ef37397d6b 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir @@ -1,6 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -march=amdgcn -mcpu=gfx1030 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX10 %s # RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX11 %s +# RUN: llc -march=amdgcn -mcpu=gfx1200 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2>%t | FileCheck -check-prefix=GFX12 %s --- name: mad_u64_u32_vvv @@ -18,6 +19,7 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX10-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit [[V_MAD_U64_U32_e64_1]] + ; ; GFX11-LABEL: name: mad_u64_u32_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -26,6 +28,15 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX11-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] + ; + ; GFX12-LABEL: name: mad_u64_u32_vvv + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 + ; GFX12-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -51,6 +62,7 @@ body: | ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX10-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit [[V_MAD_I64_I32_e64_1]] + ; ; GFX11-LABEL: name: mad_i64_i32_vvv ; GFX11: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GFX11-NEXT: {{ $}} @@ -59,6 +71,15 @@ body: | ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 ; GFX11-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] + ; + ; GFX12-LABEL: name: mad_i64_i32_vvv + ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 + ; GFX12-NEXT: {{ $}} + ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 + ; GFX12-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index 249acec639540b..b9b03e52ec865c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefix=GFX12 %s define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; SI-LABEL: umulo_i64_v_v: @@ -97,6 +98,32 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: umulo_i64_v_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v4, v3, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], null, v5, v3, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v4, v1 +; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v9, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y) ret { i64, i1 } %umulo @@ -248,6 +275,47 @@ define { i64, i1 } @smulo_i64_v_v(i64 %x, i64 %y) { ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: smulo_i64_v_v: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v4, v3, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0 +; GFX12-NEXT: v_mad_co_i64_i32 v[10:11], null, v5, v3, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v12, v1 +; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v6 +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v7, vcc_lo +; GFX12-NEXT: v_add_co_u32 v12, vcc_lo, v12, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, v7, v9, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v11, vcc_lo +; GFX12-NEXT: v_add_co_u32 v7, vcc_lo, v7, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo +; GFX12-NEXT: v_sub_co_u32 v2, vcc_lo, v7, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v9, vcc_lo +; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 +; GFX12-NEXT: v_cndmask_b32_e32 v6, v7, v2, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc_lo +; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX12-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo +; GFX12-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 +; GFX12-NEXT: v_mov_b32_e32 v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_dual_cndmask_b32 v4, v6, v4 :: v_dual_cndmask_b32 v5, v5, v7 +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %smulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y) ret { i64, i1 } %smulo @@ -375,6 +443,34 @@ define amdgpu_kernel void @umulo_i64_s(i64 %x, i64 %y) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: umulo_i64_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 +; GFX12-NEXT: s_mul_i32 s6, s0, s3 +; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2 +; GFX12-NEXT: s_mul_i32 s10, s1, s2 +; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7] +; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2 +; GFX12-NEXT: s_mul_hi_u32 s11, s1, s3 +; GFX12-NEXT: s_add_co_u32 s4, s6, s10 +; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9 +; GFX12-NEXT: s_mul_i32 s8, s1, s3 +; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0 +; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[8:9] +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0 +; GFX12-NEXT: s_cselect_b32 s0, 0, s0 +; GFX12-NEXT: s_cselect_b32 s1, 0, s1 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm bb: %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %x, i64 %y) %mul = extractvalue { i64, i1 } %umulo, 0 @@ -560,6 +656,48 @@ define amdgpu_kernel void @smulo_i64_s(i64 %x, i64 %y) { ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: smulo_i64_s: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX12-NEXT: s_mov_b32 s5, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mul_hi_u32 s7, s0, s3 +; GFX12-NEXT: s_mul_i32 s6, s0, s3 +; GFX12-NEXT: s_mul_hi_u32 s4, s0, s2 +; GFX12-NEXT: s_mul_i32 s10, s1, s2 +; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[6:7] +; GFX12-NEXT: s_mul_hi_u32 s9, s1, s2 +; GFX12-NEXT: s_mul_hi_i32 s11, s1, s3 +; GFX12-NEXT: s_add_co_u32 s4, s6, s10 +; GFX12-NEXT: s_add_co_ci_u32 s4, s7, s9 +; GFX12-NEXT: s_mul_i32 s8, s1, s3 +; GFX12-NEXT: s_add_co_ci_u32 s9, s11, 0 +; GFX12-NEXT: s_cmp_lt_i32 s1, 0 +; GFX12-NEXT: s_add_nc_u64 s[6:7], s[4:5], s[8:9] +; GFX12-NEXT: s_mov_b32 s4, s2 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_sub_nc_u64 s[8:9], s[6:7], s[4:5] +; GFX12-NEXT: s_mov_b32 s4, s0 +; GFX12-NEXT: s_cselect_b32 s7, s9, s7 +; GFX12-NEXT: s_cselect_b32 s6, s8, s6 +; GFX12-NEXT: s_cmp_lt_i32 s3, 0 +; GFX12-NEXT: s_sub_nc_u64 s[4:5], s[6:7], s[4:5] +; GFX12-NEXT: s_mul_u64 s[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_cselect_b32 s3, s5, s7 +; GFX12-NEXT: s_cselect_b32 s2, s4, s6 +; GFX12-NEXT: s_ashr_i32 s4, s1, 31 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mov_b32 s5, s4 +; GFX12-NEXT: s_cmp_lg_u64 s[2:3], s[4:5] +; GFX12-NEXT: s_cselect_b32 s0, 0, s0 +; GFX12-NEXT: s_cselect_b32 s1, 0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v[0:1], v[0:1], off +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm bb: %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %x, i64 %y) %mul = extractvalue { i64, i1 } %umulo, 0 @@ -618,6 +756,19 @@ define { i64, i1 } @smulo_i64_v_4(i64 %i) { ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: smulo_i64_v_4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[0:1] +; GFX12-NEXT: v_alignbit_b32 v3, v1, v0, 30 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %umulo = tail call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %i, i64 4) ret { i64, i1 } %umulo @@ -674,6 +825,18 @@ define { i64, i1 } @umulo_i64_v_4(i64 %i) { ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: umulo_i64_v_4: +; GFX12: ; %bb.0: ; %bb +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1 +; GFX12-NEXT: v_lshlrev_b64_e32 v[4:5], 2, v[0:1] +; GFX12-NEXT: v_alignbit_b32 v3, v1, v0, 30 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] +; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] bb: %umulo = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %i, i64 4) ret { i64, i1 } %umulo diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index bd2e5cc5952bff..6fbcb74306d6dd 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -3,6 +3,7 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s ; On GFX11, ensure vdst and src2 do not partially overlap. Full overlap is ok. @@ -35,6 +36,14 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_sextops: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -71,6 +80,14 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_sextops_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -107,6 +124,14 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_u64_u32_zextops: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = zext i32 %arg0 to i64 %sext1 = zext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -143,6 +168,14 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_u64_u32_zextops_commute: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = zext i32 %arg0 to i64 %sext1 = zext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -265,6 +298,36 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_sextops_i32_i128: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v0, v1, 0 +; GFX12-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-NEXT: v_ashrrev_i32_e32 v14, 31, v0 +; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v14, v1, v[7:8] +; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v0, v15, v[9:10] +; GFX12-NEXT: v_mov_b32_e32 v10, v8 +; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v1, v14, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_add_co_u32 v10, s0, v11, v10 +; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_i64_i32 v[12:13], null, v15, v0, v[8:9] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v14, v15, v[10:11] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12 +; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i128 %sext1 = sext i32 %arg1 to i128 %mul = mul i128 %sext0, %sext1 @@ -301,6 +364,14 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_sextops_i32_i63: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i63 %sext1 = sext i32 %arg1 to i63 %mul = mul i63 %sext0, %sext1 @@ -346,6 +417,15 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_sextops_i31_i63: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 31 +; GFX12-NEXT: v_bfe_i32 v5, v0, 0, 31 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i31 %arg0 to i63 %sext1 = sext i31 %arg1 to i63 %mul = mul i63 %sext0, %sext1 @@ -397,6 +477,18 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v4, v[3:4] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_extops_i32_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_ashrrev_i32_e32 v0, 31, v4 +; GFX12-NEXT: v_mul_lo_u32 v6, v0, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v5, v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v1, v6, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] %ext0 = sext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 %mul = mul i64 %ext0, %ext1 @@ -433,6 +525,14 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_u64_u32_bitops: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v2, v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 4294967295 %mul = mul i64 %trunc.lhs, %trunc.rhs @@ -483,6 +583,17 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_u64_u32_bitops_lhs_mask_small: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_and_b32 v6, 1, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v2, v[4:5] +; GFX12-NEXT: v_mul_lo_u32 v2, v6, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 8589934591 %trunc.rhs = and i64 %arg1, 4294967295 %mul = mul i64 %trunc.lhs, %trunc.rhs @@ -534,6 +645,17 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_u64_u32_bitops_rhs_mask_small: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v3, 1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v2, v[4:5] +; GFX12-NEXT: v_mul_lo_u32 v2, v6, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 8589934591 %mul = mul i64 %trunc.lhs, %trunc.rhs @@ -570,6 +692,14 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v3, v2, v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_bitops: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mov_b32_e32 v3, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v3, v2, v[4:5] +; GFX12-NEXT: s_setpc_b64 s[30:31] %shl.lhs = shl i64 %arg0, 32 %trunc.lhs = ashr i64 %shl.lhs, 32 %shl.rhs = shl i64 %arg1, 32 @@ -609,6 +739,14 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_unpack_i64ops: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v1, v0, v[0:1] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp4 = lshr i64 %arg0, 32 %tmp5 = and i64 %arg0, 4294967295 %mul = mul nuw i64 %tmp4, %tmp5 @@ -682,6 +820,26 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0, ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: mad_i64_i32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 +; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 +; GFX12-NEXT: s_mov_b32 s3, 0 +; GFX12-NEXT: s_waitcnt lgkmcnt(0) +; GFX12-NEXT: s_mov_b32 s2, s6 +; GFX12-NEXT: s_mov_b32 s6, s7 +; GFX12-NEXT: s_mov_b32 s7, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_mul_u64 s[2:3], s[2:3], s[6:7] +; GFX12-NEXT: s_add_nc_u64 s[0:1], s[2:3], s[0:1] +; GFX12-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: global_store_b64 v2, v[0:1], s[4:5] +; GFX12-NEXT: s_nop 0 +; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX12-NEXT: s_endpgm %ext0 = zext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 %mul = mul i64 %ext0, %ext1 @@ -731,6 +889,16 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 { ; GFX11-NEXT: v_xor_b32_e32 v0, v6, v2 ; GFX11-NEXT: v_xor_b32_e32 v1, v7, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_twice: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mad_co_i64_i32 v[6:7], null, v0, v1, v[2:3] +; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, v0, v1, v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_xor_b32_e32 v0, v6, v2 +; GFX12-NEXT: v_xor_b32_e32 v1, v7, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -804,6 +972,25 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 % ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_thrice: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v0, v1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX12-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX12-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -857,6 +1044,18 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX11-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX11-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i64_i32_secondary_use: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_mad_co_i64_i32 v[4:5], null, v0, v1, 0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_xor_b32_e32 v0, v0, v4 +; GFX12-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 %mul = mul i64 %sext0, %sext1 @@ -913,6 +1112,18 @@ define i48 @mad_i48_i48(i48 %arg0, i48 %arg1, i48 %arg2) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add3_u32 v1, v2, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: mad_i48_i48: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v7, v2, v[4:5] +; GFX12-NEXT: v_mul_lo_u32 v3, v7, v3 +; GFX12-NEXT: v_mul_lo_u32 v2, v6, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_add3_u32 v1, v2, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] %m = mul i48 %arg0, %arg1 %a = add i48 %m, %arg2 ret i48 %a >From 8433a18587fa7b2a8dfcc05de7e0c528565552e6 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.f...@amd.com> Date: Fri, 12 Jan 2024 12:28:49 +0000 Subject: [PATCH 2/3] [AMDGPU] Disable V_MAD_U64_U32/V_MAD_I64_I32 workaround for GFX12 --- llvm/lib/Target/AMDGPU/AMDGPU.td | 1 - .../GlobalISel/inst-select-mad_64_32.mir | 8 +- .../AMDGPU/atomic_optimizations_buffer.ll | 15 +- .../atomic_optimizations_global_pointer.ll | 40 +++--- .../AMDGPU/atomic_optimizations_raw_buffer.ll | 15 +- .../atomic_optimizations_struct_buffer.ll | 15 +- llvm/test/CodeGen/AMDGPU/llvm.mulo.ll | 14 +- llvm/test/CodeGen/AMDGPU/mad_64_32.ll | 128 ++++++++---------- llvm/test/CodeGen/AMDGPU/mul.ll | 43 +++--- 9 files changed, 126 insertions(+), 153 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index b27edb1e9e14bb..023a4260d76a37 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1501,7 +1501,6 @@ def FeatureISAVersion12 : FeatureSet< FeaturePseudoScalarTrans, FeatureHasRestrictedSOffset, FeatureVGPRSingleUseHintInsts, - FeatureMADIntraFwdBug, FeatureScalarDwordx3Loads]>; //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir index 6e33ef37397d6b..59f6114ca5cd3a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-mad_64_32.mir @@ -35,8 +35,8 @@ body: | ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 - ; GFX12-NEXT: [[V_MAD_U64_U32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_gfx11_e64_]], implicit [[V_MAD_U64_U32_gfx11_e64_1]] + ; GFX12-NEXT: [[V_MAD_U64_U32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_U64_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_U64_U32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_U64_U32_e64_]], implicit [[V_MAD_U64_U32_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 @@ -78,8 +78,8 @@ body: | ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr3 - ; GFX12-NEXT: [[V_MAD_I64_I32_gfx11_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_gfx11_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_gfx11_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec - ; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_gfx11_e64_]], implicit [[V_MAD_I64_I32_gfx11_e64_1]] + ; GFX12-NEXT: [[V_MAD_I64_I32_e64_:%[0-9]+]]:vreg_64, [[V_MAD_I64_I32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_MAD_I64_I32_e64 [[COPY]], [[COPY1]], [[COPY2]], 0, implicit $exec + ; GFX12-NEXT: S_ENDPGM 0, implicit [[V_MAD_I64_I32_e64_]], implicit [[V_MAD_I64_I32_e64_1]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = COPY $vgpr1 %2:vgpr(s32) = COPY $vgpr2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index e3d2ecefbda30d..b63a50dc4e6e7a 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -520,13 +520,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_waitcnt vmcnt(0) ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -553,12 +552,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_waitcnt vmcnt(0) -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] +; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index b4c8da44337ae5..42bef4faf8a49b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -572,13 +572,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: global_inv scope:SCOPE_DEV ; GFX1264-NEXT: .LBB1_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX1264-NEXT: s_waitcnt lgkmcnt(0) -; GFX1264-NEXT: v_mul_lo_u32 v0, s8, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1264-NEXT: s_waitcnt lgkmcnt(0) ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s8, v0, s[0:1] ; GFX1264-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1264-NEXT: s_nop 0 ; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -610,13 +609,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: global_inv scope:SCOPE_DEV ; GFX1232-NEXT: .LBB1_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) -; GFX1232-NEXT: v_mul_lo_u32 v0, s0, v0 -; GFX1232-NEXT: v_readfirstlane_b32 s0, v1 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s6, -1 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v0, s[2:3] ; GFX1232-NEXT: buffer_store_b32 v0, off, s[4:7], null ; GFX1232-NEXT: s_nop 0 ; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1671,12 +1669,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: s_waitcnt lgkmcnt(0) -; GFX1264-NEXT: v_mul_lo_u32 v3, s1, v2 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3] -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX1264-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2] ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1264-NEXT: s_nop 0 ; GFX1264-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1712,12 +1709,11 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) -; GFX1232-NEXT: v_mul_lo_u32 v3, s1, v2 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s6, -1 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, s0, v2, s[2:3] -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_add_nc_u32_e32 v1, v3, v1 +; GFX1232-NEXT: v_mad_co_u64_u32 v[1:2], null, s1, v2, v[1:2] ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1232-NEXT: s_nop 0 ; GFX1232-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3608,16 +3604,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: .LBB10_2: ; GFX1264-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1264-NEXT: s_waitcnt lgkmcnt(0) -; GFX1264-NEXT: v_mul_lo_u32 v5, s1, v2 ; GFX1264-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1264-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1264-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1264-NEXT: v_add_nc_u32_e32 v1, v4, v5 +; GFX1264-NEXT: s_wait_alu 0xfff +; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5] +; GFX1264-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v3 -; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1264-NEXT: v_mov_b32_e32 v1, v4 ; GFX1264-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1264-NEXT: s_nop 0 @@ -3652,16 +3648,16 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1232-NEXT: .LBB10_2: ; GFX1232-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1232-NEXT: s_waitcnt lgkmcnt(0) -; GFX1232-NEXT: v_mul_lo_u32 v5, s1, v2 ; GFX1232-NEXT: v_mad_co_u64_u32 v[3:4], null, s0, v2, 0 ; GFX1232-NEXT: v_readfirstlane_b32 s0, v0 -; GFX1232-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1232-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s6, -1 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX1232-NEXT: v_add_nc_u32_e32 v1, v4, v5 +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX1232-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5] +; GFX1232-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1232-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v3 -; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1232-NEXT: v_mov_b32_e32 v1, v4 ; GFX1232-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[4:7], null ; GFX1232-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 79f8b3a1d5d84c..280b3c13f410db 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -519,13 +519,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_waitcnt vmcnt(0) ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -552,12 +551,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_waitcnt vmcnt(0) -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] +; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index edf6fbadf1a60a..f2eea9d2218417 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -535,13 +535,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W64-NEXT: .LBB1_2: ; GFX12W64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX12W64-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX12W64-NEXT: v_mul_lo_u32 v0, s6, v0 ; GFX12W64-NEXT: s_waitcnt vmcnt(0) ; GFX12W64-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W64-NEXT: s_waitcnt lgkmcnt(0) +; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12W64-NEXT: v_mad_co_u64_u32 v[0:1], null, s6, v0, s[2:3] ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_add_nc_u32_e32 v0, s2, v0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_nop 0 ; GFX12W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -568,12 +567,12 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX12W32-NEXT: .LBB1_2: ; GFX12W32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX12W32-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX12W32-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX12W32-NEXT: s_waitcnt vmcnt(0) -; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 +; GFX12W32-NEXT: v_readfirstlane_b32 s4, v1 +; GFX12W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12W32-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s2, v0 +; GFX12W32-NEXT: v_mad_co_u64_u32 v[0:1], null, s2, v0, s[4:5] +; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_nop 0 ; GFX12W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll index b9b03e52ec865c..9afe3e0f97551f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -107,18 +107,18 @@ define { i64, i1 } @umulo_i64_v_v(i64 %x, i64 %y) { ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v2, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v4, v3, 0 ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v5, v2, 0 -; GFX12-NEXT: v_mad_co_u64_u32 v[10:11], null, v5, v3, 0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v5, v3, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX12-NEXT: v_mov_b32_e32 v4, v1 ; GFX12-NEXT: v_add3_u32 v1, v1, v6, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v6 -; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v7, vcc_lo -; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v8 +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v6 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v4, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v3, v9, vcc_lo -; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo -; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v2, v10 +; GFX12-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, v5, v9, vcc_lo +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX12-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll index 6fbcb74306d6dd..79a65268556d5d 100644 --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -40,9 +40,7 @@ define i64 @mad_i64_i32_sextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-LABEL: mad_i64_i32_sextops: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 @@ -84,9 +82,7 @@ define i64 @mad_i64_i32_sextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-LABEL: mad_i64_i32_sextops_commute: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 @@ -128,9 +124,7 @@ define i64 @mad_u64_u32_zextops(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-LABEL: mad_u64_u32_zextops: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = zext i32 %arg0 to i64 %sext1 = zext i32 %arg1 to i64 @@ -172,9 +166,7 @@ define i64 @mad_u64_u32_zextops_commute(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-LABEL: mad_u64_u32_zextops_commute: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = zext i32 %arg0 to i64 %sext1 = zext i32 %arg1 to i64 @@ -304,24 +296,24 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 { ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v0, v1, 0 ; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: v_ashrrev_i32_e32 v14, 31, v0 -; GFX12-NEXT: v_ashrrev_i32_e32 v15, 31, v1 +; GFX12-NEXT: v_ashrrev_i32_e32 v12, 31, v0 +; GFX12-NEXT: v_ashrrev_i32_e32 v13, 31, v1 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v14, v1, v[7:8] +; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v12, v1, v[7:8] ; GFX12-NEXT: v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v8 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v0, v15, v[9:10] +; GFX12-NEXT: v_mad_co_u64_u32 v[7:8], null, v0, v13, v[9:10] ; GFX12-NEXT: v_mov_b32_e32 v10, v8 -; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v1, v14, 0 +; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v1, v12, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_add_co_u32 v10, s0, v11, v10 ; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_i64_i32 v[12:13], null, v15, v0, v[8:9] -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v14, v15, v[10:11] +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v13, v0, v[8:9] +; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v12, v13, v[10:11] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12 -; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo +; GFX12-NEXT: v_add_co_u32 v8, vcc_lo, v8, v0 +; GFX12-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v9, v1, vcc_lo ; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2 ; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) @@ -368,9 +360,7 @@ define i63 @mad_i64_i32_sextops_i32_i63(i32 %arg0, i32 %arg1, i63 %arg2) #0 { ; GFX12-LABEL: mad_i64_i32_sextops_i32_i63: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i63 %sext1 = sext i32 %arg1 to i63 @@ -421,10 +411,10 @@ define i63 @mad_i64_i32_sextops_i31_i63(i31 %arg0, i31 %arg1, i63 %arg2) #0 { ; GFX12-LABEL: mad_i64_i32_sextops_i31_i63: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_bfe_i32 v4, v1, 0, 31 -; GFX12-NEXT: v_bfe_i32 v5, v0, 0, 31 +; GFX12-NEXT: v_bfe_i32 v1, v1, 0, 31 +; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 31 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[2:3] ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i31 %arg0 to i63 %sext1 = sext i31 %arg1 to i63 @@ -481,13 +471,11 @@ define i64 @mad_i64_i32_extops_i32_i64(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-LABEL: mad_i64_i32_extops_i32_i64: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX12-NEXT: v_mul_lo_u32 v6, v0, v5 -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v4, v5, v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, v6, v1 +; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v5, v4, v[2:3] +; GFX12-NEXT: v_ashrrev_i32_e32 v2, 31, v5 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v2, v4, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] %ext0 = sext i32 %arg0 to i64 %ext1 = zext i32 %arg1 to i64 @@ -529,9 +517,7 @@ define i64 @mad_u64_u32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { ; GFX12-LABEL: mad_u64_u32_bitops: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v2, v[4:5] +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5] ; GFX12-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 4294967295 @@ -587,12 +573,11 @@ define i64 @mad_u64_u32_bitops_lhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX12-LABEL: mad_u64_u32_bitops_lhs_mask_small: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_and_b32 v6, 1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v3, v2, v[4:5] -; GFX12-NEXT: v_mul_lo_u32 v2, v6, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX12-NEXT: v_mov_b32_e32 v3, v1 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v0, v2, v[4:5] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v3, v2, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 8589934591 %trunc.rhs = and i64 %arg1, 4294967295 @@ -649,12 +634,11 @@ define i64 @mad_u64_u32_bitops_rhs_mask_small(i64 %arg0, i64 %arg1, i64 %arg2) # ; GFX12-LABEL: mad_u64_u32_bitops_rhs_mask_small: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v3, 1, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_mov_b32_e32 v6, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX12-NEXT: v_mul_lo_u32 v2, v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_add_nc_u32_e32 v1, v2, v1 +; GFX12-NEXT: v_and_b32_e32 v2, 1, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v6, v2, v[1:2] ; GFX12-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 4294967295 %trunc.rhs = and i64 %arg1, 8589934591 @@ -696,9 +680,7 @@ define i64 @mad_i64_i32_bitops(i64 %arg0, i64 %arg1, i64 %arg2) #0 { ; GFX12-LABEL: mad_i64_i32_bitops: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_mov_b32_e32 v3, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v3, v2, v[4:5] +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v2, v[4:5] ; GFX12-NEXT: s_setpc_b64 s[30:31] %shl.lhs = shl i64 %arg0, 32 %trunc.lhs = ashr i64 %shl.lhs, 32 @@ -743,9 +725,7 @@ define i64 @mad_i64_i32_unpack_i64ops(i64 %arg0) #0 { ; GFX12-LABEL: mad_i64_i32_unpack_i64ops: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v1, v0, v[0:1] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v0, v[0:1] ; GFX12-NEXT: s_setpc_b64 s[30:31] %tmp4 = lshr i64 %arg0, 32 %tmp5 = and i64 %arg0, 4294967295 @@ -893,11 +873,11 @@ define i64 @mad_i64_i32_twice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3) #0 { ; GFX12-LABEL: mad_i64_i32_twice: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_mad_co_i64_i32 v[6:7], null, v0, v1, v[2:3] -; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, v0, v1, v[4:5] +; GFX12-NEXT: v_mad_co_i64_i32 v[2:3], null, v0, v1, v[2:3] +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, v[4:5] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_xor_b32_e32 v0, v6, v2 -; GFX12-NEXT: v_xor_b32_e32 v1, v7, v3 +; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 @@ -976,20 +956,20 @@ define i64 @mad_i64_i32_thrice(i32 %arg0, i32 %arg1, i64 %arg2, i64 %arg3, i64 % ; GFX12-LABEL: mad_i64_i32_thrice: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_mad_co_i64_i32 v[8:9], null, v0, v1, 0 +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v8, v2 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v9, v3, vcc_lo -; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v8, v4 -; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v9, v5, vcc_lo -; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v8, v6 -; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v9, v7, vcc_lo +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, v4 +; GFX12-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v5, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, v6 +; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v7, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_xor_b32_e32 v0, v0, v2 -; GFX12-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX12-NEXT: v_xor_b32_e32 v2, v2, v4 +; GFX12-NEXT: v_xor_b32_e32 v3, v3, v5 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX12-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 @@ -1048,13 +1028,13 @@ define i64 @mad_i64_i32_secondary_use(i32 %arg0, i32 %arg1, i64 %arg2) #0 { ; GFX12-LABEL: mad_i64_i32_secondary_use: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX12-NEXT: v_mad_co_i64_i32 v[4:5], null, v0, v1, 0 +; GFX12-NEXT: v_mad_co_i64_i32 v[0:1], null, v0, v1, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v4, v2 -; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v3, vcc_lo +; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, v0, v2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v1, v3, vcc_lo ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX12-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX12-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX12-NEXT: v_xor_b32_e32 v1, v3, v1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %sext0 = sext i32 %arg0 to i64 %sext1 = sext i32 %arg1 to i64 diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll index e2617fc453b58f..25c95528988592 100644 --- a/llvm/test/CodeGen/AMDGPU/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/mul.ll @@ -3208,37 +3208,38 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a ; GFX12-LABEL: v_mul_i128: ; GFX12: ; %bb.0: ; %entry ; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c -; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v15, 4, v0 +; GFX12-NEXT: v_dual_mov_b32 v10, 0 :: v_dual_lshlrev_b32 v13, 4, v0 ; GFX12-NEXT: s_waitcnt lgkmcnt(0) ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: global_load_b128 v[0:3], v15, s[0:1] -; GFX12-NEXT: global_load_b128 v[4:7], v15, s[2:3] +; GFX12-NEXT: global_load_b128 v[0:3], v13, s[0:1] +; GFX12-NEXT: global_load_b128 v[4:7], v13, s[2:3] ; GFX12-NEXT: s_waitcnt vmcnt(0) ; GFX12-NEXT: v_mad_co_u64_u32 v[8:9], null, v0, v4, 0 -; GFX12-NEXT: v_mul_lo_u32 v14, v5, v2 -; GFX12-NEXT: v_mul_lo_u32 v3, v4, v3 +; GFX12-NEXT: v_mul_lo_u32 v15, v5, v2 +; GFX12-NEXT: v_mul_lo_u32 v7, v7, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v1, v4, v[9:10] -; GFX12-NEXT: v_dual_mov_b32 v13, v12 :: v_dual_mov_b32 v12, v10 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_mov_b32_e32 v14, v12 +; GFX12-NEXT: v_mov_b32_e32 v12, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GFX12-NEXT: v_mad_co_u64_u32 v[9:10], null, v0, v5, v[11:12] -; GFX12-NEXT: v_mad_co_u64_u32 v[11:12], null, v4, v2, 0 -; GFX12-NEXT: v_mul_lo_u32 v4, v6, v1 -; GFX12-NEXT: v_mov_b32_e32 v2, v10 -; GFX12-NEXT: v_mul_lo_u32 v10, v7, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add3_u32 v12, v12, v3, v14 -; GFX12-NEXT: v_add_co_u32 v2, s0, v13, v2 +; GFX12-NEXT: v_mul_lo_u32 v11, v4, v3 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v4, v2, 0 +; GFX12-NEXT: v_mul_lo_u32 v12, v6, v1 +; GFX12-NEXT: v_mov_b32_e32 v4, v10 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_add3_u32 v3, v3, v11, v15 +; GFX12-NEXT: v_add_co_u32 v10, s0, v14, v4 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0 -; GFX12-NEXT: v_mad_co_u64_u32 v[13:14], null, v6, v0, v[11:12] +; GFX12-NEXT: v_add_co_ci_u32_e64 v11, null, 0, 0, s0 +; GFX12-NEXT: v_mad_co_u64_u32 v[2:3], null, v6, v0, v[2:3] ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mad_co_u64_u32 v[6:7], null, v1, v5, v[2:3] -; GFX12-NEXT: v_add3_u32 v0, v10, v14, v4 +; GFX12-NEXT: v_mad_co_u64_u32 v[0:1], null, v1, v5, v[10:11] +; GFX12-NEXT: v_add3_u32 v3, v7, v3, v12 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v6, v13 -; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v7, v0, vcc_lo -; GFX12-NEXT: global_store_b128 v15, v[8:11], s[2:3] +; GFX12-NEXT: v_add_co_u32 v10, vcc_lo, v0, v2 +; GFX12-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, v1, v3, vcc_lo +; GFX12-NEXT: global_store_b128 v13, v[8:11], s[2:3] ; GFX12-NEXT: s_nop 0 ; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX12-NEXT: s_endpgm >From f89207303beb1e48ca6dfac78c371ec90de84b22 Mon Sep 17 00:00:00 2001 From: Jay Foad <jay.f...@amd.com> Date: Wed, 17 Jan 2024 11:04:49 +0000 Subject: [PATCH 3/3] Regenerate checks after merge --- llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 32c84b75bb3cb9..3e8fc8eef8e01b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -3608,7 +3608,7 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace( ; GFX1264-NEXT: v_readfirstlane_b32 s0, v0 ; GFX1264-NEXT: s_mov_b32 s7, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s6, -1 -; GFX1264-NEXT: s_wait_alu 0xfff +; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX1264-NEXT: v_mad_co_u64_u32 v[4:5], null, s1, v2, v[4:5] ; GFX1264-NEXT: v_readfirstlane_b32 s1, v1 ; GFX1264-NEXT: v_sub_co_u32 v0, vcc, s0, v3 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits