https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/146076
>From 2d8d232729769a3ca274789dee2fe542d0045ef2 Mon Sep 17 00:00:00 2001 From: Fabian Ritter <fabian.rit...@amd.com> Date: Fri, 27 Jun 2025 05:38:52 -0400 Subject: [PATCH] [AMDGPU][SDAG] Enable ISD::PTRADD for 64-bit AS by default Also removes the command line option to control this feature. There seem to be mainly two kinds of test changes: - Some operands of addition instructions are swapped; that is to be expected since PTRADD is not commutative. - Improvements in code generation, probably because the legacy lowering enabled some transformations that were sometimes harmful. For SWDEV-516125. --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 +- .../identical-subrange-spill-infloop.ll | 354 +++++++++++------- .../AMDGPU/infer-addrspace-flat-atomic.ll | 14 +- llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll | 8 +- .../AMDGPU/lower-module-lds-via-hybrid.ll | 4 +- .../AMDGPU/lower-module-lds-via-table.ll | 16 +- .../match-perm-extract-vector-elt-bug.ll | 22 +- llvm/test/CodeGen/AMDGPU/memmove-var-size.ll | 16 +- .../AMDGPU/preload-implicit-kernargs.ll | 6 +- .../AMDGPU/promote-constOffset-to-imm.ll | 8 +- llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll | 7 +- .../AMDGPU/ptradd-sdag-optimizations.ll | 94 ++--- .../AMDGPU/ptradd-sdag-undef-poison.ll | 6 +- llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll | 27 +- llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 29 +- 15 files changed, 311 insertions(+), 310 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 822bab88c8a09..79981007c13af 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -63,14 +63,6 @@ static cl::opt<bool> UseDivergentRegisterIndexing( cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false)); -// TODO: This option should be removed once we switch to always using PTRADD in -// the SelectionDAG. -static cl::opt<bool> UseSelectionDAGPTRADD( - "amdgpu-use-sdag-ptradd", cl::Hidden, - cl::desc("Generate ISD::PTRADD nodes for 64-bit pointer arithmetic in the " - "SelectionDAG ISel"), - cl::init(false)); - static bool denormalModeIsFlushAllF32(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign(); @@ -10599,7 +10591,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, bool SITargetLowering::shouldPreservePtrArith(const Function &F, EVT PtrVT) const { - return UseSelectionDAGPTRADD && PtrVT == MVT::i64; + return PtrVT == MVT::i64; } bool SITargetLowering::canTransformPtrArithOutOfBounds(const Function &F, diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 56ceba258f471..f9fcf489bd389 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -6,97 +6,151 @@ define void @main(i1 %arg) #0 { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: v_writelane_b32 v5, s30, 0 -; CHECK-NEXT: v_writelane_b32 v5, s31, 1 -; CHECK-NEXT: v_writelane_b32 v5, s36, 2 -; CHECK-NEXT: v_writelane_b32 v5, s37, 3 -; CHECK-NEXT: v_writelane_b32 v5, s38, 4 -; CHECK-NEXT: v_writelane_b32 v5, s39, 5 -; CHECK-NEXT: v_writelane_b32 v5, s48, 6 -; CHECK-NEXT: v_writelane_b32 v5, s49, 7 -; CHECK-NEXT: v_writelane_b32 v5, s50, 8 -; CHECK-NEXT: v_writelane_b32 v5, s51, 9 -; CHECK-NEXT: v_writelane_b32 v5, s52, 10 -; CHECK-NEXT: v_writelane_b32 v5, s53, 11 -; CHECK-NEXT: v_writelane_b32 v5, s54, 12 -; CHECK-NEXT: v_writelane_b32 v5, s55, 13 -; CHECK-NEXT: s_getpc_b64 s[24:25] -; CHECK-NEXT: v_writelane_b32 v5, s64, 14 -; CHECK-NEXT: s_movk_i32 s4, 0xf0 -; CHECK-NEXT: s_mov_b32 s5, s24 -; CHECK-NEXT: v_writelane_b32 v5, s65, 15 -; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0 -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_writelane_b32 v5, s66, 16 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s67, 17 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s6, 0x130 -; CHECK-NEXT: s_mov_b32 s7, s24 -; CHECK-NEXT: v_writelane_b32 v5, s68, 18 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 -; CHECK-NEXT: v_writelane_b32 v5, s69, 19 -; CHECK-NEXT: v_writelane_b32 v5, s70, 20 +; CHECK-NEXT: v_writelane_b32 v6, s30, 0 +; CHECK-NEXT: v_writelane_b32 v6, s31, 1 +; CHECK-NEXT: v_writelane_b32 v6, s36, 2 +; CHECK-NEXT: v_writelane_b32 v6, s37, 3 +; CHECK-NEXT: v_writelane_b32 v6, s38, 4 +; CHECK-NEXT: v_writelane_b32 v6, s39, 5 +; CHECK-NEXT: v_writelane_b32 v6, s48, 6 +; CHECK-NEXT: v_writelane_b32 v6, s49, 7 +; CHECK-NEXT: v_writelane_b32 v6, s50, 8 +; CHECK-NEXT: v_writelane_b32 v6, s51, 9 +; CHECK-NEXT: v_writelane_b32 v6, s52, 10 +; CHECK-NEXT: v_writelane_b32 v6, s53, 11 +; CHECK-NEXT: v_writelane_b32 v6, s54, 12 +; CHECK-NEXT: v_writelane_b32 v6, s55, 13 +; CHECK-NEXT: v_writelane_b32 v6, s64, 14 +; CHECK-NEXT: v_writelane_b32 v6, s65, 15 +; CHECK-NEXT: v_writelane_b32 v6, s66, 16 +; CHECK-NEXT: v_writelane_b32 v6, s67, 17 +; CHECK-NEXT: v_writelane_b32 v6, s68, 18 +; CHECK-NEXT: s_getpc_b64 s[4:5] +; CHECK-NEXT: s_mov_b64 s[8:9], 0 +; CHECK-NEXT: v_writelane_b32 v6, s69, 19 ; CHECK-NEXT: s_mov_b32 s68, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: v_writelane_b32 v5, s71, 21 +; CHECK-NEXT: s_mov_b32 s69, s4 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0 +; CHECK-NEXT: s_load_dwordx8 s[24:31], s[68:69], 0x30 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0xf0 +; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane +; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9 +; CHECK-NEXT: v_writelane_b32 v6, s70, 20 +; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v2, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_writelane_b32 v7, s36, 0 +; CHECK-NEXT: v_writelane_b32 v7, s37, 1 +; CHECK-NEXT: v_writelane_b32 v7, s38, 2 +; CHECK-NEXT: v_writelane_b32 v7, s39, 3 +; CHECK-NEXT: v_writelane_b32 v7, s40, 4 +; CHECK-NEXT: v_writelane_b32 v7, s41, 5 +; CHECK-NEXT: v_writelane_b32 v7, s42, 6 +; CHECK-NEXT: v_writelane_b32 v7, s43, 7 +; CHECK-NEXT: v_writelane_b32 v7, s44, 8 +; CHECK-NEXT: v_writelane_b32 v7, s45, 9 +; CHECK-NEXT: v_writelane_b32 v7, s46, 10 +; CHECK-NEXT: v_writelane_b32 v7, s47, 11 +; CHECK-NEXT: v_writelane_b32 v7, s48, 12 +; CHECK-NEXT: v_writelane_b32 v7, s49, 13 +; CHECK-NEXT: v_writelane_b32 v7, s50, 14 +; CHECK-NEXT: v_writelane_b32 v7, s51, 15 +; CHECK-NEXT: v_writelane_b32 v7, s8, 16 +; CHECK-NEXT: v_writelane_b32 v7, s9, 17 +; CHECK-NEXT: v_writelane_b32 v7, s10, 18 +; CHECK-NEXT: v_writelane_b32 v7, s11, 19 +; CHECK-NEXT: v_writelane_b32 v7, s12, 20 +; CHECK-NEXT: v_writelane_b32 v7, s13, 21 +; CHECK-NEXT: v_writelane_b32 v7, s14, 22 +; CHECK-NEXT: v_writelane_b32 v7, s15, 23 +; CHECK-NEXT: v_writelane_b32 v7, s16, 24 +; CHECK-NEXT: v_writelane_b32 v7, s17, 25 +; CHECK-NEXT: v_writelane_b32 v7, s18, 26 +; CHECK-NEXT: v_writelane_b32 v7, s19, 27 +; CHECK-NEXT: v_writelane_b32 v7, s20, 28 +; CHECK-NEXT: v_writelane_b32 v7, s21, 29 +; CHECK-NEXT: v_writelane_b32 v7, s22, 30 +; CHECK-NEXT: v_writelane_b32 v7, s23, 31 +; CHECK-NEXT: v_readlane_b32 s4, v7, 0 +; CHECK-NEXT: v_writelane_b32 v6, s71, 21 +; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0x1f0 ; CHECK-NEXT: v_mov_b32_e32 v3, v1 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0 ; CHECK-NEXT: s_mov_b32 s69, s68 ; CHECK-NEXT: s_mov_b32 s70, s68 ; CHECK-NEXT: s_mov_b32 s71, s68 -; CHECK-NEXT: image_sample_lz v3, v[2:3], s[16:23], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s5, v7, 1 +; CHECK-NEXT: v_readlane_b32 s6, v7, 2 +; CHECK-NEXT: v_readlane_b32 s7, v7, 3 +; CHECK-NEXT: v_readlane_b32 s8, v7, 4 +; CHECK-NEXT: v_readlane_b32 s9, v7, 5 +; CHECK-NEXT: v_readlane_b32 s10, v7, 6 +; CHECK-NEXT: v_readlane_b32 s11, v7, 7 +; CHECK-NEXT: v_readlane_b32 s12, v7, 8 +; CHECK-NEXT: v_readlane_b32 s13, v7, 9 +; CHECK-NEXT: v_readlane_b32 s14, v7, 10 +; CHECK-NEXT: v_readlane_b32 s15, v7, 11 +; CHECK-NEXT: v_readlane_b32 s16, v7, 12 +; CHECK-NEXT: v_readlane_b32 s17, v7, 13 +; CHECK-NEXT: v_readlane_b32 s18, v7, 14 +; CHECK-NEXT: v_readlane_b32 s19, v7, 15 +; CHECK-NEXT: v_and_b32_e32 v5, 1, v0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[20:21], 1, v5 +; CHECK-NEXT: s_nop 2 +; CHECK-NEXT: image_sample_lz v3, v[2:3], s[12:19], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s4, v7, 16 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 -; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane -; CHECK-NEXT: s_mov_b32 s6, 48 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v6, s36, 0 -; CHECK-NEXT: v_writelane_b32 v6, s37, 1 -; CHECK-NEXT: v_writelane_b32 v6, s38, 2 -; CHECK-NEXT: v_writelane_b32 v6, s39, 3 -; CHECK-NEXT: v_writelane_b32 v6, s40, 4 -; CHECK-NEXT: v_writelane_b32 v6, s41, 5 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[68:71] dmask:0x1 -; CHECK-NEXT: v_writelane_b32 v6, s42, 6 -; CHECK-NEXT: v_writelane_b32 v6, s43, 7 -; CHECK-NEXT: v_writelane_b32 v6, s44, 8 -; CHECK-NEXT: v_writelane_b32 v6, s45, 9 -; CHECK-NEXT: v_writelane_b32 v6, s46, 10 -; CHECK-NEXT: v_writelane_b32 v6, s47, 11 -; CHECK-NEXT: v_writelane_b32 v6, s48, 12 -; CHECK-NEXT: v_writelane_b32 v6, s49, 13 -; CHECK-NEXT: v_writelane_b32 v6, s50, 14 -; CHECK-NEXT: s_movk_i32 s56, 0x1f0 -; CHECK-NEXT: s_movk_i32 s72, 0x2f0 -; CHECK-NEXT: s_mov_b32 s57, s24 -; CHECK-NEXT: s_mov_b32 s73, s24 -; CHECK-NEXT: v_writelane_b32 v6, s51, 15 -; CHECK-NEXT: s_load_dwordx8 s[24:31], s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0 -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: s_load_dwordx16 s[52:67], s[72:73], 0x0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; CHECK-NEXT: v_readlane_b32 s5, v7, 17 +; CHECK-NEXT: v_readlane_b32 s6, v7, 18 +; CHECK-NEXT: v_readlane_b32 s7, v7, 19 +; CHECK-NEXT: v_readlane_b32 s8, v7, 20 +; CHECK-NEXT: v_readlane_b32 s9, v7, 21 +; CHECK-NEXT: v_readlane_b32 s10, v7, 22 +; CHECK-NEXT: v_readlane_b32 s11, v7, 23 +; CHECK-NEXT: v_readlane_b32 s12, v7, 24 +; CHECK-NEXT: v_readlane_b32 s13, v7, 25 +; CHECK-NEXT: v_readlane_b32 s14, v7, 26 +; CHECK-NEXT: v_readlane_b32 s15, v7, 27 +; CHECK-NEXT: v_readlane_b32 s16, v7, 28 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[4:11], s[68:71] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s17, v7, 29 +; CHECK-NEXT: v_readlane_b32 s18, v7, 30 +; CHECK-NEXT: v_readlane_b32 s19, v7, 31 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 -; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] -; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[20:21] +; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[6:7] ; CHECK-NEXT: s_cbranch_execz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb48 -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_readlane_b32 s4, v7, 0 +; CHECK-NEXT: v_readlane_b32 s12, v7, 8 +; CHECK-NEXT: v_readlane_b32 s13, v7, 9 +; CHECK-NEXT: v_readlane_b32 s14, v7, 10 +; CHECK-NEXT: v_readlane_b32 s15, v7, 11 +; CHECK-NEXT: v_readlane_b32 s16, v7, 12 +; CHECK-NEXT: v_readlane_b32 s17, v7, 13 +; CHECK-NEXT: v_readlane_b32 s18, v7, 14 +; CHECK-NEXT: v_readlane_b32 s19, v7, 15 ; CHECK-NEXT: s_and_b64 vcc, exec, -1 +; CHECK-NEXT: v_readlane_b32 s5, v7, 1 +; CHECK-NEXT: v_readlane_b32 s6, v7, 2 +; CHECK-NEXT: v_readlane_b32 s7, v7, 3 +; CHECK-NEXT: v_readlane_b32 s8, v7, 4 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[12:19], s[68:71] dmask:0x1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_readlane_b32 s9, v7, 5 +; CHECK-NEXT: v_readlane_b32 s10, v7, 6 +; CHECK-NEXT: v_readlane_b32 s11, v7, 7 ; CHECK-NEXT: .LBB0_2: ; %bb50 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_mov_b32 s69, s68 -; CHECK-NEXT: s_mov_b32 s70, s68 -; CHECK-NEXT: s_mov_b32 s71, s68 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[28:31] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[60:67], s[28:31] dmask:0x1 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1 +; CHECK-NEXT: image_sample_lz v1, v[1:2], s[44:51], s[68:71] dmask:0x1 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4 ; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0 @@ -104,50 +158,66 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccnz .LBB0_2 ; CHECK-NEXT: .LBB0_3: ; %Flow14 -; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[22:23] ; CHECK-NEXT: s_cbranch_execz .LBB0_10 ; CHECK-NEXT: ; %bb.4: ; %bb32 -; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5] +; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[20:21] ; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17] ; CHECK-NEXT: s_cbranch_execz .LBB0_6 ; CHECK-NEXT: ; %bb.5: ; %bb43 ; CHECK-NEXT: s_mov_b32 s16, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_mov_b64 s[8:9], s[52:53] ; CHECK-NEXT: s_mov_b32 s17, s16 -; CHECK-NEXT: v_mov_b32_e32 v2, s16 -; CHECK-NEXT: v_mov_b32_e32 v3, s17 +; CHECK-NEXT: v_mov_b32_e32 v0, s16 +; CHECK-NEXT: s_mov_b64 s[10:11], s[54:55] +; CHECK-NEXT: s_mov_b64 s[12:13], s[56:57] +; CHECK-NEXT: s_mov_b64 s[14:15], s[58:59] +; CHECK-NEXT: v_readlane_b32 s44, v7, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s17 ; CHECK-NEXT: s_mov_b32 s18, s16 ; CHECK-NEXT: s_mov_b32 s19, s16 -; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[16:19] dmask:0x1 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37] -; CHECK-NEXT: s_mov_b64 s[10:11], s[38:39] -; CHECK-NEXT: s_mov_b64 s[12:13], s[40:41] -; CHECK-NEXT: s_mov_b64 s[14:15], s[42:43] -; CHECK-NEXT: v_readlane_b32 s36, v6, 0 -; CHECK-NEXT: v_readlane_b32 s44, v6, 8 -; CHECK-NEXT: v_readlane_b32 s45, v6, 9 -; CHECK-NEXT: v_readlane_b32 s46, v6, 10 -; CHECK-NEXT: v_readlane_b32 s47, v6, 11 -; CHECK-NEXT: v_readlane_b32 s48, v6, 12 -; CHECK-NEXT: v_readlane_b32 s49, v6, 13 -; CHECK-NEXT: v_readlane_b32 s50, v6, 14 -; CHECK-NEXT: v_readlane_b32 s51, v6, 15 -; CHECK-NEXT: v_readlane_b32 s37, v6, 1 -; CHECK-NEXT: v_readlane_b32 s38, v6, 2 -; CHECK-NEXT: v_readlane_b32 s39, v6, 3 -; CHECK-NEXT: v_readlane_b32 s40, v6, 4 -; CHECK-NEXT: v_readlane_b32 s41, v6, 5 -; CHECK-NEXT: image_sample_lz v0, v[2:3], s[44:51], s[24:27] dmask:0x1 -; CHECK-NEXT: v_readlane_b32 s42, v6, 6 -; CHECK-NEXT: v_readlane_b32 s43, v6, 7 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: s_mov_b64 s[42:43], s[14:15] -; CHECK-NEXT: v_mov_b32_e32 v3, v2 -; CHECK-NEXT: s_mov_b64 s[40:41], s[12:13] -; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11] -; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: v_readlane_b32 s45, v7, 1 +; CHECK-NEXT: v_readlane_b32 s46, v7, 2 +; CHECK-NEXT: v_readlane_b32 s47, v7, 3 +; CHECK-NEXT: v_readlane_b32 s48, v7, 4 +; CHECK-NEXT: v_readlane_b32 s49, v7, 5 +; CHECK-NEXT: v_readlane_b32 s50, v7, 6 +; CHECK-NEXT: v_readlane_b32 s51, v7, 7 +; CHECK-NEXT: v_readlane_b32 s52, v7, 8 +; CHECK-NEXT: v_readlane_b32 s53, v7, 9 +; CHECK-NEXT: v_readlane_b32 s54, v7, 10 +; CHECK-NEXT: v_readlane_b32 s55, v7, 11 +; CHECK-NEXT: v_readlane_b32 s56, v7, 12 +; CHECK-NEXT: v_readlane_b32 s57, v7, 13 +; CHECK-NEXT: v_readlane_b32 s58, v7, 14 +; CHECK-NEXT: v_readlane_b32 s59, v7, 15 +; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1 +; CHECK-NEXT: v_readlane_b32 s44, v7, 16 +; CHECK-NEXT: v_readlane_b32 s52, v7, 24 +; CHECK-NEXT: v_readlane_b32 s53, v7, 25 +; CHECK-NEXT: v_readlane_b32 s54, v7, 26 +; CHECK-NEXT: v_readlane_b32 s55, v7, 27 +; CHECK-NEXT: v_readlane_b32 s56, v7, 28 +; CHECK-NEXT: v_readlane_b32 s57, v7, 29 +; CHECK-NEXT: v_readlane_b32 s58, v7, 30 +; CHECK-NEXT: v_readlane_b32 s59, v7, 31 +; CHECK-NEXT: v_mov_b32_e32 v3, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, v3 +; CHECK-NEXT: v_readlane_b32 s45, v7, 17 +; CHECK-NEXT: v_readlane_b32 s46, v7, 18 +; CHECK-NEXT: v_readlane_b32 s47, v7, 19 +; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1 +; CHECK-NEXT: s_mov_b64 s[58:59], s[14:15] +; CHECK-NEXT: v_readlane_b32 s48, v7, 20 +; CHECK-NEXT: v_readlane_b32 s49, v7, 21 +; CHECK-NEXT: v_readlane_b32 s50, v7, 22 +; CHECK-NEXT: v_readlane_b32 s51, v7, 23 +; CHECK-NEXT: s_mov_b64 s[56:57], s[12:13] +; CHECK-NEXT: s_mov_b64 s[54:55], s[10:11] +; CHECK-NEXT: s_mov_b64 s[52:53], s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[16:19], 0 +; CHECK-NEXT: buffer_store_dwordx3 v[2:4], off, s[16:19], 0 ; CHECK-NEXT: s_waitcnt vmcnt(1) ; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 ; CHECK-NEXT: ; implicit-def: $vgpr0 @@ -155,17 +225,17 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_9 ; CHECK-NEXT: ; %bb.7: ; %bb33.preheader -; CHECK-NEXT: s_mov_b32 s8, 0 -; CHECK-NEXT: s_mov_b32 s12, s8 -; CHECK-NEXT: s_mov_b32 s13, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s12 -; CHECK-NEXT: s_mov_b32 s9, s8 -; CHECK-NEXT: s_mov_b32 s10, s8 -; CHECK-NEXT: s_mov_b32 s11, s8 -; CHECK-NEXT: v_mov_b32_e32 v2, s13 +; CHECK-NEXT: s_mov_b32 s16, 0 +; CHECK-NEXT: s_mov_b32 s20, s16 +; CHECK-NEXT: s_mov_b32 s21, s16 +; CHECK-NEXT: v_mov_b32_e32 v1, s20 +; CHECK-NEXT: s_mov_b32 s17, s16 +; CHECK-NEXT: s_mov_b32 s18, s16 +; CHECK-NEXT: s_mov_b32 s19, s16 +; CHECK-NEXT: v_mov_b32_e32 v2, s21 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1 -; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1 +; CHECK-NEXT: image_sample_lz v3, v[1:2], s[52:59], s[16:19] dmask:0x1 +; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3 @@ -181,32 +251,32 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock ; CHECK-NEXT: s_or_b64 exec, exec, s[6:7] -; CHECK-NEXT: v_readlane_b32 s71, v5, 21 -; CHECK-NEXT: v_readlane_b32 s70, v5, 20 -; CHECK-NEXT: v_readlane_b32 s69, v5, 19 -; CHECK-NEXT: v_readlane_b32 s68, v5, 18 +; CHECK-NEXT: v_readlane_b32 s71, v6, 21 +; CHECK-NEXT: v_readlane_b32 s70, v6, 20 +; CHECK-NEXT: v_readlane_b32 s69, v6, 19 +; CHECK-NEXT: v_readlane_b32 s68, v6, 18 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_readlane_b32 s67, v5, 17 -; CHECK-NEXT: v_readlane_b32 s66, v5, 16 -; CHECK-NEXT: v_readlane_b32 s65, v5, 15 -; CHECK-NEXT: v_readlane_b32 s64, v5, 14 -; CHECK-NEXT: v_readlane_b32 s55, v5, 13 -; CHECK-NEXT: v_readlane_b32 s54, v5, 12 -; CHECK-NEXT: v_readlane_b32 s53, v5, 11 -; CHECK-NEXT: v_readlane_b32 s52, v5, 10 -; CHECK-NEXT: v_readlane_b32 s51, v5, 9 -; CHECK-NEXT: v_readlane_b32 s50, v5, 8 -; CHECK-NEXT: v_readlane_b32 s49, v5, 7 -; CHECK-NEXT: v_readlane_b32 s48, v5, 6 -; CHECK-NEXT: v_readlane_b32 s39, v5, 5 -; CHECK-NEXT: v_readlane_b32 s38, v5, 4 -; CHECK-NEXT: v_readlane_b32 s37, v5, 3 -; CHECK-NEXT: v_readlane_b32 s36, v5, 2 -; CHECK-NEXT: v_readlane_b32 s31, v5, 1 -; CHECK-NEXT: v_readlane_b32 s30, v5, 0 +; CHECK-NEXT: v_readlane_b32 s67, v6, 17 +; CHECK-NEXT: v_readlane_b32 s66, v6, 16 +; CHECK-NEXT: v_readlane_b32 s65, v6, 15 +; CHECK-NEXT: v_readlane_b32 s64, v6, 14 +; CHECK-NEXT: v_readlane_b32 s55, v6, 13 +; CHECK-NEXT: v_readlane_b32 s54, v6, 12 +; CHECK-NEXT: v_readlane_b32 s53, v6, 11 +; CHECK-NEXT: v_readlane_b32 s52, v6, 10 +; CHECK-NEXT: v_readlane_b32 s51, v6, 9 +; CHECK-NEXT: v_readlane_b32 s50, v6, 8 +; CHECK-NEXT: v_readlane_b32 s49, v6, 7 +; CHECK-NEXT: v_readlane_b32 s48, v6, 6 +; CHECK-NEXT: v_readlane_b32 s39, v6, 5 +; CHECK-NEXT: v_readlane_b32 s38, v6, 4 +; CHECK-NEXT: v_readlane_b32 s37, v6, 3 +; CHECK-NEXT: v_readlane_b32 s36, v6, 2 +; CHECK-NEXT: v_readlane_b32 s31, v6, 1 +; CHECK-NEXT: v_readlane_b32 s30, v6, 0 ; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll index 258aa9e299c3d..ed2755ed1e38b 100644 --- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll @@ -11,8 +11,8 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) { ; CHECK-NEXT: v_mov_b32_e32 v0, s2 ; CHECK-NEXT: v_mov_b32_e32 v1, s3 ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 -; CHECK-NEXT: s_add_u32 s0, s2, s0 -; CHECK-NEXT: s_addc_u32 s1, s3, s1 +; CHECK-NEXT: s_add_u32 s0, s0, s2 +; CHECK-NEXT: s_addc_u32 s1, s1, s3 ; CHECK-NEXT: v_mov_b32_e32 v3, s1 ; CHECK-NEXT: v_add_co_u32_e64 v2, vcc, -8, s0 ; CHECK-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc @@ -69,13 +69,13 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3 ; CHECK-NEXT: s_add_u32 s0, s0, s2 ; CHECK-NEXT: s_addc_u32 s1, s1, s3 +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -7, s0 -; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc -; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: buffer_wbinvl1_vol ; CHECK-NEXT: s_endpgm @@ -113,7 +113,7 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl ; CHECK-NEXT: s_addc_u32 s1, s1, s5 ; CHECK-NEXT: s_add_u32 s4, s0, -8 ; CHECK-NEXT: s_addc_u32 s5, s1, -1 -; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9 +; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 1 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll index 04abb75c3f912..42ee46bd2c110 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll @@ -46,8 +46,8 @@ define void @use_extern_normal() #0 { ; CHECK-NEXT: s_ashr_i32 s5, s15, 31 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4048f5c3 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; CHECK-NEXT: s_add_u32 s4, s4, s6 -; CHECK-NEXT: s_addc_u32 s5, s5, s7 +; CHECK-NEXT: s_add_u32 s4, s6, s4 +; CHECK-NEXT: s_addc_u32 s5, s7, s5 ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 @@ -70,8 +70,8 @@ define void @use_extern_overalign() #0 { ; CHECK-NEXT: s_ashr_i32 s5, s15, 31 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x42280000 ; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; CHECK-NEXT: s_add_u32 s4, s4, s6 -; CHECK-NEXT: s_addc_u32 s5, s5, s7 +; CHECK-NEXT: s_add_u32 s4, s6, s4 +; CHECK-NEXT: s_addc_u32 s5, s7, s5 ; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll index 2a7553ae5d92b..0b5ba81b3c24f 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll @@ -84,8 +84,8 @@ define void @f2() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll index dca9b71a757af..882e05cf9efda 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll @@ -49,8 +49,8 @@ define void @f0() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -90,8 +90,8 @@ define void @f1() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+8 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+16 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 @@ -131,8 +131,8 @@ define void @f2() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+12 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+20 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 @@ -172,8 +172,8 @@ define void @f3() { ; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+16 ; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+24 ; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4 -; GCN-NEXT: s_add_u32 s4, s4, s6 -; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: s_add_u32 s4, s6, s4 +; GCN-NEXT: s_addc_u32 s5, s7, s5 ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll index 4896e504cfdf4..229b3ece6e5ea 100644 --- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll @@ -13,9 +13,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX9-NEXT: s_and_b32 s4, s4, 0xffff ; GFX9-NEXT: s_mul_i32 s14, s14, s4 ; GFX9-NEXT: s_add_i32 s5, s5, s14 -; GFX9-NEXT: v_add_u32_e32 v0, s5, v0 -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_ashrrev_i64 v[4:5], 28, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc @@ -37,12 +37,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX10-NEXT: s_load_dword s4, s[8:9], 0x1c ; GFX10-NEXT: s_load_dword s5, s[8:9], 0x38 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_and_b32 s4, s4, 0xffff ; GFX10-NEXT: s_mul_i32 s14, s14, s4 -; GFX10-NEXT: v_add3_u32 v0, s5, s14, v0 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX10-NEXT: v_add3_u32 v2, s5, s14, v0 +; GFX10-NEXT: v_ashrrev_i64 v[4:5], 28, v[1:2] ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4 @@ -62,21 +62,19 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) { ; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x1c ; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x38 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_and_b32 s4, s6, 0xffff ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mul_i32 s13, s13, s4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_add3_u32 v0, s7, s13, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1] +; GFX11-NEXT: v_add3_u32 v1, s7, s13, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_ashrrev_i64 v[4:5], 28, v[0:1] ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s3, v5, vcc_lo ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll index 272daa9dd0b59..7187ece89ae04 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll @@ -388,8 +388,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB2_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v1, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -684,8 +684,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB4_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v0, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v1, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -1411,8 +1411,8 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB10_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v2, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 @@ -1889,8 +1889,8 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align ; CHECK-NEXT: s_and_saveexec_b32 s7, s4 ; CHECK-NEXT: s_cbranch_execz .LBB15_13 ; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader -; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1 -; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4 +; CHECK-NEXT: v_add_co_u32 v9, s4, v1, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v2, v4, s4 ; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1 ; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1 ; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4 diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll index 79b531e3ce785..615740a2d0730 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll @@ -277,8 +277,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX942-NEXT: .p2align 8 ; GFX942-NEXT: ; %bb.2: ; GFX942-NEXT: .LBB8_0: -; GFX942-NEXT: s_mov_b32 s4, 8 -; GFX942-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2 +; GFX942-NEXT: s_load_dword s0, s[0:1], 0xa ; GFX942-NEXT: v_mov_b32_e32 v0, 0 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) ; GFX942-NEXT: v_mov_b32_e32 v1, s0 @@ -293,8 +292,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) ; GFX90a-NEXT: .p2align 8 ; GFX90a-NEXT: ; %bb.2: ; GFX90a-NEXT: .LBB8_0: -; GFX90a-NEXT: s_mov_b32 s0, 8 -; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 +; GFX90a-NEXT: s_load_dword s0, s[4:5], 0xa ; GFX90a-NEXT: v_mov_b32_e32 v0, 0 ; GFX90a-NEXT: s_waitcnt lgkmcnt(0) ; GFX90a-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index c4842c1f4f523..a78c3e854b011 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -612,8 +612,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_movk_i32 s1, 0x7f ; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6 -; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s35, s0 +; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader @@ -830,8 +830,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1 ; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0 +; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll index ff90f1f175c3c..40f39a24d7a99 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX6,GFX6_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX6,GFX6_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s ; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectMUBUF. @@ -34,7 +33,3 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in store i32 %result, ptr addrspace(1) %out ret void } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX6_LEGACY: {{.*}} -; GFX6_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll index a03bd97309c6b..d89f0a94e0771 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 < %s | FileCheck --check-prefixes=GFX942 %s ; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG ; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable @@ -24,21 +23,13 @@ define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) { } define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) { -; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] -; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1] -; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: global_load_gep_add_reassoc: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3] +; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24 +; GFX942-NEXT: s_waitcnt vmcnt(0) +; GFX942-NEXT: s_setpc_b64 s[30:31] %add0 = add nuw nsw i64 %voffset, 24 %gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0 %l = load i64, ptr addrspace(1) %gep0, align 8 @@ -222,23 +213,14 @@ define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 % ; Check that offsets are folded into global addresses if possible. For example, ; this is relevant when using --amdgpu-lower-module-lds-strategy=table. define ptr addrspace(1) @complextype_global_gep(i64 %offset) { -; GFX942_PTRADD-LABEL: complextype_global_gep: -; GFX942_PTRADD: ; %bb.0: -; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1] -; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 -; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 -; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] -; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31] -; -; GFX942_LEGACY-LABEL: complextype_global_gep: -; GFX942_LEGACY: ; %bb.0: -; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_getpc_b64 s[0:1] -; GFX942_LEGACY-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 -; GFX942_LEGACY-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 -; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31] +; GFX942-LABEL: complextype_global_gep: +; GFX942: ; %bb.0: +; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX942-NEXT: s_getpc_b64 s[0:1] +; GFX942-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14 +; GFX942-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22 +; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1] +; GFX942-NEXT: s_setpc_b64 s[30:31] %gep0 = getelementptr inbounds %complextype, ptr addrspace(1) @v0, i64 0, i32 1, i64 %offset %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2 ret ptr addrspace(1) %gep1 @@ -431,36 +413,20 @@ define ptr @gep_disjoint_or(ptr %base) { ; Check that AssertAlign nodes between ptradd nodes don't block offset folding, ; taken from preload-implicit-kernargs.ll define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) { -; GFX942_PTRADD-LABEL: random_incorrect_offset: -; GFX942_PTRADD: ; %bb.1: -; GFX942_PTRADD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: s_branch .LBB21_0 -; GFX942_PTRADD-NEXT: .p2align 8 -; GFX942_PTRADD-NEXT: ; %bb.2: -; GFX942_PTRADD-NEXT: .LBB21_0: -; GFX942_PTRADD-NEXT: s_load_dword s0, s[4:5], 0xa -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, 0 -; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s0 -; GFX942_PTRADD-NEXT: global_store_dword v0, v1, s[8:9] -; GFX942_PTRADD-NEXT: s_endpgm -; -; GFX942_LEGACY-LABEL: random_incorrect_offset: -; GFX942_LEGACY: ; %bb.1: -; GFX942_LEGACY-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: s_branch .LBB21_0 -; GFX942_LEGACY-NEXT: .p2align 8 -; GFX942_LEGACY-NEXT: ; %bb.2: -; GFX942_LEGACY-NEXT: .LBB21_0: -; GFX942_LEGACY-NEXT: s_mov_b32 s0, 8 -; GFX942_LEGACY-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2 -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, 0 -; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0) -; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s0 -; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[8:9] -; GFX942_LEGACY-NEXT: s_endpgm +; GFX942-LABEL: random_incorrect_offset: +; GFX942: ; %bb.1: +; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: s_branch .LBB21_0 +; GFX942-NEXT: .p2align 8 +; GFX942-NEXT: ; %bb.2: +; GFX942-NEXT: .LBB21_0: +; GFX942-NEXT: s_load_dword s0, s[4:5], 0xa +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_waitcnt lgkmcnt(0) +; GFX942-NEXT: v_mov_b32_e32 v1, s0 +; GFX942-NEXT: global_store_dword v0, v1, s[8:9] +; GFX942-NEXT: s_endpgm %imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() %gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2 %load = load i32, ptr addrspace(4) %gep diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll index 1934ce395e63d..e7c715f0a38bf 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel < %s | FileCheck --check-prefixes=GFX942 %s ; Tests for undef and poison DAG folds for the ISD::PTRADD SelectionDAG opcode. ; If any additions are generated for these tests, the folds don't work. @@ -44,6 +43,3 @@ define ptr @undef_base(ptr %p, i64 %offset) { %gep1 = getelementptr i8, ptr undef, i64 %offset ret ptr %gep1 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX942_LEGACY: {{.*}} -; GFX942_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll index 1c4a9547ed189..42158f18da525 100644 --- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll +++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll @@ -1,14 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_LEGACY -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_PTRADD -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_LEGACY +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck %s -check-prefixes=GFX8 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefixes=GFX10 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12 ; Tests for the ISD::PTRADD SelectionDAG opcode. This only tests 64-bit address ; spaces since PTRADD is currently only used for these. @@ -509,15 +504,3 @@ entry: store i32 %val, ptr addrspace(1) %gep.to, align 4 ret void } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10_LEGACY: {{.*}} -; GFX10_PTRADD: {{.*}} -; GFX11_LEGACY: {{.*}} -; GFX11_PTRADD: {{.*}} -; GFX12_LEGACY: {{.*}} -; GFX12_PTRADD: {{.*}} -; GFX8_LEGACY: {{.*}} -; GFX8_PTRADD: {{.*}} -; GFX942_LEGACY: {{.*}} -; GFX942_PTRADD: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll index 92918f19a98a5..119340fbde8a3 100644 --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -52,11 +52,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; HAWAII-LABEL: local_store_i55: ; HAWAII: ; %bb.0: ; HAWAII-NEXT: s_add_i32 s12, s12, s17 -; HAWAII-NEXT: s_or_b32 s0, s8, 14 -; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13 ; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; HAWAII-NEXT: s_add_u32 s0, s8, 14 +; HAWAII-NEXT: s_addc_u32 s1, s9, 0 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s9 +; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 ; HAWAII-NEXT: flat_load_ubyte v0, v[0:1] ; HAWAII-NEXT: s_load_dword s2, s[8:9], 0x0 ; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2 @@ -74,25 +75,27 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 { ; ; FIJI-LABEL: local_store_i55: ; FIJI: ; %bb.0: +; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 ; FIJI-NEXT: s_add_i32 s12, s12, s17 -; FIJI-NEXT: s_or_b32 s0, s8, 14 -; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s9 -; FIJI-NEXT: flat_load_ubyte v0, v[0:1] -; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8 -; FIJI-NEXT: s_load_dword s2, s[8:9], 0x0 +; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) -; FIJI-NEXT: s_and_b32 s3, s1, 0xffff -; FIJI-NEXT: v_mov_b32_e32 v1, s2 +; FIJI-NEXT: s_and_b32 s4, s1, 0xffff +; FIJI-NEXT: s_add_u32 s2, s8, 14 +; FIJI-NEXT: s_addc_u32 s3, s9, 0 +; FIJI-NEXT: v_mov_b32_e32 v0, s2 +; FIJI-NEXT: v_mov_b32_e32 v1, s3 +; FIJI-NEXT: flat_load_ubyte v0, v[0:1] +; FIJI-NEXT: s_load_dword s2, s[8:9], 0x0 ; FIJI-NEXT: v_mov_b32_e32 v2, s1 ; FIJI-NEXT: v_mov_b32_e32 v3, s0 +; FIJI-NEXT: s_waitcnt lgkmcnt(0) +; FIJI-NEXT: v_mov_b32_e32 v1, s2 ; FIJI-NEXT: ds_write_b16 v1, v2 offset:4 ; FIJI-NEXT: s_waitcnt vmcnt(0) ; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; FIJI-NEXT: v_or_b32_e32 v0, s3, v0 +; FIJI-NEXT: v_or_b32_e32 v0, s4, v0 ; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7 ; FIJI-NEXT: ds_write_b8 v1, v0 offset:6 ; FIJI-NEXT: ds_write_b32 v1, v3 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits