llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Scott Linder (slinder1) <details> <summary>Changes</summary> --- Patch is 1.26 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/169098.diff 65 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp (+20) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll (+2831-2831) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.320bit.ll (+22-22) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.512bit.ll (+998-998) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll (+16-16) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll (+88-88) - (modified) llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.960bit.ll (+164-164) - (modified) llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll (+140-140) - (modified) llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-undefined-behavior2.ll (+7-6) - (modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+92-74) - (modified) llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll (+78-78) - (modified) llvm/test/CodeGen/AMDGPU/call-args-inreg.ll (+48-48) - (modified) llvm/test/CodeGen/AMDGPU/call-argument-types.ll (+20-20) - (modified) llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll (+58-58) - (modified) llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll (+53-53) - (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs-packed.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/debug-frame.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll (+32-32) - (modified) llvm/test/CodeGen/AMDGPU/dynamic-vgpr-reserve-stack-for-cwsr.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll (+5-20) - (modified) llvm/test/CodeGen/AMDGPU/function-args-inreg.ll (+4-4) - (modified) llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll (+72-72) - (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll (+616-618) - (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll (+36-36) - (modified) llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll (+21-21) - (modified) llvm/test/CodeGen/AMDGPU/global-alias.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll (+46-46) - (modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+552-552) - (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/materialize-frame-index-sgpr.ll (+818-816) - (modified) llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll (+10-10) - (modified) llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/nested-calls.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll (+5-20) - (modified) llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll (+3-3) - (modified) llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir (+95-95) - (modified) llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll (+6-21) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v2i64.v8i64.ll (+160-139) - (modified) llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills-vgpr-lanes-usage.mir (+9-9) - (modified) llvm/test/CodeGen/AMDGPU/sibling-call.ll (+111-111) - (modified) llvm/test/CodeGen/AMDGPU/stack-realign.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll (+5-5) - (modified) llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll (+7-7) - (modified) llvm/test/CodeGen/AMDGPU/swdev504645-global-fold.ll (+2-1) - (modified) llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll (+20-18) - (modified) llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll (+84-84) - (modified) llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll (+6-6) - (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll (+10-10) - (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll (+2-2) - (modified) llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll (+4-4) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 40eeeb8a8630d..057a5e2df8bf7 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -268,11 +268,19 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs( std::vector<CalleeSavedInfo> CSI; const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs(); + Register RetAddrReg = TRI->getReturnAddressReg(MF); + bool SpillRetAddrReg = false; for (unsigned I = 0; CSRegs[I]; ++I) { MCRegister Reg = CSRegs[I]; if (SavedRegs.test(Reg)) { + if (Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub0) || + Reg == TRI->getSubReg(RetAddrReg, AMDGPU::sub1)) { + SpillRetAddrReg = true; + continue; + } + const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, MVT::i32); int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), @@ -283,6 +291,18 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs( } } + // Return address uses a register pair. Add the super register to the + // CSI list so that it's easier to identify the entire spill and CFI + // can be emitted appropriately. + if (SpillRetAddrReg) { + const TargetRegisterClass *RC = + TRI->getMinimalPhysRegClass(RetAddrReg, MVT::i64); + int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC), + TRI->getSpillAlign(*RC), true); + CSI.push_back(CalleeSavedInfo(RetAddrReg, JunkFI)); + CalleeSavedFIs.push_back(JunkFI); + } + if (!CSI.empty()) { for (MachineBasicBlock *SaveBlock : SaveBlocks) insertCSRSaves(*SaveBlock, CSI, Indexes, LIS); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll index b84b31cd2702c..023398377de94 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -23,10 +23,10 @@ define ptr addrspace(1) @call_assert_align() { ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: global_store_dword v[0:1], v2, off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 -; CHECK-NEXT: v_readlane_b32 s30, v40, 0 ; CHECK-NEXT: s_mov_b32 s32, s33 ; CHECK-NEXT: v_readlane_b32 s4, v40, 2 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll index 7e6f500181ec6..2c1beb8468576 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -238,8 +238,8 @@ define void @func_caller_stack() #2 { ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -277,8 +277,8 @@ define void @func_caller_stack() #2 { ; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32_v16i32_v4i32@rel32@hi+12 ; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 @@ -363,8 +363,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 { ; MUBUF-NEXT: s_waitcnt vmcnt(1) ; MUBUF-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] -; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 +; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: s_mov_b32 s32, s33 ; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 @@ -414,8 +414,8 @@ define void @func_caller_byval(ptr addrspace(5) %argptr) #2 { ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[0:1], s32 offset:56 ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] -; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 +; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: s_mov_b32 s32, s33 ; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll index 72766f47030cc..35591cd602992 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -244,8 +244,8 @@ define void @sink_null_insert_pt(ptr addrspace(4) %arg0) #0 { ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 -; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 +; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: s_mov_b32 s32, s33 ; GFX9-NEXT: v_readlane_b32 s4, v40, 2 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll index 3194581fa4213..0e24430e7be20 100644 --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -30,8 +30,8 @@ define void @parent_func_missing_inputs() #0 { ; FIXEDABI-NEXT: s_addc_u32 s17, s17, requires_all_inputs@rel32@hi+12 ; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1 ; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17] -; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 +; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: s_mov_b32 s32, s33 ; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll index 149b0cb4e052d..b6e65c8842904 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn-call-whole-wave.ll @@ -35,8 +35,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) ; DAGISEL-NEXT: s_clause 0x1 ; DAGISEL-NEXT: scratch_load_b32 v41, off, s33 ; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1 ; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0 +; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1 ; DAGISEL-NEXT: s_mov_b32 s32, s33 ; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2 ; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1 @@ -78,8 +78,8 @@ define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) ; GISEL-NEXT: s_clause 0x1 ; GISEL-NEXT: scratch_load_b32 v41, off, s33 ; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4 -; GISEL-NEXT: v_readlane_b32 s31, v42, 1 ; GISEL-NEXT: v_readlane_b32 s30, v42, 0 +; GISEL-NEXT: v_readlane_b32 s31, v42, 1 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s0, v42, 2 ; GISEL-NEXT: s_or_saveexec_b32 s1, -1 @@ -787,8 +787,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 { ; DAGISEL-NEXT: s_wait_alu 0xfffe ; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1 ; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0 +; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1 ; DAGISEL-NEXT: s_mov_b32 s32, s33 ; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2 ; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1 @@ -822,8 +822,8 @@ define amdgpu_gfx void @ret_void(i32 %x) #0 { ; GISEL-NEXT: s_wait_alu 0xfffe ; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 +; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: s_mov_b32 s32, s33 ; GISEL-NEXT: v_readlane_b32 s0, v40, 2 ; GISEL-NEXT: s_or_saveexec_b32 s1, -1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll index c794168c40075..1e2a46fcefa36 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll @@ -7466,42 +7466,42 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v22, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; SI-NEXT: s_mov_b64 exec, s[4:5] ; SI-NEXT: s_waitcnt expcnt(2) -; SI-NEXT: v_writelane_b32 v20, s30, 0 -; SI-NEXT: v_writelane_b32 v20, s31, 1 -; SI-NEXT: v_writelane_b32 v20, s34, 2 -; SI-NEXT: v_writelane_b32 v20, s35, 3 -; SI-NEXT: v_writelane_b32 v20, s36, 4 -; SI-NEXT: v_writelane_b32 v20, s37, 5 -; SI-NEXT: v_writelane_b32 v20, s38, 6 -; SI-NEXT: v_writelane_b32 v20, s39, 7 -; SI-NEXT: v_writelane_b32 v20, s48, 8 -; SI-NEXT: v_writelane_b32 v20, s49, 9 -; SI-NEXT: v_writelane_b32 v20, s50, 10 -; SI-NEXT: v_writelane_b32 v20, s51, 11 -; SI-NEXT: v_writelane_b32 v20, s52, 12 -; SI-NEXT: v_writelane_b32 v20, s53, 13 -; SI-NEXT: v_writelane_b32 v20, s54, 14 -; SI-NEXT: v_writelane_b32 v20, s55, 15 -; SI-NEXT: v_writelane_b32 v20, s64, 16 -; SI-NEXT: v_writelane_b32 v20, s65, 17 -; SI-NEXT: v_writelane_b32 v20, s66, 18 -; SI-NEXT: v_writelane_b32 v20, s67, 19 -; SI-NEXT: v_writelane_b32 v20, s68, 20 -; SI-NEXT: v_writelane_b32 v20, s69, 21 -; SI-NEXT: v_writelane_b32 v20, s70, 22 -; SI-NEXT: v_writelane_b32 v20, s71, 23 -; SI-NEXT: v_writelane_b32 v20, s80, 24 -; SI-NEXT: v_writelane_b32 v20, s81, 25 -; SI-NEXT: v_writelane_b32 v20, s82, 26 -; SI-NEXT: v_writelane_b32 v20, s83, 27 -; SI-NEXT: v_writelane_b32 v20, s84, 28 -; SI-NEXT: v_writelane_b32 v20, s85, 29 -; SI-NEXT: v_writelane_b32 v20, s86, 30 -; SI-NEXT: v_writelane_b32 v20, s87, 31 -; SI-NEXT: v_writelane_b32 v20, s96, 32 -; SI-NEXT: v_writelane_b32 v20, s97, 33 +; SI-NEXT: v_writelane_b32 v20, s34, 0 +; SI-NEXT: v_writelane_b32 v20, s35, 1 +; SI-NEXT: v_writelane_b32 v20, s36, 2 +; SI-NEXT: v_writelane_b32 v20, s37, 3 +; SI-NEXT: v_writelane_b32 v20, s38, 4 +; SI-NEXT: v_writelane_b32 v20, s39, 5 +; SI-NEXT: v_writelane_b32 v20, s48, 6 +; SI-NEXT: v_writelane_b32 v20, s49, 7 +; SI-NEXT: v_writelane_b32 v20, s50, 8 +; SI-NEXT: v_writelane_b32 v20, s51, 9 +; SI-NEXT: v_writelane_b32 v20, s52, 10 +; SI-NEXT: v_writelane_b32 v20, s53, 11 +; SI-NEXT: v_writelane_b32 v20, s54, 12 +; SI-NEXT: v_writelane_b32 v20, s55, 13 +; SI-NEXT: v_writelane_b32 v20, s64, 14 +; SI-NEXT: v_writelane_b32 v20, s65, 15 +; SI-NEXT: v_writelane_b32 v20, s66, 16 +; SI-NEXT: v_writelane_b32 v20, s67, 17 +; SI-NEXT: v_writelane_b32 v20, s68, 18 +; SI-NEXT: v_writelane_b32 v20, s69, 19 +; SI-NEXT: v_writelane_b32 v20, s70, 20 +; SI-NEXT: v_writelane_b32 v20, s71, 21 +; SI-NEXT: v_writelane_b32 v20, s80, 22 +; SI-NEXT: v_writelane_b32 v20, s81, 23 +; SI-NEXT: v_writelane_b32 v20, s82, 24 +; SI-NEXT: v_writelane_b32 v20, s83, 25 +; SI-NEXT: v_writelane_b32 v20, s84, 26 +; SI-NEXT: v_writelane_b32 v20, s85, 27 +; SI-NEXT: v_writelane_b32 v20, s86, 28 +; SI-NEXT: v_writelane_b32 v20, s87, 29 +; SI-NEXT: v_writelane_b32 v20, s96, 30 +; SI-NEXT: v_writelane_b32 v20, s97, 31 +; SI-NEXT: v_writelane_b32 v20, s98, 32 +; SI-NEXT: v_writelane_b32 v20, s99, 33 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; SI-NEXT: v_writelane_b32 v20, s98, 34 +; SI-NEXT: v_writelane_b32 v20, s30, 34 ; SI-NEXT: v_readfirstlane_b32 s44, v1 ; SI-NEXT: v_readfirstlane_b32 s45, v2 ; SI-NEXT: v_readfirstlane_b32 s42, v3 @@ -7521,7 +7521,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_readfirstlane_b32 s4, v17 ; SI-NEXT: s_and_b64 s[46:47], vcc, exec ; SI-NEXT: v_readfirstlane_b32 s5, v18 -; SI-NEXT: v_writelane_b32 v20, s99, 35 +; SI-NEXT: v_writelane_b32 v20, s31, 35 ; SI-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane ; SI-NEXT: ; implicit-def: $vgpr21 : SGPR spill to VGPR lane ; SI-NEXT: s_cbranch_scc0 .LBB13_4 @@ -8391,6 +8391,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen ; SI-NEXT: v_add_i32_e32 v0, vcc, 0x7c, v0 ; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_readlane_b32 s30, v20, 34 ; SI-NEXT: v_readlane_b32 s19, v22, 11 ; SI-NEXT: v_readlane_b32 s17, v22, 17 ; SI-NEXT: v_readlane_b32 s15, v22, 23 @@ -8398,42 +8399,41 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; SI-NEXT: v_readlane_b32 s11, v22, 35 ; SI-NEXT: v_readlane_b32 s9, v22, 41 ; SI-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; SI-NEXT: v_readlane_b32 s99, v20, 35 -; SI-NEXT: v_readlane_b32 s98, v20, 34 -; SI-NEXT: v_readlane_b32 s97, v20, 33 -; SI-NEXT: v_readlane_b32 s96, v20, 32 -; SI-NEXT: v_readlane_b32 s87, v20, 31 -; SI-NEXT: v_readlane_b32 s86, v20, 30 -; SI-NEXT: v_readlane_b32 s85, v20, 29 -; SI-NEXT: v_readlane_b32 s84, v20, 28 -; SI-NEXT: v_readlane_b32 s83, v20, 27 -; SI-NEXT: v_readlane_b32 s82, v20, 26 -; SI-NEXT: v_readlane_b32 s81, v20, 25 -; SI-NEXT: v_readlane_b32 s80, v20, 24 -; SI-NEXT: v_readlane_b32 s71, v20, 23 -; SI-NEXT: v_readlane_b32 s70, v20, 22 -; SI-NEXT: v_readlane_b32 s69, v20, 21 -; SI-NEXT: v_readlane_b32 s68, v20, 20 -; SI-NEXT: v_readlane_b32 s67, v20, 19 -; SI-NEXT: v_readlane_b32 s66, v20, 18 -; SI-NEXT: v_readlane_b32 s65, v20, 17 -; SI-NEXT: v_readlane_b32 s64, v20, 16 -; SI-NEXT: v_readlane_b32 s55, v20, 15 -; SI-NEXT: v_readlane_b32 s54, v20, 14 -; SI-NEXT: v_readlane_b32 s53, v20, 13 -; SI-NEXT: v_readlane_b32 s52, v20, 12 -; SI-NEXT: v_readlane_b32 s51, v20, 11 -; SI-NEXT: v_readlane_b32 s50, v20, 10 -; SI-NEXT: v_readlane_b32 s49, v20, 9 -; SI-NEXT: v_readlane_b32 s48, v20, 8 -; SI-NEXT: v_readlane_b32 s39, v20, 7 -; SI-NEXT: v_readlane_b32 s38, v20, 6 -; SI-NEXT: v_readlane_b32 s37, v20, 5 -; SI-NEXT: v_readlane_b32 s36, v20, 4 -; SI-NEXT: v_readlane_b32 s35, v20, 3 -; SI-NEXT: v_readlane_b32 s34, v20, 2 -; SI-NEXT: v_readlane_b32 s31, v20, 1 -; SI-NEXT: v_readlane_b32 s30, v20, 0 +; SI-NEXT: v_readlane_b32 s31, v20, 35 +; SI-NEXT: v_readlane_b32 s99, v20, 33 +; SI-NEXT: v_readlane_b32 s98, v20, 32 +; SI-NEXT: v_readlane_b32 s97, v20, 31 +; SI-NEXT: v_readlane_b32 s96, v20, 30 +; SI-NEXT: v_readlane_b32 s87, v20, 29 +; SI-NEXT: v_readlane_b32 s86, v20, 28 +; SI-NEXT: v_readlane_b32 s85, v20, 27 +; SI-NEXT: v_readlane_b32 s84, v20, 26 +; SI-NEXT: v_readlane_b32 s83, v20, 25 +; SI-NEXT: v_readlane_b32 s82, v20, 24 +; SI-NEXT: v_readlane_b32 s81, v20, 23 +; SI-NEXT: v_readlane_b32 s80, v20, 22 +; SI-NEXT: v_readlane_b32 s71, v20, 21 +; SI-NEXT: v_readlane_b32 s70, v20, 20 +; SI-NEXT: v_readlane_b32 s69, v20, 19 +; SI-NEXT: v_readlane_b32 s68, v20, 18 +; SI-NEXT: v_readlane_b32 s67, v20, 17 +; SI-NEXT: v_readlane_b32 s66, v20, 16 +; SI-NEXT: v_readlane_b32 s65, v20, 15 +; SI-NEXT: v_readlane_b32 s64, v20, 14 +; SI-NEXT: v_readlane_b32 s55, v20, 13 +; SI-NEXT: v_readlane_b32 s54, v20, 12 +; SI-NEXT: v_readlane_b32 s53, v20, 11 +; SI-NEXT: v_readlane_b32 s52, v20, 10 +; SI-NEXT: v_readlane_b32 s51, v20, 9 +; SI-NEXT: v_readlane_b32 s50, v20, 8 +; SI-NEXT: v_readlane_b32 s49, v20, 7 +; SI-NEXT: v_readlane_b32 s48, v20, 6 +; SI-NEXT: v_readlane_b32 s39, v20, 5 +; SI-NEXT: v_readlane_b32 s38, v20, 4 +; SI-NEXT: v_readlane_b32 s37, v20, 3 +; SI-NEXT: v_readlane_b32 s36, v20, 2 +; SI-NEXT: v_readlane_b32 s35, v20, 1 +; SI-NEXT: v_readlane_b32 s34, v20, 0 ; SI-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; SI-NEXT: buffer_load_dword v20, off, s[0:3], s32 ; 4-byte Folded Reload ; SI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload @@ -8630,38 +8630,38 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: buffer_store_dword v20, off, s[0:3], s32 ; 4-byte Folded Spill ; VI-NEXT: buffer_store_dword v21, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: v_writelane_b32 v20, s30, 0 -; VI-NEXT: v_writelane_b32 v20, s31, 1 -; VI-NEXT: v_writelane_b32 v20, s34, 2 -; VI-NEXT: v_writelane_b32 v20, s35, 3 -; VI-NEXT: v_writelane_b32 v20, s36, 4 -; VI-NEXT: v_writelane_b32 v20, s37, 5 -; VI-NEXT: v_writelane_b32 v20, s38, 6 -; VI-NEXT: v_writelane_b32 v20, s39, 7 -; VI-NEXT: v_writelane_b32 v20, s48, 8 -; VI-NEXT: v_writelane_b32 v20, s49, 9 -; VI-NEXT: v_writelane_b32 v20, s50, 10 -; VI-NEXT: v_writelane_b32 v20, s51, 11 -; VI-NEXT: v_writelane_b32 v20, s52, 12 -; VI-NEXT: v_writelane_b32 v20, s53, 13 -; VI-NEXT: v_writelane_b32 v20, s54, 14 -; VI-NEXT: v_writelane_b32 v20, s55, 15 -; VI-NEXT: v_writelane_b32 v20, s64, 16 -; VI-NEXT: v_writelane_b32 v20, s65, 17 -; VI-NEXT: v_writelane_b32 v20, s66, 18 -; VI-NEXT: v_writelane_b32 v20, s67, 19 -; VI-NEXT: v_writelane_b32 v20, s68, 20 -; VI-NEXT: v_writelane_b32 v20, s69, 21 -; VI-NEXT: v_writelane_b32 v20, s70, 22 -; VI-NEXT: v_writelane_b32 v20, s71, 23 -; VI-NEXT: v_writelane_b32 v20, s80, 24 -; VI-NEXT: v_writelane_b32 v20, s81, 25 -; VI-NEXT: v_writelane_b32 v20, s82, 26 -; VI-NEXT: v_writelane_b32 v20, s83, 27 -; VI-NEXT: v_writelane_b32 v20, s84, 28 -; VI-NEXT: v_writelane_b32 v20, s85, 29 +; VI-NEXT: v_writelane_b32 v20, s34, 0 +; VI-NEXT: v_writelane_b32 v20, s35, 1 +; VI-NEXT: v_writelane_b32 v20, s36, 2 +; VI-NEXT: v_writelane_b32 v20, s37, 3 +; VI-NEXT: v_writelane_b32 v20, s38, 4 +; VI-NEXT: v_writelane_b32 v20, s39, 5 +; VI-NEXT: v_writelane_b32 v20, s48, 6 +; VI-NEXT: v_writelane_b32 v20, s49, 7 +; VI-NEXT: v_writelane_b32 v20, s50, 8 +; VI-NEXT: v_writelane_b32 v20, s51, 9 +; VI-NEXT: v_writelane_b32 v20, s52, 10 +; VI-NEXT: v_writelane_b32 v20, s53, 11 +; VI-NEXT: v_writelane_b32 v20, s54, 12 +; VI-NEXT: v_writelane_b32 v20, s55, 13 +; VI-NEXT: v_writelane_b32 v20, s64, 14 +; VI-NEXT: v_writelane_b32 v20, s65, 15 +; VI-NEXT: v_writelane_b32 v20, s66, 16 +; VI-NEXT: v_writelane_b32 v20, s67, 17 +; VI-NEXT: v_writelane_b32 v20, s68, 18 +; VI-NEXT: v_writelane_b32 v20, s69, 19 +; VI-NEXT: v_writelane_b32 v20, s70, 20 +; VI-NEXT: v_writelane_b32 v20, s71, 21 +; VI-NEXT: v_writelane_b32 v20, s80, 22 +; VI-NEXT: v_writelane_b32 v20, s81, 23 +; VI-NEXT: v_writelane_b32 v20, s82, 24 +; VI-NEXT: v_writelane_b32 v20, s83, 25 +; VI-NEXT: v_writelane_b32 v20, s84, 26 +; VI-NEXT: v_writelane_b32 v20, s85, 27 +; VI-NEXT: v_writelane_b32 v20, s86, 28 +; VI-NEXT: v_writelane_b32 v20, s87, 29 ; VI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v19 -; VI-NEXT: v_writelane_b32 v20, s86, 30 +; VI-NEXT: v_writelane_b32 v20, s30, 30 ; VI-NEXT: v_readfirstlane_b32 s44, v1 ; VI-NEXT: v_readfirstlane_b32 s45, v2 ; VI-NEXT: v_readfirstlane_b32 s42, v3 @@ -8681,7 +8681,7 @@ define inreg <128 x i8> @bitcast_v32i32_to_v128i8_scalar(<32 x i32> inreg %a, i3 ; VI-NEXT: v_readfirstlane_b32 s4, v17 ; VI-NEXT: s_and_b64 s[46:47], vcc, exec ; VI-NEXT: v_readfirstlane_b32 s5, v18 -; VI-NEXT: v_writelane_b32 v20, s87, 31 +; VI-NEXT: v_writelane_b32 v20, s31, 31 ; VI-NEXT: ; implicit-def: $vgpr21 : SGPR spil... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/169098 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
