llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> In principle we need this analysis to avoid regressions when using v_pk_mov_b32 when shuffling to physical register inputs. However, as it stands this only introduces regressions by decomposing every useful case where we benefit from the instruction. --- Patch is 379.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125657.diff 13 Files Affected: - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+34) - (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+3) - (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+1-1) - (modified) llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir (+14-14) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll (+121-89) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v3f32.ll (+143-129) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v4f32.ll (+466-343) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v2i32.ll (+121-89) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v3i32.ll (+143-129) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4i32.v4i32.ll (+466-343) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v2p3.ll (+121-89) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v3p3.ll (+143-129) - (modified) llvm/test/CodeGen/AMDGPU/shufflevector.v4p3.v4p3.ll (+466-343) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 35667801c809d5..3cae838321885d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9725,6 +9725,40 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl( return nullptr; } +bool SIInstrInfo::getRegSequenceLikeInputs( + const MachineInstr &MI, unsigned DefIdx, + SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const { + assert(MI.getOpcode() == AMDGPU::V_PK_MOV_B32 && + "v_pk_mov_b32 is the only reg-sequence like instruction"); + assert(DefIdx == 0); + + unsigned Src0Mods = MI.getOperand(1).getImm(); + const MachineOperand &Src0 = MI.getOperand(2); + unsigned Src1Mods = MI.getOperand(3).getImm(); + const MachineOperand &Src1 = MI.getOperand(4); + + unsigned SubReg0 = + Src0Mods & SISrcMods::OP_SEL_0 ? AMDGPU::sub1 : AMDGPU::sub0; + unsigned SubReg1 = + Src1Mods & SISrcMods::OP_SEL_0 ? AMDGPU::sub1 : AMDGPU::sub0; + + if (!Src0.isUndef()) { + // src0 will provide the result sub0 from src0. + SubReg0 = RI.composeSubRegIndices(Src0.getSubReg(), SubReg0); + InputRegs.push_back( + RegSubRegPairAndIdx(Src0.getReg(), SubReg0, AMDGPU::sub0)); + } + + if (!Src1.isUndef()) { + // src1 will provide the result's sub1 from src1. + SubReg1 = RI.composeSubRegIndices(Src1.getSubReg(), SubReg1); + InputRegs.push_back( + RegSubRegPairAndIdx(Src1.getReg(), SubReg1, AMDGPU::sub1)); + } + + return true; +} + unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost) const { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 933935a86f9f98..425ff77e8cdc3f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -1437,6 +1437,9 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { int FrameIndex, LiveIntervals *LIS = nullptr, VirtRegMap *VRM = nullptr) const override; + bool getRegSequenceLikeInputs( + const MachineInstr &MI, unsigned DefIdx, + SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const override; unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 1afd68767cd3ba..a6f8035e93b182 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1107,7 +1107,7 @@ let isCommutable = 1, isReMaterializable = 1 in { defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>; } // End SubtargetPredicate = HasPackedFP32Ops - let SubtargetPredicate = HasPkMovB32 in + let SubtargetPredicate = HasPkMovB32, isRegSequence = 1 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>; } // End isCommutable = 1, isReMaterializable = 1 diff --git a/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir index 90291221e8e178..602fc29bf1db36 100644 --- a/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir +++ b/llvm/test/CodeGen/AMDGPU/reg-sequence-like-v-pk-mov-b32.mir @@ -15,8 +15,8 @@ body: | ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE]], 0, 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 ; CHECK-NEXT: $vgpr4 = COPY [[COPY2]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY3]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 @@ -49,8 +49,8 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 @@ -118,7 +118,7 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 @@ -154,7 +154,7 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] @@ -191,7 +191,7 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, undef [[REG_SEQUENCE]], 8, [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] @@ -229,7 +229,7 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]], 8, undef [[REG_SEQUENCE1]], 0, 0, 0, 0, 0, implicit $exec ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 ; CHECK-NEXT: $vgpr4 = COPY [[COPY4]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY5]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 @@ -304,8 +304,8 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub0_sub1, 8, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 ; CHECK-NEXT: $vgpr4 = COPY [[COPY6]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY7]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 @@ -345,8 +345,8 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY4]], %subreg.sub1, [[COPY5]], %subreg.sub2 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, [[REG_SEQUENCE]].sub0_sub1, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 ; CHECK-NEXT: $vgpr4 = COPY [[COPY6]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY7]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 @@ -388,8 +388,8 @@ body: | ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128_align2 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 12, [[REG_SEQUENCE]].sub2_sub3, 12, [[REG_SEQUENCE1]].sub0_sub1, 0, 0, 0, 0, 0, implicit $exec - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub1 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[V_PK_MOV_B32_]].sub0 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub3 ; CHECK-NEXT: $vgpr4 = COPY [[COPY8]] ; CHECK-NEXT: $vgpr5 = COPY [[COPY9]] ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr4, implicit $vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll index 9e3c044a76295f..fafdac7ffac62f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v4f32.v2f32.ll @@ -176,7 +176,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -191,8 +192,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_u_u(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -565,15 +566,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v1 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -581,15 +583,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[2:3] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, v1 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -672,26 +675,31 @@ define void @v_shuffle_v4f32_v2f32__3_3_3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v5 +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v4f32_v2f32__3_3_3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[1,0] -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v5 +; GFX940-NEXT: v_mov_b32_e32 v2, v5 +; GFX940-NEXT: v_mov_b32_e32 v3, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -865,7 +873,8 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v3 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -878,7 +887,8 @@ define void @v_shuffle_v4f32_v2f32__1_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v3 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -962,7 +972,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v1 +; GFX90A-NEXT: v_mov_b32_e32 v1, v2 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 ; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) @@ -978,7 +989,8 @@ define void @v_shuffle_v4f32_v2f32__3_0_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[2:3] op_sel:[1,0] +; GFX940-NEXT: v_mov_b32_e32 v0, v1 +; GFX940-NEXT: v_mov_b32_e32 v1, v2 ; GFX940-NEXT: v_mov_b32_e32 v3, v2 ; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) @@ -1126,13 +1138,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: v_mov_b32_e32 v0, v5 +; GFX90A-NEXT: v_mov_b32_e32 v1, v4 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1142,13 +1155,14 @@ define void @v_shuffle_v4f32_v2f32__3_2_0_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND +; GFX940-NEXT: v_mov_b32_e32 v6, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[4:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] ; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: global_store_dwordx4 v6, v[0:3], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -1289,15 +1303,16 @@ define void @v_shuffle_v4f32_v2f32__3_3_1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[4:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: global_store_dwordx4 v4, v[0:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, v5 +; GFX90A-NEXT: v_mov_b32_e32 v3, v4 +; GFX90A-NEXT: global_store_dwordx4 v6, v[0:3], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1305,15 +1320,16 @@ define void @v_shuffle_v4f... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/125657 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits