llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> If we have a divergent value passed to an outgoing inreg argument, the call needs to be executed in a waterfall loop and thus cannot be tail called. The waterfall handling of arbitrary calls is broken on the selectiondag path, so some of these cases still hit an error later. I also noticed the argument evaluation code in isEligibleForTailCallOptimization is not correctly accounting for implicit argument assignments. It also seems inreg codegen is generally broken; we are assigning arguments to the reserved private resource descriptor. --- Patch is 63.45 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/111002.diff 7 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp (+3) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+49-10) - (modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.h (+3) - (modified) llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll (+140-56) - (modified) llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w64.ll (+140-56) - (added) llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll (+78) - (added) llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.ll (+97) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index 25e36dc4b3691f..2cde47c743f9e8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -1142,6 +1142,9 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization( return false; } + // FIXME: We need to check if any arguments passed in SGPR are uniform. If + // they are not, this cannot be a tail call. If they are uniform, but may be + // VGPR, we need to insert readfirstlanes. if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs)) return false; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 67e5b3de741412..53cb0800f7fd27 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3593,6 +3593,8 @@ bool SITargetLowering::isEligibleForTailCallOptimization( SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx); + // FIXME: We are not allocating special input registers, so we will be + // deciding based on incorrect register assignments. CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg)); const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>(); @@ -3602,6 +3604,21 @@ bool SITargetLowering::isEligibleForTailCallOptimization( if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) return false; + for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) { + // FIXME: What about inreg arguments that end up passed in memory? + if (!CCVA.isRegLoc()) + continue; + + // If we are passing an argument in an SGPR, and the value is divergent, + // this call requires a waterfall loop. + if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) { + LLVM_DEBUG( + dbgs() << "Cannot tail call due to divergent outgoing argument in " + << printReg(CCVA.getLocReg(), TRI) << '\n'); + return false; + } + } + const MachineRegisterInfo &MRI = MF.getRegInfo(); return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals); } @@ -3734,6 +3751,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // arguments to begin at SP+0. Completely unused for non-tail calls. int32_t FPDiff = 0; MachineFrameInfo &MFI = MF.getFrameInfo(); + auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass @@ -3756,6 +3774,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } } + const unsigned NumSpecialInputs = RegsToPass.size(); + MVT PtrVT = MVT::i32; // Walk the register/memloc assignments, inserting copies/loads. @@ -3857,16 +3877,40 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); + SDValue ReadFirstLaneID = + DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); + + SDValue TokenGlue; + if (CLI.ConvergenceControlToken) { + TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue, + CLI.ConvergenceControlToken); + } + // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into the appropriate regs. SDValue InGlue; - for (auto &RegToPass : RegsToPass) { - Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first, - RegToPass.second, InGlue); + + unsigned ArgIdx = 0; + for (auto [Reg, Val] : RegsToPass) { + if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() && + TRI->isSGPRPhysReg(Reg)) { + // Speculatively insert a readfirstlane in case this is a uniform value in + // a VGPR. + // + // FIXME: We need to execute this in a waterfall loop if it is a divergent + // value, so let that continue to produce invalid code. + + SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val}); + if (TokenGlue) + ReadfirstlaneArgs.push_back(TokenGlue); + Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(), + ReadfirstlaneArgs); + } + + Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue); InGlue = Chain.getValue(1); } - // We don't usually want to end the call-sequence here because we would tidy // the frame up *after* the call, however in the ABI-changing tail-call case // we've carefully laid out the parameters so that when sp is reset they'll be @@ -3896,12 +3940,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32); SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee}); - if (CLI.ConvergenceControlToken) { - SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {}, - MVT::Glue, CLI.ConvergenceControlToken); + if (TokenGlue) ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token. - } - Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(), ReadfirstlaneArgs); } @@ -3928,7 +3968,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } // Add a register mask operand representing the call-preserved registers. - auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 409e5418abc8ec..99fa632c0300be 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -210,6 +210,9 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { } bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const; + bool isSGPRPhysReg(Register Reg) const { + return isSGPRClass(getPhysRegBaseClass(Reg)); + } /// \returns true if this class contains only VGPR registers static bool isVGPRClass(const TargetRegisterClass *RC) { diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll index 84dce0ffb31e8f..469d0453b9dfb1 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgcn-cs-chain-intrinsic-w32.ll @@ -73,10 +73,16 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] @@ -97,12 +103,18 @@ define amdgpu_cs_chain void @chain_to_chain(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] - ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] @@ -177,10 +189,16 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] @@ -201,12 +219,18 @@ define amdgpu_cs void @cs_to_chain(<3 x i32> inreg %sgpr, { i32, ptr addrspace(5 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] - ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] @@ -281,10 +305,16 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] @@ -305,12 +335,18 @@ define amdgpu_cs_chain void @chain_to_chain_preserve(<3 x i32> inreg %sgpr, { i3 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY7]] - ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[COPY6]] - ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[COPY5]] - ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: $sgpr48_sgpr49_sgpr50_sgpr51 = COPY [[COPY10]] + ; DAGISEL-GFX10-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; DAGISEL-GFX10-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; DAGISEL-GFX10-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX10-NEXT: $vgpr8 = COPY [[COPY3]] ; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]] @@ -385,10 +421,16 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 + ; DAGISEL-GFX11-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX11-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec ; DAGISEL-GFX11-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -1 - ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[COPY6]] - ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[COPY5]] - ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[COPY4]] + ; DAGISEL-GFX11-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] + ; DAGISEL-GFX11-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] + ; DAGISEL-GFX11-NEXT: $sgpr2 = COPY [[V_READFIRSTLANE_B32_2]] ; DAGISEL-GFX11-NEXT: $vgpr8 = COPY [[COPY3]] ; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]] ; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]] @@ -409,12 +451,18 @@ define amdgpu_cs void @cs_to_chain_preserve(<3 x i32> inreg %sgpr, { i32, ptr ad ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-hi) @callee_preserve ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 target-flags(amdgpu-abs32-lo) @callee_preserve ; DAGISEL-GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:ccr_sgpr_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1 - ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 + ; DAGISEL-GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; DAGISEL-GFX10-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec + ; DAGISEL-GFX10-NEXT: [[COPY10:%[0-9]+]]:sgpr_128 = COPY $sgpr100_sgpr101_sgpr102_sgpr103 ; DAGISEL-GFX10-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 ... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/111002 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits