Author: Shoreshen Date: 2025-07-04T09:43:00+08:00 New Revision: d6c3ae82c37987098b409ec46971cb2ee370f8c5
URL: https://github.com/llvm/llvm-project/commit/d6c3ae82c37987098b409ec46971cb2ee370f8c5 DIFF: https://github.com/llvm/llvm-project/commit/d6c3ae82c37987098b409ec46971cb2ee370f8c5.diff LOG: Revert "[AMDGPU] Re-apply: Implement vop3p complex pattern optmization for gi…" This reverts commit db03c27763656948323a50b9706da912c581e6f2. Added: Modified: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll llvm/test/CodeGen/AMDGPU/packed-fp32.ll llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index fd679a9933cf0..b632b16f5c198 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4327,591 +4327,60 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const { }}; } -enum class SrcStatus { - IS_SAME, - IS_UPPER_HALF, - IS_LOWER_HALF, - IS_UPPER_HALF_NEG, - // This means current op = [op_upper, op_lower] and src = -op_lower. - IS_LOWER_HALF_NEG, - IS_HI_NEG, - // This means current op = [op_upper, op_lower] and src = [op_upper, - // -op_lower]. - IS_LO_NEG, - IS_BOTH_NEG, - INVALID, - NEG_START = IS_UPPER_HALF_NEG, - NEG_END = IS_BOTH_NEG, - HALF_START = IS_UPPER_HALF, - HALF_END = IS_LOWER_HALF_NEG -}; -/// Test if the MI is truncating to half, such as `%reg0:n = G_TRUNC %reg1:2n` -static bool isTruncHalf(const MachineInstr *MI, - const MachineRegisterInfo &MRI) { - if (MI->getOpcode() != AMDGPU::G_TRUNC) - return false; - - unsigned DstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits(); - unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); - return DstSize * 2 == SrcSize; -} - -/// Test if the MI is logic shift right with half bits, -/// such as `%reg0:2n =G_LSHR %reg1:2n, CONST(n)` -static bool isLshrHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { - if (MI->getOpcode() != AMDGPU::G_LSHR) - return false; - - Register ShiftSrc; - std::optional<ValueAndVReg> ShiftAmt; - if (mi_match(MI->getOperand(0).getReg(), MRI, - m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { - unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); - unsigned Shift = ShiftAmt->Value.getZExtValue(); - return Shift * 2 == SrcSize; - } - return false; -} - -/// Test if the MI is shift left with half bits, -/// such as `%reg0:2n =G_SHL %reg1:2n, CONST(n)` -static bool isShlHalf(const MachineInstr *MI, const MachineRegisterInfo &MRI) { - if (MI->getOpcode() != AMDGPU::G_SHL) - return false; - - Register ShiftSrc; - std::optional<ValueAndVReg> ShiftAmt; - if (mi_match(MI->getOperand(0).getReg(), MRI, - m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) { - unsigned SrcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits(); - unsigned Shift = ShiftAmt->Value.getZExtValue(); - return Shift * 2 == SrcSize; - } - return false; -} - -/// Test function, if the MI is `%reg0:n, %reg1:n = G_UNMERGE_VALUES %reg2:2n` -static bool isUnmergeHalf(const MachineInstr *MI, - const MachineRegisterInfo &MRI) { - if (MI->getOpcode() != AMDGPU::G_UNMERGE_VALUES) - return false; - return MI->getNumOperands() == 3 && MI->getOperand(0).isDef() && - MI->getOperand(1).isDef() && !MI->getOperand(2).isDef(); -} - -enum class TypeClass { VECTOR_OF_TWO, SCALAR, NONE_OF_LISTED }; - -static TypeClass isVectorOfTwoOrScalar(Register Reg, - const MachineRegisterInfo &MRI) { - LLT OpTy = MRI.getType(Reg); - if (OpTy.isScalar()) - return TypeClass::SCALAR; - if (OpTy.isVector() && OpTy.getNumElements() == 2) - return TypeClass::VECTOR_OF_TWO; - return TypeClass::NONE_OF_LISTED; -} - -static SrcStatus getNegStatus(Register Reg, SrcStatus S, - const MachineRegisterInfo &MRI) { - TypeClass NegType = isVectorOfTwoOrScalar(Reg, MRI); - if (NegType != TypeClass::VECTOR_OF_TWO && NegType != TypeClass::SCALAR) - return SrcStatus::INVALID; - - switch (S) { - case SrcStatus::IS_SAME: - if (NegType == TypeClass::VECTOR_OF_TWO) { - // Vector of 2: - // [SrcHi, SrcLo] = [CurrHi, CurrLo] - // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) - // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) - // [SrcHi, SrcLo] = [-OpHi, -OpLo] - return SrcStatus::IS_BOTH_NEG; - } - if (NegType == TypeClass::SCALAR) { - // Scalar: - // [SrcHi, SrcLo] = [CurrHi, CurrLo] - // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) - // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) - // [SrcHi, SrcLo] = [-OpHi, OpLo] - return SrcStatus::IS_HI_NEG; - } - break; - case SrcStatus::IS_HI_NEG: - if (NegType == TypeClass::VECTOR_OF_TWO) { - // Vector of 2: - // [SrcHi, SrcLo] = [-CurrHi, CurrLo] - // [CurrHi, CurrLo] = neg [OpHi, OpLo](2 x Type) - // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) - // [SrcHi, SrcLo] = [-(-OpHi), -OpLo] = [OpHi, -OpLo] - return SrcStatus::IS_LO_NEG; - } - if (NegType == TypeClass::SCALAR) { - // Scalar: - // [SrcHi, SrcLo] = [-CurrHi, CurrLo] - // [CurrHi, CurrLo] = neg [OpHi, OpLo](Type) - // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) - // [SrcHi, SrcLo] = [-(-OpHi), OpLo] = [OpHi, OpLo] - return SrcStatus::IS_SAME; - } - break; - case SrcStatus::IS_LO_NEG: - if (NegType == TypeClass::VECTOR_OF_TWO) { - // Vector of 2: - // [SrcHi, SrcLo] = [CurrHi, -CurrLo] - // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) - // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) - // [SrcHi, SrcLo] = [-OpHi, -(-OpLo)] = [-OpHi, OpLo] - return SrcStatus::IS_HI_NEG; - } - if (NegType == TypeClass::SCALAR) { - // Scalar: - // [SrcHi, SrcLo] = [CurrHi, -CurrLo] - // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) - // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) - // [SrcHi, SrcLo] = [-OpHi, -OpLo] - return SrcStatus::IS_BOTH_NEG; - } - break; - case SrcStatus::IS_BOTH_NEG: - if (NegType == TypeClass::VECTOR_OF_TWO) { - // Vector of 2: - // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] - // [CurrHi, CurrLo] = fneg [OpHi, OpLo](2 x Type) - // [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) - // [SrcHi, SrcLo] = [OpHi, OpLo] - return SrcStatus::IS_SAME; - } - if (NegType == TypeClass::SCALAR) { - // Scalar: - // [SrcHi, SrcLo] = [-CurrHi, -CurrLo] - // [CurrHi, CurrLo] = fneg [OpHi, OpLo](Type) - // [CurrHi, CurrLo] = [-OpHi, OpLo](Type) - // [SrcHi, SrcLo] = [OpHi, -OpLo] - return SrcStatus::IS_LO_NEG; - } - break; - case SrcStatus::IS_UPPER_HALF: - // Vector of 2: - // Src = CurrUpper - // Curr = [CurrUpper, CurrLower] - // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) - // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) - // Src = -OpUpper - // - // Scalar: - // Src = CurrUpper - // Curr = [CurrUpper, CurrLower] - // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) - // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) - // Src = -OpUpper - return SrcStatus::IS_UPPER_HALF_NEG; - case SrcStatus::IS_LOWER_HALF: - if (NegType == TypeClass::VECTOR_OF_TWO) { - // Vector of 2: - // Src = CurrLower - // Curr = [CurrUpper, CurrLower] - // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) - // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) - // Src = -OpLower - return SrcStatus::IS_LOWER_HALF_NEG; - } - if (NegType == TypeClass::SCALAR) { - // Scalar: - // Src = CurrLower - // Curr = [CurrUpper, CurrLower] - // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) - // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) - // Src = OpLower - return SrcStatus::IS_LOWER_HALF; - } - break; - case SrcStatus::IS_UPPER_HALF_NEG: - // Vector of 2: - // Src = -CurrUpper - // Curr = [CurrUpper, CurrLower] - // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) - // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) - // Src = -(-OpUpper) = OpUpper - // - // Scalar: - // Src = -CurrUpper - // Curr = [CurrUpper, CurrLower] - // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) - // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) - // Src = -(-OpUpper) = OpUpper - return SrcStatus::IS_UPPER_HALF; - case SrcStatus::IS_LOWER_HALF_NEG: - if (NegType == TypeClass::VECTOR_OF_TWO) { - // Vector of 2: - // Src = -CurrLower - // Curr = [CurrUpper, CurrLower] - // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](2 x Type) - // [CurrUpper, CurrLower] = [-OpUpper, -OpLower](2 x Type) - // Src = -(-OpLower) = OpLower - return SrcStatus::IS_LOWER_HALF; - } - if (NegType == TypeClass::SCALAR) { - // Scalar: - // Src = -CurrLower - // Curr = [CurrUpper, CurrLower] - // [CurrUpper, CurrLower] = fneg [OpUpper, OpLower](Type) - // [CurrUpper, CurrLower] = [-OpUpper, OpLower](Type) - // Src = -OpLower - return SrcStatus::IS_LOWER_HALF_NEG; - } - break; - default: - llvm_unreachable("unexpected SrcStatus"); - } -} - -static std::optional<std::pair<Register, SrcStatus>> -calcNextStatus(std::pair<Register, SrcStatus> Curr, - const MachineRegisterInfo &MRI) { - const MachineInstr *MI = MRI.getVRegDef(Curr.first); - - unsigned Opc = MI->getOpcode(); - - // Handle general Opc cases. - switch (Opc) { - case AMDGPU::G_BITCAST: - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), Curr.second}); - case AMDGPU::COPY: - if (MI->getOperand(1).getReg().isPhysical()) - return std::nullopt; - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), Curr.second}); - case AMDGPU::G_FNEG: { - SrcStatus Stat = getNegStatus(Curr.first, Curr.second, MRI); - if (Stat == SrcStatus::INVALID) - return std::nullopt; - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), Stat}); - } - default: - break; - } - - // Calc next Stat from current Stat. - switch (Curr.second) { - case SrcStatus::IS_SAME: - if (isTruncHalf(MI, MRI)) - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF}); - else if (isUnmergeHalf(MI, MRI)) { - if (Curr.first == MI->getOperand(0).getReg()) - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF}); - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF}); - } - break; - case SrcStatus::IS_HI_NEG: - if (isTruncHalf(MI, MRI)) { - // [SrcHi, SrcLo] = [-CurrHi, CurrLo] - // [CurrHi, CurrLo] = trunc [OpUpper, OpLower] = OpLower - // = [OpLowerHi, OpLowerLo] - // Src = [SrcHi, SrcLo] = [-CurrHi, CurrLo] - // = [-OpLowerHi, OpLowerLo] - // = -OpLower - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG}); - } - if (isUnmergeHalf(MI, MRI)) { - if (Curr.first == MI->getOperand(0).getReg()) - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(2).getReg(), SrcStatus::IS_LOWER_HALF_NEG}); - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(2).getReg(), SrcStatus::IS_UPPER_HALF_NEG}); - } - break; - case SrcStatus::IS_UPPER_HALF: - if (isShlHalf(MI, MRI)) - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF}); - break; - case SrcStatus::IS_LOWER_HALF: - if (isLshrHalf(MI, MRI)) - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF}); - break; - case SrcStatus::IS_UPPER_HALF_NEG: - if (isShlHalf(MI, MRI)) - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), SrcStatus::IS_LOWER_HALF_NEG}); - break; - case SrcStatus::IS_LOWER_HALF_NEG: - if (isLshrHalf(MI, MRI)) - return std::optional<std::pair<Register, SrcStatus>>( - {MI->getOperand(1).getReg(), SrcStatus::IS_UPPER_HALF_NEG}); - break; - default: - break; - } - return std::nullopt; -} - -/// This is used to control valid status that current MI supports. For example, -/// non floating point intrinsic such as @llvm.amdgcn.sdot2 does not support NEG -/// bit on VOP3P. -/// The class can be further extended to recognize support on SEL, NEG, ABS bit -/// for diff erent MI on diff erent arch -class SearchOptions { -private: - bool HasNeg = false; - // Assume all complex pattern of VOP3P have opsel. - bool HasOpsel = true; - -public: - SearchOptions(Register Reg, const MachineRegisterInfo &MRI) { - const MachineInstr *MI = MRI.getVRegDef(Reg); - unsigned Opc = MI->getOpcode(); - - if (Opc < TargetOpcode::GENERIC_OP_END) { - // Keep same for generic op. - HasNeg = true; - } else if (Opc == TargetOpcode::G_INTRINSIC) { - Intrinsic::ID IntrinsicID = cast<GIntrinsic>(*MI).getIntrinsicID(); - // Only float point intrinsic has neg & neg_hi bits. - if (IntrinsicID == Intrinsic::amdgcn_fdot2) - HasNeg = true; - } - } - bool checkOptions(SrcStatus Stat) const { - if (!HasNeg && - (Stat >= SrcStatus::NEG_START && Stat <= SrcStatus::NEG_END)) { - return false; - } - if (!HasOpsel && - (Stat >= SrcStatus::HALF_START && Stat <= SrcStatus::HALF_END)) { - return false; - } - return true; - } -}; - -static SmallVector<std::pair<Register, SrcStatus>> -getSrcStats(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, - int MaxDepth = 3) { - int Depth = 0; - auto Curr = calcNextStatus({Reg, SrcStatus::IS_SAME}, MRI); - SmallVector<std::pair<Register, SrcStatus>> Statlist; - - while (Depth <= MaxDepth && Curr.has_value()) { - Depth++; - if (SO.checkOptions(Curr.value().second)) - Statlist.push_back(Curr.value()); - Curr = calcNextStatus(Curr.value(), MRI); - } - - return Statlist; -} - -static std::pair<Register, SrcStatus> -getLastSameOrNeg(Register Reg, const MachineRegisterInfo &MRI, SearchOptions SO, - int MaxDepth = 3) { - int Depth = 0; - std::pair<Register, SrcStatus> LastSameOrNeg = {Reg, SrcStatus::IS_SAME}; - auto Curr = calcNextStatus(LastSameOrNeg, MRI); - - while (Depth <= MaxDepth && Curr.has_value()) { - Depth++; - SrcStatus Stat = Curr.value().second; - if (SO.checkOptions(Stat)) { - if (Stat == SrcStatus::IS_SAME || Stat == SrcStatus::IS_HI_NEG || - Stat == SrcStatus::IS_LO_NEG || Stat == SrcStatus::IS_BOTH_NEG) - LastSameOrNeg = Curr.value(); - } - Curr = calcNextStatus(Curr.value(), MRI); - } - - return LastSameOrNeg; -} - -static bool isSameBitWidth(Register Reg1, Register Reg2, - const MachineRegisterInfo &MRI) { - unsigned Width1 = MRI.getType(Reg1).getSizeInBits(); - unsigned Width2 = MRI.getType(Reg2).getSizeInBits(); - return Width1 == Width2; -} - -static unsigned updateMods(SrcStatus HiStat, SrcStatus LoStat, unsigned Mods) { - // SrcStatus::IS_LOWER_HALF remain 0. - if (HiStat == SrcStatus::IS_UPPER_HALF_NEG) { - Mods ^= SISrcMods::NEG_HI; - Mods |= SISrcMods::OP_SEL_1; - } else if (HiStat == SrcStatus::IS_UPPER_HALF) - Mods |= SISrcMods::OP_SEL_1; - else if (HiStat == SrcStatus::IS_LOWER_HALF_NEG) - Mods ^= SISrcMods::NEG_HI; - else if (HiStat == SrcStatus::IS_HI_NEG) - Mods ^= SISrcMods::NEG_HI; - - if (LoStat == SrcStatus::IS_UPPER_HALF_NEG) { - Mods ^= SISrcMods::NEG; - Mods |= SISrcMods::OP_SEL_0; - } else if (LoStat == SrcStatus::IS_UPPER_HALF) - Mods |= SISrcMods::OP_SEL_0; - else if (LoStat == SrcStatus::IS_LOWER_HALF_NEG) - Mods |= SISrcMods::NEG; - else if (LoStat == SrcStatus::IS_HI_NEG) - Mods ^= SISrcMods::NEG; - - return Mods; -} - -static bool isValidToPack(SrcStatus HiStat, SrcStatus LoStat, Register NewReg, - Register RootReg, const SIInstrInfo &TII, - const MachineRegisterInfo &MRI) { - auto IsHalfState = [](SrcStatus S) { - return S == SrcStatus::IS_UPPER_HALF || S == SrcStatus::IS_UPPER_HALF_NEG || - S == SrcStatus::IS_LOWER_HALF || S == SrcStatus::IS_LOWER_HALF_NEG; - }; - return isSameBitWidth(NewReg, RootReg, MRI) && IsHalfState(LoStat) && - IsHalfState(HiStat); -} - -std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3PModsImpl( - Register RootReg, const MachineRegisterInfo &MRI, bool IsDOT) const { +std::pair<Register, unsigned> +AMDGPUInstructionSelector::selectVOP3PModsImpl( + Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const { unsigned Mods = 0; - // No modification if Root type is not form of <2 x Type>. - if (isVectorOfTwoOrScalar(RootReg, MRI) != TypeClass::VECTOR_OF_TWO) { - Mods |= SISrcMods::OP_SEL_1; - return {RootReg, Mods}; - } - - SearchOptions SO(RootReg, MRI); - - std::pair<Register, SrcStatus> Stat = getLastSameOrNeg(RootReg, MRI, SO); + MachineInstr *MI = MRI.getVRegDef(Src); - if (Stat.second == SrcStatus::IS_BOTH_NEG) + if (MI->getOpcode() == AMDGPU::G_FNEG && + // It's possible to see an f32 fneg here, but unlikely. + // TODO: Treat f32 fneg as only high bit. + MRI.getType(Src) == LLT::fixed_vector(2, 16)) { Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); - else if (Stat.second == SrcStatus::IS_HI_NEG) - Mods ^= SISrcMods::NEG_HI; - else if (Stat.second == SrcStatus::IS_LO_NEG) - Mods ^= SISrcMods::NEG; - - MachineInstr *MI = MRI.getVRegDef(Stat.first); - - if (MI->getOpcode() != AMDGPU::G_BUILD_VECTOR || MI->getNumOperands() != 3 || - (IsDOT && Subtarget->hasDOTOpSelHazard())) { - Mods |= SISrcMods::OP_SEL_1; - return {Stat.first, Mods}; + Src = MI->getOperand(1).getReg(); + MI = MRI.getVRegDef(Src); } - SmallVector<std::pair<Register, SrcStatus>> StatlistHi = - getSrcStats(MI->getOperand(2).getReg(), MRI, SO); + // TODO: Handle G_FSUB 0 as fneg - if (StatlistHi.empty()) { - Mods |= SISrcMods::OP_SEL_1; - return {Stat.first, Mods}; - } + // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. + (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard() - SmallVector<std::pair<Register, SrcStatus>> StatlistLo = - getSrcStats(MI->getOperand(1).getReg(), MRI, SO); - - if (StatlistLo.empty()) { - Mods |= SISrcMods::OP_SEL_1; - return {Stat.first, Mods}; - } - - for (int I = StatlistHi.size() - 1; I >= 0; I--) { - for (int J = StatlistLo.size() - 1; J >= 0; J--) { - if (StatlistHi[I].first == StatlistLo[J].first && - isValidToPack(StatlistHi[I].second, StatlistLo[J].second, - StatlistHi[I].first, RootReg, TII, MRI)) - return {StatlistHi[I].first, - updateMods(StatlistHi[I].second, StatlistLo[J].second, Mods)}; - } - } // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; - return {Stat.first, Mods}; -} - -int64_t getAllKindImm(const MachineOperand *Op) { - switch (Op->getType()) { - case MachineOperand::MachineOperandType::MO_Immediate: - return Op->getImm(); - case MachineOperand::MachineOperandType::MO_CImmediate: - return Op->getCImm()->getSExtValue(); - case MachineOperand::MachineOperandType::MO_FPImmediate: - return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue(); - default: - llvm_unreachable("not an imm type"); - } -} - -static bool checkRB(Register Reg, unsigned int RBNo, - const AMDGPURegisterBankInfo &RBI, - const MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI) { - const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI); - return RB->getID() == RBNo; -} - -// This function is used to get the correct register bank for returned reg. -// Assume: -// 1. VOP3P is always legal for VGPR. -// 2. RootOp's regbank is legal. -// Thus -// 1. If RootOp is SGPR, then NewOp can be SGPR or VGPR. -// 2. If RootOp is VGPR, then NewOp must be VGPR. -static Register getLegalRegBank(Register NewReg, Register RootReg, - const AMDGPURegisterBankInfo &RBI, - MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, - const SIInstrInfo &TII) { - // RootOp can only be VGPR or SGPR (some hand written cases such as. - // inst-select-ashr.v2s16.mir::ashr_v2s16_vs). - if (checkRB(RootReg, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) || - checkRB(NewReg, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) - return NewReg; - - MachineInstr *MI = MRI.getVRegDef(RootReg); - if (MI->getOpcode() == AMDGPU::COPY && NewReg == MI->getOperand(1).getReg()) { - // RootOp is VGPR, NewOp is not VGPR, but RootOp = COPY NewOp. - return RootReg; - } - - MachineBasicBlock *BB = MI->getParent(); - Register DstReg = MRI.cloneVirtualRegister(RootReg); - - MachineInstrBuilder MIB = - BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg) - .addReg(NewReg); - - // Only accept VGPR. - return MIB->getOperand(0).getReg(); + return std::pair(Src, Mods); } InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PRetHelper(MachineOperand &Root, - bool IsDOT) const { - MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - Register Reg; +AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); + + Register Src; unsigned Mods; - std::tie(Reg, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, IsDOT); + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI); - Reg = getLegalRegBank(Reg, Root.getReg(), RBI, MRI, TRI, TII); return {{ - [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); }, - [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods }}; } -InstructionSelector::ComplexRendererFns -AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const { - - return selectVOP3PRetHelper(Root); -} - InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { + MachineRegisterInfo &MRI + = Root.getParent()->getParent()->getParent()->getRegInfo(); - return selectVOP3PRetHelper(Root, true); + Register Src; + unsigned Mods; + std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true); + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; } InstructionSelector::ComplexRendererFns diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 8e9e573147a86..6c3f3026e877a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -188,10 +188,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const; std::pair<Register, unsigned> - selectVOP3PModsImpl(Register RootReg, const MachineRegisterInfo &MRI, + selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI, bool IsDOT = false) const; - InstructionSelector::ComplexRendererFns - selectVOP3PRetHelper(MachineOperand &Root, bool IsDOT = false) const; InstructionSelector::ComplexRendererFns selectVOP3PMods(MachineOperand &Root) const; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll index e03aa18d3147f..543f8e413abd8 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmul.v2f16.ll @@ -106,104 +106,6 @@ define <2 x half> @v_fmul_v2f16_fneg_lhs_fneg_rhs(<2 x half> %a, <2 x half> %b) ret <2 x half> %mul } -define <2 x half> @v_fmul_v2f16_partial_neg(<2 x half> %a, <2 x half> %b) { -; GFX9-LABEL: v_fmul_v2f16_partial_neg: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0] -; GFX9-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: v_fmul_v2f16_partial_neg: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80008000, v1 -; GFX8-NEXT: v_mul_f16_e32 v3, v1, v0 -; GFX8-NEXT: v_mul_f16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_mul_f16_e32 v1, v2, v3 -; GFX8-NEXT: v_mul_f16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: v_fmul_v2f16_partial_neg: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_hi:[1,0] -; GFX10-NEXT: v_pk_mul_f16 v0, v1, v0 neg_lo:[1,0] -; GFX10-NEXT: s_setpc_b64 s[30:31] - %b1 = bitcast <2 x half> %b to float - %b2 = fneg float %b1 - %b3 = bitcast float %b2 to <2 x half> - %b4 = fneg <2 x half> %b3 - %mul1 = fmul <2 x half> %b3, %a - %mul2 = fmul <2 x half> %b4, %mul1 - ret <2 x half> %mul2 -} - -define <2 x half> @fmul_v2_half_neg_hi(<2 x half> %a, <2 x half> %b) #0 { -; GFX9-LABEL: fmul_v2_half_neg_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmul_v2_half_neg_hi: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 -; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: fmul_v2_half_neg_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 neg_hi:[0,1] -; GFX10-NEXT: s_setpc_b64 s[30:31] - %b1 = bitcast <2 x half> %b to float - %b2 = fneg float %b1 - %b3 = bitcast float %b2 to <2 x half> - %b4 = extractelement <2 x half> %b3, i64 1 - %tmp = insertelement <2 x half> poison, half %b4, i64 0 - %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0> - %mul = fmul <2 x half> %a, %k - ret <2 x half> %mul -} - - -define <2 x half> @fmul_v2_half_neg_lo1(<2 x half> %a, <2 x half> %b) #0 { -; GFX9-LABEL: fmul_v2_half_neg_lo1: -; GFX9: ; %bb.0: -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] -; GFX9-NEXT: s_setpc_b64 s[30:31] -; -; GFX8-LABEL: fmul_v2_half_neg_lo1: -; GFX8: ; %bb.0: -; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 -; GFX8-NEXT: v_mul_f16_e32 v1, v0, v1 -; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 -; GFX8-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-LABEL: fmul_v2_half_neg_lo1: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_pk_mul_f16 v0, v0, v1 op_sel_hi:[1,0] -; GFX10-NEXT: s_setpc_b64 s[30:31] - %b1 = bitcast <2 x half> %b to float - %b2 = fneg float %b1 - %b3 = bitcast float %b2 to <2 x half> - %b4 = extractelement <2 x half> %b3, i64 0 - %tmp = insertelement <2 x half> poison, half %b4, i64 0 - %k = shufflevector <2 x half> %tmp, <2 x half> %b, <2 x i32> <i32 2, i32 0> - %mul = fmul <2 x half> %a, %k - ret <2 x half> %mul -} - define <3 x half> @v_fmul_v3f16(<3 x half> %a, <3 x half> %b) { ; GFX9-LABEL: v_fmul_v3f16: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll index 8f0ae8c47098a..744a5b7feb48d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll @@ -304,7 +304,8 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] +; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_a: @@ -318,7 +319,8 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX10-LABEL: v_sdot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] +; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> poison, <2 x i32> <i32 1, i32 0> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -329,7 +331,8 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_sdot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] +; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_sdot2_shuffle10_b: @@ -343,7 +346,8 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX10-LABEL: v_sdot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] +; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> <i32 1, i32 0> %r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll index 287a009ca1405..9e623494a5a04 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll @@ -289,19 +289,22 @@ define i32 @v_udot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_shuffle10_a: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] +; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_a: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] +; GFX908-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_a: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1] +; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.a = shufflevector <2 x i16> %a, <2 x i16> poison, <2 x i32> <i32 1, i32 0> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false) @@ -312,19 +315,22 @@ define i32 @v_udot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) { ; GFX906-LABEL: v_udot2_shuffle10_b: ; GFX906: ; %bb.0: ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] +; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX906-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX906-NEXT: s_setpc_b64 s[30:31] ; ; GFX908-LABEL: v_udot2_shuffle10_b: ; GFX908: ; %bb.0: ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] +; GFX908-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX908-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX908-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_udot2_shuffle10_b: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1] +; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16 +; GFX10-NEXT: v_dot2_u32_u16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] %shuf.b = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> <i32 1, i32 0> %r = call i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll index 0e1e5e4c4987c..bef38c1a65ef8 100644 --- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll +++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll @@ -1982,7 +1982,9 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa ; PACKED-GISEL-NEXT: ds_read_b32 v5, v5 offset:8 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: v_pk_mul_f32 v[4:5], 1.0, v[4:5] op_sel_hi:[0,1] -; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1] +; PACKED-GISEL-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 +; PACKED-GISEL-NEXT: v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[4:5] ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm @@ -2044,7 +2046,9 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace( ; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8 ; PACKED-GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v3 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm @@ -2106,8 +2110,12 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: ds_read_b64 v[2:3], v2 offset:8 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] -; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[2:3] op_sel:[0,1] op_sel_hi:[1,0] +; PACKED-GISEL-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[2:3], 1.0, v[2:3] op_sel_hi:[0,1] +; PACKED-GISEL-NEXT: v_mov_b32_e32 v4, v3 +; PACKED-GISEL-NEXT: v_mov_b32_e32 v5, v2 +; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[4:5] ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm @@ -2343,7 +2351,9 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) { ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; PACKED-GISEL-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] ; PACKED-GISEL-NEXT: s_waitcnt vmcnt(0) -; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-GISEL-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 +; PACKED-GISEL-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, v[0:1] op_sel_hi:[0,1] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -2384,7 +2394,9 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x ; PACKED-GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; PACKED-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; PACKED-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] neg_lo:[0,1] neg_hi:[0,1] +; PACKED-GISEL-NEXT: s_xor_b32 s2, s2, 0x80000000 +; PACKED-GISEL-NEXT: s_xor_b32 s3, s3, 0x80000000 +; PACKED-GISEL-NEXT: v_pk_mul_f32 v[0:1], 1.0, s[2:3] op_sel_hi:[0,1] ; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; PACKED-GISEL-NEXT: s_endpgm %fneg = fsub <2 x float> <float -0.0, float -0.0>, %x diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 141b86a24c1c4..2f1dfa11fd34d 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -742,8 +742,9 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX9-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX9-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 +; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-GISEL-NEXT: v_pk_add_f16 v0, s2, v0 ; GFX9-GISEL-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: @@ -783,7 +784,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX10-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] +; GFX10-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 +; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 ; GFX10-GISEL-NEXT: ; return to shader part epilog ; ; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict: @@ -808,7 +810,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; ; GFX11-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 ; GFX11-GISEL-NEXT: ; return to shader part epilog ; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-SDAG: ; %bb.0: @@ -821,7 +824,8 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; GFX10PLUS-SDAG-NEXT: ; return to shader part epilog ; GFX10PLUS-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-GISEL: ; %bb.0: -; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s3 neg_lo:[0,1] neg_hi:[0,1] +; GFX10PLUS-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 +; GFX10PLUS-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 ; GFX10PLUS-GISEL-NEXT: ; return to shader part epilog %val = call <2 x half> @llvm.experimental.constrained.fsub.v2f16(<2 x half> %x, <2 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <2 x half> %val _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits