Author: Luo, Yuanke Date: 2021-01-21T18:11:43+08:00 New Revision: 64132f541edd82bffebbd5521e620219743a42eb
URL: https://github.com/llvm/llvm-project/commit/64132f541edd82bffebbd5521e620219743a42eb DIFF: https://github.com/llvm/llvm-project/commit/64132f541edd82bffebbd5521e620219743a42eb.diff LOG: Revert "[X86][AMX] Fix tile config register spill issue." This reverts commit 20013d02f3352a88d0838eed349abc9a2b0e9cc0. Added: Modified: llvm/include/llvm/CodeGen/LiveIntervals.h llvm/lib/CodeGen/LiveIntervals.cpp llvm/lib/Target/X86/X86ExpandPseudo.cpp llvm/lib/Target/X86/X86FrameLowering.cpp llvm/lib/Target/X86/X86ISelDAGToDAG.cpp llvm/lib/Target/X86/X86InstrAMX.td llvm/lib/Target/X86/X86InstrInfo.cpp llvm/lib/Target/X86/X86PreTileConfig.cpp llvm/lib/Target/X86/X86RegisterInfo.td llvm/lib/Target/X86/X86TileConfig.cpp llvm/test/CodeGen/X86/AMX/amx-across-func.ll llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll llvm/test/CodeGen/X86/opt-pipeline.ll Removed: llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll ################################################################################ diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h index 8c6f94052295..fa08166791b0 100644 --- a/llvm/include/llvm/CodeGen/LiveIntervals.h +++ b/llvm/include/llvm/CodeGen/LiveIntervals.h @@ -377,13 +377,6 @@ class VirtRegMap; bool checkRegMaskInterference(LiveInterval &LI, BitVector &UsableRegs); - /// Get the interferenced slot index and its regmask for an live interval. - /// Return false if ther is no interference. - bool - getInterferenceRegMasks(LiveInterval &LI, - SmallVectorImpl<SlotIndex> &RegSlots, - SmallVectorImpl<const uint32_t *> &RegMaskBits); - // Register unit functions. // // Fixed interference occurs when MachineInstrs use physregs directly diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp index 17005b38ac94..a32b486240c8 100644 --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -952,56 +952,6 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI, } } -bool LiveIntervals::getInterferenceRegMasks( - LiveInterval &LI, SmallVectorImpl<SlotIndex> &RegSlots, - SmallVectorImpl<const uint32_t *> &RegBits) { - if (LI.empty()) - return false; - LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end(); - - // Use a smaller arrays for local live ranges. - ArrayRef<SlotIndex> Slots; - ArrayRef<const uint32_t *> Bits; - if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) { - Slots = getRegMaskSlotsInBlock(MBB->getNumber()); - Bits = getRegMaskBitsInBlock(MBB->getNumber()); - } else { - Slots = getRegMaskSlots(); - Bits = getRegMaskBits(); - } - - // We are going to enumerate all the register mask slots contained in LI. - // Start with a binary search of RegMaskSlots to find a starting point. - ArrayRef<SlotIndex>::iterator SlotI = llvm::lower_bound(Slots, LiveI->start); - ArrayRef<SlotIndex>::iterator SlotE = Slots.end(); - - // No slots in range, LI begins after the last call. - if (SlotI == SlotE) - return false; - - bool Found = false; - while (true) { - assert(*SlotI >= LiveI->start); - // Loop over all slots overlapping this segment. - while (*SlotI < LiveI->end) { - // *SlotI overlaps LI. Collect mask bits. - Found = true; - RegSlots.push_back(*SlotI); - RegBits.push_back(Bits[SlotI - Slots.begin()]); - if (++SlotI == SlotE) - return Found; - } - // *SlotI is beyond the current LI segment. - LiveI = LI.advanceTo(LiveI, *SlotI); - if (LiveI == LiveE) - return Found; - // Advance SlotI until it overlaps. - while (*SlotI < LiveI->start) - if (++SlotI == SlotE) - return Found; - } -} - //===----------------------------------------------------------------------===// // IntervalUpdate class. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index 395f437bb648..15af0fb2e888 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -461,13 +461,25 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; + case X86::PLDTILECFG: { + MI.RemoveOperand(0); + MI.setDesc(TII->get(X86::LDTILECFG)); + return true; + } + case X86::PSTTILECFG: { + MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg + MI.setDesc(TII->get(X86::STTILECFG)); + return true; + } case X86::PTILELOADDV: { + MI.RemoveOperand(8); // Remove $tmmcfg for (unsigned i = 2; i > 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILELOADD)); return true; } case X86::PTDPBSSDV: { + MI.RemoveOperand(7); // Remove $tmmcfg MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) MI.RemoveOperand(i); @@ -476,13 +488,14 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, return true; } case X86::PTILESTOREDV: { + MI.RemoveOperand(8); // Remove $tmmcfg for (int i = 1; i >= 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILESTORED)); return true; } case X86::PTILEZEROV: { - for (int i = 2; i > 0; --i) // Remove row, col + for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILEZERO)); return true; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index fcddfb93b7a3..8339f512158d 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2094,14 +2094,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // Emit tilerelease for AMX kernel. const MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); - unsigned TileRegNum = RC->getNumRegs(); - for (unsigned I = 0; I < TileRegNum; I++) { - if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) { - BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); - break; - } - } + if (!MRI.reg_nodbg_empty(X86::TMMCFG)) + BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); } StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 302a15701d81..a96f73df855d 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4606,6 +4606,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), @@ -4615,6 +4616,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { Index, Disp, Segment, + CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); @@ -4625,12 +4627,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; SDValue Chain = Node->getOperand(0); unsigned Opc = X86::PTDPBSSDV; + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Node->getOperand(4), Node->getOperand(5), Node->getOperand(6), Node->getOperand(7), + CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); @@ -4642,7 +4646,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; unsigned Opc = X86::PTILEZEROV; SDValue Chain = Node->getOperand(0); - SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain}; + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); @@ -4713,6 +4718,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), @@ -4723,6 +4729,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { Disp, Segment, Node->getOperand(6), + CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index 209ebd4b3de3..e4f3290cab9f 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,14 +48,23 @@ let Predicates = [HasAMXTILE, In64BitMode] in { VEX, T8XD; // Pseduo instruction for RA. + let hasSideEffects = 1, mayLoad = 1, + Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in + def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>; + + let hasSideEffects = 1, mayStore = 1 in + def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>; + def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, - opaquemem:$src3), []>; + opaquemem:$src3, + TILECFG:$cfg), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, - TILE:$src4), []>; + TILE:$src4, TILECFG:$cfg), []>; def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2), []>; + GR16:$src2, + TILECFG:$cfg), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. @@ -95,7 +104,7 @@ let Predicates = [HasAMXINT8, In64BitMode] in { let Constraints = "$src4 = $dst" in def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6), []>; + TILE:$src5, TILE:$src6, TILECFG:$cfg), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index fe434bd80f35..d9bab14f0c08 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3808,6 +3808,10 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); + } else if (RC->getID() == X86::TILECFGRegClassID) { + unsigned Opc = X86::PSTTILECFG; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) + .addReg(SrcReg, getKillRegState(isKill)); } else { unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); bool isAligned = @@ -3836,6 +3840,10 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); + } else if (RC->getID() == X86::TILECFGRegClassID) { + unsigned Opc = X86::PLDTILECFG; + addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), + FrameIdx); } else { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index a61f9c5cc752..05ee6c6c8384 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -38,7 +38,6 @@ #include "X86InstrBuilder.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" -#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -63,13 +62,8 @@ class X86PreTileConfig : public MachineFunctionPass { const TargetInstrInfo *TII; MachineDominatorTree *DomTree = nullptr; MachineRegisterInfo *MRI = nullptr; - LiveIntervals *LIS = nullptr; - SmallVector<Register, 16> VTileRegs; - MachineInstr *TileConfigMI = nullptr; - void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx); MachineInstr *getTileConfigPoint(); - void reloadTileConfig(int FI); public: X86PreTileConfig() : MachineFunctionPass(ID) {} @@ -94,21 +88,20 @@ char X86PreTileConfig::ID = 0; INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); - AU.addRequired<LiveIntervals>(); - AU.addPreserved<LiveIntervals>(); AU.addRequired<MachineDominatorTree>(); MachineFunctionPass::getAnalysisUsage(AU); } -void X86PreTileConfig::buildConfigMI(MachineBasicBlock::iterator MI, - int FrameIdx) { +static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, + const TargetInstrInfo *TII, + MachineRegisterInfo *MRI, + const X86Subtarget *ST) { auto *MBB = MI->getParent(); // FIXME: AMX should assume AVX512 enabled. @@ -118,15 +111,18 @@ void X86PreTileConfig::buildConfigMI(MachineBasicBlock::iterator MI, BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm) .addReg(Zmm, RegState::Undef) .addReg(Zmm, RegState::Undef); - TileConfigMI = &*addFrameReference(BuildMI(*MBB, MI, DebugLoc(), - TII->get(X86::VMOVUPSZmr)), - FrameIdx) - .addReg(Zmm); + addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)), + FrameIdx) + .addReg(Zmm); } // build psuedo ldtilecfg - addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)), - FrameIdx); + Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); + + addFrameReference( + BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx); + + return VReg; } static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { @@ -155,7 +151,6 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() { const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); if (RC.getID() != X86::TILERegClassID) continue; - VTileRegs.push_back(VirtReg); // Find the common dominator for all MI that define tile register. for (const MachineOperand &MO : MRI->def_operands(VirtReg)) { @@ -224,138 +219,23 @@ MachineInstr *X86PreTileConfig::getTileConfigPoint() { return &*MII; } -void X86PreTileConfig::reloadTileConfig(int FI) { - SmallSet<MachineInstr *, 8> MIVisited; - const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID); - auto TileRegNum = RC->getNumRegs(); - - for (Register VReg : VTileRegs) { - BitVector UsableRegs(TRI->getNumRegs()); - for (unsigned I = 0; I < TileRegNum; I++) - UsableRegs.set(X86::TMM0 + I); - SmallVector<SlotIndex, 8> RegSlots; - SmallVector<const uint32_t *, 8> RegMasks; - LiveInterval &LI = LIS->getInterval(VReg); - if (!LIS->getInterferenceRegMasks(LI, RegSlots, RegMasks)) - continue; - for (unsigned I = 0; I < RegSlots.size(); I++) { - SlotIndex &SI = RegSlots[I]; - MachineInstr *MI = LIS->getInstructionFromIndex(SI); - // We have reload the tile config register before. - if (MIVisited.count(MI)) - continue; - // For inline assembly, we don't reload tile config register. - // If there is any ldtilecfg instruction in inline assembly, - // it is user's reponsibility to restore everything. - if (!MI->isCall()) - continue; - UsableRegs.clearBitsInMask(RegMasks[I]); - MIVisited.insert(MI); - // There is no interference in callee. This is benifited from - // IPRA. - if (UsableRegs.none()) - continue; - - // build psuedo ldtilecfg - auto *MBB = MI->getParent(); - auto MII = MachineBasicBlock::iterator(MI); - MII++; - addFrameReference( - BuildMI(*MBB, *MII, DebugLoc(), TII->get(X86::LDTILECFG)), FI); - } - } - // We just check tile data register interference, we also need check tile - // config register interference. Since we don't model the config register - // we should check interference from the ldtilecfg to each tile data register - // def. - // ldtilecfg - // / \ - // BB1 BB2 - // / \ - // call BB3 - // / \ - // %1=tileload %2=tilezero - // We can start from the instruction of each tile def, and backward to - // ldtilecfg. If there is any call instruction, and tile data register is - // not preserved, we should insert ldtilecfg after the call instruction. - SmallSet<MachineBasicBlock *, 8> MBBVisited; - for (Register VReg : VTileRegs) { - for (MachineOperand &MO : MRI->def_operands(VReg)) { - if (MO.isUndef()) - continue; - MachineInstr *MI = MO.getParent(); - // May be PHI instructiion. - // There must be several def tile before PHI instruction. - if (MI->isTransient()) - continue; - - bool Terminate = false; - MachineBasicBlock *MBB = MI->getParent(); - // backward to see if there is any call instruction after ldtilecfg. - std::queue<MachineBasicBlock *> WorkList; - WorkList.push(MBB); - bool First = true; - while (!WorkList.empty()) { - MBB = WorkList.front(); - WorkList.pop(); - // If we have iterate the basic block before, don't iterate it and - // its predecessor again. This may be caused by loop, or it has a - // cross path from several successor, or it has been iterated when - // handle other tile register. In below example, BB1 hit the condition. - // ldtilecfg - // | - // ---BB1--- - // / \ - // BB2 BB3 - // / \ - // %1=tileload %2=tilezero - if (MBBVisited.count(MBB)) - continue; - // For the first MBB, we start from the amx instruction which def - // tile register. - auto I = (First) ? MI->getReverseIterator() : MBB->instr_rbegin(); - for (auto E = MBB->instr_rend(); I != E; ++I) { - // If it is inserted point for ldtilecfg, then we've finished - // backward. - if (&*I == TileConfigMI) { - Terminate = true; - break; - } - if (MIVisited.count(&*I)) - continue; - if (!I->isCall()) - continue; - BitVector UsableRegs(TRI->getNumRegs()); - for (unsigned I = 0; I < TileRegNum; I++) - UsableRegs.set(X86::TMM0 + I); - for (MachineOperand &CallMO : I->operands()) { - if (CallMO.isRegMask()) - UsableRegs.clearBitsInMask(CallMO.getRegMask()); - } - // Record the call to avoid double ldtilecfg insert. - MIVisited.insert(&*I); - if (UsableRegs.none()) - continue; - // Insert ldtilecfg after call instruction. - --I; - addFrameReference( - BuildMI(*MBB, *I, DebugLoc(), TII->get(X86::LDTILECFG)), FI); - } - // We encounter visited MachineInst, so we don't need to do backward - // again. - if (Terminate) - break; - // Next we will iterate its predecessor. - for (MachineBasicBlock::pred_iterator S = MBB->pred_begin(), - E = MBB->pred_end(); - S != E; S++) - WorkList.push(*S); +static void addTileCFGUse(MachineFunction &MF, Register CFG) { + for (MachineBasicBlock &MBB : MF) { - // The first the MBB may be visited for the second time when it is in - // a loop. - if (!First) - MBBVisited.insert(MBB); - First = false; + // Traverse the basic block. + for (MachineInstr &MI : MBB) { + unsigned Opcode = MI.getOpcode(); + switch (Opcode) { + default: + break; + case X86::PTILELOADDV: + case X86::PTILESTOREDV: + case X86::PTDPBSSDV: + case X86::PTILEZEROV: + unsigned NumOperands = MI.getNumOperands(); + MI.RemoveOperand(NumOperands - 1); + MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); + break; } } } @@ -368,17 +248,15 @@ bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) { TRI = ST->getRegisterInfo(); TII = mf.getSubtarget().getInstrInfo(); DomTree = &getAnalysis<MachineDominatorTree>(); - LIS = &getAnalysis<LiveIntervals>(); - auto *TileConfigPoint = getTileConfigPoint(); - if (!TileConfigPoint) + MachineInstr *MI = getTileConfigPoint(); + if (!MI) return false; unsigned Size = ST->getTileConfigSize(); Align Alignment = ST->getTileConfigAlignment(); int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); - buildConfigMI(TileConfigPoint, SS); - reloadTileConfig(SS); - VTileRegs.clear(); + Register CFG = buildConfigMI(MI, SS, TII, MRI, ST); + addTileCFGUse(mf, CFG); return true; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index c8723c8268f2..75cbd4e1cff1 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -639,3 +639,8 @@ def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>; let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} +def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> { + let CopyCost = -1; // Don't allow copying of tile config registers. + let isAllocatable = 1; + let Size = 512; +} diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp index d6c1dcaf0588..ef010bcd38b7 100644 --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -22,7 +22,6 @@ #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" -#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -131,14 +130,13 @@ static MachineInstr *storeImmToStackSlot(MachineBasicBlock &MBB, } MachineInstr *X86TileConfig::getTileConfigPoint() { - MachineBasicBlock *Entry = &*MF->begin(); - ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry); - for (MachineBasicBlock *MBB : RPOT) { - for (MachineInstr &MI : *MBB) + for (MachineBasicBlock &MBB : *MF) { + + // Traverse the basic block. + for (MachineInstr &MI : MBB) // Refer X86PreTileConfig.cpp. - // We only support one tile config for now. The other ldtilecfg - // is for spill purpose and is dominated by the first ldtilecfg. - if (MI.getOpcode() == X86::LDTILECFG) + // We only support one tile config for now. + if (MI.getOpcode() == X86::PLDTILECFG) return &MI; } @@ -150,7 +148,7 @@ void X86TileConfig::tileConfig() { if (!MI) return; MachineBasicBlock *MBB = MI->getParent(); - int SS = MI->getOperand(0).getIndex(); + int SS = MI->getOperand(1).getIndex(); BitVector PhysRegs(TRI->getNumRegs()); // Fill in the palette first. diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll index 87973fd9c315..a68a81b8d732 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -1,34 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s -@buf = dso_local global [3072 x i8] zeroinitializer, align 64 +%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }> -define internal void @foo() #0 { -; CHECK-LABEL: foo: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq -; -; IPRA-LABEL: foo: -; IPRA: # %bb.0: # %entry -; IPRA-NEXT: pushq %rbp -; IPRA-NEXT: .cfi_def_cfa_offset 16 -; IPRA-NEXT: .cfi_offset %rbp, -16 -; IPRA-NEXT: movq %rsp, %rbp -; IPRA-NEXT: .cfi_def_cfa_register %rbp -; IPRA-NEXT: popq %rbp -; IPRA-NEXT: .cfi_def_cfa %rsp, 8 -; IPRA-NEXT: retq -entry: - ret void -} +@buf = dso_local global [3072 x i8] zeroinitializer, align 64 define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-LABEL: test_api: @@ -50,6 +25,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movw $8, %r15w @@ -60,10 +36,11 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf+2048, %eax +; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movabsq $64, %rcx ; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload @@ -78,48 +55,17 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: tilerelease ; CHECK-NEXT: retq -; -; IPRA-LABEL: test_api: -; IPRA: # %bb.0: -; IPRA-NEXT: pushq %rbp -; IPRA-NEXT: subq $64, %rsp -; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; IPRA-NEXT: vmovdqu64 %zmm0, (%rsp) -; IPRA-NEXT: movb $1, (%rsp) -; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movb %dil, {{[0-9]+}}(%rsp) -; IPRA-NEXT: movw %si, {{[0-9]+}}(%rsp) -; IPRA-NEXT: ldtilecfg (%rsp) -; IPRA-NEXT: movl $buf, %eax -; IPRA-NEXT: movl $32, %ecx -; IPRA-NEXT: movw $8, %dx -; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 -; IPRA-NEXT: movl $buf+1024, %eax -; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm1 -; IPRA-NEXT: callq foo -; IPRA-NEXT: movl $buf+2048, %eax -; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm2 -; IPRA-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 -; IPRA-NEXT: tilestored %tmm2, (%rax,%rcx) -; IPRA-NEXT: addq $64, %rsp -; IPRA-NEXT: popq %rbp -; IPRA-NEXT: tilerelease -; IPRA-NEXT: vzeroupper -; IPRA-NEXT: retq %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - call void @foo() + tail call void (...) @foo() %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) ret void } +declare dso_local void @foo(...) + declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) - -attributes #0 = { noinline nounwind optnone uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll index f38554b9f79d..a415d9c15242 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll @@ -5,6 +5,7 @@ define void @test_amx() { ; CHECK-LABEL: test_amx: ; CHECK: # %bb.0: ; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3 +; CHECK-NEXT: retq call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7) ret void } diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll deleted file mode 100644 index b381429c9374..000000000000 --- a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll +++ /dev/null @@ -1,131 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s -@buf = dso_local global [3072 x i8] zeroinitializer, align 16 - -define dso_local void @test1(i16 signext %0, i16 signext %1) local_unnamed_addr { -; CHECK-LABEL: test1: -; CHECK: # %bb.0: -; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movl $32, %ecx -; CHECK-NEXT: movw $8, %dx -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0 -; CHECK-NEXT: movl $buf+1024, %eax -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1 -; CHECK-NEXT: movl $buf+2048, %eax -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 -; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2 -; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx) -; CHECK-NEXT: tilerelease -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: jmp foo # TAILCALL - %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) - %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) - %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) - tail call void @foo() - ret void -} - -define dso_local void @test2(i16 signext %0, i16 signext %1) local_unnamed_addr { -; CHECK-LABEL: test2: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: subq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movl %esi, %ebx -; CHECK-NEXT: movl %edi, %ebp -; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB1_3 -; CHECK-NEXT: # %bb.1: # %if.true -; CHECK-NEXT: movw $8, %ax -; CHECK-NEXT: tilezero %tmm0 -; CHECK-NEXT: movl $32, %ecx -; CHECK-NEXT: movl $buf+1024, %edx -; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1 -; CHECK-NEXT: movl $buf+2048, %edx -; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2 -; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 -; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx) -; CHECK-NEXT: jmp .LBB1_2 -; CHECK-NEXT: .LBB1_3: # %if.false -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movl $32, %ecx -; CHECK-NEXT: movw $8, %dx -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3 -; CHECK-NEXT: movl $buf+1024, %eax -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4 -; CHECK-NEXT: movl $buf+2048, %eax -; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2 -; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3 -; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx) -; CHECK-NEXT: .LBB1_2: # %if.true -; CHECK-NEXT: addq $72, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: tilerelease -; CHECK-NEXT: retq - call void @foo() - br i1 undef, label %if.true, label %if.false - -if.true: - %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8) - %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) - %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4) - br label %exit - -if.false: - %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) - %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32) - %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32) - %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7) - tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8) - br label %exit - -exit: - ret void -} - -declare dso_local void @foo() local_unnamed_addr -declare x86_amx @llvm.x86.tilezero.internal(i16, i16) -declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) -declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) -declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll index 57b67c456b36..0dc0c34c340c 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -36,10 +36,11 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_2 -; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movw $8, %cx +; CHECK-NEXT: jne .LBB0_2 +; CHECK-NEXT: # %bb.1: # %if.true ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 @@ -51,13 +52,11 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.false -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3 @@ -69,7 +68,7 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm6, (%r15,%r14) @@ -140,6 +139,7 @@ define dso_local void @test3(i8 *%buf) nounwind { ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: xorl %ebp, %ebp +; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_2: # %loop.header ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -149,7 +149,7 @@ define dso_local void @test3(i8 *%buf) nounwind { ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: tilezero %tmm0 ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm1 ; CHECK-NEXT: tileloadd (%rbx,%r14), %tmm2 diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 1e1154b5f759..b851eea60b0a 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -120,8 +120,6 @@ ; CHECK-NEXT: X86 EFLAGS copy lowering ; CHECK-NEXT: X86 WinAlloca Expander ; CHECK-NEXT: MachineDominator Tree Construction -; CHECK-NEXT: Slot index numbering -; CHECK-NEXT: Live Interval Analysis ; CHECK-NEXT: Tile Register Pre-configure ; CHECK-NEXT: Detect Dead Lanes ; CHECK-NEXT: Process Implicit Definitions _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits