https://github.com/MacDue updated https://github.com/llvm/llvm-project/pull/149510
>From d07322bddea4f6286eef5cd29e8e06b0939f8b2e Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell <[email protected]> Date: Tue, 15 Jul 2025 17:00:04 +0000 Subject: [PATCH] [AArch64][SME] Propagate desired ZA states in the MachineSMEABIPass This patch adds a propagation step to the MachineSMEABIPass that propagates desired ZA states forwards/backwards (from predecessors to successors, or vice versa). The aim of this is to pick better ZA states for edge bundles, as when many (or all) blocks in a bundle do not have a preferred ZA state, the ZA state assigned to a bundle can be less than ideal. An important case is nested loops, where only the inner loop has a preferred ZA state. Here we'd like to propagate the ZA state up from the inner loop to the outer loops (to avoid saves/restores in any loop). Change-Id: I39f9c7d7608e2fa070be2fb88351b4d1d0079041 --- llvm/lib/Target/AArch64/MachineSMEABIPass.cpp | 101 +++++++++++++--- llvm/test/CodeGen/AArch64/sme-agnostic-za.ll | 9 +- .../CodeGen/AArch64/sme-za-control-flow.ll | 85 +++++--------- .../test/CodeGen/AArch64/sme-za-exceptions.ll | 36 +++--- .../AArch64/sme-za-lazy-save-buffer.ll | 110 ++++++------------ 5 files changed, 163 insertions(+), 178 deletions(-) diff --git a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp index a96edf523ab1b..9aa43eea3d977 100644 --- a/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/MachineSMEABIPass.cpp @@ -213,6 +213,11 @@ struct MachineSMEABI : public MachineFunctionPass { /// E.g., ACTIVE -> LOCAL_SAVED will insert code required to save ZA. void insertStateChanges(); + /// Propagates desired states forwards (from predecessors -> successors) if + /// \p Forwards, otherwise, propagates backwards (from successors -> + /// predecessors). + void propagateDesiredStates(bool Forwards = true); + // Emission routines for private and shared ZA functions (using lazy saves). void emitNewZAPrologue(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); @@ -287,8 +292,10 @@ struct MachineSMEABI : public MachineFunctionPass { /// Contains the needed ZA state for each instruction in a block. /// Instructions that do not require a ZA state are not recorded. struct BlockInfo { - ZAState FixedEntryState{ZAState::ANY}; SmallVector<InstInfo> Insts; + ZAState FixedEntryState{ZAState::ANY}; + ZAState DesiredIncomingState{ZAState::ANY}; + ZAState DesiredOutgoingState{ZAState::ANY}; LiveRegs PhysLiveRegsAtEntry = LiveRegs::None; LiveRegs PhysLiveRegsAtExit = LiveRegs::None; }; @@ -381,28 +388,80 @@ void MachineSMEABI::collectNeededZAStates(SMEAttrs SMEFnAttrs) { // Reverse vector (as we had to iterate backwards for liveness). std::reverse(Block.Insts.begin(), Block.Insts.end()); + + // Record the desired states on entry/exit of this block. These are the + // states that would not incur a state transition. + if (!Block.Insts.empty()) { + Block.DesiredIncomingState = Block.Insts.front().NeededState; + Block.DesiredOutgoingState = Block.Insts.back().NeededState; + } + } +} + +void MachineSMEABI::propagateDesiredStates(bool Forwards) { + // If `Forwards`, this propagates desired states from predecessors to + // successors, otherwise, this propagates states from successors to + // predecessors. + auto GetBlockState = [](BlockInfo &Block, bool Incoming) -> ZAState & { + return Incoming ? Block.DesiredIncomingState : Block.DesiredOutgoingState; + }; + + SmallVector<MachineBasicBlock *> Worklist; + for (auto [BlockID, BlockInfo] : enumerate(State.Blocks)) { + if (!isLegalEdgeBundleZAState(GetBlockState(BlockInfo, Forwards))) + Worklist.push_back(MF->getBlockNumbered(BlockID)); + } + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + auto &BlockInfo = State.Blocks[MBB->getNumber()]; + + // Pick a legal edge bundle state that matches the majority of + // predecessors/successors. + int StateCounts[ZAState::NUM_ZA_STATE] = {0}; + for (MachineBasicBlock *PredOrSucc : + Forwards ? predecessors(MBB) : successors(MBB)) { + auto &PredOrSuccBlockInfo = State.Blocks[PredOrSucc->getNumber()]; + auto ZAState = GetBlockState(PredOrSuccBlockInfo, !Forwards); + if (isLegalEdgeBundleZAState(ZAState)) + StateCounts[ZAState]++; + } + + ZAState PropagatedState = ZAState(max_element(StateCounts) - StateCounts); + auto &CurrentState = GetBlockState(BlockInfo, Forwards); + if (PropagatedState != CurrentState) { + CurrentState = PropagatedState; + auto &OtherState = GetBlockState(BlockInfo, !Forwards); + // Propagate to the incoming/outgoing state if that is also "ANY". + if (OtherState == ZAState::ANY) + OtherState = PropagatedState; + // Push any successors/predecessors that may need updating to the + // worklist. + for (MachineBasicBlock *SuccOrPred : + Forwards ? successors(MBB) : predecessors(MBB)) { + auto &SuccOrPredBlockInfo = State.Blocks[SuccOrPred->getNumber()]; + if (!isLegalEdgeBundleZAState( + GetBlockState(SuccOrPredBlockInfo, Forwards))) + Worklist.push_back(SuccOrPred); + } + } } } void MachineSMEABI::assignBundleZAStates() { State.BundleStates.resize(Bundles->getNumBundles()); + for (unsigned I = 0, E = Bundles->getNumBundles(); I != E; ++I) { LLVM_DEBUG(dbgs() << "Assigning ZA state for edge bundle: " << I << '\n'); // Attempt to assign a ZA state for this bundle that minimizes state // transitions. Edges within loops are given a higher weight as we assume // they will be executed more than once. - // TODO: We should propagate desired incoming/outgoing states through blocks - // that have the "ANY" state first to make better global decisions. int EdgeStateCounts[ZAState::NUM_ZA_STATE] = {0}; for (unsigned BlockID : Bundles->getBlocks(I)) { LLVM_DEBUG(dbgs() << "- bb." << BlockID); - const BlockInfo &Block = State.Blocks[BlockID]; - if (Block.Insts.empty()) { - LLVM_DEBUG(dbgs() << " (no state preference)\n"); - continue; - } + BlockInfo &Block = State.Blocks[BlockID]; bool IsLoop = MLI && MLI->getLoopFor(MF->getBlockNumbered(BlockID)); bool InEdge = Bundles->getBundle(BlockID, /*Out=*/false) == I; bool OutEdge = Bundles->getBundle(BlockID, /*Out=*/true) == I; @@ -411,26 +470,28 @@ void MachineSMEABI::assignBundleZAStates() { LLVM_DEBUG(dbgs() << " IsLoop"); LLVM_DEBUG(dbgs() << " (EdgeWeight: " << EdgeWeight << ')'); - ZAState DesiredIncomingState = Block.Insts.front().NeededState; - if (InEdge && isLegalEdgeBundleZAState(DesiredIncomingState)) { - EdgeStateCounts[DesiredIncomingState] += EdgeWeight; + bool LegalInEdge = + InEdge && isLegalEdgeBundleZAState(Block.DesiredIncomingState); + bool LegalOutEgde = + OutEdge && isLegalEdgeBundleZAState(Block.DesiredOutgoingState); + if (LegalInEdge) { LLVM_DEBUG(dbgs() << " DesiredIncomingState: " - << getZAStateString(DesiredIncomingState)); + << getZAStateString(Block.DesiredIncomingState)); + EdgeStateCounts[Block.DesiredIncomingState] += EdgeWeight; } - ZAState DesiredOutgoingState = Block.Insts.back().NeededState; - if (OutEdge && isLegalEdgeBundleZAState(DesiredOutgoingState)) { - EdgeStateCounts[DesiredOutgoingState] += EdgeWeight; + if (LegalOutEgde) { LLVM_DEBUG(dbgs() << " DesiredOutgoingState: " - << getZAStateString(DesiredOutgoingState)); + << getZAStateString(Block.DesiredOutgoingState)); + EdgeStateCounts[Block.DesiredOutgoingState] += EdgeWeight; } + if (!LegalInEdge && !LegalOutEgde) + LLVM_DEBUG(dbgs() << " (no state preference)"); LLVM_DEBUG(dbgs() << '\n'); } ZAState BundleState = ZAState(max_element(EdgeStateCounts) - EdgeStateCounts); - // Force ZA to be active in bundles that don't have a preferred state. - // TODO: Something better here (to avoid extra mode switches). if (BundleState == ZAState::ANY) BundleState = ZAState::ACTIVE; @@ -839,6 +900,10 @@ bool MachineSMEABI::runOnMachineFunction(MachineFunction &MF) { MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI(); collectNeededZAStates(SMEFnAttrs); + if (OptLevel != CodeGenOptLevel::None) { + for (bool Forwards : {true, false}) + propagateDesiredStates(Forwards); + } assignBundleZAStates(); insertStateChanges(); diff --git a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll index 4479cbe2075ff..e53a250258330 100644 --- a/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll +++ b/llvm/test/CodeGen/AArch64/sme-agnostic-za.ll @@ -363,7 +363,6 @@ define i64 @test_many_callee_arguments( ret i64 %ret } -; FIXME: The new lowering should avoid saves/restores in the probing loop. define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_state_agnostic" "probe-stack"="inline-asm" "stack-probe-size"="65536"{ ; CHECK-LABEL: agnostic_za_buffer_alloc_with_stack_probes: ; CHECK: // %bb.0: @@ -401,18 +400,14 @@ define void @agnostic_za_buffer_alloc_with_stack_probes() nounwind "aarch64_za_s ; CHECK-NEWLOWERING-NEXT: bl __arm_sme_state_size ; CHECK-NEWLOWERING-NEXT: mov x8, sp ; CHECK-NEWLOWERING-NEXT: sub x19, x8, x0 +; CHECK-NEWLOWERING-NEXT: mov x0, x19 +; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save ; CHECK-NEWLOWERING-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 ; CHECK-NEWLOWERING-NEXT: cmp sp, x19 -; CHECK-NEWLOWERING-NEXT: mov x0, x19 -; CHECK-NEWLOWERING-NEXT: mrs x8, NZCV -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_save -; CHECK-NEWLOWERING-NEXT: msr NZCV, x8 ; CHECK-NEWLOWERING-NEXT: b.le .LBB7_3 ; CHECK-NEWLOWERING-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 -; CHECK-NEWLOWERING-NEXT: mov x0, x19 ; CHECK-NEWLOWERING-NEXT: str xzr, [sp] -; CHECK-NEWLOWERING-NEXT: bl __arm_sme_restore ; CHECK-NEWLOWERING-NEXT: b .LBB7_1 ; CHECK-NEWLOWERING-NEXT: .LBB7_3: ; CHECK-NEWLOWERING-NEXT: mov sp, x19 diff --git a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll index 1ede38bd731f9..1ffa01675aafc 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-control-flow.ll @@ -223,65 +223,34 @@ exit: ret void } -; FIXME: The codegen for this case could be improved (by tuning weights). -; Here the ZA save has been hoisted out of the conditional, but would be better -; to sink it. define void @cond_private_za_call(i1 %cond) "aarch64_inout_za" nounwind { -; CHECK-LABEL: cond_private_za_call: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: tbz w0, #0, .LBB3_4 -; CHECK-NEXT: // %bb.1: // %private_za_call -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl private_za_call -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB3_3 -; CHECK-NEXT: // %bb.2: // %private_za_call -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB3_3: // %private_za_call -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB3_4: // %exit -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: b shared_za_call -; -; CHECK-NEWLOWERING-LABEL: cond_private_za_call: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: tbz w0, #0, .LBB3_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %private_za_call -; CHECK-NEWLOWERING-NEXT: bl private_za_call -; CHECK-NEWLOWERING-NEXT: .LBB3_2: // %exit -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB3_4 -; CHECK-NEWLOWERING-NEXT: // %bb.3: // %exit -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB3_4: // %exit -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: b shared_za_call +; CHECK-COMMON-LABEL: cond_private_za_call: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: tbz w0, #0, .LBB3_4 +; CHECK-COMMON-NEXT: // %bb.1: // %private_za_call +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl private_za_call +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB3_3 +; CHECK-COMMON-NEXT: // %bb.2: // %private_za_call +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB3_3: // %private_za_call +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB3_4: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: b shared_za_call br i1 %cond, label %private_za_call, label %exit private_za_call: diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll index bb88142efa592..506974a14c3be 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll @@ -56,31 +56,23 @@ define void @za_with_raii(i1 %fail) "aarch64_inout_za" personality ptr @__gxx_pe ; CHECK-NEXT: adrp x8, .L.str ; CHECK-NEXT: add x8, x8, :lo12:.L.str ; CHECK-NEXT: str x8, [x0] -; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .Ltmp0: // EH_LABEL ; CHECK-NEXT: adrp x1, :got:typeinfo_for_char_const_ptr ; CHECK-NEXT: mov x2, xzr ; CHECK-NEXT: ldr x1, [x1, :got_lo12:typeinfo_for_char_const_ptr] ; CHECK-NEXT: bl __cxa_throw -; CHECK-NEXT: .Ltmp1: -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_4 -; CHECK-NEXT: // %bb.3: // %throw_exception -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_4: // %throw_exception -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: // %bb.5: // %throw_fail -; CHECK-NEXT: .LBB0_6: // %unwind_dtors -; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: .Ltmp1: // EH_LABEL +; CHECK-NEXT: // %bb.3: // %throw_fail +; CHECK-NEXT: .LBB0_4: // %unwind_dtors +; CHECK-NEXT: .Ltmp2: // EH_LABEL ; CHECK-NEXT: mov x19, x0 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB0_8 -; CHECK-NEXT: // %bb.7: // %unwind_dtors +; CHECK-NEXT: cbnz x8, .LBB0_6 +; CHECK-NEXT: // %bb.5: // %unwind_dtors ; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB0_8: // %unwind_dtors +; CHECK-NEXT: .LBB0_6: // %unwind_dtors ; CHECK-NEXT: msr TPIDR2_EL0, xzr ; CHECK-NEXT: bl shared_za_call ; CHECK-NEXT: sub x8, x29, #16 @@ -142,11 +134,11 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per ; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: .Ltmp3: // EH_LABEL ; CHECK-NEXT: sub x8, x29, #16 ; CHECK-NEXT: msr TPIDR2_EL0, x8 ; CHECK-NEXT: bl may_throw -; CHECK-NEXT: .Ltmp4: +; CHECK-NEXT: .Ltmp4: // EH_LABEL ; CHECK-NEXT: .LBB1_1: // %after_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -160,7 +152,7 @@ define dso_local void @try_catch() "aarch64_inout_za" personality ptr @__gxx_per ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: b shared_za_call ; CHECK-NEXT: .LBB1_4: // %catch -; CHECK-NEXT: .Ltmp5: +; CHECK-NEXT: .Ltmp5: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -235,16 +227,16 @@ define void @try_catch_shared_za_callee() "aarch64_new_za" personality ptr @__gx ; CHECK-NEXT: zero {za} ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: smstart za -; CHECK-NEXT: .Ltmp6: +; CHECK-NEXT: .Ltmp6: // EH_LABEL ; CHECK-NEXT: bl shared_za_call -; CHECK-NEXT: .Ltmp7: +; CHECK-NEXT: .Ltmp7: // EH_LABEL ; CHECK-NEXT: .LBB2_3: // %exit ; CHECK-NEXT: smstop za ; CHECK-NEXT: mov sp, x29 ; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB2_4: // %catch -; CHECK-NEXT: .Ltmp8: +; CHECK-NEXT: .Ltmp8: // EH_LABEL ; CHECK-NEXT: bl __cxa_begin_catch ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 066ee3b040469..afd56d198d0d3 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -12,77 +12,41 @@ entry: } define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" { -; CHECK-LABEL: multi_bb_stpidr2_save_required: -; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEXT: .cfi_offset w30, -8 -; CHECK-NEXT: .cfi_offset w29, -16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEXT: cbz w0, .LBB1_2 -; CHECK-NEXT: // %bb.1: // %use_b -; CHECK-NEXT: fmov s1, #4.00000000 -; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: b .LBB1_5 -; CHECK-NEXT: .LBB1_2: // %use_c -; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: sub x8, x29, #16 -; CHECK-NEXT: msr TPIDR2_EL0, x8 -; CHECK-NEXT: bl cosf -; CHECK-NEXT: smstart za -; CHECK-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEXT: sub x0, x29, #16 -; CHECK-NEXT: cbnz x8, .LBB1_4 -; CHECK-NEXT: // %bb.3: // %use_c -; CHECK-NEXT: bl __arm_tpidr2_restore -; CHECK-NEXT: .LBB1_4: // %use_c -; CHECK-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEXT: .LBB1_5: // %exit -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEXT: ret -; -; CHECK-NEWLOWERING-LABEL: multi_bb_stpidr2_save_required: -; CHECK-NEWLOWERING: // %bb.0: -; CHECK-NEWLOWERING-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEWLOWERING-NEXT: mov x29, sp -; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16 -; CHECK-NEWLOWERING-NEXT: .cfi_def_cfa w29, 16 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w30, -8 -; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 -; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 -; CHECK-NEWLOWERING-NEXT: mov x9, sp -; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 -; CHECK-NEWLOWERING-NEXT: mov sp, x9 -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 -; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 -; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB1_2 -; CHECK-NEWLOWERING-NEXT: // %bb.1: // %use_b -; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 -; CHECK-NEWLOWERING-NEXT: fadd s0, s0, s1 -; CHECK-NEWLOWERING-NEXT: b .LBB1_3 -; CHECK-NEWLOWERING-NEXT: .LBB1_2: // %use_c -; CHECK-NEWLOWERING-NEXT: fmov s0, s1 -; CHECK-NEWLOWERING-NEXT: bl cosf -; CHECK-NEWLOWERING-NEXT: .LBB1_3: // %exit -; CHECK-NEWLOWERING-NEXT: smstart za -; CHECK-NEWLOWERING-NEXT: mrs x8, TPIDR2_EL0 -; CHECK-NEWLOWERING-NEXT: sub x0, x29, #16 -; CHECK-NEWLOWERING-NEXT: cbnz x8, .LBB1_5 -; CHECK-NEWLOWERING-NEXT: // %bb.4: // %exit -; CHECK-NEWLOWERING-NEXT: bl __arm_tpidr2_restore -; CHECK-NEWLOWERING-NEXT: .LBB1_5: // %exit -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, xzr -; CHECK-NEWLOWERING-NEXT: mov sp, x29 -; CHECK-NEWLOWERING-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK-NEWLOWERING-NEXT: ret +; CHECK-COMMON-LABEL: multi_bb_stpidr2_save_required: +; CHECK-COMMON: // %bb.0: +; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-COMMON-NEXT: mov x29, sp +; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: .cfi_def_cfa w29, 16 +; CHECK-COMMON-NEXT: .cfi_offset w30, -8 +; CHECK-COMMON-NEXT: .cfi_offset w29, -16 +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stp x9, x8, [x29, #-16] +; CHECK-COMMON-NEXT: cbz w0, .LBB1_2 +; CHECK-COMMON-NEXT: // %bb.1: // %use_b +; CHECK-COMMON-NEXT: fmov s1, #4.00000000 +; CHECK-COMMON-NEXT: fadd s0, s0, s1 +; CHECK-COMMON-NEXT: b .LBB1_5 +; CHECK-COMMON-NEXT: .LBB1_2: // %use_c +; CHECK-COMMON-NEXT: fmov s0, s1 +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: bl cosf +; CHECK-COMMON-NEXT: smstart za +; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-COMMON-NEXT: sub x0, x29, #16 +; CHECK-COMMON-NEXT: cbnz x8, .LBB1_4 +; CHECK-COMMON-NEXT: // %bb.3: // %use_c +; CHECK-COMMON-NEXT: bl __arm_tpidr2_restore +; CHECK-COMMON-NEXT: .LBB1_4: // %use_c +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, xzr +; CHECK-COMMON-NEXT: .LBB1_5: // %exit +; CHECK-COMMON-NEXT: mov sp, x29 +; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ret %cmp = icmp ne i32 %a, 0 br i1 %cmp, label %use_b, label %use_c @@ -155,7 +119,9 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEWLOWERING-NEXT: .cfi_offset w29, -16 ; CHECK-NEWLOWERING-NEXT: rdsvl x8, #1 ; CHECK-NEWLOWERING-NEXT: mov x9, sp +; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 ; CHECK-NEWLOWERING-NEXT: msub x9, x8, x8, x9 +; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEWLOWERING-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 ; CHECK-NEWLOWERING-NEXT: sub sp, sp, #16, lsl #12 // =65536 ; CHECK-NEWLOWERING-NEXT: cmp sp, x9 @@ -166,9 +132,7 @@ define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float ; CHECK-NEWLOWERING-NEXT: .LBB2_3: ; CHECK-NEWLOWERING-NEXT: mov sp, x9 ; CHECK-NEWLOWERING-NEXT: ldr xzr, [sp] -; CHECK-NEWLOWERING-NEXT: sub x10, x29, #16 ; CHECK-NEWLOWERING-NEXT: stp x9, x8, [x29, #-16] -; CHECK-NEWLOWERING-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEWLOWERING-NEXT: cbz w0, .LBB2_5 ; CHECK-NEWLOWERING-NEXT: // %bb.4: // %use_b ; CHECK-NEWLOWERING-NEXT: fmov s1, #4.00000000 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
