https://github.com/ytmukai updated https://github.com/llvm/llvm-project/pull/79589
>From bcdb1e47ce841df96b2916d61cda018503f62358 Mon Sep 17 00:00:00 2001 From: Yuta Mukai <mukai.y...@fujitsu.com> Date: Tue, 12 Dec 2023 16:59:09 +0000 Subject: [PATCH] [AArch64][MachinePipeliner] Add pipeliner support for AArch64 Add AArch64 implementations for the interfaces of MachinePipeliner pass. The pass is disabled by default for AArch64. It is enabled by specifying --aarch64-enable-pipeliner. 5 tests in llvm-test-suites show performance improvement by more than 5% on a Neoverse V1 processor. | test | improvement | | ---------------------------------------------------------------- | -----------:| | MultiSource/Benchmarks/TSVC/Recurrences-dbl/Recurrences-dbl.test | 16% | | MultiSource/Benchmarks/TSVC/Recurrences-dbl/Recurrences-flt.test | 16% | | SingleSource/Benchmarks/Adobe-C++/loop_unroll.test | 14% | | SingleSource/Benchmarks/Misc/flops-5.test | 13% | | SingleSource/Benchmarks/BenchmarkGame/spectral-norm.test | 6% | (base flags: -mcpu=neoverse-v1 -O3 -mrecip, flags for pipelining: -mllvm -aarch64-enable-pipeliner -mllvm -pipeliner-max-stages=100 -mllvm -pipeliner-max-mii=100 -mllvm -pipeliner-enable-copytophi=0) On the other hand, there are cases of significant performance degradation. Algorithm improvements and adding the option/pragma will be needed in the future. --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 105 ++++++++++++++++++ llvm/lib/Target/AArch64/AArch64InstrInfo.h | 4 + llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 4 + llvm/lib/Target/AArch64/AArch64Subtarget.h | 3 + .../Target/AArch64/AArch64TargetMachine.cpp | 7 ++ .../CodeGen/AArch64/sms-acceptable-loop1.mir | 78 +++++++++++++ .../CodeGen/AArch64/sms-acceptable-loop2.mir | 78 +++++++++++++ .../CodeGen/AArch64/sms-acceptable-loop3.mir | 79 +++++++++++++ .../CodeGen/AArch64/sms-acceptable-loop4.mir | 79 +++++++++++++ .../AArch64/sms-unacceptable-loop1.mir | 77 +++++++++++++ .../AArch64/sms-unacceptable-loop2.mir | 80 +++++++++++++ .../CodeGen/AArch64/sms-unpipeline-insts1.mir | 87 +++++++++++++++ .../CodeGen/AArch64/sms-unpipeline-insts2.mir | 80 +++++++++++++ 13 files changed, 761 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir create mode 100644 llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir create mode 100644 llvm/test/CodeGen/AArch64/sms-unacceptable-loop1.mir create mode 100644 llvm/test/CodeGen/AArch64/sms-unacceptable-loop2.mir create mode 100644 llvm/test/CodeGen/AArch64/sms-unpipeline-insts1.mir create mode 100644 llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 8e50c16ba0887..809c3415ea234 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -9608,6 +9608,111 @@ AArch64InstrInfo::probedStackAlloc(MachineBasicBlock::iterator MBBI, return ExitMBB->begin(); } +namespace { +class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo { + MachineInstr *PredBranch; + SmallVector<MachineOperand, 4> Cond; + +public: + AArch64PipelinerLoopInfo(MachineInstr *PredBranch, + const SmallVectorImpl<MachineOperand> &Cond) + : PredBranch(PredBranch), Cond(Cond.begin(), Cond.end()) {} + + bool shouldIgnoreForPipelining(const MachineInstr *MI) const override { + // Make the instructions for loop control be placed in stage 0. + // The predecessors of PredBranch are considered by the caller. + return MI == PredBranch; + } + + std::optional<bool> createTripCountGreaterCondition( + int TC, MachineBasicBlock &MBB, + SmallVectorImpl<MachineOperand> &CondParam) override { + // A branch instruction will be inserted as "if (Cond) goto epilogue". + // Cond is normalized for such use. + // The predecessors of the branch are assumed to have already been inserted. + CondParam = Cond; + return {}; + } + + void setPreheader(MachineBasicBlock *NewPreheader) override {} + + void adjustTripCount(int TripCountAdjust) override {} + + void disposed() override {} +}; +} // namespace + +static bool isCompareAndBranch(unsigned Opcode) { + switch (Opcode) { + case AArch64::CBZW: + case AArch64::CBZX: + case AArch64::CBNZW: + case AArch64::CBNZX: + case AArch64::TBZW: + case AArch64::TBZX: + case AArch64::TBNZW: + case AArch64::TBNZX: + return true; + } + return false; +} + +std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> +AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const { + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + SmallVector<MachineOperand, 4> Cond; + if (analyzeBranch(*LoopBB, TBB, FBB, Cond)) + return nullptr; + + // Infinite loops are not supported + if (TBB == LoopBB && FBB == LoopBB) + return nullptr; + + // Must be conditional branch + if (FBB == nullptr) + return nullptr; + + assert((TBB == LoopBB || FBB == LoopBB) && + "The Loop must be a single-basic-block loop"); + + // Normalization for createTripCountGreaterCondition() + if (TBB == LoopBB) + reverseBranchCondition(Cond); + + MachineInstr *CondBranch = &*LoopBB->getFirstTerminator(); + const TargetRegisterInfo &TRI = getRegisterInfo(); + + // Find the immediate predecessor of the conditional branch + MachineInstr *PredBranch = nullptr; + if (CondBranch->getOpcode() == AArch64::Bcc) { + for (MachineInstr &MI : reverse(*LoopBB)) { + if (MI.modifiesRegister(AArch64::NZCV, &TRI)) { + PredBranch = &MI; + break; + } + } + if (!PredBranch) + return nullptr; + } else if (isCompareAndBranch(CondBranch->getOpcode())) { + const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); + Register Reg = CondBranch->getOperand(0).getReg(); + if (!Reg.isVirtual()) + return nullptr; + PredBranch = MRI.getVRegDef(Reg); + + // MachinePipeliner does not expect that the immediate predecessor is a Phi + if (PredBranch->isPHI()) + return nullptr; + + if (PredBranch->getParent() != LoopBB) + return nullptr; + } else { + return nullptr; + } + + return std::make_unique<AArch64PipelinerLoopInfo>(PredBranch, Cond); +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 6526f6740747a..f3a5db4367509 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -247,6 +247,10 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo { MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; + + std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> + analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override; + bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index e3a0606331db1..6550c12722166 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -540,3 +540,7 @@ AArch64Subtarget::getAuthenticatedLRCheckMethod() const { // performance regression or incompatibility with execute-only mappings. return AArch64PAuth::AuthCheckMethod::None; } + +bool AArch64Subtarget::enableMachinePipeliner() const { + return getSchedModel().hasInstrSchedModel(); +} diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 16864102df59b..0292c018f1dbc 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -201,6 +201,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { bool enableMachineScheduler() const override { return true; } bool enablePostRAScheduler() const override { return usePostRAScheduler(); } + bool enableMachinePipeliner() const override; + bool useDFAforSMS() const override { return false; } + /// Returns ARM processor family. /// Avoid this function! CPU specifics should be kept local to this class /// and preferably modeled with SubtargetFeatures or properties in diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index 6fbc13d8904f2..81bb6e59422fa 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -204,6 +204,11 @@ static cl::opt<bool> cl::desc("Enable sinking and folding of instruction copies"), cl::init(true), cl::Hidden); +static cl::opt<bool> + EnableMachinePipeliner("aarch64-enable-pipeliner", + cl::desc("Enable Machine Pipeliner for AArch64"), + cl::init(false), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget()); @@ -779,6 +784,8 @@ void AArch64PassConfig::addPreRegAlloc() { // be register coalescer friendly. addPass(&PeepholeOptimizerID); } + if (TM->getOptLevel() != CodeGenOptLevel::None && EnableMachinePipeliner) + addPass(&MachinePipelinerID); } void AArch64PassConfig::addPostRegAlloc() { diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir new file mode 100644 index 0000000000000..ed2bd73a7861a --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop1.mir @@ -0,0 +1,78 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s + +# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Branch with NZCV flags +# CHECK: Schedule Found? 1 + +--- | + define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 { + entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ] + %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ] + %0 = load float, ptr %lsr.iv9, align 4 + %add = fadd float %0, 1.000000e+00 + store float %add, ptr %lsr.iv, align 4 + %scevgep = getelementptr i8, ptr %lsr.iv, i64 4 + %scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4 + %lsr.iv.next = add nsw i64 %lsr.iv11, -1 + %exitcond.not = icmp eq i64 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + +... +--- +name: func +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%7' } + - { reg: '$x1', virtual-reg: '%8' } + - { reg: '$w2', virtual-reg: '%9' } +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $x0, $x1, $w2 + + %9:gpr32common = COPY $w2 + %8:gpr64 = COPY $x1 + %7:gpr64 = COPY $x0 + dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %11:gpr32 = ORRWrs $wzr, %9, 0 + %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32 + %14:fpr32 = FMOVSi 112 + B %bb.3 + + bb.2.for.cond.cleanup: + RET_ReallyLR + + bb.3.for.body: + successors: %bb.2(0x04000000), %bb.3(0x7c000000) + + %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3 + %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3 + %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3 + early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9) + %15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr + early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv) + %4:gpr64all = COPY %16 + %5:gpr64all = COPY %12 + %17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv + %6:gpr64all = COPY %17 + Bcc 0, %bb.2, implicit $nzcv + B %bb.3 + +... diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir new file mode 100644 index 0000000000000..5cf6367354ecc --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop2.mir @@ -0,0 +1,78 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s + +# An acceptable loop by pipeliner: TBB == LoopBB, FBB == ExitBB, Branch with NZCV flags +# CHECK: Schedule Found? 1 + +--- | + define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 { + entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ] + %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ] + %0 = load float, ptr %lsr.iv9, align 4 + %add = fadd float %0, 1.000000e+00 + store float %add, ptr %lsr.iv, align 4 + %scevgep = getelementptr i8, ptr %lsr.iv, i64 4 + %scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4 + %lsr.iv.next = add nsw i64 %lsr.iv11, -1 + %exitcond.not = icmp eq i64 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + +... +--- +name: func +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%7' } + - { reg: '$x1', virtual-reg: '%8' } + - { reg: '$w2', virtual-reg: '%9' } +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $x0, $x1, $w2 + + %9:gpr32common = COPY $w2 + %8:gpr64 = COPY $x1 + %7:gpr64 = COPY $x0 + dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %11:gpr32 = ORRWrs $wzr, %9, 0 + %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32 + %14:fpr32 = FMOVSi 112 + B %bb.3 + + bb.2.for.cond.cleanup: + RET_ReallyLR + + bb.3.for.body: + successors: %bb.2(0x04000000), %bb.3(0x7c000000) + + %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3 + %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3 + %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3 + early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9) + %15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr + early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv) + %4:gpr64all = COPY %16 + %5:gpr64all = COPY %12 + %17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv + %6:gpr64all = COPY %17 + Bcc 1, %bb.3, implicit $nzcv + B %bb.2 + +... diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir new file mode 100644 index 0000000000000..652770e3fcfa8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop3.mir @@ -0,0 +1,79 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s + +# An acceptable loop by pipeliner: TBB == ExitBB, FBB == LoopBB, Compare and branch +# CHECK: Schedule Found? 1 + +--- | + define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 { + entry: + %or.cond = icmp ult i32 %n, 2 + br i1 %or.cond, label %for.end, label %for.body.preheader + + for.body.preheader: ; preds = %entry + %i.07 = add i32 %n, -1 + %0 = sext i32 %i.07 to i64 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %1 = shl nsw i64 %indvars.iv, 2 + %scevgep = getelementptr i8, ptr %b, i64 %1 + %2 = load float, ptr %scevgep, align 4 + %add = fadd float %2, 1.000000e+00 + %3 = shl nsw i64 %indvars.iv, 2 + %scevgep11 = getelementptr i8, ptr %a, i64 %3 + store float %add, ptr %scevgep11, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %4 = add i64 %indvars.iv, -1 + %5 = and i64 %4, 4294967295 + %tobool.not = icmp eq i64 %5, 0 + br i1 %tobool.not, label %for.end, label %for.body + + for.end: ; preds = %for.body, %entry + ret void + } + +... +--- +name: func +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%3' } + - { reg: '$x1', virtual-reg: '%4' } + - { reg: '$w2', virtual-reg: '%5' } +body: | + bb.0.entry: + liveins: $x0, $x1, $w2 + + %5:gpr32common = COPY $w2 + %4:gpr64common = COPY $x1 + %3:gpr64common = COPY $x0 + dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv + Bcc 3, %bb.3, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %7:gpr32common = SUBWri %5, 1, 0 + %9:gpr64all = IMPLICIT_DEF + %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32 + %10:gpr64 = SBFMXri killed %8, 0, 31 + %0:gpr64all = COPY %10 + %12:fpr32 = FMOVSi 112 + + bb.2.for.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %1:gpr64common = PHI %0, %bb.1, %2, %bb.2 + %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep) + %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr + STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11) + %14:gpr64common = SUBXri %1, 1, 0 + %2:gpr64all = COPY %14 + %15:gpr32 = COPY %14.sub_32 + CBZW killed %15, %bb.3 + B %bb.2 + + bb.3.for.end: + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir b/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir new file mode 100644 index 0000000000000..95d64cae5b780 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-acceptable-loop4.mir @@ -0,0 +1,79 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s + +# An acceptable loop by pipeliner TBB == LoopBB, FBB == ExitBB, Compare and branch +# CHECK: Schedule Found? 1 + +--- | + define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 { + entry: + %or.cond = icmp ult i32 %n, 2 + br i1 %or.cond, label %for.end, label %for.body.preheader + + for.body.preheader: ; preds = %entry + %i.07 = add i32 %n, -1 + %0 = sext i32 %i.07 to i64 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %1 = shl nsw i64 %indvars.iv, 2 + %scevgep = getelementptr i8, ptr %b, i64 %1 + %2 = load float, ptr %scevgep, align 4 + %add = fadd float %2, 1.000000e+00 + %3 = shl nsw i64 %indvars.iv, 2 + %scevgep11 = getelementptr i8, ptr %a, i64 %3 + store float %add, ptr %scevgep11, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %4 = add i64 %indvars.iv, -1 + %5 = and i64 %4, 4294967295 + %tobool.not = icmp eq i64 %5, 0 + br i1 %tobool.not, label %for.end, label %for.body + + for.end: ; preds = %for.body, %entry + ret void + } + +... +--- +name: func +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%3' } + - { reg: '$x1', virtual-reg: '%4' } + - { reg: '$w2', virtual-reg: '%5' } +body: | + bb.0.entry: + liveins: $x0, $x1, $w2 + + %5:gpr32common = COPY $w2 + %4:gpr64common = COPY $x1 + %3:gpr64common = COPY $x0 + dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv + Bcc 3, %bb.3, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %7:gpr32common = SUBWri %5, 1, 0 + %9:gpr64all = IMPLICIT_DEF + %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32 + %10:gpr64 = SBFMXri killed %8, 0, 31 + %0:gpr64all = COPY %10 + %12:fpr32 = FMOVSi 112 + + bb.2.for.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %1:gpr64common = PHI %0, %bb.1, %2, %bb.2 + %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep) + %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr + STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11) + %14:gpr64common = SUBXri %1, 1, 0 + %2:gpr64all = COPY %14 + %15:gpr32 = COPY %14.sub_32 + CBNZW killed %15, %bb.2 + B %bb.3 + + bb.3.for.end: + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/sms-unacceptable-loop1.mir b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop1.mir new file mode 100644 index 0000000000000..79dc1482c748f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop1.mir @@ -0,0 +1,77 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s + +# An unacceptable loop by pipeliner: No exits +# CHECK: Unable to analyzeLoop, can NOT pipeline Loop + +--- | + define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 { + entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv11 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %lsr.iv9 = phi ptr [ %b, %for.body.preheader ], [ %scevgep10, %for.body ] + %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ] + %0 = load float, ptr %lsr.iv9, align 4 + %add = fadd float %0, 1.000000e+00 + store float %add, ptr %lsr.iv, align 4 + %scevgep = getelementptr i8, ptr %lsr.iv, i64 4 + %scevgep10 = getelementptr i8, ptr %lsr.iv9, i64 4 + %lsr.iv.next = add nsw i64 %lsr.iv11, -1 + %exitcond.not = icmp eq i64 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + +... +--- +name: func +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%7' } + - { reg: '$x1', virtual-reg: '%8' } + - { reg: '$w2', virtual-reg: '%9' } +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $x0, $x1, $w2 + + %9:gpr32common = COPY $w2 + %8:gpr64 = COPY $x1 + %7:gpr64 = COPY $x0 + dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %11:gpr32 = ORRWrs $wzr, %9, 0 + %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32 + %14:fpr32 = FMOVSi 112 + B %bb.3 + + bb.2.for.cond.cleanup: + RET_ReallyLR + + bb.3.for.body: + successors: %bb.3(0x7c000000) + + %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3 + %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3 + %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3 + early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv9) + %15:fpr32 = nofpexcept FADDSrr killed %13, %14, implicit $fpcr + early-clobber %16:gpr64sp = STRSpost killed %15, %3, 4 :: (store (s32) into %ir.lsr.iv) + %4:gpr64all = COPY %16 + %5:gpr64all = COPY %12 + %17:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv + %6:gpr64all = COPY %17 + B %bb.3 + +... diff --git a/llvm/test/CodeGen/AArch64/sms-unacceptable-loop2.mir b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop2.mir new file mode 100644 index 0000000000000..c3807ae272c6c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-unacceptable-loop2.mir @@ -0,0 +1,80 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s + +# An unacceptable loop by pipeliner: The operand of the compare and branch is not defined in the loop +# CHECK: Unable to analyzeLoop, can NOT pipeline Loop + +--- | + define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 { + entry: + %or.cond = icmp ult i32 %n, 2 + br i1 %or.cond, label %for.end, label %for.body.preheader + + for.body.preheader: ; preds = %entry + %i.07 = add i32 %n, -1 + %0 = sext i32 %i.07 to i64 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %1 = shl nsw i64 %indvars.iv, 2 + %scevgep = getelementptr i8, ptr %b, i64 %1 + %2 = load float, ptr %scevgep, align 4 + %add = fadd float %2, 1.000000e+00 + %3 = shl nsw i64 %indvars.iv, 2 + %scevgep11 = getelementptr i8, ptr %a, i64 %3 + store float %add, ptr %scevgep11, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %4 = add i64 %indvars.iv, -1 + %5 = and i64 %4, 4294967295 + %tobool.not = icmp eq i64 %5, 0 + br i1 %tobool.not, label %for.end, label %for.body + + for.end: ; preds = %for.body, %entry + ret void + } + +... +--- +name: func +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%3' } + - { reg: '$x1', virtual-reg: '%4' } + - { reg: '$w2', virtual-reg: '%5' } +body: | + bb.0.entry: + liveins: $x0, $x1, $w2 + + %5:gpr32common = COPY $w2 + %4:gpr64common = COPY $x1 + %3:gpr64common = COPY $x0 + dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv + Bcc 3, %bb.3, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %7:gpr32common = SUBWri %5, 1, 0 + %9:gpr64all = IMPLICIT_DEF + %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32 + %10:gpr64 = SBFMXri killed %8, 0, 31 + %0:gpr64all = COPY %10 + %12:fpr32 = FMOVSi 112 + %16:gpr32 = COPY %10.sub_32 + + bb.2.for.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %1:gpr64common = PHI %0, %bb.1, %2, %bb.2 + %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep) + %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr + STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11) + %14:gpr64common = SUBXri %1, 1, 0 + %2:gpr64all = COPY %14 + %15:gpr32 = COPY %14.sub_32 + CBZW %16, %bb.3 + B %bb.2 + + bb.3.for.end: + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/sms-unpipeline-insts1.mir b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts1.mir new file mode 100644 index 0000000000000..5973a44308253 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts1.mir @@ -0,0 +1,87 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -mcpu=neoverse-n1 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -debug-only=pipeliner 2>&1 | FileCheck %s + +# Check that instructions referencing NZCV are not pipelined + +# CHECK: SU([[SU0:[0-9]+]]): nofpexcept FCMPSri {{.*}}, implicit-def $nzcv, implicit $fpcr +# CHECK: SU([[SU1:[0-9]+]]): {{.*}} = FCSELSrrr {{.*}}, {{.*}}, 1, implicit $nzcv +# CHECK: Do not pipeline SU([[SU0:[0-9]+]]) +# CHECK: Do not pipeline SU([[SU1:[0-9]+]]) + +--- | + define dso_local void @KERNEL(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 { + entry: + %cmp19 = icmp sgt i32 %n, 0 + br i1 %cmp19, label %for.body.preheader, label %for.cond.cleanup + + for.body.preheader: ; preds = %entry + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + + for.cond.cleanup: ; preds = %for.body, %entry + ret void + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv24 = phi i64 [ %wide.trip.count, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %lsr.iv22 = phi ptr [ %b, %for.body.preheader ], [ %scevgep23, %for.body ] + %lsr.iv = phi ptr [ %a, %for.body.preheader ], [ %scevgep, %for.body ] + %0 = load float, ptr %lsr.iv22, align 4 + %tobool = fcmp une float %0, 0.000000e+00 + %. = select i1 %tobool, float 1.000000e+00, float 2.000000e+00 + %add = fadd float %0, %. + store float %add, ptr %lsr.iv, align 4 + %scevgep = getelementptr i8, ptr %lsr.iv, i64 4 + %scevgep23 = getelementptr i8, ptr %lsr.iv22, i64 4 + %lsr.iv.next = add nsw i64 %lsr.iv24, -1 + %exitcond.not = icmp eq i64 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + } + +... +--- +name: KERNEL +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%7' } + - { reg: '$x1', virtual-reg: '%8' } + - { reg: '$w2', virtual-reg: '%9' } +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.2(0x30000000) + liveins: $x0, $x1, $w2 + + %9:gpr32common = COPY $w2 + %8:gpr64 = COPY $x1 + %7:gpr64 = COPY $x0 + dead $wzr = SUBSWri %9, 1, 0, implicit-def $nzcv + Bcc 11, %bb.2, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %11:gpr32 = ORRWrs $wzr, %9, 0 + %0:gpr64all = SUBREG_TO_REG 0, killed %11, %subreg.sub_32 + %14:fpr32 = FMOVSi 0 + %15:fpr32 = FMOVSi 112 + B %bb.3 + + bb.2.for.cond.cleanup: + RET_ReallyLR + + bb.3.for.body: + successors: %bb.2(0x04000000), %bb.3(0x7c000000) + + %1:gpr64sp = PHI %0, %bb.1, %6, %bb.3 + %2:gpr64sp = PHI %8, %bb.1, %5, %bb.3 + %3:gpr64sp = PHI %7, %bb.1, %4, %bb.3 + early-clobber %12:gpr64sp, %13:fpr32 = LDRSpost %2, 4 :: (load (s32) from %ir.lsr.iv22) + nofpexcept FCMPSri %13, implicit-def $nzcv, implicit $fpcr + %16:fpr32 = FCSELSrrr %15, %14, 1, implicit $nzcv + %17:fpr32 = nofpexcept FADDSrr %13, killed %16, implicit $fpcr + early-clobber %18:gpr64sp = STRSpost killed %17, %3, 4 :: (store (s32) into %ir.lsr.iv) + %4:gpr64all = COPY %18 + %5:gpr64all = COPY %12 + %19:gpr64 = nsw SUBSXri %1, 1, 0, implicit-def $nzcv + %6:gpr64all = COPY %19 + Bcc 0, %bb.2, implicit $nzcv + B %bb.3 + +... diff --git a/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir new file mode 100644 index 0000000000000..fdecbffdd4490 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sms-unpipeline-insts2.mir @@ -0,0 +1,80 @@ +# RUN: llc --verify-machineinstrs -mtriple=aarch64 -o - %s -run-pass pipeliner -aarch64-enable-pipeliner -pipeliner-enable-copytophi=0 -debug-only=pipeliner 2>&1 | FileCheck %s + +# An acceptable loop by pipeliner TBB == LoopBB, FBB == ExitBB, Compare and branch +# CHECK: SU([[SU0:[0-9]+]]): [[V0:%[0-9]+]]:gpr64common = SUBXri [[V1:%[0-9]+]]:gpr64common, 1, 0 +# CHECK: Do not pipeline SU([[SU0:[0-9]+]]) + +--- | + define dso_local void @func(ptr noalias nocapture noundef writeonly %a, ptr nocapture noundef readonly %b, i32 noundef %n) local_unnamed_addr #0 { + entry: + %or.cond = icmp ult i32 %n, 2 + br i1 %or.cond, label %for.end, label %for.body.preheader + + for.body.preheader: ; preds = %entry + %i.07 = add i32 %n, -1 + %0 = sext i32 %i.07 to i64 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %1 = shl nsw i64 %indvars.iv, 2 + %scevgep = getelementptr i8, ptr %b, i64 %1 + %2 = load float, ptr %scevgep, align 4 + %add = fadd float %2, 1.000000e+00 + %3 = shl nsw i64 %indvars.iv, 2 + %scevgep11 = getelementptr i8, ptr %a, i64 %3 + store float %add, ptr %scevgep11, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %4 = add i64 %indvars.iv, -1 + %5 = and i64 %4, 4294967295 + %tobool.not = icmp eq i64 %5, 0 + br i1 %tobool.not, label %for.end, label %for.body + + for.end: ; preds = %for.body, %entry + ret void + } + +... +--- +name: func +tracksRegLiveness: true +liveins: + - { reg: '$x0', virtual-reg: '%3' } + - { reg: '$x1', virtual-reg: '%4' } + - { reg: '$w2', virtual-reg: '%5' } +body: | + bb.0.entry: + liveins: $x0, $x1, $w2 + + %5:gpr32common = COPY $w2 + %4:gpr64common = COPY $x1 + %3:gpr64common = COPY $x0 + dead $wzr = SUBSWri %5, 2, 0, implicit-def $nzcv + Bcc 3, %bb.3, implicit $nzcv + B %bb.1 + + bb.1.for.body.preheader: + %7:gpr32common = SUBWri %5, 1, 0 + %9:gpr64all = IMPLICIT_DEF + %8:gpr64 = SUBREG_TO_REG 0, killed %7, %subreg.sub_32 + %10:gpr64 = SBFMXri killed %8, 0, 31 + %0:gpr64all = COPY %10 + %12:fpr32 = FMOVSi 112 + + bb.2.for.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %1:gpr64common = PHI %0, %bb.1, %2, %bb.2 + %11:fpr32 = LDRSroX %4, %1, 0, 1 :: (load (s32) from %ir.scevgep) + %13:fpr32 = nofpexcept FADDSrr killed %11, %12, implicit $fpcr + STRSroX killed %13, %3, %1, 0, 1 :: (store (s32) into %ir.scevgep11) + %14:gpr64common = SUBXri %1, 1, 0 + %2:gpr64all = COPY %14 + %15:gpr32 = COPY %14.sub_32 + CBNZW killed %15, %bb.2 + B %bb.3 + + bb.3.for.end: + RET_ReallyLR + +... _______________________________________________ lldb-commits mailing list lldb-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/lldb-commits