Author: Florian Hahn Date: 2020-09-13T18:47:56+01:00 New Revision: 638a188b6a0262fe26ad62353d71cdd384c40bd9
URL: https://github.com/llvm/llvm-project/commit/638a188b6a0262fe26ad62353d71cdd384c40bd9 DIFF: https://github.com/llvm/llvm-project/commit/638a188b6a0262fe26ad62353d71cdd384c40bd9.diff LOG: [LV] Generate RT checks up-front and remove them if required. Differential Revision: https://reviews.llvm.org/D75980 Added: llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll Modified: llvm/include/llvm/Transforms/Utils/LoopUtils.h llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h llvm/lib/Transforms/Utils/LoopUtils.cpp llvm/lib/Transforms/Utils/LoopVersioning.cpp llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp llvm/lib/Transforms/Vectorize/LoopVectorize.cpp llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll Removed: ################################################################################ diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 70c8c84c857b..bb72c19f8532 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -35,6 +35,7 @@ class MemorySSAUpdater; class OptimizationRemarkEmitter; class PredIteratorCache; class ScalarEvolution; +class ScalarEvolutionExpander; class SCEV; class SCEVExpander; class TargetLibraryInfo; @@ -446,7 +447,7 @@ Loop *cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, std::pair<Instruction *, Instruction *> addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, - ScalarEvolution *SE); + SCEVExpander &Expander); } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h index 77360cb2671d..829bdcbb2588 100644 --- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h +++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h @@ -16,6 +16,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionNormalization.h" @@ -199,6 +200,8 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> { ChainedPhis.clear(); } + ScalarEvolution *getSE() { return &SE; } + /// Return a vector containing all instructions inserted during expansion. SmallVector<Instruction *, 32> getAllInsertedInstructions() const { SmallVector<Instruction *, 32> Result; @@ -509,10 +512,12 @@ class SCEVExpanderCleaner { SCEVExpanderCleaner(SCEVExpander &Expander, DominatorTree &DT) : Expander(Expander), DT(DT), ResultUsed(false) {} - ~SCEVExpanderCleaner(); + ~SCEVExpanderCleaner() { cleanup(); } /// Indicate that the result of the expansion is used. void markResultUsed() { ResultUsed = true; } + + void cleanup(); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index d7cd9b19b8d5..dd808619f67a 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1574,7 +1574,8 @@ struct PointerBounds { /// in \p TheLoop. \return the values for the bounds. static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG, Loop *TheLoop, Instruction *Loc, - SCEVExpander &Exp, ScalarEvolution *SE) { + SCEVExpander &Exp) { + ScalarEvolution *SE = Exp.getSE(); // TODO: Add helper to retrieve pointers to CG. Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue; const SCEV *Sc = SE->getSCEV(Ptr); @@ -1613,16 +1614,15 @@ static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG, /// lower bounds for both pointers in the check. static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L, - Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) { + Instruction *Loc, SCEVExpander &Exp) { SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds; // Here we're relying on the SCEV Expander's cache to only emit code for the // same bounds once. transform(PointerChecks, std::back_inserter(ChecksWithBounds), [&](const RuntimePointerCheck &Check) { - PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE), - Second = - expandBounds(Check.second, L, Loc, Exp, SE); + PointerBounds First = expandBounds(Check.first, L, Loc, Exp), + Second = expandBounds(Check.second, L, Loc, Exp); return std::make_pair(First, Second); }); @@ -1632,12 +1632,10 @@ expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L, std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks( Instruction *Loc, Loop *TheLoop, const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, - ScalarEvolution *SE) { + SCEVExpander &Exp) { // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible. // TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible - const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout(); - SCEVExpander Exp(*SE, DL, "induction"); - auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp); + auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, Exp); LLVMContext &Ctx = Loc->getContext(); Instruction *FirstInst = nullptr; diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp index b4925064bc6b..d2c9def03785 100644 --- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp +++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp @@ -68,9 +68,12 @@ void LoopVersioning::versionLoop( // Add the memcheck in the original preheader (this is empty initially). BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader(); const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop, - AliasChecks, RtPtrChecking.getSE()); + + SCEVExpander Exp2(*RtPtrChecking.getSE(), + VersionedLoop->getHeader()->getModule()->getDataLayout(), + "induction"); + std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( + RuntimeCheckBB->getTerminator(), VersionedLoop, AliasChecks, Exp2); const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate(); SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(), diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 165030c6d2f1..1d7e7882d7ea 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2663,7 +2663,7 @@ bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint, return false; } -SCEVExpanderCleaner::~SCEVExpanderCleaner() { +void SCEVExpanderCleaner::cleanup() { // Result is used, nothing to remove. if (ResultUsed) return; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 545540efc284..f9a0e6f35f50 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -411,9 +411,8 @@ static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) { return None; } - +struct GeneratedRTChecks; namespace llvm { - /// InnerLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). /// This class performs the widening of scalars into vectors, or multiple @@ -437,12 +436,12 @@ class InnerLoopVectorizer { OptimizationRemarkEmitter *ORE, ElementCount VecWidth, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI), AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor), Builder(PSE.getSE()->getContext()), VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM), - BFI(BFI), PSI(PSI) { + BFI(BFI), PSI(PSI), Check(Check) { // Query this against the original loop and save it here because the profile // of the original loop header may change as the transformation happens. OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize( @@ -855,6 +854,10 @@ class InnerLoopVectorizer { // Whether this loop should be optimized for size based on profile guided size // optimizatios. bool OptForSizeBasedOnProfile; + + /// Structure to hold information about generated runtime checks, responsible + /// for cleaning the checks, if they turn out unprofitable. + GeneratedRTChecks &Check; }; class InnerLoopUnroller : public InnerLoopVectorizer { @@ -866,10 +869,9 @@ class InnerLoopUnroller : public InnerLoopVectorizer { OptimizationRemarkEmitter *ORE, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI) - : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, - ElementCount::getFixed(1), UnrollFactor, LVL, CM, - BFI, PSI) {} + ProfileSummaryInfo *PSI, GeneratedRTChecks &Check) + : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(1), + UnrollFactor, LVL, CM, BFI, PSI, Check) {} private: Value *getBroadcastInstrs(Value *V) override; @@ -1645,9 +1647,99 @@ class LoopVectorizationCostModel { /// Values to ignore in the cost model when VF > 1. SmallPtrSet<const Value *, 16> VecValuesToIgnore; }; - } // end namespace llvm +/// Helper struct to manage generating runtime checks for vectorization. +/// +/// The runtime checks are created up-front in a temporary block to allow better +/// estimating the cost and un-linked from the existing IR. After deciding to +/// vectorize, the checks are moved backed. If deciding not to vectorize, the +/// temporary block is completely removed. +struct GeneratedRTChecks { + BasicBlock *TmpBlock = nullptr; + BasicBlock *Preheader; + Value *SCEVCheck; + Instruction *FirstCheckInst = nullptr; + Instruction *MemRuntimeCheck = nullptr; + + ScalarEvolution &SE; + DominatorTree *DT; + + SCEVExpander Exp; + SCEVExpanderCleaner Cleaner; + + GeneratedRTChecks(BasicBlock *Preheader, ScalarEvolution &SE, + DominatorTree *DT) + : Preheader(Preheader), SE(SE), DT(DT), + Exp(SE, Preheader->getModule()->getDataLayout(), "scev.check"), + Cleaner(Exp, *DT) {} + + /// Generate runtime checks in temporary block (TmpBlock), so we can + /// accurately estimate the cost of the runtime checks. The block is un-linked + /// from the IR and is added back during vector code generation. If there is + /// no vector code generation, the check blocks is removed completely. + void Create(Loop *L, const LoopAccessInfo &LAI, + const SCEVUnionPredicate &UnionPred, LoopInfo *LI) { + BasicBlock *LoopHeader = Preheader->getSingleSuccessor(); + TmpBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "tmp.rtchecks"); + + SCEVCheck = + Exp.expandCodeForPredicate(&UnionPred, TmpBlock->getTerminator()); + + const auto &RtPtrChecking = *LAI.getRuntimePointerChecking(); + if (RtPtrChecking.Need) { + std::tie(FirstCheckInst, MemRuntimeCheck) = addRuntimeChecks( + TmpBlock->getTerminator(), L, RtPtrChecking.getChecks(), Exp); + assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " + "claimed checks are required"); + } + + // Unhook the temporary block with the checks, update various places + // accordingly. + TmpBlock->replaceAllUsesWith(Preheader); + TmpBlock->getTerminator()->moveBefore(Preheader->getTerminator()); + Preheader->getTerminator()->eraseFromParent(); + DT->changeImmediateDominator(LoopHeader, Preheader); + DT->eraseNode(TmpBlock); + LI->removeBlock(TmpBlock); + } + + ~GeneratedRTChecks() { + if (!TmpBlock) { + Cleaner.markResultUsed(); + return; + } + + if (!SCEVCheck && TmpBlock->empty()) { + Cleaner.markResultUsed(); + TmpBlock->eraseFromParent(); + return; + } + + if (MemRuntimeCheck && !isa<Constant>(MemRuntimeCheck)) + MemRuntimeCheck->replaceAllUsesWith( + ConstantInt::getFalse(MemRuntimeCheck->getType()->getContext())); + if (SCEVCheck && !isa<Constant>(SCEVCheck)) + SCEVCheck->replaceAllUsesWith( + ConstantInt::getFalse(SCEVCheck->getType()->getContext())); + + SmallPtrSet<Value *, 8> Removed; + // Completely remove the block. + for (auto &I : make_early_inc_range(reverse(*TmpBlock))) { + if (Exp.isInsertedInstruction(&I)) + continue; + SE.forgetValue(&I); + SE.eraseValueFromMap(&I); + Removed.insert(&I); + I.eraseFromParent(); + } + + Cleaner.cleanup(); + TmpBlock->eraseFromParent(); + } +}; + // Return true if \p OuterLp is an outer loop annotated with hints for explicit // vectorization. The loop needs to be annotated with #pragma omp simd // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the @@ -2887,17 +2979,11 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L, void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { // Reuse existing vector loop preheader for SCEV checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader; + BasicBlock *const SCEVCheckBlock = Check.TmpBlock; - // Generate the code to check that the SCEV assumptions that we made. - // We want the new basic block to start at the first instruction in a - // sequence of instructions that form a check. - SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(), - "scev.check"); - Value *SCEVCheck = Exp.expandCodeForPredicate( - &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator()); - - if (auto *C = dyn_cast<ConstantInt>(SCEVCheck)) + if (!Check.TmpBlock) + return; + if (auto *C = dyn_cast<ConstantInt>(Check.SCEVCheck)) if (C->isZero()) return; @@ -2907,10 +2993,30 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { "Cannot SCEV check stride or overflow when optimizing for size"); SCEVCheckBlock->setName("vector.scevcheck"); + + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + + BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI, - nullptr, "vector.ph"); + Check.TmpBlock = SplitBlock(SCEVCheckBlock, + cast<Instruction>(Check.SCEVCheck)->getNextNode(), + nullptr, nullptr, nullptr, ""); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(SCEVCheckBlock, *LI); + + Check.TmpBlock->replaceAllUsesWith(SCEVCheckBlock); + Check.TmpBlock->getTerminator()->moveBefore(SCEVCheckBlock->getTerminator()); + SCEVCheckBlock->getTerminator()->eraseFromParent(); + SCEVCheckBlock->moveBefore(LoopVectorPreHeader); + + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, SCEVCheckBlock); + + DT->addNewBlock(SCEVCheckBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock); // Update dominator only if this is first RT check. if (LoopBypassBlocks.empty()) { @@ -2920,9 +3026,10 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) { ReplaceInstWithInst( SCEVCheckBlock->getTerminator(), - BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck)); + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.SCEVCheck)); LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; + Check.SCEVCheck = nullptr; } void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { @@ -2932,14 +3039,12 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { // Reuse existing vector loop preheader for runtime memory checks. // Note that new preheader block is generated for vector loop. - BasicBlock *const MemCheckBlock = L->getLoopPreheader(); + BasicBlock *const MemCheckBlock = Check.TmpBlock; - // Generate the code that checks in runtime if arrays overlap. We put the - // checks into a separate block to make the more common case of few elements - // faster. - auto *LAI = Legal->getLAI(); - const auto &RtPtrChecking = *LAI->getRuntimePointerChecking(); - if (!RtPtrChecking.Need) + // Check if we generated code that checks in runtime if arrays overlap. We put + // the checks into a separate block to make the more common case of few + // elements faster. + if (!Check.MemRuntimeCheck) return; if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) { @@ -2956,11 +3061,22 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { }); } + auto *Pred = LoopVectorPreHeader->getSinglePredecessor(); + auto *PHTerm = Pred->getTerminator(); + for (unsigned i = 0; i < PHTerm->getNumSuccessors(); i++) + if (PHTerm->getSuccessor(i) == LoopVectorPreHeader) + PHTerm->setSuccessor(i, Check.TmpBlock); + auto *BI = BranchInst::Create(LoopVectorPreHeader, Check.TmpBlock); + BI->setDebugLoc(PHTerm->getDebugLoc()); + + DT->addNewBlock(Check.TmpBlock, Pred); + DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock); + Check.TmpBlock->moveBefore(LoopVectorPreHeader); + + if (auto *PL = LI->getLoopFor(LoopVectorPreHeader)) + PL->addBasicBlockToLoop(Check.TmpBlock, *LI); + MemCheckBlock->setName("vector.memcheck"); - // Create new preheader for vector loop. - LoopVectorPreHeader = - SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr, - "vector.ph"); auto *CondBranch = cast<BranchInst>( Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader)); @@ -2973,15 +3089,13 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { DT->changeImmediateDominator(Bypass, MemCheckBlock); DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock); } + Check.TmpBlock = nullptr; - Instruction *FirstCheckInst; - Instruction *MemRuntimeCheck; - std::tie(FirstCheckInst, MemRuntimeCheck) = - addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop, - RtPtrChecking.getChecks(), RtPtrChecking.getSE()); - assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking " - "claimed checks are required"); - CondBranch->setCondition(MemRuntimeCheck); + ReplaceInstWithInst( + MemCheckBlock->getTerminator(), + BranchInst::Create(Bypass, LoopVectorPreHeader, Check.MemRuntimeCheck)); + LoopBypassBlocks.push_back(MemCheckBlock); + AddedSafetyChecks = true; // We currently don't use LoopVersioning for the actual loop cloning but we // still use it to add the noalias metadata. @@ -8220,8 +8334,9 @@ static bool processLoopInVPlanNativePath( LVP.setBestPlan(VF.Width, 1); + GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL, - &CM, BFI, PSI); + &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); LVP.executePlan(LB, DT); @@ -8229,7 +8344,6 @@ static bool processLoopInVPlanNativePath( // Mark the loop as already vectorized to avoid vectorizing again. Hints.setAlreadyVectorized(); - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -8387,6 +8501,11 @@ bool LoopVectorizePass::processLoop(Loop *L) { IC = CM.selectInterleaveCount(VF.Width, VF.Cost); } + // Optimistically generate runtime checks. Drop them if they turn out to not + // be profitable. + GeneratedRTChecks Checks(L->getLoopPreheader(), *PSE.getSE(), DT); + Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate(), LI); + // Identify the diagnostic messages that should be produced. std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; @@ -8487,7 +8606,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // If we decided that it is not legal to vectorize the loop, then // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, - BFI, PSI); + BFI, PSI, Checks); LVP.executePlan(Unroller, DT); ORE->emit([&]() { @@ -8499,7 +8618,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { } else { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, - &LVL, &CM, BFI, PSI); + &LVL, &CM, BFI, PSI, Checks); LVP.executePlan(LB, DT); ++LoopsVectorized; @@ -8532,7 +8651,6 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.setAlreadyVectorized(); } - assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs())); return true; } @@ -8598,6 +8716,8 @@ LoopVectorizeResult LoopVectorizePass::runImpl( Changed |= CFGChanged |= processLoop(L); } + assert(!Changed || !verifyFunction(F, &dbgs())); + // Process each loop nest in the function. return LoopVectorizeResult(Changed, CFGChanged); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll index 6b7e809046ec..25911c61fa43 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr36032.ll @@ -47,11 +47,10 @@ define void @_Z1dv() local_unnamed_addr #0 { ; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[CALL]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[G_0]], [[CONV]] -; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = sub i64 [[TMP15]], [[TMP0]] -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP16]] +; CHECK-NEXT: [[TMP14:%.*]] = zext i32 [[TMP3]] to i64 +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[TMP14]], [[TMP0]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @c, i64 0, i64 4), i64 [[TMP15]] ; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP]], [[SCEVGEP3]] ; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP2]], [[SCEVGEP1]] ; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] @@ -65,23 +64,23 @@ define void @_Z1dv() local_unnamed_addr #0 { ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[OFFSET_IDX4:%.*]] = add i64 [[TMP0]], [[INDEX]] -; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32 -; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 0 -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64 -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP21]] -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0 -; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <4 x i8>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1, !alias.scope !0 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP25]], i32 0 -; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <4 x i8>* -; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP27]], align 1, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[OFFSET_IDX4]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[TMP17]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[CONV]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <4 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP23]], align 1, !alias.scope !0 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP16]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i8* [[TMP25]] to <4 x i8>* +; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP26]], align 1, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -96,13 +95,13 @@ define void @_Z1dv() local_unnamed_addr #0 { ; CHECK-NEXT: br label [[FOR_COND]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP29:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CONV]], [[TMP29]] +; CHECK-NEXT: [[TMP28:%.*]] = trunc i64 [[INDVARS_IV]] to i32 +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[CONV]], [[TMP28]] ; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[ADD]] to i64 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP30:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[INDVARS_IV]] -; CHECK-NEXT: store i8 [[TMP30]], i8* [[ARRAYIDX3]], align 1 +; CHECK-NEXT: store i8 [[TMP29]], i8* [[ARRAYIDX3]], align 1 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop !7 diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll index 3bb73e97a536..cad0df5daac2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll @@ -26,9 +26,9 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: br i1 [[CMP27]], label [[FOR_BODY3_LR_PH_US_PREHEADER:%.*]], label [[FOR_END15:%.*]] ; CHECK: for.body3.lr.ph.us.preheader: ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[K:%.*]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP0]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 ; CHECK-NEXT: br label [[FOR_BODY3_LR_PH_US:%.*]] ; CHECK: for.end.us: ; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]] @@ -54,12 +54,12 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3 ; CHECK: for.body3.lr.ph.us: ; CHECK-NEXT: [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]] +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP1]], [[INDVARS_IV33]] ; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP9]], [[K]] ; CHECK-NEXT: [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]] -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]] ; CHECK: vector.scevcheck: ; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]]) @@ -74,8 +74,8 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: [[TMP16:%.*]] = or i1 false, [[TMP15]] ; CHECK-NEXT: br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] @@ -100,7 +100,7 @@ define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 { ; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5 ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ] diff --git a/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll new file mode 100644 index 000000000000..1ae3a9b87715 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/runtime-drop-crash.ll @@ -0,0 +1,32 @@ +; RUN: opt -loop-vectorize -force-vector-width=4 %s | FileCheck %s + +%struct.foo = type { [400 x double] } + +; Make sure we do not crash when dropping runtime checks. + +; CHECK-NOT: vector.body + +define void @barney(%struct.foo* %ptr) { +entry: + br label %loop + +loop: + %tmp3 = phi i64 [ 0, %entry ], [ %tmp18, %loop ] + %tmp4 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 undef + %tmp5 = bitcast %struct.foo* %tmp4 to i64* + store i64 0, i64* %tmp5, align 8 + %tmp8 = add i64 1, %tmp3 + %tmp10 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp8 + %tmp11 = bitcast %struct.foo* %tmp10 to i64* + store i64 1, i64* %tmp11, align 8 + %tmp14 = add i64 undef, %tmp3 + %tmp16 = getelementptr inbounds %struct.foo, %struct.foo* %ptr, i64 %tmp14 + %tmp17 = bitcast %struct.foo* %tmp16 to i64* + store i64 2, i64* %tmp17, align 8 + %tmp18 = add nuw nsw i64 %tmp3, 4 + %c = icmp ult i64 %tmp18, 400 + br i1 %c, label %exit, label %loop + +exit: + ret void +} _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits