llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-vectorizers Author: Sam Tebbs (SamTebbs33) <details> <summary>Changes</summary> Partial reductions can easily be represented by the VPReductionRecipe class by setting their scale factor to something greater than 1. This PR merges the two together and gives VPReductionRecipe a VFScaleFactor so that it can choose to generate the partial reduction intrinsic at execute time. Depends on https://github.com/llvm/llvm-project/pull/144281 --- Patch is 364.55 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146073.diff 18 Files Affected: - (modified) llvm/include/llvm/Analysis/TargetTransformInfo.h (+2) - (modified) llvm/lib/Analysis/TargetTransformInfo.cpp (+15-4) - (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+20-14) - (modified) llvm/lib/Transforms/Vectorize/VPlan.h (+50-87) - (modified) llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp (+8-7) - (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+50-84) - (modified) llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp (+45-13) - (modified) llvm/lib/Transforms/Vectorize/VPlanValue.h (-1) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll (+86-66) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll (+85-165) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll (+58-58) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll (+285-525) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll (+384-361) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll (+13-13) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll (+12-12) - (modified) llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll (+9-12) - (modified) llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll (+26-26) - (modified) llvm/unittests/Transforms/Vectorize/VPlanTest.cpp (+4-4) ``````````diff diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 90d92e0fcf55c..73f4a6b2af853 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -223,6 +223,8 @@ class TargetTransformInfo { /// Get the kind of extension that an instruction represents. LLVM_ABI static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I); + LLVM_ABI static PartialReductionExtendKind + getPartialReductionExtendKind(Instruction::CastOps CastOpc); /// Construct a TTI object using a type implementing the \c Concept /// API below. diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 3ebd9d487ba04..b81403cc35272 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1000,11 +1000,22 @@ InstructionCost TargetTransformInfo::getShuffleCost( } TargetTransformInfo::PartialReductionExtendKind -TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { - if (isa<SExtInst>(I)) - return PR_SignExtend; - if (isa<ZExtInst>(I)) +TargetTransformInfo::getPartialReductionExtendKind( + Instruction::CastOps CastOpc) { + switch (CastOpc) { + case Instruction::CastOps::ZExt: return PR_ZeroExtend; + case Instruction::CastOps::SExt: + return PR_SignExtend; + default: + return PR_None; + } +} + +TargetTransformInfo::PartialReductionExtendKind +TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { + if (auto *Cast = dyn_cast<CastInst>(I)) + return getPartialReductionExtendKind(Cast->getOpcode()); return PR_None; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index bb29e4fc6d232..60d06fb9ce4fd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7050,7 +7050,8 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan, } // The VPlan-based cost model is more accurate for partial reduction and // comparing against the legacy cost isn't desirable. - if (isa<VPPartialReductionRecipe>(&R)) + if (auto *VPR = dyn_cast<VPReductionRecipe>(&R); VPR && + VPR->isPartialReduction()) return true; /// If a VPlan transform folded a recipe to one producing a single-scalar, @@ -8280,11 +8281,14 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())); // If the PHI is used by a partial reduction, set the scale factor. - unsigned ScaleFactor = - getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); - PhiRecipe = new VPReductionPHIRecipe( - Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi), - CM.useOrderedReductions(RdxDesc), ScaleFactor); + bool UseInLoopReduction = CM.isInLoopReduction(Phi); + bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc); + auto ScaleFactor = + (UseOrderedReductions || UseInLoopReduction) + ? 0 + : getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1); + PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV, + UseOrderedReductions, ScaleFactor); } else { // TODO: Currently fixed-order recurrences are modeled as chains of // first-order recurrences. If there are no users of the intermediate @@ -8348,7 +8352,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, VPValue *Accumulator = Operands[1]; VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe(); if (isa<VPReductionPHIRecipe>(BinOpRecipe) || - isa<VPPartialReductionRecipe>(BinOpRecipe)) + (isa<VPReductionRecipe>(BinOpRecipe) && + cast<VPReductionRecipe>(BinOpRecipe)->isPartialReduction())) std::swap(BinOp, Accumulator); unsigned ReductionOpcode = Reduction->getOpcode(); @@ -8369,12 +8374,10 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, "Expected an ADD or SUB operation for predicated partial " "reductions (because the neutral element in the mask is zero)!"); Cond = getBlockInMask(Builder.getInsertBlock()); - VPValue *Zero = - Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0)); - BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc()); } - return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond, - ScaleFactor, Reduction); + + return new VPReductionRecipe(RecurKind::Add, FastMathFlags(), Reduction, + Accumulator, BinOp, Cond, false, ScaleFactor); } void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, @@ -9154,9 +9157,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( FastMathFlags FMFs = isa<FPMathOperator>(CurrentLinkI) ? RdxDesc.getFastMathFlags() : FastMathFlags(); + bool UseOrderedReductions = CM.useOrderedReductions(RdxDesc); + unsigned VFScaleFactor = !UseOrderedReductions; auto *RedRecipe = new VPReductionRecipe( Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp, - CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc()); + UseOrderedReductions, VFScaleFactor, CurrentLinkI->getDebugLoc()); // Append the recipe to the end of the VPBasicBlock because we need to // ensure that it comes after all of it's inputs, including CondOp. // Delete CurrentLink as it will be invalid if its operand is replaced @@ -9190,8 +9195,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( // Don't output selects for partial reductions because they have an output // with fewer lanes than the VF. So the operands of the select would have // different numbers of lanes. Partial reductions mask the input instead. + auto *RR = dyn_cast<VPReductionRecipe>(OrigExitingVPV->getDefiningRecipe()); if (!PhiR->isInLoop() && CM.foldTailByMasking() && - !isa<VPPartialReductionRecipe>(OrigExitingVPV->getDefiningRecipe())) { + (!RR || !RR->isPartialReduction())) { VPValue *Cond = RecipeBuilder.getBlockInMask(PhiR->getParent()); std::optional<FastMathFlags> FMFs = PhiTy->isFloatingPointTy() diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 39e9c55abfa2c..ebeba3f198113 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -552,7 +552,6 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPWidenIntOrFpInductionSC: case VPRecipeBase::VPWidenPointerInductionSC: case VPRecipeBase::VPReductionPHISC: - case VPRecipeBase::VPPartialReductionSC: return true; case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: @@ -2182,34 +2181,37 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Descriptor for the reduction. const RecurrenceDescriptor &RdxDesc; - /// The phi is part of an in-loop reduction. - bool IsInLoop; - /// The phi is part of an ordered reduction. Requires IsInLoop to be true. bool IsOrdered; - /// When expanding the reduction PHI, the plan's VF element count is divided - /// by this factor to form the reduction phi's VF. - unsigned VFScaleFactor = 1; + /// The scaling factor, relative to the VF, that this recipe's output is + /// divided by. + /// For outer-loop reductions this is equal to 1. + /// For in-loop reductions this is equal to 0, to specify that this is equal + /// to the VF (which may not be known yet). For partial-reductions this is + /// equal to another scalar value. + unsigned VFScaleFactor; public: /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p /// RdxDesc. VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc, - VPValue &Start, bool IsInLoop = false, - bool IsOrdered = false, unsigned VFScaleFactor = 1) + VPValue &Start, bool IsOrdered = false, + unsigned VFScaleFactor = 1) : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start), - RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered), - VFScaleFactor(VFScaleFactor) { - assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop"); + RdxDesc(RdxDesc), IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) { + assert((!IsOrdered || isInLoop()) && + "IsOrdered requires the reduction to be in-loop"); + assert(((!isInLoop() && !IsOrdered) || isInLoop()) && + "Invalid VFScaleFactor"); } ~VPReductionPHIRecipe() override = default; VPReductionPHIRecipe *clone() override { - auto *R = new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), - RdxDesc, *getOperand(0), IsInLoop, - IsOrdered, VFScaleFactor); + auto *R = + new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc, + *getOperand(0), IsOrdered, VFScaleFactor); R->addOperand(getBackedgeValue()); return R; } @@ -2235,8 +2237,10 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe, /// Returns true, if the phi is part of an ordered reduction. bool isOrdered() const { return IsOrdered; } - /// Returns true, if the phi is part of an in-loop reduction. - bool isInLoop() const { return IsInLoop; } + /// Returns true if the phi is part of an in-loop reduction. + bool isInLoop() const { return VFScaleFactor == 0; } + + bool isPartialReduction() const { return VFScaleFactor > 1; } /// Returns true if the recipe only uses the first lane of operand \p Op. bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -2409,23 +2413,32 @@ class VPInterleaveRecipe : public VPRecipeBase { Instruction *getInsertPos() const { return IG->getInsertPos(); } }; -/// A recipe to represent inloop reduction operations, performing a reduction on -/// a vector operand into a scalar value, and adding the result to a chain. -/// The Operands are {ChainOp, VecOp, [Condition]}. +/// A recipe to represent inloop, ordered or partial reduction operations. It +/// performs a reduction on a vector operand into a scalar (vector in the case +/// of a partial reduction) value, and adds the result to a chain. The Operands +/// are {ChainOp, VecOp, [Condition]}. class VPReductionRecipe : public VPRecipeWithIRFlags { /// The recurrence kind for the reduction in question. RecurKind RdxKind; bool IsOrdered; /// Whether the reduction is conditional. bool IsConditional = false; + /// The scaling factor, relative to the VF, that this recipe's output is + /// divided by. + /// For outer-loop reductions this is equal to 1. + /// For in-loop reductions this is equal to 0, to specify that this is equal + /// to the VF (which may not be known yet). + /// For partial-reductions this is equal to another scalar value. + unsigned VFScaleFactor; protected: VPReductionRecipe(const unsigned char SC, RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, ArrayRef<VPValue *> Operands, VPValue *CondOp, - bool IsOrdered, DebugLoc DL) + bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL) : VPRecipeWithIRFlags(SC, Operands, FMFs, DL), RdxKind(RdxKind), - IsOrdered(IsOrdered) { + IsOrdered(IsOrdered), VFScaleFactor(VFScaleFactor) { + assert((!IsOrdered || VFScaleFactor == 0) && "Invalid scale factor"); if (CondOp) { IsConditional = true; addOperand(CondOp); @@ -2436,24 +2449,24 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { public: VPReductionRecipe(RecurKind RdxKind, FastMathFlags FMFs, Instruction *I, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {}) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, I, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, - IsOrdered, DL) {} + IsOrdered, VFScaleFactor, DL) {} VPReductionRecipe(const RecurKind RdxKind, FastMathFlags FMFs, VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp, - bool IsOrdered, DebugLoc DL = {}) + bool IsOrdered, unsigned VFScaleFactor, DebugLoc DL = {}) : VPReductionRecipe(VPDef::VPReductionSC, RdxKind, FMFs, nullptr, ArrayRef<VPValue *>({ChainOp, VecOp}), CondOp, - IsOrdered, DL) {} + IsOrdered, VFScaleFactor, DL) {} ~VPReductionRecipe() override = default; VPReductionRecipe *clone() override { - return new VPReductionRecipe(RdxKind, getFastMathFlags(), - getUnderlyingInstr(), getChainOp(), getVecOp(), - getCondOp(), IsOrdered, getDebugLoc()); + return new VPReductionRecipe( + RdxKind, getFastMathFlags(), getUnderlyingInstr(), getChainOp(), + getVecOp(), getCondOp(), IsOrdered, VFScaleFactor, getDebugLoc()); } static inline bool classof(const VPRecipeBase *R) { @@ -2485,6 +2498,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { bool isOrdered() const { return IsOrdered; }; /// Return true if the in-loop reduction is conditional. bool isConditional() const { return IsConditional; }; + /// Return true if the reduction is a partial reduction. + bool isPartialReduction() const { return VFScaleFactor > 1; } /// The VPValue of the scalar Chain being accumulated. VPValue *getChainOp() const { return getOperand(0); } /// The VPValue of the vector value to be reduced. @@ -2493,65 +2508,8 @@ class VPReductionRecipe : public VPRecipeWithIRFlags { VPValue *getCondOp() const { return isConditional() ? getOperand(getNumOperands() - 1) : nullptr; } -}; - -/// A recipe for forming partial reductions. In the loop, an accumulator and -/// vector operand are added together and passed to the next iteration as the -/// next accumulator. After the loop body, the accumulator is reduced to a -/// scalar value. -class VPPartialReductionRecipe : public VPReductionRecipe { - unsigned Opcode; - - /// The divisor by which the VF of this recipe's output should be divided - /// during execution. - unsigned VFScaleFactor; - -public: - VPPartialReductionRecipe(Instruction *ReductionInst, VPValue *Op0, - VPValue *Op1, VPValue *Cond, unsigned VFScaleFactor) - : VPPartialReductionRecipe(ReductionInst->getOpcode(), Op0, Op1, Cond, - VFScaleFactor, ReductionInst) {} - VPPartialReductionRecipe(unsigned Opcode, VPValue *Op0, VPValue *Op1, - VPValue *Cond, unsigned ScaleFactor, - Instruction *ReductionInst = nullptr) - : VPReductionRecipe(VPDef::VPPartialReductionSC, RecurKind::Add, - FastMathFlags(), ReductionInst, - ArrayRef<VPValue *>({Op0, Op1}), Cond, false, {}), - Opcode(Opcode), VFScaleFactor(ScaleFactor) { - [[maybe_unused]] auto *AccumulatorRecipe = - getChainOp()->getDefiningRecipe(); - assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) || - isa<VPPartialReductionRecipe>(AccumulatorRecipe)) && - "Unexpected operand order for partial reduction recipe"); - } - ~VPPartialReductionRecipe() override = default; - - VPPartialReductionRecipe *clone() override { - return new VPPartialReductionRecipe(Opcode, getOperand(0), getOperand(1), - getCondOp(), VFScaleFactor, - getUnderlyingInstr()); - } - - VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC) - - /// Generate the reduction in the loop. - void execute(VPTransformState &State) override; - - /// Return the cost of this VPPartialReductionRecipe. - InstructionCost computeCost(ElementCount VF, - VPCostContext &Ctx) const override; - - /// Get the binary op's opcode. - unsigned getOpcode() const { return Opcode; } - /// Get the factor that the VF of this recipe's output should be scaled by. unsigned getVFScaleFactor() const { return VFScaleFactor; } - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print the recipe. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; -#endif }; /// A recipe to represent inloop reduction operations with vector-predication @@ -2567,7 +2525,7 @@ class VPReductionEVLRecipe : public VPReductionRecipe { R.getFastMathFlags(), cast_or_null<Instruction>(R.getUnderlyingValue()), ArrayRef<VPValue *>({R.getChainOp(), R.getVecOp(), &EVL}), CondOp, - R.isOrdered(), DL) {} + R.isOrdered(), 0, DL) {} ~VPReductionEVLRecipe() override = default; @@ -2768,6 +2726,11 @@ class VPSingleDefBundleRecipe : public VPSingleDefRecipe { VPWidenRecipe *Mul, VPReductionRecipe *Red) : VPSingleDefBundleRecipe(BundleTypes::ExtMulAccumulateReduction, {Ext0, Ext1, Mul, Red}) {} + VPSingleDefBundleRecipe(VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1, + VPWidenRecipe *Mul, VPWidenRecipe *Sub, + VPReductionRecipe *Red) + : VPSingleDefBundleRecipe(BundleTypes::ExtMulAccumulateReduction, + {Ext0, Ext1, Mul, Sub, Red}) {} ~VPSingleDefBundleRecipe() override { SmallPtrSet<VPRecipeBase *, 4> Seen; diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index effcb1e201cc7..37060c87bea8f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -283,10 +283,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) { [](const auto *R) { return R->getScalarType(); }) .Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe, - VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe, - VPPartialReductionRecipe>([this](const VPRecipeBase *R) { - return inferScalarType(R->getOperand(0)); - }) + VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe>( + [this](const VPRecipeBase *R) { + return inferScalarType(R->getOperand(0)); + }) // VPInstructionWithType must be handled before VPInstruction. .Case<VPInstructionWithType, VPWidenIntrinsicRecipe, VPWidenCastRecipe>( @@ -397,7 +397,7 @@ bool VPDominatorTree::properlyDominates(const VPRecipeBase *A, static unsigned getVFScaleFactor(VPRecipeBase *R) { if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R)) return RR->getVFScaleFactor(); - if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R)) + if (auto *RR = dyn_cast<VPReductionRecipe>(R)) return RR->getVFScaleFactor(); assert( (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() != @@ -567,8 +567,9 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan( } else { // The output from scaled phis and scaled reductions actually has // fewer lanes than the VF. - unsigned ScaleFactor = getVFScaleFactor(R); - ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor); + ElementCount VF = VFs[J]; + if (unsigned ScaleFactor = getVFScaleFactor(R)) + VF = VF.divideCoefficientBy(ScaleFactor); LLVM_DEBUG(if (VF != VFs[J]) { dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << "... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/146073 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits