https://github.com/paschalis-mpeis updated https://github.com/llvm/llvm-project/pull/82488
>From 641aaf7c13d520bef52b092726f8346bfecb1c8d Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis <paschalis.mp...@arm.com> Date: Wed, 21 Feb 2024 11:53:00 +0000 Subject: [PATCH 1/3] SLP cannot vectorize frem calls in AArch64. It needs updated costs when there are available vector library functions given the VF and type. --- .../SLPVectorizer/AArch64/slp-frem.ll | 71 +++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll new file mode 100644 index 00000000000000..45f667f5657889 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -S -mtriple=aarch64 -vector-library=ArmPL -passes=slp-vectorizer | FileCheck %s + +@a = common global ptr null, align 8 + +define void @frem_v2double() { +; CHECK-LABEL: define void @frem_v2double() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A0:%.*]] = load double, ptr @a, align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 +; CHECK-NEXT: [[B0:%.*]] = load double, ptr @a, align 8 +; CHECK-NEXT: [[B1:%.*]] = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 +; CHECK-NEXT: [[R0:%.*]] = frem double [[A0]], [[B0]] +; CHECK-NEXT: [[R1:%.*]] = frem double [[A1]], [[B1]] +; CHECK-NEXT: store double [[R0]], ptr @a, align 8 +; CHECK-NEXT: store double [[R1]], ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 +; CHECK-NEXT: ret void +; +entry: + %a0 = load double, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8 + %a1 = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 + %b0 = load double, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8 + %b1 = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 + %r0 = frem double %a0, %b0 + %r1 = frem double %a1, %b1 + store double %r0, ptr getelementptr inbounds (double, ptr @a, i64 0), align 8 + store double %r1, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 + ret void +} + +define void @frem_v4float() { +; CHECK-LABEL: define void @frem_v4float() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A0:%.*]] = load float, ptr @a, align 8 +; CHECK-NEXT: [[A1:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 +; CHECK-NEXT: [[B0:%.*]] = load float, ptr @a, align 8 +; CHECK-NEXT: [[B1:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 +; CHECK-NEXT: [[B2:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 +; CHECK-NEXT: [[B3:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 +; CHECK-NEXT: [[R0:%.*]] = frem float [[A0]], [[B0]] +; CHECK-NEXT: [[R1:%.*]] = frem float [[A1]], [[B1]] +; CHECK-NEXT: [[R2:%.*]] = frem float [[A2]], [[B2]] +; CHECK-NEXT: [[R3:%.*]] = frem float [[A3]], [[B3]] +; CHECK-NEXT: store float [[R0]], ptr @a, align 8 +; CHECK-NEXT: store float [[R1]], ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 +; CHECK-NEXT: store float [[R2]], ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 +; CHECK-NEXT: store float [[R3]], ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 +; CHECK-NEXT: ret void +; +entry: + %a0 = load float, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8 + %a1 = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 + %a2 = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 + %a3 = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 + %b0 = load float, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8 + %b1 = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 + %b2 = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 + %b3 = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 + %r0 = frem float %a0, %b0 + %r1 = frem float %a1, %b1 + %r2 = frem float %a2, %b2 + %r3 = frem float %a3, %b3 + store float %r0, ptr getelementptr inbounds (float, ptr @a, i64 0), align 8 + store float %r1, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 + store float %r2, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 + store float %r3, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 + ret void +} + >From 29ae086478e3d4bae6b6250670f87273359626d7 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis <paschalis.mp...@arm.com> Date: Mon, 29 Jan 2024 14:10:30 +0000 Subject: [PATCH 2/3] [AArch64] SLP can vectorize frem When vector library calls are available for frem, given its type and vector length, the SLP vectorizer uses updated costs that amount to a call, matching LoopVectorizer's functionality. This allows 'superword-level' vectorization, which can be converted to a vector lib call by later passes. Add tests that vectorize code that contains 2x double and 4x float frem instructions. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 17 ++++++++-- .../SLPVectorizer/AArch64/slp-frem.ll | 32 +++++-------------- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 4e334748c95934..effe52fe2c4e31 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8362,9 +8362,20 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1; TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0)); TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx)); - return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info, - Op2Info) + - CommonCost; + auto VecCost = TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, + Op1Info, Op2Info); + // Some targets can replace frem with vector library calls. + if (ShuffleOrOp == Instruction::FRem) { + LibFunc Func; + if (TLI->getLibFunc(ShuffleOrOp, ScalarTy, Func) && + TLI->isFunctionVectorizable(TLI->getName(Func), + VecTy->getElementCount())) { + auto VecCallCost = TTI->getCallInstrCost( + nullptr, VecTy, {ScalarTy, ScalarTy}, CostKind); + VecCost = std::min(VecCost, VecCallCost); + } + } + return VecCost + CommonCost; }; return GetCostDiff(GetScalarCost, GetVectorCost); } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll index 45f667f5657889..a38f4bdc4640e9 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-frem.ll @@ -6,14 +6,10 @@ define void @frem_v2double() { ; CHECK-LABEL: define void @frem_v2double() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A0:%.*]] = load double, ptr @a, align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 -; CHECK-NEXT: [[B0:%.*]] = load double, ptr @a, align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 -; CHECK-NEXT: [[R0:%.*]] = frem double [[A0]], [[B0]] -; CHECK-NEXT: [[R1:%.*]] = frem double [[A1]], [[B1]] -; CHECK-NEXT: store double [[R0]], ptr @a, align 8 -; CHECK-NEXT: store double [[R1]], ptr getelementptr inbounds (double, ptr @a, i64 1), align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr @a, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @a, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = frem <2 x double> [[TMP0]], [[TMP1]] +; CHECK-NEXT: store <2 x double> [[TMP2]], ptr @a, align 8 ; CHECK-NEXT: ret void ; entry: @@ -31,22 +27,10 @@ entry: define void @frem_v4float() { ; CHECK-LABEL: define void @frem_v4float() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[A0:%.*]] = load float, ptr @a, align 8 -; CHECK-NEXT: [[A1:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 -; CHECK-NEXT: [[B0:%.*]] = load float, ptr @a, align 8 -; CHECK-NEXT: [[B1:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 -; CHECK-NEXT: [[B2:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 -; CHECK-NEXT: [[B3:%.*]] = load float, ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 -; CHECK-NEXT: [[R0:%.*]] = frem float [[A0]], [[B0]] -; CHECK-NEXT: [[R1:%.*]] = frem float [[A1]], [[B1]] -; CHECK-NEXT: [[R2:%.*]] = frem float [[A2]], [[B2]] -; CHECK-NEXT: [[R3:%.*]] = frem float [[A3]], [[B3]] -; CHECK-NEXT: store float [[R0]], ptr @a, align 8 -; CHECK-NEXT: store float [[R1]], ptr getelementptr inbounds (float, ptr @a, i64 1), align 8 -; CHECK-NEXT: store float [[R2]], ptr getelementptr inbounds (float, ptr @a, i64 2), align 8 -; CHECK-NEXT: store float [[R3]], ptr getelementptr inbounds (float, ptr @a, i64 3), align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr @a, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr @a, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = frem <4 x float> [[TMP0]], [[TMP1]] +; CHECK-NEXT: store <4 x float> [[TMP2]], ptr @a, align 8 ; CHECK-NEXT: ret void ; entry: >From b4a7eed279a092c5d83b019788373aee93540db6 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis <paschalis.mp...@arm.com> Date: Wed, 21 Feb 2024 18:02:52 +0000 Subject: [PATCH 3/3] Added 'getVecLibCallCost' in TTI. Unfortunately TLI (TargetLibraryInfo) is not available in TTI and changing the signature of 'getArithmeticInstrCost' would cause significant changes in loads of places. As a compromise getVecLibCallCost returns a vector library exist for a given target + vector type. --- .../llvm/Analysis/TargetTransformInfo.h | 7 +++++++ llvm/lib/Analysis/TargetTransformInfo.cpp | 13 +++++++++++++ .../lib/Transforms/Vectorize/SLPVectorizer.cpp | 18 +++++------------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 58577a6b6eb5c0..bd331693745267 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1255,6 +1255,13 @@ class TargetTransformInfo { ArrayRef<const Value *> Args = ArrayRef<const Value *>(), const Instruction *CxtI = nullptr) const; + /// Returns the cost of a call when a target has a vector library function for + /// the given \p VecTy, otherwise an invalid cost. + InstructionCost getVecLibCallCost(const int OpCode, + const TargetLibraryInfo *TLI, + VectorType *VecTy, + TTI::TargetCostKind CostKind); + /// Returns the cost estimation for alternating opcode pattern that can be /// lowered to a single instruction on the target. In X86 this is for the /// addsub instruction which corrsponds to a Shuffle + Fadd + FSub pattern in diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 1f11f0d7dd620e..58d39069aa740f 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -9,6 +9,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfoImpl.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Dominators.h" @@ -869,6 +870,18 @@ TargetTransformInfo::getOperandInfo(const Value *V) { return {OpInfo, OpProps}; } +InstructionCost TargetTransformInfo::getVecLibCallCost( + const int OpCode, const TargetLibraryInfo *TLI, VectorType *VecTy, + TTI::TargetCostKind CostKind) { + Type *ScalarTy = VecTy->getScalarType(); + LibFunc Func; + if (TLI->getLibFunc(OpCode, ScalarTy, Func) && + TLI->isFunctionVectorizable(TLI->getName(Func), VecTy->getElementCount())) + return getCallInstrCost(nullptr, VecTy, {ScalarTy, ScalarTy}, CostKind); + + return InstructionCost::getInvalid(); +} + InstructionCost TargetTransformInfo::getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, OperandValueInfo Op1Info, OperandValueInfo Op2Info, diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index effe52fe2c4e31..40958258565c81 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -8362,20 +8362,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1; TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0)); TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx)); - auto VecCost = TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, - Op1Info, Op2Info); + InstructionCost VecInstrCost = TTI->getArithmeticInstrCost( + ShuffleOrOp, VecTy, CostKind, Op1Info, Op2Info); // Some targets can replace frem with vector library calls. - if (ShuffleOrOp == Instruction::FRem) { - LibFunc Func; - if (TLI->getLibFunc(ShuffleOrOp, ScalarTy, Func) && - TLI->isFunctionVectorizable(TLI->getName(Func), - VecTy->getElementCount())) { - auto VecCallCost = TTI->getCallInstrCost( - nullptr, VecTy, {ScalarTy, ScalarTy}, CostKind); - VecCost = std::min(VecCost, VecCallCost); - } - } - return VecCost + CommonCost; + InstructionCost VecCallCost = + TTI->getVecLibCallCost(ShuffleOrOp, TLI, VecTy, CostKind); + return std::min(VecInstrCost, VecCallCost) + CommonCost; }; return GetCostDiff(GetScalarCost, GetVectorCost); } _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits