https://github.com/tangaac updated https://github.com/llvm/llvm-project/pull/109917
>From b720995dad35236f23681b3cc895f7aea9cc3147 Mon Sep 17 00:00:00 2001 From: tangaac <tangya...@loongson.cn> Date: Wed, 25 Sep 2024 11:32:37 +0800 Subject: [PATCH 1/2] [LoongArch] Add options for Clang to generate LoongArch-specific frecipe & frsqrte instructions --- clang/include/clang/Driver/Options.td | 4 + .../lib/Driver/ToolChains/Arch/LoongArch.cpp | 14 ++ .../LoongArch/LoongArchFloat32InstrInfo.td | 5 + .../LoongArch/LoongArchFloat64InstrInfo.td | 9 + .../LoongArch/LoongArchISelLowering.cpp | 98 ++++++++ .../Target/LoongArch/LoongArchISelLowering.h | 27 +++ .../LoongArch/LoongArchLASXInstrInfo.td | 18 ++ .../Target/LoongArch/LoongArchLSXInstrInfo.td | 15 ++ .../LoongArch/fdiv-reciprocal-estimate.ll | 43 ++++ .../LoongArch/fsqrt-reciprocal-estimate.ll | 209 ++++++++++++++++++ .../lasx/fdiv-reciprocal-estimate.ll | 114 ++++++++++ .../lasx/fsqrt-reciprocal-estimate.ll | 75 +++++++ .../LoongArch/lsx/fdiv-reciprocal-estimate.ll | 114 ++++++++++ .../lsx/fsqrt-reciprocal-estimate.ll | 75 +++++++ 14 files changed, 820 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll create mode 100644 llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 23bd686a85f526..811fb5490d6707 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5373,6 +5373,10 @@ def mno_lasx : Flag<["-"], "mno-lasx">, Group<m_loongarch_Features_Group>, def msimd_EQ : Joined<["-"], "msimd=">, Group<m_loongarch_Features_Group>, Flags<[TargetSpecific]>, HelpText<"Select the SIMD extension(s) to be enabled in LoongArch either 'none', 'lsx', 'lasx'.">; +def mfrecipe : Flag<["-"], "mfrecipe">, Group<m_loongarch_Features_Group>, + HelpText<"Enable frecipe.{s/d} and frsqrte.{s/d}">; +def mno_frecipe : Flag<["-"], "mno-frecipe">, Group<m_loongarch_Features_Group>, + HelpText<"Disable frecipe.{s/d} and frsqrte.{s/d}">; def mnop_mcount : Flag<["-"], "mnop-mcount">, HelpText<"Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.">, Visibility<[ClangOption, CC1Option]>, Group<m_Group>, MarshallingInfoFlag<CodeGenOpts<"MNopMCount">>; diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index 771adade93813f..62233a32d0d396 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -251,6 +251,20 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, } else /*-mno-lasx*/ Features.push_back("-lasx"); } + + // Select frecipe feature determined by -m[no-]frecipe. + if (const Arg *A = + Args.getLastArg(options::OPT_mfrecipe, options::OPT_mno_frecipe)) { + // FRECIPE depends on 64-bit FPU. + // -mno-frecipe conflicts with -mfrecipe. + if (A->getOption().matches(options::OPT_mfrecipe)) { + if (llvm::find(Features, "-d") != Features.end()) + D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*FRECIPE*/ 2; + else /*-mfrecipe*/ + Features.push_back("+frecipe"); + } else /*-mnofrecipe*/ + Features.push_back("-frecipe"); + } } std::string loongarch::postProcessTargetCPUString(const std::string &CPU, diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index d6a83c0c8cd8fb..8f909d26cfd08a 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -19,12 +19,15 @@ def SDT_LoongArchMOVGR2FR_W_LA64 def SDT_LoongArchMOVFR2GR_S_LA64 : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>; def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; +def SDT_LoongArchFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; def loongarch_movgr2fr_w_la64 : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>; def loongarch_movfr2gr_s_la64 : SDNode<"LoongArchISD::MOVFR2GR_S_LA64", SDT_LoongArchMOVFR2GR_S_LA64>; def loongarch_ftint : SDNode<"LoongArchISD::FTINT", SDT_LoongArchFTINT>; +def loongarch_frecipe_s : SDNode<"LoongArchISD::FRECIPE_S", SDT_LoongArchFRECIPE>; +def loongarch_frsqrte_s : SDNode<"LoongArchISD::FRSQRTE_S", SDT_LoongArchFRECIPE>; //===----------------------------------------------------------------------===// // Instructions @@ -286,6 +289,8 @@ let Predicates = [HasFrecipe] in { // FP approximate reciprocal operation def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>; def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>; +def : Pat<(loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>; +def : Pat<(loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>; } // fmadd.s: fj * fk + fa diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index 30cce8439640f1..aabb58c0d68eff 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -10,6 +10,13 @@ // //===----------------------------------------------------------------------===// +// ===----------------------------------------------------------------------===// +// LoongArch specific DAG Nodes. +// ===----------------------------------------------------------------------===// + +def loongarch_frecipe_d : SDNode<"LoongArchISD::FRECIPE_D", SDT_LoongArchFRECIPE>; +def loongarch_frsqrte_d : SDNode<"LoongArchISD::FRSQRTE_D", SDT_LoongArchFRECIPE>; + //===----------------------------------------------------------------------===// // Instructions //===----------------------------------------------------------------------===// @@ -253,6 +260,8 @@ let Predicates = [HasFrecipe] in { // FP approximate reciprocal operation def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>; def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>; +def : Pat<(loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>; +def : Pat<(loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>; } // fmadd.d: fj * fk + fa diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index bfafb331752108..0f948e8148df53 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -4697,6 +4697,18 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VANY_ZERO) NODE_NAME_CASE(VALL_NONZERO) NODE_NAME_CASE(VANY_NONZERO) + NODE_NAME_CASE(FRECIPE_S) + NODE_NAME_CASE(FRECIPE_D) + NODE_NAME_CASE(FRSQRTE_S) + NODE_NAME_CASE(FRSQRTE_D) + NODE_NAME_CASE(VFRECIPE_S) + NODE_NAME_CASE(VFRECIPE_D) + NODE_NAME_CASE(VFRSQRTE_S) + NODE_NAME_CASE(VFRSQRTE_D) + NODE_NAME_CASE(XVFRECIPE_S) + NODE_NAME_CASE(XVFRECIPE_D) + NODE_NAME_CASE(XVFRSQRTE_S) + NODE_NAME_CASE(XVFRSQRTE_D) } #undef NODE_NAME_CASE return nullptr; @@ -5902,6 +5914,92 @@ Register LoongArchTargetLowering::getExceptionSelectorRegister( return LoongArch::R5; } +//===----------------------------------------------------------------------===// +// Target Optimization Hooks +//===----------------------------------------------------------------------===// + +static int getEstimateRefinementSteps(EVT VT, + const LoongArchSubtarget &Subtarget) { + // Feature FRECIPE instrucions relative accuracy is 2^-14. + // IEEE float has 23 digits and double has 52 digits. + int RefinementSteps = VT.getScalarType() == MVT::f64 ? 2 : 1; + return RefinementSteps; +} + +SDValue LoongArchTargetLowering::getSqrtEstimate(SDValue Operand, + SelectionDAG &DAG, int Enabled, + int &RefinementSteps, + bool &UseOneConstNR, + bool Reciprocal) const { + if (Subtarget.hasFrecipe()) { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + unsigned Opcode; + + if (VT == MVT::f32) { + Opcode = LoongArchISD::FRSQRTE_S; + } else if (VT == MVT::f64 && Subtarget.hasBasicD()) { + Opcode = LoongArchISD::FRSQRTE_D; + } else if (VT == MVT::v4f32 && Subtarget.hasExtLSX()) { + Opcode = LoongArchISD::VFRSQRTE_S; + } else if (VT == MVT::v2f64 && Subtarget.hasExtLSX()) { + Opcode = LoongArchISD::VFRSQRTE_D; + } else if (VT == MVT::v8f32 && Subtarget.hasExtLASX()) { + Opcode = LoongArchISD::XVFRSQRTE_S; + } else if (VT == MVT::v4f64 && Subtarget.hasExtLASX()) { + Opcode = LoongArchISD::XVFRSQRTE_D; + } else { + return SDValue(); + } + + UseOneConstNR = false; + if (RefinementSteps == ReciprocalEstimate::Unspecified) + RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); + + SDValue Estimate = DAG.getNode(Opcode, DL, VT, Operand); + if (Reciprocal) { + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate); + } + return Estimate; + } + + return SDValue(); +} + +SDValue LoongArchTargetLowering::getRecipEstimate(SDValue Operand, + SelectionDAG &DAG, + int Enabled, + int &RefinementSteps) const { + if (Subtarget.hasFrecipe()) { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + unsigned Opcode; + + if (VT == MVT::f32) { + Opcode = LoongArchISD::FRECIPE_S; + } else if (VT == MVT::f64 && Subtarget.hasBasicD()) { + Opcode = LoongArchISD::FRECIPE_D; + } else if (VT == MVT::v4f32 && Subtarget.hasExtLSX()) { + Opcode = LoongArchISD::VFRECIPE_S; + } else if (VT == MVT::v2f64 && Subtarget.hasExtLSX()) { + Opcode = LoongArchISD::VFRECIPE_D; + } else if (VT == MVT::v8f32 && Subtarget.hasExtLASX()) { + Opcode = LoongArchISD::XVFRECIPE_S; + } else if (VT == MVT::v4f64 && Subtarget.hasExtLASX()) { + Opcode = LoongArchISD::XVFRECIPE_D; + } else { + return SDValue(); + } + + if (RefinementSteps == ReciprocalEstimate::Unspecified) + RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); + + return DAG.getNode(Opcode, DL, VT, Operand); + } + + return SDValue(); +} + //===----------------------------------------------------------------------===// // LoongArch Inline Assembly Support //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 6177884bd19501..a721cfc5f518e1 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -141,6 +141,22 @@ enum NodeType : unsigned { VALL_NONZERO, VANY_NONZERO, + // Floating point approximate reciprocal operation + FRECIPE_S, + FRECIPE_D, + FRSQRTE_S, + FRSQRTE_D, + + VFRECIPE_S, + VFRECIPE_D, + VFRSQRTE_S, + VFRSQRTE_D, + + XVFRECIPE_S, + XVFRECIPE_D, + XVFRSQRTE_S, + XVFRSQRTE_D, + // Intrinsic operations end ============================================= }; } // end namespace LoongArchISD @@ -216,6 +232,17 @@ class LoongArchTargetLowering : public TargetLowering { Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { + return true; + } + + SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps, bool &UseOneConstNR, + bool Reciprocal) const override; + + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps) const override; + ISD::NodeType getExtendForAtomicOps() const override { return ISD::SIGN_EXTEND; } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index dd7e5713e45fe9..23ae6f038dceff 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -9,9 +9,18 @@ // This file describes the Advanced SIMD extension instructions. // //===----------------------------------------------------------------------===// +def SDT_LoongArchXVFRECIPE_S : SDTypeProfile<1, 1, [SDTCisVT<0, v8f32>, SDTCisVT<1, v8f32>]>; +def SDT_LoongArchXVFRECIPE_D : SDTypeProfile<1, 1, [SDTCisVT<0, v4f64>, SDTCisVT<1, v4f64>]>; +// Target nodes. def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>; +def loongarch_xvfrecipe_s: SDNode<"LoongArchISD::XVFRECIPE_S", SDT_LoongArchXVFRECIPE_S>; +def loongarch_xvfrecipe_d: SDNode<"LoongArchISD::XVFRECIPE_D", SDT_LoongArchXVFRECIPE_D>; +def loongarch_xvfrsqrte_s: SDNode<"LoongArchISD::XVFRSQRTE_S", SDT_LoongArchXVFRECIPE_S>; +def loongarch_xvfrsqrte_d: SDNode<"LoongArchISD::XVFRSQRTE_D", SDT_LoongArchXVFRECIPE_D>; + + def lasxsplati8 : PatFrag<(ops node:$e0), (v32i8 (build_vector node:$e0, node:$e0, node:$e0, node:$e0, @@ -2094,6 +2103,15 @@ foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)), (!cast<LAInst>(Inst) LASX256:$xj)>; + +def : Pat<(loongarch_xvfrecipe_s v8f32:$src), + (XVFRECIPE_S v8f32:$src)>; +def : Pat<(loongarch_xvfrecipe_d v4f64:$src), + (XVFRECIPE_D v4f64:$src)>; +def : Pat<(loongarch_xvfrsqrte_s v8f32:$src), + (XVFRSQRTE_S v8f32:$src)>; +def : Pat<(loongarch_xvfrsqrte_d v4f64:$src), + (XVFRSQRTE_D v4f64:$src)>; } def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index e7ac9f3bd04cbf..510b1241edd4e0 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -23,6 +23,8 @@ def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>; def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>; +def SDT_LoongArchVFRECIPE_S : SDTypeProfile<1, 1, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4f32>]>; +def SDT_LoongArchVFRECIPE_D : SDTypeProfile<1, 1, [SDTCisVT<0, v2f64>, SDTCisVT<1, v2f64>]>; // Target nodes. def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>; @@ -50,6 +52,10 @@ def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>; def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_LoongArchV1RUimm>; def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_LoongArchV1RUimm>; +def loongarch_vfrecipe_s: SDNode<"LoongArchISD::VFRECIPE_S", SDT_LoongArchVFRECIPE_S>; +def loongarch_vfrecipe_d: SDNode<"LoongArchISD::VFRECIPE_D", SDT_LoongArchVFRECIPE_D>; +def loongarch_vfrsqrte_s: SDNode<"LoongArchISD::VFRSQRTE_S", SDT_LoongArchVFRECIPE_S>; +def loongarch_vfrsqrte_d: SDNode<"LoongArchISD::VFRSQRTE_D", SDT_LoongArchVFRECIPE_D>; def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>; def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>; @@ -2238,6 +2244,15 @@ foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)), (!cast<LAInst>(Inst) LSX128:$vj)>; + +def : Pat<(loongarch_vfrecipe_s v4f32:$src), + (VFRECIPE_S v4f32:$src)>; +def : Pat<(loongarch_vfrecipe_d v2f64:$src), + (VFRECIPE_D v2f64:$src)>; +def : Pat<(loongarch_vfrsqrte_s v4f32:$src), + (VFRSQRTE_S v4f32:$src)>; +def : Pat<(loongarch_vfrsqrte_d v2f64:$src), + (VFRSQRTE_D v2f64:$src)>; } // load diff --git a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll new file mode 100644 index 00000000000000..b4b280a43055f1 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s + +;; Exercise the 'fdiv' LLVM IR: https://llvm.org/docs/LangRef.html#fdiv-instruction + +define float @fdiv_s(float %x, float %y) { +; FAULT-LABEL: fdiv_s: +; FAULT: # %bb.0: +; FAULT-NEXT: fdiv.s $fa0, $fa0, $fa1 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_s: +; CHECK: # %bb.0: +; CHECK-NEXT: frecipe.s $fa2, $fa1 +; CHECK-NEXT: fmul.s $fa3, $fa0, $fa2 +; CHECK-NEXT: fnmsub.s $fa0, $fa1, $fa3, $fa0 +; CHECK-NEXT: fmadd.s $fa0, $fa2, $fa0, $fa3 +; CHECK-NEXT: ret + %div = fdiv fast float %x, %y + ret float %div +} + +define double @fdiv_d(double %x, double %y) { +; FAULT-LABEL: fdiv_d: +; FAULT: # %bb.0: +; FAULT-NEXT: fdiv.d $fa0, $fa0, $fa1 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_d: +; CHECK: # %bb.0: +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; CHECK-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) +; CHECK-NEXT: frecipe.d $fa3, $fa1 +; CHECK-NEXT: fmadd.d $fa2, $fa1, $fa3, $fa2 +; CHECK-NEXT: fnmsub.d $fa2, $fa2, $fa3, $fa3 +; CHECK-NEXT: fmul.d $fa3, $fa0, $fa2 +; CHECK-NEXT: fnmsub.d $fa0, $fa1, $fa3, $fa0 +; CHECK-NEXT: fmadd.d $fa0, $fa2, $fa0, $fa3 +; CHECK-NEXT: ret + %div = fdiv fast double %x, %y + ret double %div +} diff --git a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll new file mode 100644 index 00000000000000..d683487fdd4073 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,209 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s + +declare float @llvm.sqrt.f32(float) +declare double @llvm.sqrt.f64(double) + +define float @frsqrt_f32(float %a) nounwind { +; FAULT-LABEL: frsqrt_f32: +; FAULT: # %bb.0: +; FAULT-NEXT: frsqrt.s $fa0, $fa0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: frsqrt_f32: +; CHECK: # %bb.0: +; CHECK-NEXT: frsqrte.s $fa1, $fa0 +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; CHECK-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) +; CHECK-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) +; CHECK-NEXT: fmul.s $fa1, $fa0, $fa1 +; CHECK-NEXT: fmul.s $fa0, $fa0, $fa1 +; CHECK-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 +; CHECK-NEXT: fmul.s $fa1, $fa1, $fa3 +; CHECK-NEXT: fmul.s $fa0, $fa1, $fa0 +; CHECK-NEXT: ret + + %1 = call fast float @llvm.sqrt.f32(float %a) + %2 = fdiv fast float 1.0, %1 + ret float %2 +} + +define double @frsqrt_f64(double %a) nounwind { +; FAULT-LABEL: frsqrt_f64: +; FAULT: # %bb.0: +; FAULT-NEXT: frsqrt.d $fa0, $fa0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: frsqrt_f64: +; CHECK: # %bb.0: +; CHECK-NEXT: frsqrte.d $fa1, $fa0 +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; CHECK-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) +; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_1) +; CHECK-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI1_1) +; CHECK-NEXT: fmul.d $fa1, $fa0, $fa1 +; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 +; CHECK-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 +; CHECK-NEXT: fmul.d $fa0, $fa0, $fa1 +; CHECK-NEXT: fmadd.d $fa0, $fa0, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 +; CHECK-NEXT: fmul.d $fa0, $fa1, $fa0 +; CHECK-NEXT: ret + %1 = call fast double @llvm.sqrt.f64(double %a) + %2 = fdiv fast double 1.0, %1 + ret double %2 +} + +define double @sqrt_simplify_before_recip_3_uses(double %x, ptr %p1, ptr %p2) nounwind { +; FAULT-LABEL: sqrt_simplify_before_recip_3_uses: +; FAULT: # %bb.0: +; FAULT-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; FAULT-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) +; FAULT-NEXT: fsqrt.d $fa1, $fa0 +; FAULT-NEXT: frsqrt.d $fa0, $fa0 +; FAULT-NEXT: fdiv.d $fa2, $fa2, $fa1 +; FAULT-NEXT: fst.d $fa0, $a0, 0 +; FAULT-NEXT: fst.d $fa2, $a1, 0 +; FAULT-NEXT: fmov.d $fa0, $fa1 +; FAULT-NEXT: ret +; +; CHECK-LABEL: sqrt_simplify_before_recip_3_uses: +; CHECK: # %bb.0: +; CHECK-NEXT: frsqrte.d $fa1, $fa0 +; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; CHECK-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) +; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_1) +; CHECK-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI2_1) +; CHECK-NEXT: fmul.d $fa1, $fa0, $fa1 +; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 +; CHECK-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 +; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 +; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_2) +; CHECK-NEXT: fld.d $fa5, $a2, %pc_lo12(.LCPI2_2) +; CHECK-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa2, $fa1, $fa5 +; CHECK-NEXT: fmul.d $fa0, $fa0, $fa1 +; CHECK-NEXT: fst.d $fa1, $a0, 0 +; CHECK-NEXT: fst.d $fa2, $a1, 0 +; CHECK-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %rsqrt = fdiv fast double 1.0, %sqrt + %r = fdiv fast double 42.0, %sqrt + %sqrt_fast = fdiv fast double %x, %sqrt + store double %rsqrt, ptr %p1, align 8 + store double %r, ptr %p2, align 8 + ret double %sqrt_fast +} + +define double @sqrt_simplify_before_recip_3_uses_order(double %x, ptr %p1, ptr %p2) nounwind { +; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order: +; FAULT: # %bb.0: +; FAULT-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; FAULT-NEXT: fld.d $fa1, $a2, %pc_lo12(.LCPI3_0) +; FAULT-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; FAULT-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_1) +; FAULT-NEXT: fsqrt.d $fa0, $fa0 +; FAULT-NEXT: fdiv.d $fa1, $fa1, $fa0 +; FAULT-NEXT: fdiv.d $fa2, $fa2, $fa0 +; FAULT-NEXT: fst.d $fa1, $a0, 0 +; FAULT-NEXT: fst.d $fa2, $a1, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order: +; CHECK: # %bb.0: +; CHECK-NEXT: frsqrte.d $fa1, $fa0 +; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; CHECK-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_0) +; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; CHECK-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_1) +; CHECK-NEXT: fmul.d $fa1, $fa0, $fa1 +; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 +; CHECK-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 +; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 +; CHECK-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 +; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_2) +; CHECK-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_2) +; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_3) +; CHECK-NEXT: fld.d $fa4, $a2, %pc_lo12(.LCPI3_3) +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa0, $fa0, $fa1 +; CHECK-NEXT: fmul.d $fa2, $fa1, $fa3 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 +; CHECK-NEXT: fst.d $fa2, $a0, 0 +; CHECK-NEXT: fst.d $fa1, $a1, 0 +; CHECK-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %sqrt_fast = fdiv fast double %x, %sqrt + %r1 = fdiv fast double 42.0, %sqrt + %r2 = fdiv fast double 43.0, %sqrt + store double %r1, ptr %p1, align 8 + store double %r2, ptr %p2, align 8 + ret double %sqrt_fast +} + + +define double @sqrt_simplify_before_recip_4_uses(double %x, ptr %p1, ptr %p2, ptr %p3) nounwind { +; FAULT-LABEL: sqrt_simplify_before_recip_4_uses: +; FAULT: # %bb.0: +; FAULT-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; FAULT-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) +; FAULT-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; FAULT-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) +; FAULT-NEXT: fsqrt.d $fa1, $fa0 +; FAULT-NEXT: frsqrt.d $fa0, $fa0 +; FAULT-NEXT: fdiv.d $fa2, $fa2, $fa1 +; FAULT-NEXT: fdiv.d $fa3, $fa3, $fa1 +; FAULT-NEXT: fst.d $fa0, $a0, 0 +; FAULT-NEXT: fst.d $fa2, $a1, 0 +; FAULT-NEXT: fst.d $fa3, $a2, 0 +; FAULT-NEXT: fmov.d $fa0, $fa1 +; FAULT-NEXT: ret +; +; CHECK-LABEL: sqrt_simplify_before_recip_4_uses: +; CHECK: # %bb.0: +; CHECK-NEXT: frsqrte.d $fa1, $fa0 +; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; CHECK-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) +; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; CHECK-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) +; CHECK-NEXT: fmul.d $fa1, $fa0, $fa1 +; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 +; CHECK-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 +; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 +; CHECK-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_2) +; CHECK-NEXT: fld.d $fa4, $a3, %pc_lo12(.LCPI4_2) +; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_3) +; CHECK-NEXT: fld.d $fa5, $a3, %pc_lo12(.LCPI4_3) +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 +; CHECK-NEXT: fmul.d $fa1, $fa1, $fa2 +; CHECK-NEXT: fmul.d $fa2, $fa1, $fa4 +; CHECK-NEXT: fmul.d $fa3, $fa1, $fa5 +; CHECK-NEXT: fmul.d $fa0, $fa0, $fa1 +; CHECK-NEXT: fst.d $fa1, $a0, 0 +; CHECK-NEXT: fst.d $fa2, $a1, 0 +; CHECK-NEXT: fst.d $fa3, $a2, 0 +; CHECK-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %rsqrt = fdiv fast double 1.0, %sqrt + %r1 = fdiv fast double 42.0, %sqrt + %r2 = fdiv fast double 43.0, %sqrt + %sqrt_fast = fdiv fast double %x, %sqrt + store double %rsqrt, ptr %p1, align 8 + store double %r1, ptr %p2, align 8 + store double %r2, ptr %p3, align 8 + ret double %sqrt_fast +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll new file mode 100644 index 00000000000000..769d9ef81faf39 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s + +define void @fdiv_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v8f32: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvld $xr1, $a2, 0 +; FAULT-NEXT: xvfdiv.s $xr0, $xr0, $xr1 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvfrecipe.s $xr2, $xr0 +; CHECK-NEXT: xvfmul.s $xr3, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.s $xr0, $xr0, $xr3, $xr1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr2, $xr0, $xr3 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %v1 = load <8 x float>, ptr %a1 + %v2 = fdiv fast <8 x float> %v0, %v1 + store <8 x float> %v2, ptr %res + ret void +} + +define void @fdiv_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v4f64: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvld $xr1, $a2, 0 +; FAULT-NEXT: xvfdiv.d $xr0, $xr0, $xr1 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: lu52i.d $a1, $zero, -1025 +; CHECK-NEXT: xvreplgr2vr.d $xr2, $a1 +; CHECK-NEXT: xvfrecipe.d $xr3, $xr0 +; CHECK-NEXT: xvfmadd.d $xr2, $xr0, $xr3, $xr2 +; CHECK-NEXT: xvfnmsub.d $xr2, $xr2, $xr3, $xr3 +; CHECK-NEXT: xvfmul.d $xr3, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.d $xr0, $xr0, $xr3, $xr1 +; CHECK-NEXT: xvfmadd.d $xr0, $xr2, $xr0, $xr3 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %v1 = load <4 x double>, ptr %a1 + %v2 = fdiv fast <4 x double> %v0, %v1 + store <4 x double> %v2, ptr %res + ret void +} + +;; 1.0 / vec +define void @one_fdiv_v8f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v8f32: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrecip.s $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrecipe.s $xr1, $xr0 +; CHECK-NEXT: lu12i.w $a1, -264192 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.s $xr0, $xr0, $xr1, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %v0 + store <8 x float> %div, ptr %res + ret void +} + +define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v4f64: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrecip.d $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrecipe.d $xr1, $xr0 +; CHECK-NEXT: lu52i.d $a1, $zero, 1023 +; CHECK-NEXT: xvreplgr2vr.d $xr2, $a1 +; CHECK-NEXT: xvfnmsub.d $xr3, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfmadd.d $xr1, $xr1, $xr3, $xr1 +; CHECK-NEXT: xvfnmsub.d $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfmadd.d $xr0, $xr1, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %div = fdiv fast <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %v0 + store <4 x double> %div, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll new file mode 100644 index 00000000000000..48fd12697417ac --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s + +;; 1.0 / (fsqrt vec) +define void @one_div_sqrt_v8f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v8f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrsqrt.s $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_div_sqrt_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrsqrte.s $xr1, $xr0 +; CHECK-NEXT: xvfmul.s $xr1, $xr0, $xr1 +; CHECK-NEXT: xvfmul.s $xr0, $xr0, $xr1 +; CHECK-NEXT: lu12i.w $a1, -261120 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: lu12i.w $a1, -266240 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmul.s $xr1, $xr1, $xr2 +; CHECK-NEXT: xvfmul.s $xr0, $xr1, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0, align 16 + %sqrt = call fast <8 x float> @llvm.sqrt.v8f32 (<8 x float> %v0) + %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt + store <8 x float> %div, ptr %res, align 16 + ret void +} + +define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v4f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrsqrt.d $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_div_sqrt_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrsqrte.d $xr1, $xr0 +; CHECK-NEXT: xvfmul.d $xr1, $xr0, $xr1 +; CHECK-NEXT: xvfmul.d $xr2, $xr0, $xr1 +; CHECK-NEXT: ori $a1, $zero, 0 +; CHECK-NEXT: lu32i.d $a1, -524288 +; CHECK-NEXT: lu52i.d $a1, $a1, -1024 +; CHECK-NEXT: xvreplgr2vr.d $xr3, $a1 +; CHECK-NEXT: xvfmadd.d $xr2, $xr2, $xr1, $xr3 +; CHECK-NEXT: lu52i.d $a1, $zero, -1026 +; CHECK-NEXT: xvreplgr2vr.d $xr4, $a1 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr4 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr2 +; CHECK-NEXT: xvfmul.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvfmadd.d $xr0, $xr0, $xr1, $xr3 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr4 +; CHECK-NEXT: xvfmul.d $xr0, $xr1, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0, align 16 + %sqrt = call fast <4 x double> @llvm.sqrt.v4f64 (<4 x double> %v0) + %div = fdiv fast <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %sqrt + store <4 x double> %div, ptr %res, align 16 + ret void +} + +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll new file mode 100644 index 00000000000000..21dbbf310ad870 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s + +define void @fdiv_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vld $vr1, $a2, 0 +; FAULT-NEXT: vfdiv.s $vr0, $vr0, $vr1 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vfrecipe.s $vr2, $vr0 +; CHECK-NEXT: vfmul.s $vr3, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.s $vr0, $vr0, $vr3, $vr1 +; CHECK-NEXT: vfmadd.s $vr0, $vr2, $vr0, $vr3 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %v1 = load <4 x float>, ptr %a1 + %v2 = fdiv fast <4 x float> %v0, %v1 + store <4 x float> %v2, ptr %res + ret void +} + +define void @fdiv_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vld $vr1, $a2, 0 +; FAULT-NEXT: vfdiv.d $vr0, $vr0, $vr1 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: lu52i.d $a1, $zero, -1025 +; CHECK-NEXT: vreplgr2vr.d $vr2, $a1 +; CHECK-NEXT: vfrecipe.d $vr3, $vr0 +; CHECK-NEXT: vfmadd.d $vr2, $vr0, $vr3, $vr2 +; CHECK-NEXT: vfnmsub.d $vr2, $vr2, $vr3, $vr3 +; CHECK-NEXT: vfmul.d $vr3, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.d $vr0, $vr0, $vr3, $vr1 +; CHECK-NEXT: vfmadd.d $vr0, $vr2, $vr0, $vr3 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %v1 = load <2 x double>, ptr %a1 + %v2 = fdiv fast <2 x double> %v0, %v1 + store <2 x double> %v2, ptr %res + ret void +} + +;; 1.0 / vec +define void @one_fdiv_v4f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrecip.s $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrecipe.s $vr1, $vr0 +; CHECK-NEXT: lu12i.w $a1, -264192 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmadd.s $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.s $vr0, $vr0, $vr1, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %v0 + store <4 x float> %div, ptr %res + ret void +} + +define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrecip.d $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrecipe.d $vr1, $vr0 +; CHECK-NEXT: lu52i.d $a1, $zero, 1023 +; CHECK-NEXT: vreplgr2vr.d $vr2, $a1 +; CHECK-NEXT: vfnmsub.d $vr3, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfmadd.d $vr1, $vr1, $vr3, $vr1 +; CHECK-NEXT: vfnmsub.d $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfmadd.d $vr0, $vr1, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %v0 + store <2 x double> %div, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll new file mode 100644 index 00000000000000..912d06242f7d3e --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s + +;; 1.0 / (fsqrt vec) +define void @one_div_sqrt_v4f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrsqrt.s $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL one_div_sqrt_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrsqrte.s $vr1, $vr0 +; CHECK-NEXT: vfmul.s $vr1, $vr0, $vr1 +; CHECK-NEXT: vfmul.s $vr0, $vr0, $vr1 +; CHECK-NEXT: lu12i.w $a1, -261120 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmadd.s $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: lu12i.w $a1, -266240 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmul.s $vr1, $vr1, $vr2 +; CHECK-NEXT: vfmul.s $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0, align 16 + %sqrt = call fast <4 x float> @llvm.sqrt.v4f32 (<4 x float> %v0) + %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt + store <4 x float> %div, ptr %res, align 16 + ret void +} + +define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrsqrt.d $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL one_div_sqrt_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrsqrte.d $vr1, $vr0 +; CHECK-NEXT: vfmul.d $vr1, $vr0, $vr1 +; CHECK-NEXT: vfmul.d $vr2, $vr0, $vr1 +; CHECK-NEXT: ori $a1, $zero, 0 +; CHECK-NEXT: lu32i.d $a1, -524288 +; CHECK-NEXT: lu52i.d $a1, $a1, -1024 +; CHECK-NEXT: vreplgr2vr.d $vr3, $a1 +; CHECK-NEXT: vfmadd.d $vr2, $vr2, $vr1, $vr3 +; CHECK-NEXT: lu52i.d $a1, $zero, -1026 +; CHECK-NEXT: vreplgr2vr.d $vr4, $a1 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr4 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr2 +; CHECK-NEXT: vfmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vfmadd.d $vr0, $vr0, $vr1, $vr3 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr4 +; CHECK-NEXT: vfmul.d $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0, align 16 + %sqrt = call fast <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0) + %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt + store <2 x double> %div, ptr %res, align 16 + ret void +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) >From 0ce3290d76479b301aeb6b84b820cccd2716f37b Mon Sep 17 00:00:00 2001 From: tangaac <tangya...@loongson.cn> Date: Sun, 29 Sep 2024 14:37:47 +0800 Subject: [PATCH 2/2] fix that frecipe.s doesn't work with `+f` & add some tests --- .../lib/Driver/ToolChains/Arch/LoongArch.cpp | 6 +- .../LoongArch/fdiv-reciprocal-estimate.ll | 93 +++- .../LoongArch/fsqrt-reciprocal-estimate.ll | 503 ++++++++++++------ 3 files changed, 411 insertions(+), 191 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index 62233a32d0d396..4919d17a3580a6 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -255,13 +255,9 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, // Select frecipe feature determined by -m[no-]frecipe. if (const Arg *A = Args.getLastArg(options::OPT_mfrecipe, options::OPT_mno_frecipe)) { - // FRECIPE depends on 64-bit FPU. // -mno-frecipe conflicts with -mfrecipe. if (A->getOption().matches(options::OPT_mfrecipe)) { - if (llvm::find(Features, "-d") != Features.end()) - D.Diag(diag::err_drv_loongarch_wrong_fpu_width) << /*FRECIPE*/ 2; - else /*-mfrecipe*/ - Features.push_back("+frecipe"); + Features.push_back("+frecipe"); } else /*-mnofrecipe*/ Features.push_back("-frecipe"); } diff --git a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll index b4b280a43055f1..3f38bbed881a32 100644 --- a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll +++ b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll @@ -1,43 +1,80 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=FAULT -; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,-frecipe < %s | FileCheck %s --check-prefix=LA32F +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,+frecipe < %s | FileCheck %s --check-prefix=LA32F-FRECIPE +; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=LA64D +; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s --check-prefix=LA64D-FRECIPE ;; Exercise the 'fdiv' LLVM IR: https://llvm.org/docs/LangRef.html#fdiv-instruction define float @fdiv_s(float %x, float %y) { -; FAULT-LABEL: fdiv_s: -; FAULT: # %bb.0: -; FAULT-NEXT: fdiv.s $fa0, $fa0, $fa1 -; FAULT-NEXT: ret +; LA32F-LABEL: fdiv_s: +; LA32F: # %bb.0: +; LA32F-NEXT: fdiv.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret ; -; CHECK-LABEL: fdiv_s: -; CHECK: # %bb.0: -; CHECK-NEXT: frecipe.s $fa2, $fa1 -; CHECK-NEXT: fmul.s $fa3, $fa0, $fa2 -; CHECK-NEXT: fnmsub.s $fa0, $fa1, $fa3, $fa0 -; CHECK-NEXT: fmadd.s $fa0, $fa2, $fa0, $fa3 -; CHECK-NEXT: ret +; LA32F-FRECIPE-LABEL: fdiv_s: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frecipe.s $fa2, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fnmsub.s $fa0, $fa1, $fa3, $fa0 +; LA32F-FRECIPE-NEXT: fmadd.s $fa0, $fa2, $fa0, $fa3 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: fdiv_s: +; LA64D: # %bb.0: +; LA64D-NEXT: fdiv.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: fdiv_s: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frecipe.s $fa2, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.s $fa0, $fa1, $fa3, $fa0 +; LA64D-FRECIPE-NEXT: fmadd.s $fa0, $fa2, $fa0, $fa3 +; LA64D-FRECIPE-NEXT: ret %div = fdiv fast float %x, %y ret float %div } define double @fdiv_d(double %x, double %y) { -; FAULT-LABEL: fdiv_d: -; FAULT: # %bb.0: -; FAULT-NEXT: fdiv.d $fa0, $fa0, $fa1 -; FAULT-NEXT: ret +; LA32F-LABEL: fdiv_d: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: fdiv_d: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -16 +; LA32F-FRECIPE-NEXT: .cfi_def_cfa_offset 16 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: .cfi_offset 1, -4 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 16 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: fdiv_d: +; LA64D: # %bb.0: +; LA64D-NEXT: fdiv.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret ; -; CHECK-LABEL: fdiv_d: -; CHECK: # %bb.0: -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; CHECK-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) -; CHECK-NEXT: frecipe.d $fa3, $fa1 -; CHECK-NEXT: fmadd.d $fa2, $fa1, $fa3, $fa2 -; CHECK-NEXT: fnmsub.d $fa2, $fa2, $fa3, $fa3 -; CHECK-NEXT: fmul.d $fa3, $fa0, $fa2 -; CHECK-NEXT: fnmsub.d $fa0, $fa1, $fa3, $fa0 -; CHECK-NEXT: fmadd.d $fa0, $fa2, $fa0, $fa3 -; CHECK-NEXT: ret +; LA64D-FRECIPE-LABEL: fdiv_d: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) +; LA64D-FRECIPE-NEXT: frecipe.d $fa3, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa1, $fa3, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.d $fa2, $fa2, $fa3, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.d $fa0, $fa1, $fa3, $fa0 +; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa2, $fa0, $fa3 +; LA64D-FRECIPE-NEXT: ret %div = fdiv fast double %x, %y ret double %div } diff --git a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll index d683487fdd4073..333308474c344f 100644 --- a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll +++ b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll @@ -1,29 +1,51 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=FAULT -; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,-frecipe < %s | FileCheck %s --check-prefix=LA32F +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,+frecipe < %s | FileCheck %s --check-prefix=LA32F-FRECIPE +; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=LA64D +; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s --check-prefix=LA64D-FRECIPE + declare float @llvm.sqrt.f32(float) declare double @llvm.sqrt.f64(double) define float @frsqrt_f32(float %a) nounwind { -; FAULT-LABEL: frsqrt_f32: -; FAULT: # %bb.0: -; FAULT-NEXT: frsqrt.s $fa0, $fa0 -; FAULT-NEXT: ret +; LA32F-LABEL: frsqrt_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: frsqrt_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; LA32F-FRECIPE-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa1, $fa0 +; LA32F-FRECIPE-NEXT: ret ; -; CHECK-LABEL: frsqrt_f32: -; CHECK: # %bb.0: -; CHECK-NEXT: frsqrte.s $fa1, $fa0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) -; CHECK-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) -; CHECK-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) -; CHECK-NEXT: fmul.s $fa1, $fa0, $fa1 -; CHECK-NEXT: fmul.s $fa0, $fa0, $fa1 -; CHECK-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 -; CHECK-NEXT: fmul.s $fa1, $fa1, $fa3 -; CHECK-NEXT: fmul.s $fa0, $fa1, $fa0 -; CHECK-NEXT: ret +; LA64D-LABEL: frsqrt_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: frsqrt.s $fa0, $fa0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: frsqrt_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; LA64D-FRECIPE-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa1, $fa0 +; LA64D-FRECIPE-NEXT: ret %1 = call fast float @llvm.sqrt.f32(float %a) %2 = fdiv fast float 1.0, %1 @@ -31,69 +53,97 @@ define float @frsqrt_f32(float %a) nounwind { } define double @frsqrt_f64(double %a) nounwind { -; FAULT-LABEL: frsqrt_f64: -; FAULT: # %bb.0: -; FAULT-NEXT: frsqrt.d $fa0, $fa0 -; FAULT-NEXT: ret +; LA32F-LABEL: frsqrt_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: st.w $ra, $sp, 12 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $a2, $a0 +; LA32F-NEXT: move $a3, $a1 +; LA32F-NEXT: lu12i.w $a1, 261888 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: ld.w $ra, $sp, 12 +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: frsqrt_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -16 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $a2, $a0 +; LA32F-FRECIPE-NEXT: move $a3, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 261888 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 16 +; LA32F-FRECIPE-NEXT: ret ; -; CHECK-LABEL: frsqrt_f64: -; CHECK: # %bb.0: -; CHECK-NEXT: frsqrte.d $fa1, $fa0 -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) -; CHECK-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) -; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_1) -; CHECK-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI1_1) -; CHECK-NEXT: fmul.d $fa1, $fa0, $fa1 -; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 -; CHECK-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 -; CHECK-NEXT: fmul.d $fa0, $fa0, $fa1 -; CHECK-NEXT: fmadd.d $fa0, $fa0, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 -; CHECK-NEXT: fmul.d $fa0, $fa1, $fa0 -; CHECK-NEXT: ret +; LA64D-LABEL: frsqrt_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: frsqrt_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI1_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa0, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa1, $fa0 +; LA64D-FRECIPE-NEXT: ret %1 = call fast double @llvm.sqrt.f64(double %a) %2 = fdiv fast double 1.0, %1 ret double %2 } -define double @sqrt_simplify_before_recip_3_uses(double %x, ptr %p1, ptr %p2) nounwind { -; FAULT-LABEL: sqrt_simplify_before_recip_3_uses: -; FAULT: # %bb.0: -; FAULT-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) -; FAULT-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) -; FAULT-NEXT: fsqrt.d $fa1, $fa0 -; FAULT-NEXT: frsqrt.d $fa0, $fa0 -; FAULT-NEXT: fdiv.d $fa2, $fa2, $fa1 -; FAULT-NEXT: fst.d $fa0, $a0, 0 -; FAULT-NEXT: fst.d $fa2, $a1, 0 -; FAULT-NEXT: fmov.d $fa0, $fa1 -; FAULT-NEXT: ret +define double @sqrt_simplify_before_recip_3_uses_f64(double %x, ptr %p1, ptr %p2) nounwind { +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; LA64D-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) +; LA64D-NEXT: fsqrt.d $fa1, $fa0 +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa1 +; LA64D-NEXT: fst.d $fa0, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: fmov.d $fa0, $fa1 +; LA64D-NEXT: ret ; -; CHECK-LABEL: sqrt_simplify_before_recip_3_uses: -; CHECK: # %bb.0: -; CHECK-NEXT: frsqrte.d $fa1, $fa0 -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) -; CHECK-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_1) -; CHECK-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI2_1) -; CHECK-NEXT: fmul.d $fa1, $fa0, $fa1 -; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 -; CHECK-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 -; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_2) -; CHECK-NEXT: fld.d $fa5, $a2, %pc_lo12(.LCPI2_2) -; CHECK-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa2, $fa1, $fa5 -; CHECK-NEXT: fmul.d $fa0, $fa0, $fa1 -; CHECK-NEXT: fst.d $fa1, $a0, 0 -; CHECK-NEXT: fst.d $fa2, $a1, 0 -; CHECK-NEXT: ret +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI2_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_2) +; LA64D-FRECIPE-NEXT: fld.d $fa5, $a2, %pc_lo12(.LCPI2_2) +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: ret %sqrt = tail call fast double @llvm.sqrt.f64(double %x) %rsqrt = fdiv fast double 1.0, %sqrt %r = fdiv fast double 42.0, %sqrt @@ -103,46 +153,47 @@ define double @sqrt_simplify_before_recip_3_uses(double %x, ptr %p1, ptr %p2) no ret double %sqrt_fast } -define double @sqrt_simplify_before_recip_3_uses_order(double %x, ptr %p1, ptr %p2) nounwind { -; FAULT-LABEL: sqrt_simplify_before_recip_3_uses_order: -; FAULT: # %bb.0: -; FAULT-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) -; FAULT-NEXT: fld.d $fa1, $a2, %pc_lo12(.LCPI3_0) -; FAULT-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) -; FAULT-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_1) -; FAULT-NEXT: fsqrt.d $fa0, $fa0 -; FAULT-NEXT: fdiv.d $fa1, $fa1, $fa0 -; FAULT-NEXT: fdiv.d $fa2, $fa2, $fa0 -; FAULT-NEXT: fst.d $fa1, $a0, 0 -; FAULT-NEXT: fst.d $fa2, $a1, 0 -; FAULT-NEXT: ret + +define double @sqrt_simplify_before_recip_3_uses_order_f64(double %x, ptr %p1, ptr %p2) nounwind { +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; LA64D-NEXT: fld.d $fa1, $a2, %pc_lo12(.LCPI3_0) +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; LA64D-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_1) +; LA64D-NEXT: fsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa1, $fa1, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa0 +; LA64D-NEXT: fst.d $fa1, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: ret ; -; CHECK-LABEL: sqrt_simplify_before_recip_3_uses_order: -; CHECK: # %bb.0: -; CHECK-NEXT: frsqrte.d $fa1, $fa0 -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) -; CHECK-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_0) -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) -; CHECK-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_1) -; CHECK-NEXT: fmul.d $fa1, $fa0, $fa1 -; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 -; CHECK-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 -; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 -; CHECK-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_2) -; CHECK-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_2) -; CHECK-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_3) -; CHECK-NEXT: fld.d $fa4, $a2, %pc_lo12(.LCPI3_3) -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa0, $fa0, $fa1 -; CHECK-NEXT: fmul.d $fa2, $fa1, $fa3 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 -; CHECK-NEXT: fst.d $fa2, $a0, 0 -; CHECK-NEXT: fst.d $fa1, $a1, 0 -; CHECK-NEXT: ret +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_2) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_3) +; LA64D-FRECIPE-NEXT: fld.d $fa4, $a2, %pc_lo12(.LCPI3_3) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a1, 0 +; LA64D-FRECIPE-NEXT: ret %sqrt = tail call fast double @llvm.sqrt.f64(double %x) %sqrt_fast = fdiv fast double %x, %sqrt %r1 = fdiv fast double 42.0, %sqrt @@ -152,51 +203,50 @@ define double @sqrt_simplify_before_recip_3_uses_order(double %x, ptr %p1, ptr % ret double %sqrt_fast } - -define double @sqrt_simplify_before_recip_4_uses(double %x, ptr %p1, ptr %p2, ptr %p3) nounwind { -; FAULT-LABEL: sqrt_simplify_before_recip_4_uses: -; FAULT: # %bb.0: -; FAULT-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) -; FAULT-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) -; FAULT-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) -; FAULT-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) -; FAULT-NEXT: fsqrt.d $fa1, $fa0 -; FAULT-NEXT: frsqrt.d $fa0, $fa0 -; FAULT-NEXT: fdiv.d $fa2, $fa2, $fa1 -; FAULT-NEXT: fdiv.d $fa3, $fa3, $fa1 -; FAULT-NEXT: fst.d $fa0, $a0, 0 -; FAULT-NEXT: fst.d $fa2, $a1, 0 -; FAULT-NEXT: fst.d $fa3, $a2, 0 -; FAULT-NEXT: fmov.d $fa0, $fa1 -; FAULT-NEXT: ret +define double @sqrt_simplify_before_recip_4_uses_f64(double %x, ptr %p1, ptr %p2, ptr %p3) nounwind { +; LA64D-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; LA64D-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; LA64D-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) +; LA64D-NEXT: fsqrt.d $fa1, $fa0 +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa1 +; LA64D-NEXT: fdiv.d $fa3, $fa3, $fa1 +; LA64D-NEXT: fst.d $fa0, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: fst.d $fa3, $a2, 0 +; LA64D-NEXT: fmov.d $fa0, $fa1 +; LA64D-NEXT: ret ; -; CHECK-LABEL: sqrt_simplify_before_recip_4_uses: -; CHECK: # %bb.0: -; CHECK-NEXT: frsqrte.d $fa1, $fa0 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) -; CHECK-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) -; CHECK-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) -; CHECK-NEXT: fmul.d $fa1, $fa0, $fa1 -; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 -; CHECK-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa4 -; CHECK-NEXT: fmul.d $fa4, $fa0, $fa1 -; CHECK-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_2) -; CHECK-NEXT: fld.d $fa4, $a3, %pc_lo12(.LCPI4_2) -; CHECK-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_3) -; CHECK-NEXT: fld.d $fa5, $a3, %pc_lo12(.LCPI4_3) -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa3 -; CHECK-NEXT: fmul.d $fa1, $fa1, $fa2 -; CHECK-NEXT: fmul.d $fa2, $fa1, $fa4 -; CHECK-NEXT: fmul.d $fa3, $fa1, $fa5 -; CHECK-NEXT: fmul.d $fa0, $fa0, $fa1 -; CHECK-NEXT: fst.d $fa1, $a0, 0 -; CHECK-NEXT: fst.d $fa2, $a1, 0 -; CHECK-NEXT: fst.d $fa3, $a2, 0 -; CHECK-NEXT: ret +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_2) +; LA64D-FRECIPE-NEXT: fld.d $fa4, $a3, %pc_lo12(.LCPI4_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_3) +; LA64D-FRECIPE-NEXT: fld.d $fa5, $a3, %pc_lo12(.LCPI4_3) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa3, $a2, 0 +; LA64D-FRECIPE-NEXT: ret %sqrt = tail call fast double @llvm.sqrt.f64(double %x) %rsqrt = fdiv fast double 1.0, %sqrt %r1 = fdiv fast double 42.0, %sqrt @@ -207,3 +257,140 @@ define double @sqrt_simplify_before_recip_4_uses(double %x, ptr %p1, ptr %p2, pt store double %r2, ptr %p3, align 8 ret double %sqrt_fast } + +define float @sqrt_simplify_before_recip_3_uses_f32(float %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA32F-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI5_0) +; LA32F-NEXT: fsqrt.s $fa1, $fa0 +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA32F-NEXT: fst.s $fa0, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: fmov.s $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI5_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_1) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI5_1) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_2) +; LA32F-FRECIPE-NEXT: fld.s $fa5, $a2, %pc_lo12(.LCPI5_2) +; LA32F-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa5 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA32F-FRECIPE-NEXT: ret + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %rsqrt = fdiv fast float 1.0, %sqrt + %r = fdiv fast float 42.0, %sqrt + %sqrt_fast = fdiv fast float %x, %sqrt + store float %rsqrt, ptr %p1, align 8 + store float %r, ptr %p2, align 8 + ret float %sqrt_fast +} + +define float @sqrt_simplify_before_recip_4_uses_f32(float %x, ptr %p1, ptr %p2, ptr %p3) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA32F-NEXT: fld.s $fa2, $a3, %pc_lo12(.LCPI6_0) +; LA32F-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA32F-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA32F-NEXT: fsqrt.s $fa1, $fa0 +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA32F-NEXT: fdiv.s $fa3, $fa3, $fa1 +; LA32F-NEXT: fst.s $fa0, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: fst.s $fa3, $a2, 0 +; LA32F-NEXT: fmov.s $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA32F-FRECIPE-NEXT: fld.s $fa1, $a3, %pc_lo12(.LCPI6_0) +; LA32F-FRECIPE-NEXT: frsqrte.s $fa2, $fa0 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fmadd.s $fa1, $fa3, $fa2, $fa1 +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_2) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a3, %pc_lo12(.LCPI6_2) +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_3) +; LA32F-FRECIPE-NEXT: fld.s $fa5, $a3, %pc_lo12(.LCPI6_3) +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa2, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa2, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa1, $fa5 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa3, $a2, 0 +; LA32F-FRECIPE-NEXT: ret + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %rsqrt = fdiv fast float 1.0, %sqrt + %r1 = fdiv fast float 42.0, %sqrt + %r2 = fdiv fast float 43.0, %sqrt + %sqrt_fast = fdiv fast float %x, %sqrt + store float %rsqrt, ptr %p1, align 8 + store float %r1, ptr %p2, align 8 + store float %r2, ptr %p3, align 8 + ret float %sqrt_fast +} + +define float @sqrt_simplify_before_recip_3_uses_order_f32(float %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA32F-NEXT: fld.s $fa1, $a2, %pc_lo12(.LCPI7_0) +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA32F-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_1) +; LA32F-NEXT: fsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa1, $fa1, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa0 +; LA32F-NEXT: fst.s $fa1, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA32F-FRECIPE-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_1) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa4, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmadd.s $fa2, $fa4, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_2) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_2) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_3) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI7_3) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a1, 0 +; LA32F-FRECIPE-NEXT: ret + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %sqrt_fast = fdiv fast float %x, %sqrt + %r1 = fdiv fast float 42.0, %sqrt + %r2 = fdiv fast float 43.0, %sqrt + store float %r1, ptr %p1, align 8 + store float %r2, ptr %p2, align 8 + ret float %sqrt_fast +} _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits