Author: tangaac Date: 2024-10-18T09:06:29+08:00 New Revision: e9eec14bb3566f6578950797559de98678f16985
URL: https://github.com/llvm/llvm-project/commit/e9eec14bb3566f6578950797559de98678f16985 DIFF: https://github.com/llvm/llvm-project/commit/e9eec14bb3566f6578950797559de98678f16985.diff LOG: [LoongArch] [CodeGen] Add options for Clang to generate LoongArch-specific frecipe & frsqrte instructions (#109917) Two options: `-mfrecipe` & `-mno-frecipe`. Enable or Disable frecipe.{s/d} and frsqrte.{s/d} instructions. The default is `-mno-frecipe`. Added: clang/test/Driver/loongarch-mfrecipe.c llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll Modified: clang/include/clang/Driver/Options.td clang/lib/Driver/ToolChains/Arch/LoongArch.cpp llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp llvm/lib/Target/LoongArch/LoongArchISelLowering.h llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td Removed: ################################################################################ diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 379e75b197cf96..4eb013d587eb1e 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -5387,6 +5387,10 @@ def mno_lasx : Flag<["-"], "mno-lasx">, Group<m_loongarch_Features_Group>, let Flags = [TargetSpecific] in { def msimd_EQ : Joined<["-"], "msimd=">, Group<m_loongarch_Features_Group>, HelpText<"Select the SIMD extension(s) to be enabled in LoongArch either 'none', 'lsx', 'lasx'.">; +def mfrecipe : Flag<["-"], "mfrecipe">, Group<m_loongarch_Features_Group>, + HelpText<"Enable frecipe.{s/d} and frsqrte.{s/d}">; +def mno_frecipe : Flag<["-"], "mno-frecipe">, Group<m_loongarch_Features_Group>, + HelpText<"Disable frecipe.{s/d} and frsqrte.{s/d}">; def mannotate_tablejump : Flag<["-"], "mannotate-tablejump">, Group<m_loongarch_Features_Group>, HelpText<"Enable annotate table jump instruction to correlate it with the jump table.">; def mno_annotate_tablejump : Flag<["-"], "mno-annotate-tablejump">, Group<m_loongarch_Features_Group>, diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp index 771adade93813f..355253e4b3b07c 100644 --- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp +++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp @@ -251,6 +251,15 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D, } else /*-mno-lasx*/ Features.push_back("-lasx"); } + + // Select frecipe feature determined by -m[no-]frecipe. + if (const Arg *A = + Args.getLastArg(options::OPT_mfrecipe, options::OPT_mno_frecipe)) { + if (A->getOption().matches(options::OPT_mfrecipe)) + Features.push_back("+frecipe"); + else + Features.push_back("-frecipe"); + } } std::string loongarch::postProcessTargetCPUString(const std::string &CPU, diff --git a/clang/test/Driver/loongarch-mfrecipe.c b/clang/test/Driver/loongarch-mfrecipe.c new file mode 100644 index 00000000000000..14afd54af0b9df --- /dev/null +++ b/clang/test/Driver/loongarch-mfrecipe.c @@ -0,0 +1,30 @@ +/// Test -m[no]frecipe options. + +// RUN: %clang --target=loongarch64 -mfrecipe -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-FRECIPE +// RUN: %clang --target=loongarch64 -mno-frecipe -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-NO-FRECIPE +// RUN: %clang --target=loongarch64 -mno-frecipe -mfrecipe -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-FRECIPE +// RUN: %clang --target=loongarch64 -mfrecipe -mno-frecipe -fsyntax-only %s -### 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CC1-NO-FRECIPE + +// RUN: %clang --target=loongarch64 -mfrecipe -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-FRECIPE +// RUN: %clang --target=loongarch64 -mno-frecipe -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-NO-FRECIPE +// RUN: %clang --target=loongarch64 -mno-frecipe -mfrecipe -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-FRECIPE +// RUN: %clang --target=loongarch64 -mfrecipe -mno-frecipe -S -emit-llvm %s -o - | \ +// RUN: FileCheck %s --check-prefix=IR-NO-FRECIPE + + +// CC1-FRECIPE: "-target-feature" "+frecipe" +// CC1-NO-FRECIPE: "-target-feature" "-frecipe" + +// IR-FRECIPE: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+frecipe{{(,.*)?}}" +// IR-NO-FRECIPE: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-frecipe{{(,.*)?}}" + +int foo(void) { + return 42; +} \ No newline at end of file diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td index d6a83c0c8cd8fb..65802d660432d9 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td @@ -19,12 +19,16 @@ def SDT_LoongArchMOVGR2FR_W_LA64 def SDT_LoongArchMOVFR2GR_S_LA64 : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>; def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; +def SDT_LoongArchFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; +def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>; def loongarch_movgr2fr_w_la64 : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>; def loongarch_movfr2gr_s_la64 : SDNode<"LoongArchISD::MOVFR2GR_S_LA64", SDT_LoongArchMOVFR2GR_S_LA64>; def loongarch_ftint : SDNode<"LoongArchISD::FTINT", SDT_LoongArchFTINT>; +def loongarch_frecipe : SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchFRECIPE>; +def loongarch_frsqrte : SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchFRSQRTE>; //===----------------------------------------------------------------------===// // Instructions @@ -286,6 +290,8 @@ let Predicates = [HasFrecipe] in { // FP approximate reciprocal operation def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>; def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>; +def : Pat<(loongarch_frecipe FPR32:$src), (FRECIPE_S FPR32:$src)>; +def : Pat<(loongarch_frsqrte FPR32:$src), (FRSQRTE_S FPR32:$src)>; } // fmadd.s: fj * fk + fa diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td index 30cce8439640f1..b98025643903af 100644 --- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td @@ -253,6 +253,8 @@ let Predicates = [HasFrecipe] in { // FP approximate reciprocal operation def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>; def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>; +def : Pat<(loongarch_frecipe FPR64:$src), (FRECIPE_D FPR64:$src)>; +def : Pat<(loongarch_frsqrte FPR64:$src), (FRSQRTE_D FPR64:$src)>; } // fmadd.d: fj * fk + fa diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index fc5f0fc1bf0db0..676d43ef22c47b 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -4697,6 +4697,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(VANY_ZERO) NODE_NAME_CASE(VALL_NONZERO) NODE_NAME_CASE(VANY_NONZERO) + NODE_NAME_CASE(FRECIPE) + NODE_NAME_CASE(FRSQRTE) } #undef NODE_NAME_CASE return nullptr; @@ -5900,6 +5902,71 @@ Register LoongArchTargetLowering::getExceptionSelectorRegister( return LoongArch::R5; } +//===----------------------------------------------------------------------===// +// Target Optimization Hooks +//===----------------------------------------------------------------------===// + +static int getEstimateRefinementSteps(EVT VT, + const LoongArchSubtarget &Subtarget) { + // Feature FRECIPE instrucions relative accuracy is 2^-14. + // IEEE float has 23 digits and double has 52 digits. + int RefinementSteps = VT.getScalarType() == MVT::f64 ? 2 : 1; + return RefinementSteps; +} + +SDValue LoongArchTargetLowering::getSqrtEstimate(SDValue Operand, + SelectionDAG &DAG, int Enabled, + int &RefinementSteps, + bool &UseOneConstNR, + bool Reciprocal) const { + if (Subtarget.hasFrecipe()) { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32 || (VT == MVT::f64 && Subtarget.hasBasicD()) || + (VT == MVT::v4f32 && Subtarget.hasExtLSX()) || + (VT == MVT::v2f64 && Subtarget.hasExtLSX()) || + (VT == MVT::v8f32 && Subtarget.hasExtLASX()) || + (VT == MVT::v4f64 && Subtarget.hasExtLASX())) { + + if (RefinementSteps == ReciprocalEstimate::Unspecified) + RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); + + SDValue Estimate = DAG.getNode(LoongArchISD::FRSQRTE, DL, VT, Operand); + if (Reciprocal) + Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate); + + return Estimate; + } + } + + return SDValue(); +} + +SDValue LoongArchTargetLowering::getRecipEstimate(SDValue Operand, + SelectionDAG &DAG, + int Enabled, + int &RefinementSteps) const { + if (Subtarget.hasFrecipe()) { + SDLoc DL(Operand); + EVT VT = Operand.getValueType(); + + if (VT == MVT::f32 || (VT == MVT::f64 && Subtarget.hasBasicD()) || + (VT == MVT::v4f32 && Subtarget.hasExtLSX()) || + (VT == MVT::v2f64 && Subtarget.hasExtLSX()) || + (VT == MVT::v8f32 && Subtarget.hasExtLASX()) || + (VT == MVT::v4f64 && Subtarget.hasExtLASX())) { + + if (RefinementSteps == ReciprocalEstimate::Unspecified) + RefinementSteps = getEstimateRefinementSteps(VT, Subtarget); + + return DAG.getNode(LoongArchISD::FRECIPE, DL, VT, Operand); + } + } + + return SDValue(); +} + //===----------------------------------------------------------------------===// // LoongArch Inline Assembly Support //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index 6177884bd19501..df6a55a2b83190 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -141,6 +141,10 @@ enum NodeType : unsigned { VALL_NONZERO, VANY_NONZERO, + // Floating point approximate reciprocal operation + FRECIPE, + FRSQRTE + // Intrinsic operations end ============================================= }; } // end namespace LoongArchISD @@ -216,6 +220,17 @@ class LoongArchTargetLowering : public TargetLowering { Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override; + bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override { + return true; + } + + SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps, bool &UseOneConstNR, + bool Reciprocal) const override; + + SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, + int &RefinementSteps) const override; + ISD::NodeType getExtendForAtomicOps() const override { return ISD::SIGN_EXTEND; } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index dd7e5713e45fe9..d13cc9af135b57 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +// Target nodes. def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>; def lasxsplati8 @@ -2094,6 +2095,15 @@ foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)), (!cast<LAInst>(Inst) LASX256:$xj)>; + +def : Pat<(loongarch_vfrecipe v8f32:$src), + (XVFRECIPE_S v8f32:$src)>; +def : Pat<(loongarch_vfrecipe v4f64:$src), + (XVFRECIPE_D v4f64:$src)>; +def : Pat<(loongarch_vfrsqrte v8f32:$src), + (XVFRSQRTE_S v8f32:$src)>; +def : Pat<(loongarch_vfrsqrte v4f64:$src), + (XVFRSQRTE_D v4f64:$src)>; } def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm), diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td index e7ac9f3bd04cbf..86aa6dcfd8261f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td @@ -23,6 +23,8 @@ def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>; def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>; +def SDT_LoongArchVFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; +def SDT_LoongArchVFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>; // Target nodes. def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>; @@ -50,6 +52,8 @@ def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>; def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_LoongArchV1RUimm>; def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_LoongArchV1RUimm>; +def loongarch_vfrecipe: SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchVFRECIPE>; +def loongarch_vfrsqrte: SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchVFRSQRTE>; def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>; def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>; @@ -2238,6 +2242,15 @@ foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)), (!cast<LAInst>(Inst) LSX128:$vj)>; + +def : Pat<(loongarch_vfrecipe v4f32:$src), + (VFRECIPE_S v4f32:$src)>; +def : Pat<(loongarch_vfrecipe v2f64:$src), + (VFRECIPE_D v2f64:$src)>; +def : Pat<(loongarch_vfrsqrte v4f32:$src), + (VFRSQRTE_S v4f32:$src)>; +def : Pat<(loongarch_vfrsqrte v2f64:$src), + (VFRSQRTE_D v2f64:$src)>; } // load diff --git a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll new file mode 100644 index 00000000000000..3f38bbed881a32 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,-frecipe < %s | FileCheck %s --check-prefix=LA32F +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,+frecipe < %s | FileCheck %s --check-prefix=LA32F-FRECIPE +; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=LA64D +; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s --check-prefix=LA64D-FRECIPE + +;; Exercise the 'fdiv' LLVM IR: https://llvm.org/docs/LangRef.html#fdiv-instruction + +define float @fdiv_s(float %x, float %y) { +; LA32F-LABEL: fdiv_s: +; LA32F: # %bb.0: +; LA32F-NEXT: fdiv.s $fa0, $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: fdiv_s: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frecipe.s $fa2, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fnmsub.s $fa0, $fa1, $fa3, $fa0 +; LA32F-FRECIPE-NEXT: fmadd.s $fa0, $fa2, $fa0, $fa3 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: fdiv_s: +; LA64D: # %bb.0: +; LA64D-NEXT: fdiv.s $fa0, $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: fdiv_s: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frecipe.s $fa2, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.s $fa0, $fa1, $fa3, $fa0 +; LA64D-FRECIPE-NEXT: fmadd.s $fa0, $fa2, $fa0, $fa3 +; LA64D-FRECIPE-NEXT: ret + %div = fdiv fast float %x, %y + ret float %div +} + +define double @fdiv_d(double %x, double %y) { +; LA32F-LABEL: fdiv_d: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: .cfi_def_cfa_offset 16 +; LA32F-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: .cfi_offset 1, -4 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: fdiv_d: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -16 +; LA32F-FRECIPE-NEXT: .cfi_def_cfa_offset 16 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: .cfi_offset 1, -4 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 16 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: fdiv_d: +; LA64D: # %bb.0: +; LA64D-NEXT: fdiv.d $fa0, $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: fdiv_d: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) +; LA64D-FRECIPE-NEXT: frecipe.d $fa3, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa1, $fa3, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.d $fa2, $fa2, $fa3, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fnmsub.d $fa0, $fa1, $fa3, $fa0 +; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa2, $fa0, $fa3 +; LA64D-FRECIPE-NEXT: ret + %div = fdiv fast double %x, %y + ret double %div +} diff --git a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll new file mode 100644 index 00000000000000..388ae6321f664a --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,797 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,-frecipe < %s | FileCheck %s --check-prefix=LA32F +; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,+frecipe < %s | FileCheck %s --check-prefix=LA32F-FRECIPE +; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=LA64D +; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s --check-prefix=LA64D-FRECIPE + + +declare float @llvm.sqrt.f32(float) +declare double @llvm.sqrt.f64(double) + +define float @frsqrt_f32(float %a) nounwind { +; LA32F-LABEL: frsqrt_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: frsqrt_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; LA32F-FRECIPE-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa1, $fa0 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: frsqrt_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: frsqrt.s $fa0, $fa0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: frsqrt_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_0) +; LA64D-FRECIPE-NEXT: fld.s $fa2, $a0, %pc_lo12(.LCPI0_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI0_1) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a0, %pc_lo12(.LCPI0_1) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.s $fa0, $fa0, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa1, $fa0 +; LA64D-FRECIPE-NEXT: ret + + %1 = call fast float @llvm.sqrt.f32(float %a) + %2 = fdiv fast float 1.0, %1 + ret float %2 +} + +define double @frsqrt_f64(double %a) nounwind { +; LA32F-LABEL: frsqrt_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -16 +; LA32F-NEXT: st.w $ra, $sp, 12 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $a2, $a0 +; LA32F-NEXT: move $a3, $a1 +; LA32F-NEXT: lu12i.w $a1, 261888 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: ld.w $ra, $sp, 12 +; LA32F-NEXT: addi.w $sp, $sp, 16 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: frsqrt_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -16 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $a2, $a0 +; LA32F-FRECIPE-NEXT: move $a3, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 261888 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 16 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: frsqrt_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: frsqrt_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a0, %pc_lo12(.LCPI1_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a0, %pc_hi20(.LCPI1_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a0, %pc_lo12(.LCPI1_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa0, $fa0, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa1, $fa0 +; LA64D-FRECIPE-NEXT: ret + %1 = call fast double @llvm.sqrt.f64(double %a) + %2 = fdiv fast double 1.0, %1 + ret double %2 +} + +define double @sqrt_simplify_before_recip_3_uses_f64(double %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -32 +; LA32F-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32F-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s1, $sp, 16 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s2, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s3, $sp, 8 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s4, $sp, 4 # 4-byte Folded Spill +; LA32F-NEXT: move $fp, $a3 +; LA32F-NEXT: move $s0, $a2 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $s1, $a0 +; LA32F-NEXT: move $s2, $a1 +; LA32F-NEXT: lu12i.w $a1, 261888 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s1 +; LA32F-NEXT: move $a3, $s2 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: move $s3, $a0 +; LA32F-NEXT: move $s4, $a1 +; LA32F-NEXT: lu12i.w $a1, 263248 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s1 +; LA32F-NEXT: move $a3, $s2 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: st.w $s3, $s0, 0 +; LA32F-NEXT: st.w $s4, $s0, 4 +; LA32F-NEXT: st.w $a0, $fp, 0 +; LA32F-NEXT: st.w $a1, $fp, 4 +; LA32F-NEXT: move $a0, $s1 +; LA32F-NEXT: move $a1, $s2 +; LA32F-NEXT: ld.w $s4, $sp, 4 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s3, $sp, 8 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s2, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s1, $sp, 16 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 32 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -32 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s1, $sp, 16 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s2, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s3, $sp, 8 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s4, $sp, 4 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: move $fp, $a3 +; LA32F-FRECIPE-NEXT: move $s0, $a2 +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $s1, $a0 +; LA32F-FRECIPE-NEXT: move $s2, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 261888 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s1 +; LA32F-FRECIPE-NEXT: move $a3, $s2 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: move $s3, $a0 +; LA32F-FRECIPE-NEXT: move $s4, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263248 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s1 +; LA32F-FRECIPE-NEXT: move $a3, $s2 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: st.w $s3, $s0, 0 +; LA32F-FRECIPE-NEXT: st.w $s4, $s0, 4 +; LA32F-FRECIPE-NEXT: st.w $a0, $fp, 0 +; LA32F-FRECIPE-NEXT: st.w $a1, $fp, 4 +; LA32F-FRECIPE-NEXT: move $a0, $s1 +; LA32F-FRECIPE-NEXT: move $a1, $s2 +; LA32F-FRECIPE-NEXT: ld.w $s4, $sp, 4 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s3, $sp, 8 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s2, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s1, $sp, 16 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 32 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; LA64D-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) +; LA64D-NEXT: fsqrt.d $fa1, $fa0 +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa1 +; LA64D-NEXT: fst.d $fa0, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: fmov.d $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI2_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI2_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI2_2) +; LA64D-FRECIPE-NEXT: fld.d $fa5, $a2, %pc_lo12(.LCPI2_2) +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %rsqrt = fdiv fast double 1.0, %sqrt + %r = fdiv fast double 42.0, %sqrt + %sqrt_fast = fdiv fast double %x, %sqrt + store double %rsqrt, ptr %p1, align 8 + store double %r, ptr %p2, align 8 + ret double %sqrt_fast +} + + +define double @sqrt_simplify_before_recip_3_uses_order_f64(double %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -32 +; LA32F-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32F-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s1, $sp, 16 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s2, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s3, $sp, 8 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s4, $sp, 4 # 4-byte Folded Spill +; LA32F-NEXT: move $fp, $a3 +; LA32F-NEXT: move $s0, $a2 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $s1, $a0 +; LA32F-NEXT: move $s2, $a1 +; LA32F-NEXT: lu12i.w $a1, 263248 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s1 +; LA32F-NEXT: move $a3, $s2 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: move $s3, $a0 +; LA32F-NEXT: move $s4, $a1 +; LA32F-NEXT: lu12i.w $a1, 263256 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s1 +; LA32F-NEXT: move $a3, $s2 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: st.w $s3, $s0, 0 +; LA32F-NEXT: st.w $s4, $s0, 4 +; LA32F-NEXT: st.w $a0, $fp, 0 +; LA32F-NEXT: st.w $a1, $fp, 4 +; LA32F-NEXT: move $a0, $s1 +; LA32F-NEXT: move $a1, $s2 +; LA32F-NEXT: ld.w $s4, $sp, 4 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s3, $sp, 8 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s2, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s1, $sp, 16 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 32 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -32 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s1, $sp, 16 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s2, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s3, $sp, 8 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s4, $sp, 4 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: move $fp, $a3 +; LA32F-FRECIPE-NEXT: move $s0, $a2 +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $s1, $a0 +; LA32F-FRECIPE-NEXT: move $s2, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263248 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s1 +; LA32F-FRECIPE-NEXT: move $a3, $s2 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: move $s3, $a0 +; LA32F-FRECIPE-NEXT: move $s4, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263256 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s1 +; LA32F-FRECIPE-NEXT: move $a3, $s2 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: st.w $s3, $s0, 0 +; LA32F-FRECIPE-NEXT: st.w $s4, $s0, 4 +; LA32F-FRECIPE-NEXT: st.w $a0, $fp, 0 +; LA32F-FRECIPE-NEXT: st.w $a1, $fp, 4 +; LA32F-FRECIPE-NEXT: move $a0, $s1 +; LA32F-FRECIPE-NEXT: move $a1, $s2 +; LA32F-FRECIPE-NEXT: ld.w $s4, $sp, 4 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s3, $sp, 8 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s2, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s1, $sp, 16 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 32 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; LA64D-NEXT: fld.d $fa1, $a2, %pc_lo12(.LCPI3_0) +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; LA64D-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_1) +; LA64D-NEXT: fsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa1, $fa1, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa0 +; LA64D-NEXT: fst.d $fa1, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a2, %pc_lo12(.LCPI3_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_2) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a2, %pc_lo12(.LCPI3_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI3_3) +; LA64D-FRECIPE-NEXT: fld.d $fa4, $a2, %pc_lo12(.LCPI3_3) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a1, 0 +; LA64D-FRECIPE-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %sqrt_fast = fdiv fast double %x, %sqrt + %r1 = fdiv fast double 42.0, %sqrt + %r2 = fdiv fast double 43.0, %sqrt + store double %r1, ptr %p1, align 8 + store double %r2, ptr %p2, align 8 + ret double %sqrt_fast +} + +define double @sqrt_simplify_before_recip_4_uses_f64(double %x, ptr %p1, ptr %p2, ptr %p3) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA32F: # %bb.0: +; LA32F-NEXT: addi.w $sp, $sp, -48 +; LA32F-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32F-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32F-NEXT: st.w $s7, $sp, 8 # 4-byte Folded Spill +; LA32F-NEXT: move $fp, $a4 +; LA32F-NEXT: move $s0, $a3 +; LA32F-NEXT: move $s1, $a2 +; LA32F-NEXT: bl %plt(sqrt) +; LA32F-NEXT: move $s2, $a0 +; LA32F-NEXT: move $s3, $a1 +; LA32F-NEXT: lu12i.w $a1, 261888 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s2 +; LA32F-NEXT: move $a3, $s3 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: move $s4, $a0 +; LA32F-NEXT: move $s5, $a1 +; LA32F-NEXT: lu12i.w $a1, 263248 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s2 +; LA32F-NEXT: move $a3, $s3 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: move $s6, $a0 +; LA32F-NEXT: move $s7, $a1 +; LA32F-NEXT: lu12i.w $a1, 263256 +; LA32F-NEXT: move $a0, $zero +; LA32F-NEXT: move $a2, $s2 +; LA32F-NEXT: move $a3, $s3 +; LA32F-NEXT: bl %plt(__divdf3) +; LA32F-NEXT: st.w $s4, $s1, 0 +; LA32F-NEXT: st.w $s5, $s1, 4 +; LA32F-NEXT: st.w $s6, $s0, 0 +; LA32F-NEXT: st.w $s7, $s0, 4 +; LA32F-NEXT: st.w $a0, $fp, 0 +; LA32F-NEXT: st.w $a1, $fp, 4 +; LA32F-NEXT: move $a0, $s2 +; LA32F-NEXT: move $a1, $s3 +; LA32F-NEXT: ld.w $s7, $sp, 8 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32F-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32F-NEXT: addi.w $sp, $sp, 48 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, -48 +; LA32F-FRECIPE-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: st.w $s7, $sp, 8 # 4-byte Folded Spill +; LA32F-FRECIPE-NEXT: move $fp, $a4 +; LA32F-FRECIPE-NEXT: move $s0, $a3 +; LA32F-FRECIPE-NEXT: move $s1, $a2 +; LA32F-FRECIPE-NEXT: bl %plt(sqrt) +; LA32F-FRECIPE-NEXT: move $s2, $a0 +; LA32F-FRECIPE-NEXT: move $s3, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 261888 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s2 +; LA32F-FRECIPE-NEXT: move $a3, $s3 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: move $s4, $a0 +; LA32F-FRECIPE-NEXT: move $s5, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263248 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s2 +; LA32F-FRECIPE-NEXT: move $a3, $s3 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: move $s6, $a0 +; LA32F-FRECIPE-NEXT: move $s7, $a1 +; LA32F-FRECIPE-NEXT: lu12i.w $a1, 263256 +; LA32F-FRECIPE-NEXT: move $a0, $zero +; LA32F-FRECIPE-NEXT: move $a2, $s2 +; LA32F-FRECIPE-NEXT: move $a3, $s3 +; LA32F-FRECIPE-NEXT: bl %plt(__divdf3) +; LA32F-FRECIPE-NEXT: st.w $s4, $s1, 0 +; LA32F-FRECIPE-NEXT: st.w $s5, $s1, 4 +; LA32F-FRECIPE-NEXT: st.w $s6, $s0, 0 +; LA32F-FRECIPE-NEXT: st.w $s7, $s0, 4 +; LA32F-FRECIPE-NEXT: st.w $a0, $fp, 0 +; LA32F-FRECIPE-NEXT: st.w $a1, $fp, 4 +; LA32F-FRECIPE-NEXT: move $a0, $s2 +; LA32F-FRECIPE-NEXT: move $a1, $s3 +; LA32F-FRECIPE-NEXT: ld.w $s7, $sp, 8 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload +; LA32F-FRECIPE-NEXT: addi.w $sp, $sp, 48 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; LA64D-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; LA64D-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) +; LA64D-NEXT: fsqrt.d $fa1, $fa0 +; LA64D-NEXT: frsqrt.d $fa0, $fa0 +; LA64D-NEXT: fdiv.d $fa2, $fa2, $fa1 +; LA64D-NEXT: fdiv.d $fa3, $fa3, $fa1 +; LA64D-NEXT: fst.d $fa0, $a0, 0 +; LA64D-NEXT: fst.d $fa2, $a1, 0 +; LA64D-NEXT: fst.d $fa3, $a2, 0 +; LA64D-NEXT: fmov.d $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f64: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.d $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_0) +; LA64D-FRECIPE-NEXT: fld.d $fa2, $a3, %pc_lo12(.LCPI4_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_1) +; LA64D-FRECIPE-NEXT: fld.d $fa3, $a3, %pc_lo12(.LCPI4_1) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa4, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.d $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_2) +; LA64D-FRECIPE-NEXT: fld.d $fa4, $a3, %pc_lo12(.LCPI4_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI4_3) +; LA64D-FRECIPE-NEXT: fld.d $fa5, $a3, %pc_lo12(.LCPI4_3) +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.d $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.d $fa2, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.d $fa3, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.d $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.d $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: fst.d $fa3, $a2, 0 +; LA64D-FRECIPE-NEXT: ret + %sqrt = tail call fast double @llvm.sqrt.f64(double %x) + %rsqrt = fdiv fast double 1.0, %sqrt + %r1 = fdiv fast double 42.0, %sqrt + %r2 = fdiv fast double 43.0, %sqrt + %sqrt_fast = fdiv fast double %x, %sqrt + store double %rsqrt, ptr %p1, align 8 + store double %r1, ptr %p2, align 8 + store double %r2, ptr %p3, align 8 + ret double %sqrt_fast +} + +define float @sqrt_simplify_before_recip_3_uses_f32(float %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA32F-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI5_0) +; LA32F-NEXT: fsqrt.s $fa1, $fa0 +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA32F-NEXT: fst.s $fa0, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: fmov.s $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI5_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_1) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI5_1) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_2) +; LA32F-FRECIPE-NEXT: fld.s $fa5, $a2, %pc_lo12(.LCPI5_2) +; LA32F-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa5 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA64D-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI5_0) +; LA64D-NEXT: fsqrt.s $fa1, $fa0 +; LA64D-NEXT: frsqrt.s $fa0, $fa0 +; LA64D-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA64D-NEXT: fst.s $fa0, $a0, 0 +; LA64D-NEXT: fst.s $fa2, $a1, 0 +; LA64D-NEXT: fmov.s $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_0) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI5_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_1) +; LA64D-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI5_1) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI5_2) +; LA64D-FRECIPE-NEXT: fld.s $fa5, $a2, %pc_lo12(.LCPI5_2) +; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: ret +; + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %rsqrt = fdiv fast float 1.0, %sqrt + %r = fdiv fast float 42.0, %sqrt + %sqrt_fast = fdiv fast float %x, %sqrt + store float %rsqrt, ptr %p1, align 8 + store float %r, ptr %p2, align 8 + ret float %sqrt_fast +} + +define float @sqrt_simplify_before_recip_4_uses_f32(float %x, ptr %p1, ptr %p2, ptr %p3) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA32F-NEXT: fld.s $fa2, $a3, %pc_lo12(.LCPI6_0) +; LA32F-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA32F-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA32F-NEXT: fsqrt.s $fa1, $fa0 +; LA32F-NEXT: frsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA32F-NEXT: fdiv.s $fa3, $fa3, $fa1 +; LA32F-NEXT: fst.s $fa0, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: fst.s $fa3, $a2, 0 +; LA32F-NEXT: fmov.s $fa0, $fa1 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA32F-FRECIPE-NEXT: fld.s $fa1, $a3, %pc_lo12(.LCPI6_0) +; LA32F-FRECIPE-NEXT: frsqrte.s $fa2, $fa0 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA32F-FRECIPE-NEXT: fmadd.s $fa1, $fa3, $fa2, $fa1 +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_2) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a3, %pc_lo12(.LCPI6_2) +; LA32F-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_3) +; LA32F-FRECIPE-NEXT: fld.s $fa5, $a3, %pc_lo12(.LCPI6_3) +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa2, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa2, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fmul.s $fa3, $fa1, $fa5 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa3, $a2, 0 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA64D-NEXT: fld.s $fa2, $a3, %pc_lo12(.LCPI6_0) +; LA64D-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA64D-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA64D-NEXT: fsqrt.s $fa1, $fa0 +; LA64D-NEXT: frsqrt.s $fa0, $fa0 +; LA64D-NEXT: fdiv.s $fa2, $fa2, $fa1 +; LA64D-NEXT: fdiv.s $fa3, $fa3, $fa1 +; LA64D-NEXT: fst.s $fa0, $a0, 0 +; LA64D-NEXT: fst.s $fa2, $a1, 0 +; LA64D-NEXT: fst.s $fa3, $a2, 0 +; LA64D-NEXT: fmov.s $fa0, $fa1 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_0) +; LA64D-FRECIPE-NEXT: fld.s $fa1, $a3, %pc_lo12(.LCPI6_0) +; LA64D-FRECIPE-NEXT: frsqrte.s $fa2, $fa0 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa0, $fa2 +; LA64D-FRECIPE-NEXT: fmadd.s $fa1, $fa3, $fa2, $fa1 +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_1) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a3, %pc_lo12(.LCPI6_1) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_2) +; LA64D-FRECIPE-NEXT: fld.s $fa4, $a3, %pc_lo12(.LCPI6_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a3, %pc_hi20(.LCPI6_3) +; LA64D-FRECIPE-NEXT: fld.s $fa5, $a3, %pc_lo12(.LCPI6_3) +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa2, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa2, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fmul.s $fa3, $fa1, $fa5 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a1, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa3, $a2, 0 +; LA64D-FRECIPE-NEXT: ret +; + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %rsqrt = fdiv fast float 1.0, %sqrt + %r1 = fdiv fast float 42.0, %sqrt + %r2 = fdiv fast float 43.0, %sqrt + %sqrt_fast = fdiv fast float %x, %sqrt + store float %rsqrt, ptr %p1, align 8 + store float %r1, ptr %p2, align 8 + store float %r2, ptr %p3, align 8 + ret float %sqrt_fast +} + +define float @sqrt_simplify_before_recip_3_uses_order_f32(float %x, ptr %p1, ptr %p2) nounwind { +; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA32F: # %bb.0: +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA32F-NEXT: fld.s $fa1, $a2, %pc_lo12(.LCPI7_0) +; LA32F-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA32F-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_1) +; LA32F-NEXT: fsqrt.s $fa0, $fa0 +; LA32F-NEXT: fdiv.s $fa1, $fa1, $fa0 +; LA32F-NEXT: fdiv.s $fa2, $fa2, $fa0 +; LA32F-NEXT: fst.s $fa1, $a0, 0 +; LA32F-NEXT: fst.s $fa2, $a1, 0 +; LA32F-NEXT: ret +; +; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA32F-FRECIPE: # %bb.0: +; LA32F-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA32F-FRECIPE-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_0) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_1) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa4, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmadd.s $fa2, $fa4, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_2) +; LA32F-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_2) +; LA32F-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_3) +; LA32F-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI7_3) +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA32F-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA32F-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa3 +; LA32F-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA32F-FRECIPE-NEXT: fst.s $fa2, $a0, 0 +; LA32F-FRECIPE-NEXT: fst.s $fa1, $a1, 0 +; LA32F-FRECIPE-NEXT: ret +; +; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA64D: # %bb.0: +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA64D-NEXT: fld.s $fa1, $a2, %pc_lo12(.LCPI7_0) +; LA64D-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA64D-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_1) +; LA64D-NEXT: fsqrt.s $fa0, $fa0 +; LA64D-NEXT: fdiv.s $fa1, $fa1, $fa0 +; LA64D-NEXT: fdiv.s $fa2, $fa2, $fa0 +; LA64D-NEXT: fst.s $fa1, $a0, 0 +; LA64D-NEXT: fst.s $fa2, $a1, 0 +; LA64D-NEXT: ret +; +; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f32: +; LA64D-FRECIPE: # %bb.0: +; LA64D-FRECIPE-NEXT: frsqrte.s $fa1, $fa0 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_0) +; LA64D-FRECIPE-NEXT: fld.s $fa2, $a2, %pc_lo12(.LCPI7_0) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_1) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_1) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa4, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmadd.s $fa2, $fa4, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_2) +; LA64D-FRECIPE-NEXT: fld.s $fa3, $a2, %pc_lo12(.LCPI7_2) +; LA64D-FRECIPE-NEXT: pcalau12i $a2, %pc_hi20(.LCPI7_3) +; LA64D-FRECIPE-NEXT: fld.s $fa4, $a2, %pc_lo12(.LCPI7_3) +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa2 +; LA64D-FRECIPE-NEXT: fmul.s $fa0, $fa0, $fa1 +; LA64D-FRECIPE-NEXT: fmul.s $fa2, $fa1, $fa3 +; LA64D-FRECIPE-NEXT: fmul.s $fa1, $fa1, $fa4 +; LA64D-FRECIPE-NEXT: fst.s $fa2, $a0, 0 +; LA64D-FRECIPE-NEXT: fst.s $fa1, $a1, 0 +; LA64D-FRECIPE-NEXT: ret +; + %sqrt = tail call fast float @llvm.sqrt.f32(float %x) + %sqrt_fast = fdiv fast float %x, %sqrt + %r1 = fdiv fast float 42.0, %sqrt + %r2 = fdiv fast float 43.0, %sqrt + store float %r1, ptr %p1, align 8 + store float %r2, ptr %p2, align 8 + ret float %sqrt_fast +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll new file mode 100644 index 00000000000000..769d9ef81faf39 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s + +define void @fdiv_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v8f32: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvld $xr1, $a2, 0 +; FAULT-NEXT: xvfdiv.s $xr0, $xr0, $xr1 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvfrecipe.s $xr2, $xr0 +; CHECK-NEXT: xvfmul.s $xr3, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.s $xr0, $xr0, $xr3, $xr1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr2, $xr0, $xr3 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %v1 = load <8 x float>, ptr %a1 + %v2 = fdiv fast <8 x float> %v0, %v1 + store <8 x float> %v2, ptr %res + ret void +} + +define void @fdiv_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v4f64: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvld $xr1, $a2, 0 +; FAULT-NEXT: xvfdiv.d $xr0, $xr0, $xr1 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: lu52i.d $a1, $zero, -1025 +; CHECK-NEXT: xvreplgr2vr.d $xr2, $a1 +; CHECK-NEXT: xvfrecipe.d $xr3, $xr0 +; CHECK-NEXT: xvfmadd.d $xr2, $xr0, $xr3, $xr2 +; CHECK-NEXT: xvfnmsub.d $xr2, $xr2, $xr3, $xr3 +; CHECK-NEXT: xvfmul.d $xr3, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.d $xr0, $xr0, $xr3, $xr1 +; CHECK-NEXT: xvfmadd.d $xr0, $xr2, $xr0, $xr3 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %v1 = load <4 x double>, ptr %a1 + %v2 = fdiv fast <4 x double> %v0, %v1 + store <4 x double> %v2, ptr %res + ret void +} + +;; 1.0 / vec +define void @one_fdiv_v8f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v8f32: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrecip.s $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrecipe.s $xr1, $xr0 +; CHECK-NEXT: lu12i.w $a1, -264192 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfnmsub.s $xr0, $xr0, $xr1, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0 + %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %v0 + store <8 x float> %div, ptr %res + ret void +} + +define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v4f64: +; FAULT: # %bb.0: +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrecip.d $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrecipe.d $xr1, $xr0 +; CHECK-NEXT: lu52i.d $a1, $zero, 1023 +; CHECK-NEXT: xvreplgr2vr.d $xr2, $a1 +; CHECK-NEXT: xvfnmsub.d $xr3, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfmadd.d $xr1, $xr1, $xr3, $xr1 +; CHECK-NEXT: xvfnmsub.d $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: xvfmadd.d $xr0, $xr1, $xr0, $xr1 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0 + %div = fdiv fast <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %v0 + store <4 x double> %div, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll new file mode 100644 index 00000000000000..48fd12697417ac --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s + +;; 1.0 / (fsqrt vec) +define void @one_div_sqrt_v8f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v8f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrsqrt.s $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_div_sqrt_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrsqrte.s $xr1, $xr0 +; CHECK-NEXT: xvfmul.s $xr1, $xr0, $xr1 +; CHECK-NEXT: xvfmul.s $xr0, $xr0, $xr1 +; CHECK-NEXT: lu12i.w $a1, -261120 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmadd.s $xr0, $xr0, $xr1, $xr2 +; CHECK-NEXT: lu12i.w $a1, -266240 +; CHECK-NEXT: xvreplgr2vr.w $xr2, $a1 +; CHECK-NEXT: xvfmul.s $xr1, $xr1, $xr2 +; CHECK-NEXT: xvfmul.s $xr0, $xr1, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %a0, align 16 + %sqrt = call fast <8 x float> @llvm.sqrt.v8f32 (<8 x float> %v0) + %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt + store <8 x float> %div, ptr %res, align 16 + ret void +} + +define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v4f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: xvld $xr0, $a1, 0 +; FAULT-NEXT: xvfrsqrt.d $xr0, $xr0 +; FAULT-NEXT: xvst $xr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_div_sqrt_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: xvfrsqrte.d $xr1, $xr0 +; CHECK-NEXT: xvfmul.d $xr1, $xr0, $xr1 +; CHECK-NEXT: xvfmul.d $xr2, $xr0, $xr1 +; CHECK-NEXT: ori $a1, $zero, 0 +; CHECK-NEXT: lu32i.d $a1, -524288 +; CHECK-NEXT: lu52i.d $a1, $a1, -1024 +; CHECK-NEXT: xvreplgr2vr.d $xr3, $a1 +; CHECK-NEXT: xvfmadd.d $xr2, $xr2, $xr1, $xr3 +; CHECK-NEXT: lu52i.d $a1, $zero, -1026 +; CHECK-NEXT: xvreplgr2vr.d $xr4, $a1 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr4 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr2 +; CHECK-NEXT: xvfmul.d $xr0, $xr0, $xr1 +; CHECK-NEXT: xvfmadd.d $xr0, $xr0, $xr1, $xr3 +; CHECK-NEXT: xvfmul.d $xr1, $xr1, $xr4 +; CHECK-NEXT: xvfmul.d $xr0, $xr1, $xr0 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %a0, align 16 + %sqrt = call fast <4 x double> @llvm.sqrt.v4f64 (<4 x double> %v0) + %div = fdiv fast <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %sqrt + store <4 x double> %div, ptr %res, align 16 + ret void +} + +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll new file mode 100644 index 00000000000000..21dbbf310ad870 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s + +define void @fdiv_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vld $vr1, $a2, 0 +; FAULT-NEXT: vfdiv.s $vr0, $vr0, $vr1 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vfrecipe.s $vr2, $vr0 +; CHECK-NEXT: vfmul.s $vr3, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.s $vr0, $vr0, $vr3, $vr1 +; CHECK-NEXT: vfmadd.s $vr0, $vr2, $vr0, $vr3 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %v1 = load <4 x float>, ptr %a1 + %v2 = fdiv fast <4 x float> %v0, %v1 + store <4 x float> %v2, ptr %res + ret void +} + +define void @fdiv_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind { +; FAULT-LABEL: fdiv_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vld $vr1, $a2, 0 +; FAULT-NEXT: vfdiv.d $vr0, $vr0, $vr1 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: fdiv_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: lu52i.d $a1, $zero, -1025 +; CHECK-NEXT: vreplgr2vr.d $vr2, $a1 +; CHECK-NEXT: vfrecipe.d $vr3, $vr0 +; CHECK-NEXT: vfmadd.d $vr2, $vr0, $vr3, $vr2 +; CHECK-NEXT: vfnmsub.d $vr2, $vr2, $vr3, $vr3 +; CHECK-NEXT: vfmul.d $vr3, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.d $vr0, $vr0, $vr3, $vr1 +; CHECK-NEXT: vfmadd.d $vr0, $vr2, $vr0, $vr3 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %v1 = load <2 x double>, ptr %a1 + %v2 = fdiv fast <2 x double> %v0, %v1 + store <2 x double> %v2, ptr %res + ret void +} + +;; 1.0 / vec +define void @one_fdiv_v4f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrecip.s $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrecipe.s $vr1, $vr0 +; CHECK-NEXT: lu12i.w $a1, -264192 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmadd.s $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfnmsub.s $vr0, $vr0, $vr1, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0 + %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %v0 + store <4 x float> %div, ptr %res + ret void +} + +define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_fdiv_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrecip.d $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL: one_fdiv_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrecipe.d $vr1, $vr0 +; CHECK-NEXT: lu52i.d $a1, $zero, 1023 +; CHECK-NEXT: vreplgr2vr.d $vr2, $a1 +; CHECK-NEXT: vfnmsub.d $vr3, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfmadd.d $vr1, $vr1, $vr3, $vr1 +; CHECK-NEXT: vfnmsub.d $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: vfmadd.d $vr0, $vr1, $vr0, $vr1 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0 + %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %v0 + store <2 x double> %div, ptr %res + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll new file mode 100644 index 00000000000000..912d06242f7d3e --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefix=FAULT +; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s + +;; 1.0 / (fsqrt vec) +define void @one_div_sqrt_v4f32(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v4f32: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrsqrt.s $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL one_div_sqrt_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrsqrte.s $vr1, $vr0 +; CHECK-NEXT: vfmul.s $vr1, $vr0, $vr1 +; CHECK-NEXT: vfmul.s $vr0, $vr0, $vr1 +; CHECK-NEXT: lu12i.w $a1, -261120 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmadd.s $vr0, $vr0, $vr1, $vr2 +; CHECK-NEXT: lu12i.w $a1, -266240 +; CHECK-NEXT: vreplgr2vr.w $vr2, $a1 +; CHECK-NEXT: vfmul.s $vr1, $vr1, $vr2 +; CHECK-NEXT: vfmul.s $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %a0, align 16 + %sqrt = call fast <4 x float> @llvm.sqrt.v4f32 (<4 x float> %v0) + %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt + store <4 x float> %div, ptr %res, align 16 + ret void +} + +define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind { +; FAULT-LABEL: one_div_sqrt_v2f64: +; FAULT: # %bb.0: # %entry +; FAULT-NEXT: vld $vr0, $a1, 0 +; FAULT-NEXT: vfrsqrt.d $vr0, $vr0 +; FAULT-NEXT: vst $vr0, $a0, 0 +; FAULT-NEXT: ret +; +; CHECK-LABEL one_div_sqrt_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vfrsqrte.d $vr1, $vr0 +; CHECK-NEXT: vfmul.d $vr1, $vr0, $vr1 +; CHECK-NEXT: vfmul.d $vr2, $vr0, $vr1 +; CHECK-NEXT: ori $a1, $zero, 0 +; CHECK-NEXT: lu32i.d $a1, -524288 +; CHECK-NEXT: lu52i.d $a1, $a1, -1024 +; CHECK-NEXT: vreplgr2vr.d $vr3, $a1 +; CHECK-NEXT: vfmadd.d $vr2, $vr2, $vr1, $vr3 +; CHECK-NEXT: lu52i.d $a1, $zero, -1026 +; CHECK-NEXT: vreplgr2vr.d $vr4, $a1 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr4 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr2 +; CHECK-NEXT: vfmul.d $vr0, $vr0, $vr1 +; CHECK-NEXT: vfmadd.d $vr0, $vr0, $vr1, $vr3 +; CHECK-NEXT: vfmul.d $vr1, $vr1, $vr4 +; CHECK-NEXT: vfmul.d $vr0, $vr1, $vr0 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %a0, align 16 + %sqrt = call fast <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0) + %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt + store <2 x double> %div, ptr %res, align 16 + ret void +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits