llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-llvm-globalisel Author: Matt Arsenault (arsenm) <details> <summary>Changes</summary> Device libs has a fast sqrt macro implemented this way. --- Patch is 156.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/183697.diff 4 Files Affected: - (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+22-17) - (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+23-17) - (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll (+180-660) - (modified) llvm/test/CodeGen/AMDGPU/rsq.f64.ll (+143-625) ``````````diff diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index c117c6bf4ddd9..1b9028eb41487 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -5856,18 +5856,21 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, Register X = MI.getOperand(1).getReg(); unsigned Flags = MI.getFlags(); - auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); + Register SqrtX = X; + Register Scaling, ZeroInt; + if (!MI.getFlag(MachineInstr::FmAfn)) { + auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767); - auto ZeroInt = B.buildConstant(S32, 0); - auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant); + ZeroInt = B.buildConstant(S32, 0).getReg(0); + Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0); - // Scale up input if it is too small. - auto ScaleUpFactor = B.buildConstant(S32, 256); - auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); - auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags); + // Scale up input if it is too small. + auto ScaleUpFactor = B.buildConstant(S32, 256); + auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt); + SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0); + } - auto SqrtY = - B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0)); + auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX); auto Half = B.buildFConstant(F64, 0.5); auto SqrtH0 = B.buildFMul(F64, SqrtY, Half); @@ -5884,15 +5887,17 @@ bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI, auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1); - auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); - auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); - - auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); + Register SqrtRet = SqrtS2.getReg(0); + if (!MI.getFlag(MachineInstr::FmAfn)) { + auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2); + auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX); + auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2); - // Scale down the result. - auto ScaleDownFactor = B.buildConstant(S32, -128); - auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); - SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags); + // Scale down the result. + auto ScaleDownFactor = B.buildConstant(S32, -128); + auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt); + SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0); + } Register IsZeroOrInf; if (MI.getFlag(MachineInstr::FmNoInfs)) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 1e0ba25158ff4..cd8575751220b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -13182,17 +13182,20 @@ SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue X = Op.getOperand(0); - SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); - - SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); - SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32); - // Scale up input if it is too small. - SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); - SDValue ScaleUp = - DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); - SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); + SDValue SqrtX = X; + SDValue Scaling; + if (!Flags.hasApproximateFuncs()) { + SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64); + Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT); + + // Scale up input if it is too small. + SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32); + SDValue ScaleUp = + DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt); + SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags); + } SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX); @@ -13214,16 +13217,19 @@ SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1); - SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); - SDValue SqrtD1 = - DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); + SDValue SqrtRet = SqrtS2; + if (!Flags.hasApproximateFuncs()) { + SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2); + SDValue SqrtD1 = + DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX); - SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); + SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2); - SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32); - SDValue ScaleDown = - DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt); - SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); + SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32); + SDValue ScaleDown = DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, + ScaleDownFactor, ZeroInt); + SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags); + } // TODO: Check for DAZ and expand to subnormals diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index 7cdf08800cb25..cf1cabd6d62ed 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1066,116 +1066,80 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) { define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) { ; GFX6-SDAG-LABEL: s_sqrt_f64_afn: ; GFX6-SDAG: ; %bb.0: -; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 -; GFX6-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; GFX6-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX6-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; GFX6-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 -; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; GFX6-SDAG-NEXT: v_mov_b32_e32 v4, 0x260 -; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 -; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[0:1], s[0:1] +; GFX6-SDAG-NEXT: v_mov_b32_e32 v6, 0x260 +; GFX6-SDAG-NEXT: v_cmp_class_f64_e32 vcc, s[0:1], v6 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v7, s1 +; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], s[0:1], v[0:1] +; GFX6-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; GFX6-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[0:1] +; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], -v[2:3], v[2:3], s[0:1] +; GFX6-SDAG-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-SDAG-NEXT: v_readfirstlane_b32 s1, v1 ; GFX6-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_sqrt_f64_afn: ; GFX8-SDAG: ; %bb.0: -; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 -; GFX8-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; GFX8-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX8-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 -; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; GFX8-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 -; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_mov_b32_e32 v4, 0x260 -; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v4 -; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[0:1], s[0:1] +; GFX8-SDAG-NEXT: v_mov_b32_e32 v6, 0x260 +; GFX8-SDAG-NEXT: v_cmp_class_f64_e32 vcc, s[0:1], v6 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v7, s1 +; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], s[0:1], v[0:1] +; GFX8-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; GFX8-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[0:1] +; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], -v[2:3], v[2:3], s[0:1] +; GFX8-SDAG-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-SDAG-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-SDAG-NEXT: ; return to shader part epilog ; ; GFX6-GISEL-LABEL: s_sqrt_f64_afn: ; GFX6-GISEL: ; %bb.0: -; GFX6-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 -; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] -; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 -; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] -; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] -; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] -; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] -; GFX6-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; GFX6-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[0:1], s[0:1] +; GFX6-GISEL-NEXT: v_mov_b32_e32 v6, 0x260 +; GFX6-GISEL-NEXT: v_cmp_class_f64_e32 vcc, s[0:1], v6 +; GFX6-GISEL-NEXT: v_mov_b32_e32 v7, s0 +; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], 0.5 +; GFX6-GISEL-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 0.5 +; GFX6-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[0:1] +; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[0:1], s[0:1] +; GFX6-GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX6-GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX6-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX6-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX6-GISEL-NEXT: ; return to shader part epilog ; ; GFX8-GISEL-LABEL: s_sqrt_f64_afn: ; GFX8-GISEL: ; %bb.0: -; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 -; GFX8-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; GFX8-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX8-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX8-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; GFX8-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 -; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] -; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 -; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] -; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] -; GFX8-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] -; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] -; GFX8-GISEL-NEXT: v_mov_b32_e32 v4, 0xffffff80 -; GFX8-GISEL-NEXT: v_mov_b32_e32 v5, 0x260 -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc -; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v5 -; GFX8-GISEL-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-GISEL-NEXT: v_rsq_f64_e32 v[0:1], s[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v6, 0x260 +; GFX8-GISEL-NEXT: v_cmp_class_f64_e32 vcc, s[0:1], v6 +; GFX8-GISEL-NEXT: v_mov_b32_e32 v7, s0 +; GFX8-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], 0.5 +; GFX8-GISEL-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1] +; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 0.5 +; GFX8-GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[0:1] +; GFX8-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; GFX8-GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[0:1], s[0:1] +; GFX8-GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; GFX8-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX8-GISEL-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-GISEL-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-GISEL-NEXT: ; return to shader part epilog @@ -1193,112 +1157,76 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) { define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) { ; GFX6-SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf: ; GFX6-SDAG: ; %bb.0: -; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 -; GFX6-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; GFX6-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX6-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 -; GFX6-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX6-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; GFX6-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 -; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; GFX6-SDAG-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[0:1] -; GFX6-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; GFX6-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; GFX6-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX6-SDAG-NEXT: v_rsq_f64_e32 v[0:1], s[0:1] +; GFX6-SDAG-NEXT: v_cmp_eq_f64_e64 vcc, s[0:1], 0 +; GFX6-SDAG-NEXT: v_mov_b32_e32 v6, s1 +; GFX6-SDAG-NEXT: v_mul_f64 v[2:3], s[0:1], v[0:1] +; GFX6-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 0.5 +; GFX6-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; GFX6-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[0:1] +; GFX6-SDAG-NEXT: v_fma_f64 v[4:5], -v[2:3], v[2:3], s[0:1] +; GFX6-SDAG-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; GFX6-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX6-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX6-SDAG-NEXT: v_readfirstlane_b32 s1, v1 ; GFX6-SDAG-NEXT: ; return to shader part epilog ; ; GFX8-SDAG-LABEL: s_sqrt_f64_afn_nnan_ninf: ; GFX8-SDAG: ; %bb.0: -; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-SDAG-NEXT: v_bfrev_b32_e32 v1, 8 -; GFX8-SDAG-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; GFX8-SDAG-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX8-SDAG-NEXT: s_cselect_b32 s2, 0x100, 0 -; GFX8-SDAG-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-SDAG-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; GFX8-SDAG-NEXT: s_cselect_b32 s0, 0xffffff80, 0 -; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; GFX8-SDAG-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[0:1] -; GFX8-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] -; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5 -; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], v[6:7], v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_fma_f64 v[6:7], -v[4:5], v[4:5], v[0:1] -; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[4:5] -; GFX8-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], s0 -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-SDAG-NEXT: v_rsq_f64_e32 v[0:1], s[0:1] +; GFX8-SDAG-NEXT: v_cmp_eq_f64_e64 vcc, s[0:1], 0 +; GFX8-SDAG-NEXT: v_mov_b32_e32 v6, s1 +; GFX8-SDAG-NEXT: v_mul_f64 v[2:3], s[0:1], v[0:1] +; GFX8-SDAG-NEXT: v_mul_f64 v[0:1], v[0:1], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 0.5 +; GFX8-SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] +; GFX8-SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[0:1] +; GFX8-SDAG-NEXT: v_fma_f64 v[4:5], -v[2:3], v[2:3], s[0:1] +; GFX8-SDAG-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[2:3] +; GFX8-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc +; GFX8-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-SDAG-NEXT: v_readfirstlane_b32 s0, v0 ; GFX8-SDAG-NEXT: v_readfirstlane_b32 s1, v1 ; GFX8-SDAG-NEXT: ; return to shader part epilog ; ; GFX6-GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf: ; GFX6-GISEL: ; %bb.0: -; GFX6-GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX6-GISEL-NEXT: v_bfrev_b32_e32 v1, 8 -; GFX6-GISEL-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; GFX6-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX6-GISEL-NEXT: v_lshlrev_b32_e32 v0, 8, v0 -; GFX6-GISEL-NEXT: v_ldexp_f64 v[0:1], s[0:1], v0 -; GFX6-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] -; GFX6-GISEL-NEXT: v_mul_f64 v[4:5], v[2:3], 0.5 -; GFX6-GISEL-NEXT: v_mul_f64 v[2:3], v[0:1], v[2:3] -; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5 -; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3] -; GFX6-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] -; GFX6-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[4:5], v[2:3] -; GFX6-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[2:3], v[0:1] -; GFX6-GISEL-... [truncated] `````````` </details> https://github.com/llvm/llvm-project/pull/183697 _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
