https://github.com/AlexMaclean updated https://github.com/llvm/llvm-project/pull/135644
>From fd11c2b4c964a3fe336e3fcb106fca5bf9c7d2b2 Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Fri, 11 Apr 2025 17:59:50 +0000 Subject: [PATCH 1/5] [NVPTX] Cleaup and document nvvm.fabs intrinsics, adding f16 support --- llvm/docs/NVPTXUsage.rst | 52 ++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 14 +- llvm/lib/IR/AutoUpgrade.cpp | 16 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 19 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 162 +++++++++--------- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 11 +- llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll | 142 +++++++++++++++ .../math-intrins-sm80-ptx70-autoupgrade.ll | 3 + 8 files changed, 316 insertions(+), 103 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 621879fc5648b..fbb7122b9b42d 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -309,6 +309,58 @@ space casted to this space), 1 is returned, otherwise 0 is returned. Arithmetic Intrinsics --------------------- +'``llvm.nvvm.fabs.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare float @llvm.nvvm.fabs.f32(float %a) + declare double @llvm.nvvm.fabs.f64(double %a) + declare half @llvm.nvvm.fabs.f16(half %a) + declare <2 x bfloat> @llvm.nvvm.fabs.v2bf16(<2 x bfloat> %a) + +Overview: +""""""""" + +The '``llvm.nvvm.fabs.*``' intrinsics return the absolute value of the operand. + +Semantics: +"""""""""" + +Unlike, '``llvm.fabs.*``', these intrinsics do not perfectly preserve NaN +values. Instead, a NaN input yeilds an unspecified NaN output. The exception to +this rule is the double precision variant, for which NaN is preserved. + + +'``llvm.nvvm.fabs.ftz.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare float @llvm.nvvm.fabs.ftz.f32(float %a) + declare half @llvm.nvvm.fabs.ftz.f16(half %a) + declare <2 x half> @llvm.nvvm.fabs.ftz.v2f16(<2 x half> %a) + +Overview: +""""""""" + +The '``llvm.nvvm.fabs.ftz.*``' intrinsics return the absolute value of the +operand, flushing subnormals to sign preserving zero. + +Semantics: +"""""""""" + +Before the absolute value is taken, the input is flushed to sign preserving +zero if it is a subnormal. In addtion, unlike '``llvm.fabs.*``', a NaN input +yields an unspecified NaN output. + + '``llvm.nvvm.idp2a.[us].[us]``' Intrinsics ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 4aeb1d8a2779e..2cea00c640a02 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1039,18 +1039,18 @@ let TargetPrefix = "nvvm" in { // Abs // - def int_nvvm_fabs_ftz_f : ClangBuiltin<"__nvvm_fabs_ftz_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; - def int_nvvm_fabs_f : ClangBuiltin<"__nvvm_fabs_f">, - DefaultAttrsIntrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable]>; - def int_nvvm_fabs_d : ClangBuiltin<"__nvvm_fabs_d">, - DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_fabs_ftz : + DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable]>; + def int_nvvm_fabs : + DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable]>; // // Abs, Neg bf16, bf16x2 // - foreach unary = ["abs", "neg"] in { + foreach unary = ["neg"] in { def int_nvvm_ # unary # _bf16 : ClangBuiltin<!strconcat("__nvvm_", unary, "_bf16")>, DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_bfloat_ty], [IntrNoMem]>; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 0b329d91c3c7c..c9b946615629a 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -939,12 +939,6 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, } static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) { - if (Name.consume_front("abs.")) - return StringSwitch<Intrinsic::ID>(Name) - .Case("bf16", Intrinsic::nvvm_abs_bf16) - .Case("bf16x2", Intrinsic::nvvm_abs_bf16x2) - .Default(Intrinsic::not_intrinsic); - if (Name.consume_front("fma.rn.")) return StringSwitch<Intrinsic::ID>(Name) .Case("bf16", Intrinsic::nvvm_fma_rn_bf16) @@ -1291,7 +1285,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, bool Expand = false; if (Name.consume_front("abs.")) // nvvm.abs.{i,ii} - Expand = Name == "i" || Name == "ll"; + Expand = + Name == "i" || Name == "ll" || Name == "bf16" || Name == "bf16x2"; else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" || Name == "swap.lo.hi.b64") Expand = true; @@ -2311,6 +2306,13 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, Value *Cmp = Builder.CreateICmpSGE( Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond"); Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs"); + } else if (Name == "abs.bf16" || Name == "abs.bf16x2") { + Type *Ty = (Name == "abs.bf16") + ? Builder.getBFloatTy() + : FixedVectorType::get(Builder.getBFloatTy(), 2); + Value *Arg = Builder.CreateBitCast(CI->getArgOperand(0), Ty); + Value *Abs = Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs, Arg); + Rep = Builder.CreateBitCast(Abs, CI->getType()); } else if (Name.starts_with("atomic.load.add.f32.p") || Name.starts_with("atomic.load.add.f64.p")) { Value *Ptr = CI->getArgOperand(0); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index aa0eedb1b7446..62fdb6d2523f4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -226,14 +226,17 @@ class RegTyInfo<ValueType ty, NVPTXRegClass rc, Operand imm, SDNode imm_node, int Size = ty.Size; } -def I16RT : RegTyInfo<i16, Int16Regs, i16imm, imm>; -def I32RT : RegTyInfo<i32, Int32Regs, i32imm, imm>; -def I64RT : RegTyInfo<i64, Int64Regs, i64imm, imm>; - -def F32RT : RegTyInfo<f32, Float32Regs, f32imm, fpimm>; -def F64RT : RegTyInfo<f64, Float64Regs, f64imm, fpimm>; -def F16RT : RegTyInfo<f16, Int16Regs, f16imm, fpimm, supports_imm = 0>; -def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>; +def I16RT : RegTyInfo<i16, Int16Regs, i16imm, imm>; +def I32RT : RegTyInfo<i32, Int32Regs, i32imm, imm>; +def I64RT : RegTyInfo<i64, Int64Regs, i64imm, imm>; + +def F32RT : RegTyInfo<f32, Float32Regs, f32imm, fpimm>; +def F64RT : RegTyInfo<f64, Float64Regs, f64imm, fpimm>; +def F16RT : RegTyInfo<f16, Int16Regs, f16imm, fpimm, supports_imm = 0>; +def BF16RT : RegTyInfo<bf16, Int16Regs, bf16imm, fpimm, supports_imm = 0>; + +def F16X2RT : RegTyInfo<v2f16, Int32Regs, ?, ?, supports_imm = 0>; +def BF16X2RT : RegTyInfo<v2bf16, Int32Regs, ?, ?, supports_imm = 0>; // Template for instructions which take three int64, int32, or int16 args. diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 8528ff702f236..6f6601555f6e4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -983,12 +983,13 @@ def : Pat<(int_nvvm_fmin_d // We need a full string for OpcStr here because we need to deal with case like // INT_PTX_RECIP. -class F_MATH_1<string OpcStr, NVPTXRegClass target_regclass, - NVPTXRegClass src_regclass, Intrinsic IntOP, list<Predicate> Preds = []> - : NVPTXInst<(outs target_regclass:$dst), (ins src_regclass:$src0), - OpcStr, - [(set target_regclass:$dst, (IntOP src_regclass:$src0))]>, - Requires<Preds>; +class F_MATH_1<string OpcStr, RegTyInfo dst, RegTyInfo src, Intrinsic IntOP, + list<Predicate> Preds = []> + : NVPTXInst<(outs dst.RC:$dst), + (ins src.RC:$src0), + OpcStr, + [(set dst.Ty:$dst, (IntOP src.Ty:$src0))]>, + Requires<Preds>; // We need a full string for OpcStr here because we need to deal with the case // like INT_PTX_NATIVE_POWR_F. @@ -1307,13 +1308,20 @@ def : Pat<(int_nvvm_ceil_d f64:$a), // Abs // -def INT_NVVM_FABS_FTZ_F : F_MATH_1<"abs.ftz.f32 \t$dst, $src0;", Float32Regs, - Float32Regs, int_nvvm_fabs_ftz_f>; -def INT_NVVM_FABS_F : F_MATH_1<"abs.f32 \t$dst, $src0;", Float32Regs, - Float32Regs, int_nvvm_fabs_f>; +multiclass F_ABS<string suffix, RegTyInfo RT, bit support_ftz, list<Predicate> preds = []> { + def "" : F_MATH_1<"abs." # suffix # " \t$dst, $src0;", RT, RT, int_nvvm_fabs, preds>; + if support_ftz then + def _FTZ : F_MATH_1<"abs.ftz." # suffix # " \t$dst, $src0;", RT, RT, int_nvvm_fabs_ftz, preds>; +} + +defm ABS_F16 : F_ABS<"f16", F16RT, support_ftz = true, preds = [hasPTX<65>, hasSM<53>]>; +defm ABS_F16X2 : F_ABS<"f16x2", F16X2RT, support_ftz = true, preds = [hasPTX<65>, hasSM<53>]>; + +defm ABS_BF16 : F_ABS<"bf16", BF16RT, support_ftz = false, preds = [hasPTX<70>, hasSM<80>]>; +defm ABS_BF16X2 : F_ABS<"bf16x2", BF16X2RT, support_ftz = false, preds = [hasPTX<70>, hasSM<80>]>; -def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_fabs_d>; +defm ABS_F32 : F_ABS<"f32", F32RT, support_ftz = true>; +defm ABS_F64 : F_ABS<"f64", F64RT, support_ftz = false>; // // copysign @@ -1332,17 +1340,13 @@ def COPYSIGN_D : [(set f64:$dst, (fcopysign_nvptx f64:$src1, f64:$src0))]>; // -// Abs, Neg bf16, bf16x2 +// Neg bf16, bf16x2 // -def INT_NVVM_ABS_BF16 : F_MATH_1<"abs.bf16 \t$dst, $src0;", Int16Regs, - Int16Regs, int_nvvm_abs_bf16, [hasPTX<70>, hasSM<80>]>; -def INT_NVVM_ABS_BF16X2 : F_MATH_1<"abs.bf16x2 \t$dst, $src0;", Int32Regs, - Int32Regs, int_nvvm_abs_bf16x2, [hasPTX<70>, hasSM<80>]>; -def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $src0;", Int16Regs, - Int16Regs, int_nvvm_neg_bf16, [hasPTX<70>, hasSM<80>]>; -def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", Int32Regs, - Int32Regs, int_nvvm_neg_bf16x2, [hasPTX<70>, hasSM<80>]>; +def INT_NVVM_NEG_BF16 : F_MATH_1<"neg.bf16 \t$dst, $src0;", BF16RT, + BF16RT, int_nvvm_neg_bf16, [hasPTX<70>, hasSM<80>]>; +def INT_NVVM_NEG_BF16X2 : F_MATH_1<"neg.bf16x2 \t$dst, $src0;", BF16X2RT, + BF16X2RT, int_nvvm_neg_bf16x2, [hasPTX<70>, hasSM<80>]>; // // Round @@ -1382,16 +1386,16 @@ def : Pat<(int_nvvm_saturate_d f64:$a), // def INT_NVVM_EX2_APPROX_FTZ_F : F_MATH_1<"ex2.approx.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_ex2_approx_ftz_f>; + F32RT, F32RT, int_nvvm_ex2_approx_ftz_f>; def INT_NVVM_EX2_APPROX_F : F_MATH_1<"ex2.approx.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_ex2_approx_f>; + F32RT, F32RT, int_nvvm_ex2_approx_f>; def INT_NVVM_EX2_APPROX_D : F_MATH_1<"ex2.approx.f64 \t$dst, $src0;", - Float64Regs, Float64Regs, int_nvvm_ex2_approx_d>; + F64RT, F64RT, int_nvvm_ex2_approx_d>; def INT_NVVM_EX2_APPROX_F16 : F_MATH_1<"ex2.approx.f16 \t$dst, $src0;", - Int16Regs, Int16Regs, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>; + F16RT, F16RT, int_nvvm_ex2_approx_f16, [hasPTX<70>, hasSM<75>]>; def INT_NVVM_EX2_APPROX_F16X2 : F_MATH_1<"ex2.approx.f16x2 \t$dst, $src0;", - Int32Regs, Int32Regs, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>; + F16X2RT, F16X2RT, int_nvvm_ex2_approx_f16x2, [hasPTX<70>, hasSM<75>]>; def : Pat<(fexp2 f32:$a), (INT_NVVM_EX2_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>; @@ -1403,11 +1407,11 @@ def : Pat<(fexp2 v2f16:$a), (INT_NVVM_EX2_APPROX_F16X2 $a)>, Requires<[useFP16Math]>; def INT_NVVM_LG2_APPROX_FTZ_F : F_MATH_1<"lg2.approx.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_lg2_approx_ftz_f>; + F32RT, F32RT, int_nvvm_lg2_approx_ftz_f>; def INT_NVVM_LG2_APPROX_F : F_MATH_1<"lg2.approx.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_lg2_approx_f>; + F32RT, F32RT, int_nvvm_lg2_approx_f>; def INT_NVVM_LG2_APPROX_D : F_MATH_1<"lg2.approx.f64 \t$dst, $src0;", - Float64Regs, Float64Regs, int_nvvm_lg2_approx_d>; + F64RT, F64RT, int_nvvm_lg2_approx_d>; def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_FTZ_F $a)>, Requires<[doF32FTZ]>; @@ -1419,14 +1423,14 @@ def : Pat<(flog2 f32:$a), (INT_NVVM_LG2_APPROX_F $a)>, // def INT_NVVM_SIN_APPROX_FTZ_F : F_MATH_1<"sin.approx.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_sin_approx_ftz_f>; + F32RT, F32RT, int_nvvm_sin_approx_ftz_f>; def INT_NVVM_SIN_APPROX_F : F_MATH_1<"sin.approx.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_sin_approx_f>; + F32RT, F32RT, int_nvvm_sin_approx_f>; def INT_NVVM_COS_APPROX_FTZ_F : F_MATH_1<"cos.approx.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_cos_approx_ftz_f>; + F32RT, F32RT, int_nvvm_cos_approx_ftz_f>; def INT_NVVM_COS_APPROX_F : F_MATH_1<"cos.approx.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_cos_approx_f>; + F32RT, F32RT, int_nvvm_cos_approx_f>; // // Fma @@ -1511,69 +1515,69 @@ defm INT_NVVM_FMA : FMA_INST; // def INT_NVVM_RCP_RN_FTZ_F : F_MATH_1<"rcp.rn.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_rn_ftz_f>; + F32RT, F32RT, int_nvvm_rcp_rn_ftz_f>; def INT_NVVM_RCP_RN_F : F_MATH_1<"rcp.rn.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_rn_f>; + F32RT, F32RT, int_nvvm_rcp_rn_f>; def INT_NVVM_RCP_RZ_FTZ_F : F_MATH_1<"rcp.rz.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_rz_ftz_f>; + F32RT, F32RT, int_nvvm_rcp_rz_ftz_f>; def INT_NVVM_RCP_RZ_F : F_MATH_1<"rcp.rz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_rz_f>; + F32RT, F32RT, int_nvvm_rcp_rz_f>; def INT_NVVM_RCP_RM_FTZ_F : F_MATH_1<"rcp.rm.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_rm_ftz_f>; + F32RT, F32RT, int_nvvm_rcp_rm_ftz_f>; def INT_NVVM_RCP_RM_F : F_MATH_1<"rcp.rm.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_rm_f>; + F32RT, F32RT, int_nvvm_rcp_rm_f>; def INT_NVVM_RCP_RP_FTZ_F : F_MATH_1<"rcp.rp.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_rp_ftz_f>; + F32RT, F32RT, int_nvvm_rcp_rp_ftz_f>; def INT_NVVM_RCP_RP_F : F_MATH_1<"rcp.rp.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_rp_f>; + F32RT, F32RT, int_nvvm_rcp_rp_f>; -def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_rcp_rn_d>; -def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_rcp_rz_d>; -def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_rcp_rm_d>; -def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_rcp_rp_d>; +def INT_NVVM_RCP_RN_D : F_MATH_1<"rcp.rn.f64 \t$dst, $src0;", F64RT, + F64RT, int_nvvm_rcp_rn_d>; +def INT_NVVM_RCP_RZ_D : F_MATH_1<"rcp.rz.f64 \t$dst, $src0;", F64RT, + F64RT, int_nvvm_rcp_rz_d>; +def INT_NVVM_RCP_RM_D : F_MATH_1<"rcp.rm.f64 \t$dst, $src0;", F64RT, + F64RT, int_nvvm_rcp_rm_d>; +def INT_NVVM_RCP_RP_D : F_MATH_1<"rcp.rp.f64 \t$dst, $src0;", F64RT, + F64RT, int_nvvm_rcp_rp_d>; def INT_NVVM_RCP_APPROX_FTZ_F : F_MATH_1<"rcp.approx.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rcp_approx_ftz_f>; + F32RT, F32RT, int_nvvm_rcp_approx_ftz_f>; def INT_NVVM_RCP_APPROX_FTZ_D : F_MATH_1<"rcp.approx.ftz.f64 \t$dst, $src0;", - Float64Regs, Float64Regs, int_nvvm_rcp_approx_ftz_d>; + F64RT, F64RT, int_nvvm_rcp_approx_ftz_d>; // // Sqrt // def INT_NVVM_SQRT_RN_FTZ_F : F_MATH_1<"sqrt.rn.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_sqrt_rn_ftz_f>; -def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", Float32Regs, - Float32Regs, int_nvvm_sqrt_rn_f>; + F32RT, F32RT, int_nvvm_sqrt_rn_ftz_f>; +def INT_NVVM_SQRT_RN_F : F_MATH_1<"sqrt.rn.f32 \t$dst, $src0;", F32RT, + F32RT, int_nvvm_sqrt_rn_f>; def INT_NVVM_SQRT_RZ_FTZ_F : F_MATH_1<"sqrt.rz.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_sqrt_rz_ftz_f>; -def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", Float32Regs, - Float32Regs, int_nvvm_sqrt_rz_f>; + F32RT, F32RT, int_nvvm_sqrt_rz_ftz_f>; +def INT_NVVM_SQRT_RZ_F : F_MATH_1<"sqrt.rz.f32 \t$dst, $src0;", F32RT, + F32RT, int_nvvm_sqrt_rz_f>; def INT_NVVM_SQRT_RM_FTZ_F : F_MATH_1<"sqrt.rm.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_sqrt_rm_ftz_f>; -def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", Float32Regs, - Float32Regs, int_nvvm_sqrt_rm_f>; + F32RT, F32RT, int_nvvm_sqrt_rm_ftz_f>; +def INT_NVVM_SQRT_RM_F : F_MATH_1<"sqrt.rm.f32 \t$dst, $src0;", F32RT, + F32RT, int_nvvm_sqrt_rm_f>; def INT_NVVM_SQRT_RP_FTZ_F : F_MATH_1<"sqrt.rp.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_sqrt_rp_ftz_f>; -def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", Float32Regs, - Float32Regs, int_nvvm_sqrt_rp_f>; + F32RT, F32RT, int_nvvm_sqrt_rp_ftz_f>; +def INT_NVVM_SQRT_RP_F : F_MATH_1<"sqrt.rp.f32 \t$dst, $src0;", F32RT, + F32RT, int_nvvm_sqrt_rp_f>; def INT_NVVM_SQRT_APPROX_FTZ_F : F_MATH_1<"sqrt.approx.ftz.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_sqrt_approx_ftz_f>; + F32RT, F32RT, int_nvvm_sqrt_approx_ftz_f>; def INT_NVVM_SQRT_APPROX_F : F_MATH_1<"sqrt.approx.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_sqrt_approx_f>; + F32RT, F32RT, int_nvvm_sqrt_approx_f>; -def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_sqrt_rn_d>; -def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_sqrt_rz_d>; -def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_sqrt_rm_d>; -def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs, - Float64Regs, int_nvvm_sqrt_rp_d>; +def INT_NVVM_SQRT_RN_D : F_MATH_1<"sqrt.rn.f64 \t$dst, $src0;", F64RT, + F64RT, int_nvvm_sqrt_rn_d>; +def INT_NVVM_SQRT_RZ_D : F_MATH_1<"sqrt.rz.f64 \t$dst, $src0;", F64RT, + F64RT, int_nvvm_sqrt_rz_d>; +def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", F64RT, + F64RT, int_nvvm_sqrt_rm_d>; +def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", F64RT, + F64RT, int_nvvm_sqrt_rp_d>; // nvvm_sqrt intrinsic def : Pat<(int_nvvm_sqrt_f f32:$a), @@ -1590,16 +1594,16 @@ def : Pat<(int_nvvm_sqrt_f f32:$a), // def INT_NVVM_RSQRT_APPROX_FTZ_F - : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", Float32Regs, Float32Regs, + : F_MATH_1<"rsqrt.approx.ftz.f32 \t$dst, $src0;", F32RT, F32RT, int_nvvm_rsqrt_approx_ftz_f>; def INT_NVVM_RSQRT_APPROX_FTZ_D - : F_MATH_1<"rsqrt.approx.ftz.f64 \t$dst, $src0;", Float64Regs, Float64Regs, + : F_MATH_1<"rsqrt.approx.ftz.f64 \t$dst, $src0;", F64RT, F64RT, int_nvvm_rsqrt_approx_ftz_d>; def INT_NVVM_RSQRT_APPROX_F : F_MATH_1<"rsqrt.approx.f32 \t$dst, $src0;", - Float32Regs, Float32Regs, int_nvvm_rsqrt_approx_f>; + F32RT, F32RT, int_nvvm_rsqrt_approx_f>; def INT_NVVM_RSQRT_APPROX_D : F_MATH_1<"rsqrt.approx.f64 \t$dst, $src0;", - Float64Regs, Float64Regs, int_nvvm_rsqrt_approx_d>; + F64RT, F64RT, int_nvvm_rsqrt_approx_d>; // 1.0f / sqrt_approx -> rsqrt_approx def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_approx_f f32:$a)), @@ -1815,13 +1819,13 @@ def INT_NVVM_D2I_LO : F_MATH_1< ".reg .b32 %temp; \n\t", "mov.b64 \t{$dst, %temp}, $src0;\n\t", "}}"), - Int32Regs, Float64Regs, int_nvvm_d2i_lo>; + I32RT, F64RT, int_nvvm_d2i_lo>; def INT_NVVM_D2I_HI : F_MATH_1< !strconcat("{{\n\t", ".reg .b32 %temp; \n\t", "mov.b64 \t{%temp, $dst}, $src0;\n\t", "}}"), - Int32Regs, Float64Regs, int_nvvm_d2i_hi>; + I32RT, F64RT, int_nvvm_d2i_hi>; def : Pat<(int_nvvm_f2ll_rn_ftz f32:$a), (CVT_s64_f32 $a, CvtRNI_FTZ)>; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 81ad01bea8867..5834214f179f3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -141,6 +141,7 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, enum SpecialCase { SPC_Reciprocal, SCP_FunnelShiftClamp, + SPC_Fabs, }; // SimplifyAction is a poor-man's variant (plus an additional flag) that @@ -185,8 +186,8 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, return {Intrinsic::ceil, FTZ_MustBeOff}; case Intrinsic::nvvm_ceil_ftz_f: return {Intrinsic::ceil, FTZ_MustBeOn}; - case Intrinsic::nvvm_fabs_d: - return {Intrinsic::fabs, FTZ_Any}; + case Intrinsic::nvvm_fabs: + return {SPC_Fabs, FTZ_Any}; case Intrinsic::nvvm_floor_d: return {Intrinsic::floor, FTZ_Any}; case Intrinsic::nvvm_floor_f: @@ -411,6 +412,12 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, } return nullptr; } + case SPC_Fabs: { + if (II->getType() == IC.Builder.getDoubleTy()) + return IC.Builder.CreateUnaryIntrinsic(Intrinsic::fabs, + II->getArgOperand(0)); + return nullptr; + } } llvm_unreachable("All SpecialCase enumerators should be handled in switch."); } diff --git a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll new file mode 100644 index 0000000000000..dd9ef220a9b47 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll @@ -0,0 +1,142 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mcpu=sm_80 -mattr=+ptx70 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-11.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} + +target triple = "nvptx-nvidia-cuda" + +declare float @llvm.nvvm.fabs.f32(float) +declare float @llvm.nvvm.fabs.ftz.f32(float) +declare double @llvm.nvvm.fabs.f64(double) +declare half @llvm.nvvm.fabs.f16(half) +declare half @llvm.nvvm.fabs.ftz.f16(half) +declare <2 x half> @llvm.nvvm.fabs.v2f16(<2 x half>) +declare <2 x half> @llvm.nvvm.fabs.ftz.v2f16(<2 x half>) +declare bfloat @llvm.nvvm.fabs.bf16(bfloat) +declare <2 x bfloat> @llvm.nvvm.fabs.v2bf16(<2 x bfloat>) + + +define float @fabs_float(float %a) { +; CHECK-LABEL: fabs_float( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [fabs_float_param_0]; +; CHECK-NEXT: abs.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %ret = call float @llvm.nvvm.fabs.f32(float %a) + ret float %ret +} + +define float @fabs_float_ftz(float %a) { +; CHECK-LABEL: fabs_float_ftz( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f32 %f1, [fabs_float_ftz_param_0]; +; CHECK-NEXT: abs.ftz.f32 %f2, %f1; +; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: ret; + %ret = call float @llvm.nvvm.fabs.ftz.f32(float %a) + ret float %ret +} + +define double @fabs_double(double %a) { +; CHECK-LABEL: fabs_double( +; CHECK: { +; CHECK-NEXT: .reg .f64 %fd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.f64 %fd1, [fabs_double_param_0]; +; CHECK-NEXT: abs.f64 %fd2, %fd1; +; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: ret; + %ret = call double @llvm.nvvm.fabs.f64(double %a) + ret double %ret +} + +define half @fabs_half(half %a) { +; CHECK-LABEL: fabs_half( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [fabs_half_param_0]; +; CHECK-NEXT: abs.f16 %rs2, %rs1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; + %ret = call half @llvm.nvvm.fabs.f16(half %a) + ret half %ret +} + +define half @fabs_half_ftz(half %a) { +; CHECK-LABEL: fabs_half_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [fabs_half_ftz_param_0]; +; CHECK-NEXT: abs.ftz.f16 %rs2, %rs1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; + %ret = call half @llvm.nvvm.fabs.ftz.f16(half %a) + ret half %ret +} + +define <2 x half> @fabs_v2half(<2 x half> %a) { +; CHECK-LABEL: fabs_v2half( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [fabs_v2half_param_0]; +; CHECK-NEXT: abs.f16x2 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %ret = call <2 x half> @llvm.nvvm.fabs.v2f16(<2 x half> %a) + ret <2 x half> %ret +} + +define <2 x half> @fabs_v2half_ftz(<2 x half> %a) { +; CHECK-LABEL: fabs_v2half_ftz( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [fabs_v2half_ftz_param_0]; +; CHECK-NEXT: abs.ftz.f16x2 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %ret = call <2 x half> @llvm.nvvm.fabs.ftz.v2f16(<2 x half> %a) + ret <2 x half> %ret +} + +define bfloat @fabs_bf16(bfloat %a) { +; CHECK-LABEL: fabs_bf16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b16 %rs1, [fabs_bf16_param_0]; +; CHECK-NEXT: abs.bf16 %rs2, %rs1; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs2; +; CHECK-NEXT: ret; + %ret = call bfloat @llvm.nvvm.fabs.bf16(bfloat %a) + ret bfloat %ret +} + +define <2 x bfloat> @fabs_v2bf16(<2 x bfloat> %a) { +; CHECK-LABEL: fabs_v2bf16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.b32 %r1, [fabs_v2bf16_param_0]; +; CHECK-NEXT: abs.bf16x2 %r2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; + %ret = call <2 x bfloat> @llvm.nvvm.fabs.v2bf16(<2 x bfloat> %a) + ret <2 x bfloat> %ret +} diff --git a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll index 7faac51ff27ca..c04fd07ec5da1 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins-sm80-ptx70-autoupgrade.ll @@ -1,6 +1,9 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s ; RUN: %if ptxas-11.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | %ptxas-verify -arch=sm_80 %} +declare bfloat @llvm.nvvm.abs.bf16(bfloat) +declare <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat>) + ; CHECK-LABEL: abs_bf16 define bfloat @abs_bf16(bfloat %0) { ; CHECK-NOT: call >From 7d87fb56ec2523e7322253a1fbf2e95282d91969 Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Tue, 15 Apr 2025 00:09:33 +0000 Subject: [PATCH 2/5] fixup --- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 5834214f179f3..46a184134f3b1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -413,10 +413,11 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, return nullptr; } case SPC_Fabs: { - if (II->getType() == IC.Builder.getDoubleTy()) - return IC.Builder.CreateUnaryIntrinsic(Intrinsic::fabs, - II->getArgOperand(0)); - return nullptr; + if (!II->getType()->isDoubleTy()) + return nullptr; + auto *Fabs = Intrinsic::getOrInsertDeclaration( + II->getModule(), Intrinsic::fabs, II->getType()); + return CallInst::Create(Fabs, II->getArgOperand(0)); } } llvm_unreachable("All SpecialCase enumerators should be handled in switch."); >From e0d28ecb2ac3bc3c4d4c892da43f6b2e22d9276b Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Tue, 15 Apr 2025 00:09:45 +0000 Subject: [PATCH 3/5] address comments --- llvm/docs/NVPTXUsage.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index fbb7122b9b42d..458aa5cb8ad1f 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -320,6 +320,8 @@ Syntax: declare float @llvm.nvvm.fabs.f32(float %a) declare double @llvm.nvvm.fabs.f64(double %a) declare half @llvm.nvvm.fabs.f16(half %a) + declare <2 x half> @llvm.nvvm.fabs.v2f16(<2 x half> %a) + declare bfloat @llvm.nvvm.fabs.bf16(bfloat %a) declare <2 x bfloat> @llvm.nvvm.fabs.v2bf16(<2 x bfloat> %a) Overview: >From 1bfd50ddfecdc8309afb403d573221262339e588 Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Tue, 15 Apr 2025 15:54:19 +0000 Subject: [PATCH 4/5] fixup clang --- clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp | 4 ++++ clang/test/CodeGen/builtins-nvptx.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp index 0f7ab9fd3b099..6d30fc82d42a5 100644 --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp @@ -1034,6 +1034,10 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2: return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E, *this); + case NVPTX::BI__nvvm_abs_bf16: + case NVPTX::BI__nvvm_abs_bf16x2: + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs, + EmitScalarExpr(E->getArg(0))); case NVPTX::BI__nvvm_ldg_h: case NVPTX::BI__nvvm_ldg_h2: return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this); diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index 71b29849618b6..b2877d46e46e2 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -1038,9 +1038,9 @@ __device__ void nvvm_cvt_sm89() { __device__ void nvvm_abs_neg_bf16_bf16x2_sm80() { #if __CUDA_ARCH__ >= 800 - // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.abs.bf16(bfloat 0xR3DCD) + // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.fabs.bf16(bfloat 0xR3DCD) __nvvm_abs_bf16(BF16); - // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.abs.bf16x2(<2 x bfloat> splat (bfloat 0xR3DCD)) + // CHECK_PTX70_SM80: call <2 x bfloat> @llvm.nvvm.fabs.v2bf16(<2 x bfloat> splat (bfloat 0xR3DCD)) __nvvm_abs_bf16x2(BF16X2); // CHECK_PTX70_SM80: call bfloat @llvm.nvvm.neg.bf16(bfloat 0xR3DCD) >From 51f32c9e6c72e0d242c0ba7c2bcad9564234038c Mon Sep 17 00:00:00 2001 From: Alex Maclean <amacl...@nvidia.com> Date: Tue, 15 Apr 2025 21:22:54 +0000 Subject: [PATCH 5/5] fixup --- clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp | 7 +++++++ clang/test/CodeGen/builtins-nvptx.c | 8 ++++++++ llvm/docs/NVPTXUsage.rst | 3 +-- llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp | 10 ---------- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp index 6d30fc82d42a5..85aff28ebae6d 100644 --- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp @@ -1034,10 +1034,17 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID, case NVPTX::BI__nvvm_fmin_xorsign_abs_f16x2: return MakeHalfType(Intrinsic::nvvm_fmin_xorsign_abs_f16x2, BuiltinID, E, *this); + case NVPTX::BI__nvvm_fabs_f: case NVPTX::BI__nvvm_abs_bf16: case NVPTX::BI__nvvm_abs_bf16x2: return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs, EmitScalarExpr(E->getArg(0))); + case NVPTX::BI__nvvm_fabs_ftz_f: + return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_fabs_ftz, + EmitScalarExpr(E->getArg(0))); + case NVPTX::BI__nvvm_fabs_d: + return Builder.CreateUnaryIntrinsic(Intrinsic::fabs, + EmitScalarExpr(E->getArg(0))); case NVPTX::BI__nvvm_ldg_h: case NVPTX::BI__nvvm_ldg_h2: return MakeHalfType(Intrinsic::not_intrinsic, BuiltinID, E, *this); diff --git a/clang/test/CodeGen/builtins-nvptx.c b/clang/test/CodeGen/builtins-nvptx.c index b2877d46e46e2..5d14b327e53fe 100644 --- a/clang/test/CodeGen/builtins-nvptx.c +++ b/clang/test/CodeGen/builtins-nvptx.c @@ -229,6 +229,14 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) { // CHECK: call double @llvm.nvvm.rcp.rn.d double td4 = __nvvm_rcp_rn_d(d2); +// CHECK: call float @llvm.nvvm.fabs.f32 + float t6 = __nvvm_fabs_f(f1); +// CHECK: call float @llvm.nvvm.fabs.ftz.f32 + float t7 = __nvvm_fabs_ftz_f(f2); + +// CHECK: call double @llvm.fabs.f64 + double td5 = __nvvm_fabs_d(d1); + // CHECK: call void @llvm.nvvm.membar.cta() __nvvm_membar_cta(); // CHECK: call void @llvm.nvvm.membar.gl() diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 458aa5cb8ad1f..ef6e888286def 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -333,8 +333,7 @@ Semantics: """""""""" Unlike, '``llvm.fabs.*``', these intrinsics do not perfectly preserve NaN -values. Instead, a NaN input yeilds an unspecified NaN output. The exception to -this rule is the double precision variant, for which NaN is preserved. +values. Instead, a NaN input yeilds an unspecified NaN output. '``llvm.nvvm.fabs.ftz.*``' Intrinsic diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 46a184134f3b1..72245fe83491d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -141,7 +141,6 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, enum SpecialCase { SPC_Reciprocal, SCP_FunnelShiftClamp, - SPC_Fabs, }; // SimplifyAction is a poor-man's variant (plus an additional flag) that @@ -186,8 +185,6 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, return {Intrinsic::ceil, FTZ_MustBeOff}; case Intrinsic::nvvm_ceil_ftz_f: return {Intrinsic::ceil, FTZ_MustBeOn}; - case Intrinsic::nvvm_fabs: - return {SPC_Fabs, FTZ_Any}; case Intrinsic::nvvm_floor_d: return {Intrinsic::floor, FTZ_Any}; case Intrinsic::nvvm_floor_f: @@ -412,13 +409,6 @@ static Instruction *convertNvvmIntrinsicToLlvm(InstCombiner &IC, } return nullptr; } - case SPC_Fabs: { - if (!II->getType()->isDoubleTy()) - return nullptr; - auto *Fabs = Intrinsic::getOrInsertDeclaration( - II->getModule(), Intrinsic::fabs, II->getType()); - return CallInst::Create(Fabs, II->getArgOperand(0)); - } } llvm_unreachable("All SpecialCase enumerators should be handled in switch."); } _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits