https://github.com/frasercrmck updated https://github.com/llvm/llvm-project/pull/134063
>From 498e82e2ec64cf6ba04466ac58fca5769fecdefd Mon Sep 17 00:00:00 2001 From: Fraser Cormack <fra...@codeplay.com> Date: Wed, 2 Apr 2025 11:06:30 +0100 Subject: [PATCH 1/2] [libclc] Move sinh, cosh & tanh to the CLC library This commit also vectorizes the builtins. --- libclc/clc/include/clc/math/clc_cosh.h | 20 +++ libclc/clc/include/clc/math/clc_sinh.h | 20 +++ libclc/clc/include/clc/math/clc_tanh.h | 20 +++ libclc/clc/include/clc/math/tables.h | 9 +- libclc/clc/lib/generic/SOURCES | 3 + libclc/clc/lib/generic/math/clc_cosh.cl | 24 +++ libclc/clc/lib/generic/math/clc_cosh.inc | 201 ++++++++++++++++++++++ libclc/clc/lib/generic/math/clc_sinh.cl | 23 +++ libclc/clc/lib/generic/math/clc_sinh.inc | 201 ++++++++++++++++++++++ libclc/clc/lib/generic/math/clc_tables.cl | 100 +++++++++++ libclc/clc/lib/generic/math/clc_tanh.cl | 21 +++ libclc/clc/lib/generic/math/clc_tanh.inc | 137 +++++++++++++++ libclc/generic/lib/math/cosh.cl | 179 +------------------ libclc/generic/lib/math/sinh.cl | 178 +------------------ libclc/generic/lib/math/tables.cl | 130 -------------- libclc/generic/lib/math/tanh.cl | 133 +------------- 16 files changed, 788 insertions(+), 611 deletions(-) create mode 100644 libclc/clc/include/clc/math/clc_cosh.h create mode 100644 libclc/clc/include/clc/math/clc_sinh.h create mode 100644 libclc/clc/include/clc/math/clc_tanh.h create mode 100644 libclc/clc/lib/generic/math/clc_cosh.cl create mode 100644 libclc/clc/lib/generic/math/clc_cosh.inc create mode 100644 libclc/clc/lib/generic/math/clc_sinh.cl create mode 100644 libclc/clc/lib/generic/math/clc_sinh.inc create mode 100644 libclc/clc/lib/generic/math/clc_tanh.cl create mode 100644 libclc/clc/lib/generic/math/clc_tanh.inc diff --git a/libclc/clc/include/clc/math/clc_cosh.h b/libclc/clc/include/clc/math/clc_cosh.h new file mode 100644 index 0000000000000..71e414ce28ac2 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_cosh.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_COSH_H__ +#define __CLC_MATH_CLC_COSH_H__ + +#define __CLC_BODY <clc/math/unary_decl.inc> +#define __CLC_FUNCTION __clc_cosh + +#include <clc/math/gentype.inc> + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_COSH_H__ diff --git a/libclc/clc/include/clc/math/clc_sinh.h b/libclc/clc/include/clc/math/clc_sinh.h new file mode 100644 index 0000000000000..da525b5cd0fe7 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_sinh.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_SINH_H__ +#define __CLC_MATH_CLC_SINH_H__ + +#define __CLC_BODY <clc/math/unary_decl.inc> +#define __CLC_FUNCTION __clc_sinh + +#include <clc/math/gentype.inc> + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_SINH_H__ diff --git a/libclc/clc/include/clc/math/clc_tanh.h b/libclc/clc/include/clc/math/clc_tanh.h new file mode 100644 index 0000000000000..972a31e248c67 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_tanh.h @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_TANH_H__ +#define __CLC_MATH_CLC_TANH_H__ + +#define __CLC_BODY <clc/math/unary_decl.inc> +#define __CLC_FUNCTION __clc_tanh + +#include <clc/math/gentype.inc> + +#undef __CLC_BODY +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_TANH_H__ diff --git a/libclc/clc/include/clc/math/tables.h b/libclc/clc/include/clc/math/tables.h index e06ee82d98355..fb172b0b8f221 100644 --- a/libclc/clc/include/clc/math/tables.h +++ b/libclc/clc/include/clc/math/tables.h @@ -62,7 +62,6 @@ TABLE_FUNCTION_DECL(float2, log2_tbl); TABLE_FUNCTION_DECL(float2, log10_tbl); TABLE_FUNCTION_DECL(uint4, pibits_tbl); -TABLE_FUNCTION_DECL(float2, sinhcosh_tbl); CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_head); CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_tail); @@ -74,6 +73,8 @@ CLC_TABLE_FUNCTION_DECL(float, exp_tbl_ep_head); CLC_TABLE_FUNCTION_DECL(float, exp_tbl_ep_tail); CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_head); CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_tail); +CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_head); +CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_tail); #ifdef cl_khr_fp64 @@ -85,8 +86,10 @@ CLC_TABLE_FUNCTION_DECL(double, atan_jby256_tbl_head); CLC_TABLE_FUNCTION_DECL(double, atan_jby256_tbl_tail); CLC_TABLE_FUNCTION_DECL(double, two_to_jby64_ep_tbl_head); CLC_TABLE_FUNCTION_DECL(double, two_to_jby64_ep_tbl_tail); -TABLE_FUNCTION_DECL(double2, sinh_tbl); -TABLE_FUNCTION_DECL(double2, cosh_tbl); +CLC_TABLE_FUNCTION_DECL(double, sinh_tbl_head); +CLC_TABLE_FUNCTION_DECL(double, sinh_tbl_tail); +CLC_TABLE_FUNCTION_DECL(double, cosh_tbl_head); +CLC_TABLE_FUNCTION_DECL(double, cosh_tbl_tail); CLC_TABLE_FUNCTION_DECL(double, cbrt_inv_tbl); CLC_TABLE_FUNCTION_DECL(double, cbrt_dbl_tbl_head); CLC_TABLE_FUNCTION_DECL(double, cbrt_dbl_tbl_tail); diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index d3ea3faa63c23..6714b2b2d9810 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -31,6 +31,7 @@ math/clc_atanpi.cl math/clc_cbrt.cl math/clc_ceil.cl math/clc_copysign.cl +math/clc_cosh.cl math/clc_cospi.cl math/clc_ep_log.cl math/clc_exp.cl @@ -74,10 +75,12 @@ math/clc_rootn.cl math/clc_round.cl math/clc_rsqrt.cl math/clc_sincos_helpers.cl +math/clc_sinh.cl math/clc_sinpi.cl math/clc_sqrt.cl math/clc_sw_fma.cl math/clc_tables.cl +math/clc_tanh.cl math/clc_tanpi.cl math/clc_trunc.cl relational/clc_all.cl diff --git a/libclc/clc/lib/generic/math/clc_cosh.cl b/libclc/clc/lib/generic/math/clc_cosh.cl new file mode 100644 index 0000000000000..4da78de2714e3 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_cosh.cl @@ -0,0 +1,24 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/clc_convert.h> +#include <clc/float/definitions.h> +#include <clc/internal/clc.h> +#include <clc/math/clc_copysign.h> +#include <clc/math/clc_exp.h> +#include <clc/math/clc_fabs.h> +#include <clc/math/clc_fma.h> +#include <clc/math/clc_mad.h> +#include <clc/math/math.h> +#include <clc/math/tables.h> +#include <clc/relational/clc_isinf.h> +#include <clc/relational/clc_isnan.h> +#include <clc/shared/clc_min.h> + +#define __CLC_BODY <clc_cosh.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/math/clc_cosh.inc b/libclc/clc/lib/generic/math/clc_cosh.inc new file mode 100644 index 0000000000000..e36ce19243b76 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_cosh.inc @@ -0,0 +1,201 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { + // After dealing with special cases the computation is split into regions as + // follows. abs(x) >= max_cosh_arg: cosh(x) = sign(x)*Inf abs(x) >= + // small_threshold: cosh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // cosh(x) is then z. + + const __CLC_GENTYPE max_cosh_arg = 0x1.65a9fap+6f; + const __CLC_GENTYPE small_threshold = 0x1.0a2b24p+3f; + + __CLC_UINTN ux = __CLC_AS_UINTN(x); + __CLC_UINTN aux = ux & EXSIGNBIT_SP32; + __CLC_GENTYPE y = __CLC_AS_GENTYPE(aux); + + // Find the integer part y0 of y and the increment dy = y - y0. We then + // compute z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) z = cosh(y) = + // cosh(y0)cosh(dy) + sinh(y0)sinh(dy) where sinh(y0) and cosh(y0) are + // tabulated above. + + __CLC_INTN ind = __CLC_CONVERT_INTN(y); + ind = __CLC_CONVERT_UINTN(ind) > 36U ? 0 : ind; + + __CLC_GENTYPE dy = y - __CLC_CONVERT_GENTYPE(ind); + __CLC_GENTYPE dy2 = dy * dy; + + __CLC_GENTYPE sdy = __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad(dy2, + __clc_mad(dy2, 0.7746188980094184251527126e-12f, + 0.160576793121939886190847e-9f), + 0.250521176994133472333666e-7f), + 0.275573191913636406057211e-5f), + 0.198412698413242405162014e-3f), + 0.833333333333329931873097e-2f), + 0.166666666666666667013899e0f); + sdy = __clc_mad(sdy, dy * dy2, dy); + + __CLC_GENTYPE cdy = __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad(dy2, + __clc_mad(dy2, 0.1163921388172173692062032e-10f, + 0.208744349831471353536305e-8f), + 0.275573350756016588011357e-6f), + 0.248015872460622433115785e-4f), + 0.138888888889814854814536e-2f), + 0.416666666666660876512776e-1f), + 0.500000000000000005911074e0f); + cdy = __clc_mad(cdy, dy2, 1.0f); + + __CLC_GENTYPE sinhcoshh = USE_TABLE(sinhcosh_tbl_head, ind); + __CLC_GENTYPE sinhcosht = USE_TABLE(sinhcosh_tbl_tail, ind); + __CLC_GENTYPE z = __clc_mad(sinhcoshh, sdy, sinhcosht * cdy); + + // When exp(-x) is insignificant compared to exp(x), return exp(x)/2 + __CLC_GENTYPE t = __clc_exp(y - 0x1.62e500p-1f); + __CLC_GENTYPE zsmall = __clc_mad(0x1.a0210ep-18f, t, t); + z = y >= small_threshold ? zsmall : z; + + // Corner cases + z = y >= max_cosh_arg ? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32) : z; + z = aux > PINFBITPATT_SP32 ? __CLC_GENTYPE_NAN : z; + z = aux < 0x38800000 ? 1.0f : z; + + return z; +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { + + // After dealing with special cases the computation is split into + // regions as follows: + // + // abs(x) >= max_cosh_arg: + // cosh(x) = sign(x)*Inf + // + // abs(x) >= small_threshold: + // cosh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). + // + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // cosh(x) is then sign(x)*z. + + // This is ln(2^1025) + const __CLC_GENTYPE max_cosh_arg = + 7.10475860073943977113e+02; // 0x408633ce8fb9f87e + + // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) + const __CLC_GENTYPE small_threshold = 0x1.2b708872320e2p+4; + + __CLC_GENTYPE y = __clc_fabs(x); + + // In this range we find the integer part y0 of y + // and the increment dy = y - y0. We then compute + // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) + // where sinh(y0) and cosh(y0) are tabulated above. + + __CLC_INTN ind = __clc_min(__CLC_CONVERT_INTN(y), 36); + __CLC_GENTYPE dy = y - __CLC_CONVERT_GENTYPE(ind); + __CLC_GENTYPE dy2 = dy * dy; + + __CLC_GENTYPE sdy = + dy * dy2 * + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma(dy2, + __clc_fma(dy2, 0.7746188980094184251527126e-12, + 0.160576793121939886190847e-9), + 0.250521176994133472333666e-7), + 0.275573191913636406057211e-5), + 0.198412698413242405162014e-3), + 0.833333333333329931873097e-2), + 0.166666666666666667013899e0); + + __CLC_GENTYPE cdy = + dy2 * + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma(dy2, + __clc_fma(dy2, 0.1163921388172173692062032e-10, + 0.208744349831471353536305e-8), + 0.275573350756016588011357e-6), + 0.248015872460622433115785e-4), + 0.138888888889814854814536e-2), + 0.416666666666660876512776e-1), + 0.500000000000000005911074e0); + + // At this point sinh(dy) is approximated by dy + sdy, + // and cosh(dy) is approximated by 1 + cdy. + __CLC_GENTYPE cl = USE_TABLE(cosh_tbl_head, ind); + __CLC_GENTYPE ct = USE_TABLE(cosh_tbl_tail, ind); + __CLC_GENTYPE sl = USE_TABLE(sinh_tbl_head, ind); + __CLC_GENTYPE st = USE_TABLE(sinh_tbl_tail, ind); + + __CLC_GENTYPE z = + __clc_fma( + sl, dy, + __clc_fma(sl, sdy, + __clc_fma(cl, cdy, + __clc_fma(st, dy, __clc_fma(st, sdy, ct * cdy)) + + ct))) + + cl; + + // Other cases + z = y < 0x1.0p-28 ? 1.0 : z; + + __CLC_GENTYPE t = __clc_exp(y - 0x1.62e42fefa3800p-1); + t = __clc_fma(t, -0x1.ef35793c76641p-45, t); + z = y >= small_threshold ? t : z; + + z = y >= max_cosh_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : z; + + z = __clc_isinf(x) || __clc_isnan(x) ? y : z; + + return z; +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_cosh(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_sinh.cl b/libclc/clc/lib/generic/math/clc_sinh.cl new file mode 100644 index 0000000000000..bd0b488b16047 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_sinh.cl @@ -0,0 +1,23 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/clc_convert.h> +#include <clc/internal/clc.h> +#include <clc/math/clc_copysign.h> +#include <clc/math/clc_exp.h> +#include <clc/math/clc_fabs.h> +#include <clc/math/clc_fma.h> +#include <clc/math/clc_mad.h> +#include <clc/math/math.h> +#include <clc/math/tables.h> +#include <clc/relational/clc_isinf.h> +#include <clc/relational/clc_isnan.h> +#include <clc/shared/clc_min.h> + +#define __CLC_BODY <clc_sinh.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/math/clc_sinh.inc b/libclc/clc/lib/generic/math/clc_sinh.inc new file mode 100644 index 0000000000000..f089dd4a600a3 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_sinh.inc @@ -0,0 +1,201 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) { + // After dealing with special cases the computation is split into regions as + // follows. abs(x) >= max_sinh_arg: sinh(x) = sign(x)*Inf abs(x) >= + // small_threshold: sinh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). abs(x) < + // small_threshold: compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // sinh(x) is then sign(x)*z. + + const __CLC_GENTYPE max_sinh_arg = 0x1.65a9fap+6f; + const __CLC_GENTYPE small_threshold = 0x1.0a2b24p+3f; + + __CLC_UINTN ux = __CLC_AS_UINTN(x); + __CLC_UINTN aux = ux & EXSIGNBIT_SP32; + __CLC_UINTN xs = ux ^ aux; + __CLC_GENTYPE y = __CLC_AS_GENTYPE(aux); + + // We find the integer part y0 of y and the increment dy = y - y0. We then + // compute z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) where sinh(y0) + // and cosh(y0) are tabulated above. + __CLC_INTN ind = __CLC_CONVERT_INTN(y); + ind = __CLC_CONVERT_UINTN(ind) > 36U ? 0 : ind; + + __CLC_GENTYPE dy = y - __CLC_CONVERT_GENTYPE(ind); + __CLC_GENTYPE dy2 = dy * dy; + + __CLC_GENTYPE sdy = __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad(dy2, + __clc_mad(dy2, 0.7746188980094184251527126e-12f, + 0.160576793121939886190847e-9f), + 0.250521176994133472333666e-7f), + 0.275573191913636406057211e-5f), + 0.198412698413242405162014e-3f), + 0.833333333333329931873097e-2f), + 0.166666666666666667013899e0f); + sdy = __clc_mad(sdy, dy * dy2, dy); + + __CLC_GENTYPE cdy = __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad( + dy2, + __clc_mad(dy2, + __clc_mad(dy2, 0.1163921388172173692062032e-10f, + 0.208744349831471353536305e-8f), + 0.275573350756016588011357e-6f), + 0.248015872460622433115785e-4f), + 0.138888888889814854814536e-2f), + 0.416666666666660876512776e-1f), + 0.500000000000000005911074e0f); + cdy = __clc_mad(cdy, dy2, 1.0f); + + __CLC_GENTYPE sinhcoshh = USE_TABLE(sinhcosh_tbl_head, ind); + __CLC_GENTYPE sinhcosht = USE_TABLE(sinhcosh_tbl_tail, ind); + __CLC_GENTYPE z = __clc_mad(sinhcosht, sdy, sinhcoshh * cdy); + z = __CLC_AS_GENTYPE(xs | __CLC_AS_UINTN(z)); + + // When y is large enough so that the negative exponential is negligible, + // so sinh(y) is approximated by sign(x)*exp(y)/2. + __CLC_GENTYPE t = __clc_exp(y - 0x1.62e500p-1f); + __CLC_GENTYPE zsmall = __clc_mad(0x1.a0210ep-18f, t, t); + zsmall = __CLC_AS_GENTYPE(xs | __CLC_AS_UINTN(zsmall)); + z = y >= small_threshold ? zsmall : z; + + // Corner cases + __CLC_GENTYPE zinf = __CLC_AS_GENTYPE(PINFBITPATT_SP32 | xs); + z = y >= max_sinh_arg ? zinf : z; + z = aux > PINFBITPATT_SP32 || aux < 0x38800000U ? x : z; + + return z; +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) { + // After dealing with special cases the computation is split into + // regions as follows: + // + // abs(x) >= max_sinh_arg: + // sinh(x) = sign(x)*Inf + // + // abs(x) >= small_threshold: + // sinh(x) = sign(x)*exp(abs(x))/2 computed using the + // splitexp and scaleDouble functions as for exp_amd(). + // + // abs(x) < small_threshold: + // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) + // sinh(x) is then sign(x)*z. + + const __CLC_GENTYPE max_sinh_arg = + 7.10475860073943977113e+02; // 0x408633ce8fb9f87e + + // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) + const __CLC_GENTYPE small_threshold = 0x1.2b708872320e2p+4; + + __CLC_GENTYPE y = __clc_fabs(x); + + // In this range we find the integer part y0 of y + // and the increment dy = y - y0. We then compute + // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) + // where sinh(y0) and cosh(y0) are obtained from tables + + __CLC_INTN ind = __clc_min(__CLC_CONVERT_INTN(y), 36); + __CLC_GENTYPE dy = y - __CLC_CONVERT_GENTYPE(ind); + __CLC_GENTYPE dy2 = dy * dy; + + __CLC_GENTYPE sdy = + dy * dy2 * + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma(dy2, + __clc_fma(dy2, 0.7746188980094184251527126e-12, + 0.160576793121939886190847e-9), + 0.250521176994133472333666e-7), + 0.275573191913636406057211e-5), + 0.198412698413242405162014e-3), + 0.833333333333329931873097e-2), + 0.166666666666666667013899e0); + + __CLC_GENTYPE cdy = + dy2 * + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma( + dy2, + __clc_fma(dy2, + __clc_fma(dy2, 0.1163921388172173692062032e-10, + 0.208744349831471353536305e-8), + 0.275573350756016588011357e-6), + 0.248015872460622433115785e-4), + 0.138888888889814854814536e-2), + 0.416666666666660876512776e-1), + 0.500000000000000005911074e0); + + // At this point sinh(dy) is approximated by dy + sdy. + // Shift some significant bits from dy to sdy. + __CLC_GENTYPE sdy1 = + __CLC_AS_GENTYPE(__CLC_AS_ULONGN(dy) & 0xfffffffff8000000UL); + __CLC_GENTYPE sdy2 = sdy + (dy - sdy1); + + __CLC_GENTYPE cl = USE_TABLE(cosh_tbl_head, ind); + __CLC_GENTYPE ct = USE_TABLE(cosh_tbl_tail, ind); + __CLC_GENTYPE sl = USE_TABLE(sinh_tbl_head, ind); + __CLC_GENTYPE st = USE_TABLE(sinh_tbl_tail, ind); + + __CLC_GENTYPE z = + __clc_fma(cl, sdy1, + __clc_fma(sl, cdy, + __clc_fma(cl, sdy2, + __clc_fma(ct, sdy1, + __clc_fma(st, cdy, ct * sdy2)) + + st))) + + sl; + + // Other cases + z = (y < 0x1.0p-28) || __clc_isnan(x) || __clc_isinf(x) ? y : z; + + __CLC_GENTYPE t = __clc_exp(y - 0x1.62e42fefa3800p-1); + t = __clc_fma(t, -0x1.ef35793c76641p-45, t); + z = y >= small_threshold ? t : z; + z = y >= max_sinh_arg ? __CLC_AS_GENTYPE((__CLC_ULONGN)PINFBITPATT_DP64) : z; + + return __clc_copysign(z, x); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_sinh(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_tables.cl b/libclc/clc/lib/generic/math/clc_tables.cl index c5eb21a6d5ed7..6280413ca48ce 100644 --- a/libclc/clc/lib/generic/math/clc_tables.cl +++ b/libclc/clc/lib/generic/math/clc_tables.cl @@ -339,6 +339,37 @@ DECLARE_TABLE(float, CBRT_TBL_TAIL, 129) = { CLC_TABLE_FUNCTION(float, CBRT_TBL_TAIL, cbrt_tbl_tail); +// Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. +DECLARE_TABLE(float, SINHCOSH_TBL_HEAD, 37) = { + 0x0.000000p+0f, 0x1.2cd9fcp+0f, 0x1.d03cf6p+1f, 0x1.40926ep+3f, + 0x1.b4a380p+4f, 0x1.28d016p+6f, 0x1.936d22p+7f, 0x1.122876p+9f, + 0x1.749ea6p+10f, 0x1.fa7158p+11f, 0x1.5829dcp+13f, 0x1.d3c448p+14f, + 0x1.3de166p+16f, 0x1.b00b5ap+17f, 0x1.259ac4p+19f, 0x1.8f0ccap+20f, + 0x1.0f2ebep+22f, 0x1.709348p+23f, 0x1.f4f220p+24f, 0x1.546d90p+26f, + 0x1.ceb088p+27f, 0x1.3a6e20p+29f, 0x1.ab5adcp+30f, 0x1.226af4p+32f, + 0x1.8ab7fcp+33f, 0x1.0c3d3ap+35f, 0x1.6c9326p+36f, 0x1.ef8230p+37f, + 0x1.50bba4p+39f, 0x1.c9aae4p+40f, 0x1.370470p+42f, 0x1.a6b766p+43f, + 0x1.1f43fcp+45f, 0x1.866f34p+46f, 0x1.0953e2p+48f, 0x1.689e22p+49f, + 0x1.ea215ap+50f, +}; + +CLC_TABLE_FUNCTION(float, SINHCOSH_TBL_HEAD, sinhcosh_tbl_head); + +DECLARE_TABLE(float, SINHCOSH_TBL_TAIL, 37) = { + 0x1.000000p+0f, 0x1.8b0756p+0f, 0x1.e18fa0p+1f, 0x1.422a4ap+3f, + 0x1.b4ee86p+4f, 0x1.28d6fcp+6f, 0x1.936e68p+7f, 0x1.122894p+9f, + 0x1.749eaap+10f, 0x1.fa7158p+11f, 0x1.5829dep+13f, 0x1.d3c448p+14f, + 0x1.3de166p+16f, 0x1.b00b5ap+17f, 0x1.259ac4p+19f, 0x1.8f0ccap+20f, + 0x1.0f2ebep+22f, 0x1.709348p+23f, 0x1.f4f220p+24f, 0x1.546d90p+26f, + 0x1.ceb088p+27f, 0x1.3a6e20p+29f, 0x1.ab5adcp+30f, 0x1.226af4p+32f, + 0x1.8ab7fcp+33f, 0x1.0c3d3ap+35f, 0x1.6c9326p+36f, 0x1.ef8230p+37f, + 0x1.50bba4p+39f, 0x1.c9aae4p+40f, 0x1.370470p+42f, 0x1.a6b766p+43f, + 0x1.1f43fcp+45f, 0x1.866f34p+46f, 0x1.0953e2p+48f, 0x1.689e22p+49f, + 0x1.ea215ap+50f, +}; + +CLC_TABLE_FUNCTION(float, SINHCOSH_TBL_TAIL, sinhcosh_tbl_tail); + #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable @@ -1279,4 +1310,73 @@ DECLARE_TABLE(double, CBRT_REM_TBL_TAIL, 5) = { CLC_TABLE_FUNCTION(double, CBRT_REM_TBL_TAIL, cbrt_rem_tbl_tail); +DECLARE_TABLE(double, SINH_TBL_HEAD, 37) = { + 0x0.0000000000000p+0, 0x1.2cd9fc0000000p+0, 0x1.d03cf60000000p+1, + 0x1.40926e0000000p+3, 0x1.b4a3800000000p+4, 0x1.28d0160000000p+6, + 0x1.936d228000000p+7, 0x1.1228768000000p+9, 0x1.749ea50000000p+10, + 0x1.fa71570000000p+11, 0x1.5829dc8000000p+13, 0x1.d3c4488000000p+14, + 0x1.3de1650000000p+16, 0x1.b00b590000000p+17, 0x1.259ac48000000p+19, + 0x1.8f0cca8000000p+20, 0x1.0f2ebd0000000p+22, 0x1.7093488000000p+23, + 0x1.f4f2208000000p+24, 0x1.546d8f8000000p+26, 0x1.ceb0888000000p+27, + 0x1.3a6e1f8000000p+29, 0x1.ab5adb8000000p+30, 0x1.226af30000000p+32, + 0x1.8ab7fb0000000p+33, 0x1.0c3d390000000p+35, 0x1.6c93268000000p+36, + 0x1.ef822f0000000p+37, 0x1.50bba30000000p+39, 0x1.c9aae40000000p+40, + 0x1.3704708000000p+42, 0x1.a6b7658000000p+43, 0x1.1f43fc8000000p+45, + 0x1.866f348000000p+46, 0x1.0953e28000000p+48, 0x1.689e220000000p+49, + 0x1.ea215a0000000p+50, +}; + +DECLARE_TABLE(double, SINH_TBL_TAIL, 37) = { + 0x0.0000000000000p+0, 0x1.13ae6096a0092p-26, 0x1.db70cfb79a640p-26, + 0x1.c2526b66dc067p-23, 0x1.b81b18647f380p-23, 0x1.bc1cdd1e1eb08p-20, + 0x1.d9f201534fb09p-19, 0x1.d1c064a4e9954p-18, 0x1.4eca65d06ea74p-18, + 0x1.0c259bcc0ecc5p-15, 0x1.b5a6647cf9016p-13, 0x1.9691adefb0870p-15, + 0x1.3410fc29cde38p-10, 0x1.6a31a50b6fb3cp-11, 0x1.7defc71805c40p-10, + 0x1.eb49fd80e0babp-6, 0x1.4fffc7bcd5920p-7, 0x1.03a93b6c63435p-3, + 0x1.1940bb255fd1cp-4, 0x1.ed26e14260b50p-2, 0x1.b47401fc9f2a2p+0, + 0x1.67bb3f55634f1p+3, 0x1.c435ff8194ddcp+2, 0x1.d8fee052ba63ap+5, + 0x1.51d7edccde3f6p+7, 0x1.04b1644557d1ap+8, 0x1.6a6b5ca0a9dc4p+8, + 0x1.fd9cc72249abap+11, 0x1.e58de693edab5p+13, 0x1.8c70158ac6363p+14, + 0x1.7614764f43e20p+15, 0x1.6337db36fc718p+17, 0x1.12d98b1f611e2p+19, + 0x1.392bc108b37ccp+19, 0x1.ce87bdc3473dcp+22, 0x1.bc8d5ae99ad14p+21, + 0x1.d20d76744835cp+22, +}; + +DECLARE_TABLE(double, COSH_TBL_HEAD, 37) = { + 0x1.0000000000000p+0, 0x1.8b07550000000p+0, 0x1.e18fa08000000p+1, + 0x1.422a490000000p+3, 0x1.b4ee858000000p+4, 0x1.28d6fc8000000p+6, + 0x1.936e678000000p+7, 0x1.1228948000000p+9, 0x1.749eaa8000000p+10, + 0x1.fa71580000000p+11, 0x1.5829dd0000000p+13, 0x1.d3c4488000000p+14, + 0x1.3de1650000000p+16, 0x1.b00b590000000p+17, 0x1.259ac48000000p+19, + 0x1.8f0cca8000000p+20, 0x1.0f2ebd0000000p+22, 0x1.7093488000000p+23, + 0x1.f4f2208000000p+24, 0x1.546d8f8000000p+26, 0x1.ceb0888000000p+27, + 0x1.3a6e1f8000000p+29, 0x1.ab5adb8000000p+30, 0x1.226af30000000p+32, + 0x1.8ab7fb0000000p+33, 0x1.0c3d390000000p+35, 0x1.6c93268000000p+36, + 0x1.ef822f0000000p+37, 0x1.50bba30000000p+39, 0x1.c9aae40000000p+40, + 0x1.3704708000000p+42, 0x1.a6b7658000000p+43, 0x1.1f43fc8000000p+45, + 0x1.866f348000000p+46, 0x1.0953e28000000p+48, 0x1.689e220000000p+49, + 0x1.ea215a0000000p+50, +}; + +DECLARE_TABLE(double, COSH_TBL_TAIL, 37) = { + 0x0.0000000000000p+0, 0x1.d9f5504c2bd28p-28, 0x1.7cb66f0a4c9fdp-25, + 0x1.f58617928e588p-23, 0x1.bc7d000c38d48p-25, 0x1.f7f9d4e329998p-21, + 0x1.6e6e464885269p-19, 0x1.ba3a8b946c154p-19, 0x1.3f4e76110d5a4p-18, + 0x1.17622515a3e2bp-15, 0x1.4dc4b528af3d0p-17, 0x1.1156278615e10p-14, + 0x1.35ad50ed821f5p-10, 0x1.6b61055f2935cp-11, 0x1.7e2794a601240p-10, + 0x1.eb4b45f6aadd3p-6, 0x1.5000b967b3698p-7, 0x1.03a940fadc092p-3, + 0x1.1940bf3bf874cp-4, 0x1.ed26e1a2a2110p-2, 0x1.b4740205796d6p+0, + 0x1.67bb3f55cb85dp+3, 0x1.c435ff81e18acp+2, 0x1.d8fee052bdea4p+5, + 0x1.51d7edccde926p+7, 0x1.04b1644557e0ep+8, 0x1.6a6b5ca0a9e1cp+8, + 0x1.fd9cc72249abep+11, 0x1.e58de693edab5p+13, 0x1.8c70158ac6364p+14, + 0x1.7614764f43e20p+15, 0x1.6337db36fc718p+17, 0x1.12d98b1f611e2p+19, + 0x1.392bc108b37ccp+19, 0x1.ce87bdc3473dcp+22, 0x1.bc8d5ae99ad14p+21, + 0x1.d20d76744835cp+22, +}; + +CLC_TABLE_FUNCTION(double, SINH_TBL_HEAD, sinh_tbl_head); +CLC_TABLE_FUNCTION(double, SINH_TBL_TAIL, sinh_tbl_tail); +CLC_TABLE_FUNCTION(double, COSH_TBL_HEAD, cosh_tbl_head); +CLC_TABLE_FUNCTION(double, COSH_TBL_TAIL, cosh_tbl_tail); + #endif // cl_khr_fp64 diff --git a/libclc/clc/lib/generic/math/clc_tanh.cl b/libclc/clc/lib/generic/math/clc_tanh.cl new file mode 100644 index 0000000000000..aedcb0c38d5e3 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_tanh.cl @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/clc_convert.h> +#include <clc/internal/clc.h> +#include <clc/math/clc_exp.h> +#include <clc/math/clc_fma.h> +#include <clc/math/clc_mad.h> +#include <clc/math/math.h> +#include <clc/math/tables.h> +#include <clc/relational/clc_isinf.h> +#include <clc/relational/clc_isnan.h> +#include <clc/shared/clc_min.h> + +#define __CLC_BODY <clc_tanh.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/math/clc_tanh.inc b/libclc/clc/lib/generic/math/clc_tanh.inc new file mode 100644 index 0000000000000..a25fd58fcbeaf --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_tanh.inc @@ -0,0 +1,137 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_tanh(__CLC_GENTYPE x) { + // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + // to the following three formulae: + // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + // 2. (1 - (2/(exp(2*x) + 1 ))) + // 3. (exp(2*x) - 1)/(exp(2*x) + 1) + // but computationally, some formulae are better on some ranges. + + const __CLC_GENTYPE large_threshold = 0x1.0a2b24p+3f; + + __CLC_UINTN ux = __CLC_AS_UINTN(x); + __CLC_UINTN aux = ux & EXSIGNBIT_SP32; + __CLC_UINTN xs = ux ^ aux; + + __CLC_GENTYPE y = __CLC_AS_GENTYPE(aux); + __CLC_GENTYPE y2 = y * y; + + __CLC_GENTYPE a1 = __clc_mad( + y2, __clc_mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F), + -0.28192806108402678e0F); + __CLC_GENTYPE b1 = + __clc_mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F); + + __CLC_GENTYPE a2 = __clc_mad( + y2, __clc_mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F), + -0.24069858695196524e0F); + __CLC_GENTYPE b2 = + __clc_mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F); + + __CLC_INTN c = y < 0.9f; + __CLC_GENTYPE a = c ? a1 : a2; + __CLC_GENTYPE b = c ? b1 : b2; + __CLC_GENTYPE zlo = __clc_mad(MATH_DIVIDE(a, b), y * y2, y); + + __CLC_GENTYPE p = __clc_exp(2.0f * y) + 1.0f; + __CLC_GENTYPE zhi = 1.0F - MATH_DIVIDE(2.0F, p); + + __CLC_GENTYPE z = y <= 1.0f ? zlo : zhi; + z = __CLC_AS_GENTYPE(xs | __CLC_AS_UINTN(z)); + + // Edge cases + __CLC_GENTYPE sone = __CLC_AS_GENTYPE(0x3f800000U | xs); + z = y > large_threshold ? sone : z; + z = aux < 0x39000000 || aux > 0x7f800000 ? x : z; + + return z; +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_tanh(__CLC_GENTYPE x) { + // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent + // to the following three formulae: + // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) + // 2. (1 - (2/(exp(2*x) + 1 ))) + // 3. (exp(2*x) - 1)/(exp(2*x) + 1) + // but computationally, some formulae are better on some ranges. + + // The point at which e^-x is insignificant compared to e^x = ln(2^27) + const __CLC_GENTYPE large_threshold = 0x1.2b708872320e2p+4; + + __CLC_ULONGN ux = __CLC_AS_ULONGN(x); + __CLC_ULONGN ax = ux & ~SIGNBIT_DP64; + __CLC_ULONGN sx = ux ^ ax; + __CLC_GENTYPE y = __CLC_AS_GENTYPE(ax); + __CLC_GENTYPE y2 = y * y; + + // y < 0.9 + __CLC_GENTYPE znl = + __clc_fma(y2, + __clc_fma(y2, + __clc_fma(y2, -0.142077926378834722618091e-7, + -0.200047621071909498730453e-3), + -0.176016349003044679402273e-1), + -0.274030424656179760118928e0); + + __CLC_GENTYPE zdl = + __clc_fma(y2, + __clc_fma(y2, + __clc_fma(y2, 0.2091140262529164482568557e-3, + 0.201562166026937652780575e-1), + 0.381641414288328849317962e0), + 0.822091273968539282568011e0); + + // 0.9 <= y <= 1 + __CLC_GENTYPE znm = + __clc_fma(y2, + __clc_fma(y2, + __clc_fma(y2, -0.115475878996143396378318e-7, + -0.165597043903549960486816e-3), + -0.146173047288731678404066e-1), + -0.227793870659088295252442e0); + + __CLC_GENTYPE zdm = + __clc_fma(y2, + __clc_fma(y2, + __clc_fma(y2, 0.173076050126225961768710e-3, + 0.167358775461896562588695e-1), + 0.317204558977294374244770e0), + 0.683381611977295894959554e0); + + __CLC_LONGN c = y < 0.9; + __CLC_GENTYPE zn = c ? znl : znm; + __CLC_GENTYPE zd = c ? zdl : zdm; + __CLC_GENTYPE z = y + y * y2 * MATH_DIVIDE(zn, zd); + + // y > 1 + __CLC_GENTYPE p = __clc_exp(2.0 * y) + 1.0; + __CLC_GENTYPE zg = 1.0 - 2.0 / p; + + z = y > 1.0 ? zg : z; + + // Other cases + z = y < 0x1.0p-28 || ax > PINFBITPATT_DP64 ? x : z; + + z = y > large_threshold ? 1.0 : z; + + return __CLC_AS_GENTYPE(sx | __CLC_AS_ULONGN(z)); +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_tanh(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_tanh(__CLC_CONVERT_FLOATN(x))); +} + +#endif diff --git a/libclc/generic/lib/math/cosh.cl b/libclc/generic/lib/math/cosh.cl index 6d391b4f3a71f..870c56029f338 100644 --- a/libclc/generic/lib/math/cosh.cl +++ b/libclc/generic/lib/math/cosh.cl @@ -7,179 +7,8 @@ //===----------------------------------------------------------------------===// #include <clc/clc.h> -#include <clc/clcmacro.h> -#include <clc/math/math.h> -#include <clc/math/tables.h> +#include <clc/math/clc_cosh.h> -_CLC_OVERLOAD _CLC_DEF float cosh(float x) { - - // After dealing with special cases the computation is split into regions as follows. - // abs(x) >= max_cosh_arg: - // cosh(x) = sign(x)*Inf - // abs(x) >= small_threshold: - // cosh(x) = sign(x)*exp(abs(x))/2 computed using the - // splitexp and scaleDouble functions as for exp_amd(). - // abs(x) < small_threshold: - // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) - // cosh(x) is then z. - - const float max_cosh_arg = 0x1.65a9fap+6f; - const float small_threshold = 0x1.0a2b24p+3f; - - uint ux = as_uint(x); - uint aux = ux & EXSIGNBIT_SP32; - float y = as_float(aux); - - // Find the integer part y0 of y and the increment dy = y - y0. We then compute - // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) - // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) - // where sinh(y0) and cosh(y0) are tabulated above. - - int ind = (int)y; - ind = (uint)ind > 36U ? 0 : ind; - - float dy = y - ind; - float dy2 = dy * dy; - - float sdy = mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, 0.7746188980094184251527126e-12f, 0.160576793121939886190847e-9f), - 0.250521176994133472333666e-7f), - 0.275573191913636406057211e-5f), - 0.198412698413242405162014e-3f), - 0.833333333333329931873097e-2f), - 0.166666666666666667013899e0f); - sdy = mad(sdy, dy*dy2, dy); - - float cdy = mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, 0.1163921388172173692062032e-10f, 0.208744349831471353536305e-8f), - 0.275573350756016588011357e-6f), - 0.248015872460622433115785e-4f), - 0.138888888889814854814536e-2f), - 0.416666666666660876512776e-1f), - 0.500000000000000005911074e0f); - cdy = mad(cdy, dy2, 1.0f); - - float2 tv = USE_TABLE(sinhcosh_tbl, ind); - float z = mad(tv.s0, sdy, tv.s1 * cdy); - - // When exp(-x) is insignificant compared to exp(x), return exp(x)/2 - float t = exp(y - 0x1.62e500p-1f); - float zsmall = mad(0x1.a0210ep-18f, t, t); - z = y >= small_threshold ? zsmall : z; - - // Corner cases - z = y >= max_cosh_arg ? as_float(PINFBITPATT_SP32) : z; - z = aux > PINFBITPATT_SP32 ? as_float(QNANBITPATT_SP32) : z; - z = aux < 0x38800000 ? 1.0f : z; - - return z; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, cosh, float); - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double cosh(double x) { - - // After dealing with special cases the computation is split into - // regions as follows: - // - // abs(x) >= max_cosh_arg: - // cosh(x) = sign(x)*Inf - // - // abs(x) >= small_threshold: - // cosh(x) = sign(x)*exp(abs(x))/2 computed using the - // splitexp and scaleDouble functions as for exp_amd(). - // - // abs(x) < small_threshold: - // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) - // cosh(x) is then sign(x)*z. - - // This is ln(2^1025) - const double max_cosh_arg = 7.10475860073943977113e+02; // 0x408633ce8fb9f87e - - // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) - const double small_threshold = 0x1.2b708872320e2p+4; - - double y = fabs(x); - - // In this range we find the integer part y0 of y - // and the increment dy = y - y0. We then compute - // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy) - // where sinh(y0) and cosh(y0) are tabulated above. - - int ind = min((int)y, 36); - double dy = y - ind; - double dy2 = dy * dy; - - double sdy = dy * dy2 * - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, 0.7746188980094184251527126e-12, 0.160576793121939886190847e-9), - 0.250521176994133472333666e-7), - 0.275573191913636406057211e-5), - 0.198412698413242405162014e-3), - 0.833333333333329931873097e-2), - 0.166666666666666667013899e0); - - double cdy = dy2 * fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, 0.1163921388172173692062032e-10, 0.208744349831471353536305e-8), - 0.275573350756016588011357e-6), - 0.248015872460622433115785e-4), - 0.138888888889814854814536e-2), - 0.416666666666660876512776e-1), - 0.500000000000000005911074e0); - - // At this point sinh(dy) is approximated by dy + sdy, - // and cosh(dy) is approximated by 1 + cdy. - double2 tv = USE_TABLE(cosh_tbl, ind); - double cl = tv.s0; - double ct = tv.s1; - tv = USE_TABLE(sinh_tbl, ind); - double sl = tv.s0; - double st = tv.s1; - - double z = fma(sl, dy, fma(sl, sdy, fma(cl, cdy, fma(st, dy, fma(st, sdy, ct*cdy)) + ct))) + cl; - - // Other cases - z = y < 0x1.0p-28 ? 1.0 : z; - - double t = exp(y - 0x1.62e42fefa3800p-1); - t = fma(t, -0x1.ef35793c76641p-45, t); - z = y >= small_threshold ? t : z; - - z = y >= max_cosh_arg ? as_double(PINFBITPATT_DP64) : z; - - z = isinf(x) | isnan(x) ? y : z; - - return z; - -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cosh, double) - -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_UNARY_BUILTIN_FP16(cosh) - -#endif +#define FUNCTION cosh +#define __CLC_BODY <clc/shared/unary_def.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/sinh.cl b/libclc/generic/lib/math/sinh.cl index a889116bd3e6e..305f9bf647ff7 100644 --- a/libclc/generic/lib/math/sinh.cl +++ b/libclc/generic/lib/math/sinh.cl @@ -7,178 +7,8 @@ //===----------------------------------------------------------------------===// #include <clc/clc.h> -#include <clc/clcmacro.h> -#include <clc/math/math.h> -#include <clc/math/tables.h> +#include <clc/math/clc_sinh.h> -_CLC_OVERLOAD _CLC_DEF float sinh(float x) -{ - // After dealing with special cases the computation is split into regions as follows. - // abs(x) >= max_sinh_arg: - // sinh(x) = sign(x)*Inf - // abs(x) >= small_threshold: - // sinh(x) = sign(x)*exp(abs(x))/2 computed using the splitexp and scaleDouble functions as for exp_amd(). - // abs(x) < small_threshold: - // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) - // sinh(x) is then sign(x)*z. - - const float max_sinh_arg = 0x1.65a9fap+6f; - const float small_threshold = 0x1.0a2b24p+3f; - - uint ux = as_uint(x); - uint aux = ux & EXSIGNBIT_SP32; - uint xs = ux ^ aux; - float y = as_float(aux); - - // We find the integer part y0 of y and the increment dy = y - y0. We then compute - // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) - // where sinh(y0) and cosh(y0) are tabulated above. - int ind = (int) y; - ind = (uint)ind > 36U ? 0 : ind; - - float dy = y - ind; - float dy2 = dy * dy; - - float sdy = mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, 0.7746188980094184251527126e-12f, 0.160576793121939886190847e-9f), - 0.250521176994133472333666e-7f), - 0.275573191913636406057211e-5f), - 0.198412698413242405162014e-3f), - 0.833333333333329931873097e-2f), - 0.166666666666666667013899e0f); - sdy = mad(sdy, dy*dy2, dy); - - float cdy = mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, - mad(dy2, 0.1163921388172173692062032e-10f, 0.208744349831471353536305e-8f), - 0.275573350756016588011357e-6f), - 0.248015872460622433115785e-4f), - 0.138888888889814854814536e-2f), - 0.416666666666660876512776e-1f), - 0.500000000000000005911074e0f); - cdy = mad(cdy, dy2, 1.0f); - - float2 tv = USE_TABLE(sinhcosh_tbl, ind); - float z = mad(tv.s1, sdy, tv.s0 * cdy); - z = as_float(xs | as_uint(z)); - - // When y is large enough so that the negative exponential is negligible, - // so sinh(y) is approximated by sign(x)*exp(y)/2. - float t = exp(y - 0x1.62e500p-1f); - float zsmall = mad(0x1.a0210ep-18f, t, t); - zsmall = as_float(xs | as_uint(zsmall)); - z = y >= small_threshold ? zsmall : z; - - // Corner cases - float zinf = as_float(PINFBITPATT_SP32 | xs); - z = y >= max_sinh_arg ? zinf : z; - z = aux > PINFBITPATT_SP32 | aux < 0x38800000U ? x : z; - - return z; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, sinh, float); - -#ifdef cl_khr_fp64 -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double sinh(double x) -{ - // After dealing with special cases the computation is split into - // regions as follows: - // - // abs(x) >= max_sinh_arg: - // sinh(x) = sign(x)*Inf - // - // abs(x) >= small_threshold: - // sinh(x) = sign(x)*exp(abs(x))/2 computed using the - // splitexp and scaleDouble functions as for exp_amd(). - // - // abs(x) < small_threshold: - // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) - // sinh(x) is then sign(x)*z. - - const double max_sinh_arg = 7.10475860073943977113e+02; // 0x408633ce8fb9f87e - - // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) - const double small_threshold = 0x1.2b708872320e2p+4; - - double y = fabs(x); - - // In this range we find the integer part y0 of y - // and the increment dy = y - y0. We then compute - // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) - // where sinh(y0) and cosh(y0) are obtained from tables - - int ind = min((int)y, 36); - double dy = y - ind; - double dy2 = dy * dy; - - double sdy = dy * dy2 * - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, 0.7746188980094184251527126e-12, 0.160576793121939886190847e-9), - 0.250521176994133472333666e-7), - 0.275573191913636406057211e-5), - 0.198412698413242405162014e-3), - 0.833333333333329931873097e-2), - 0.166666666666666667013899e0); - - double cdy = dy2 * fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, - fma(dy2, 0.1163921388172173692062032e-10, 0.208744349831471353536305e-8), - 0.275573350756016588011357e-6), - 0.248015872460622433115785e-4), - 0.138888888889814854814536e-2), - 0.416666666666660876512776e-1), - 0.500000000000000005911074e0); - - // At this point sinh(dy) is approximated by dy + sdy. - // Shift some significant bits from dy to sdy. - double sdy1 = as_double(as_ulong(dy) & 0xfffffffff8000000UL); - double sdy2 = sdy + (dy - sdy1); - - double2 tv = USE_TABLE(cosh_tbl, ind); - double cl = tv.s0; - double ct = tv.s1; - tv = USE_TABLE(sinh_tbl, ind); - double sl = tv.s0; - double st = tv.s1; - - double z = fma(cl, sdy1, fma(sl, cdy, fma(cl, sdy2, fma(ct, sdy1, fma(st, cdy, ct*sdy2)) + st))) + sl; - - // Other cases - z = (y < 0x1.0p-28) | isnan(x) | isinf(x) ? y : z; - - double t = exp(y - 0x1.62e42fefa3800p-1); - t = fma(t, -0x1.ef35793c76641p-45, t); - z = y >= small_threshold ? t : z; - z = y >= max_sinh_arg ? as_double(PINFBITPATT_DP64) : z; - - return copysign(z, x); -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sinh, double) - -#endif - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_UNARY_BUILTIN_FP16(sinh) - -#endif +#define FUNCTION sinh +#define __CLC_BODY <clc/shared/unary_def.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/tables.cl b/libclc/generic/lib/math/tables.cl index 16473caad7f23..3997b32a79043 100644 --- a/libclc/generic/lib/math/tables.cl +++ b/libclc/generic/lib/math/tables.cl @@ -289,139 +289,9 @@ DECLARE_TABLE(uchar, PIBITS_TBL, ) = { 230, 139, 2, 0, 0, 0, 0, 0, 0, 0 }; -// Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. -DECLARE_TABLE(float2, SINHCOSH_TBL, 37) = { - (float2)(0x0.000000p+0f, 0x1.000000p+0f), - (float2)(0x1.2cd9fcp+0f, 0x1.8b0756p+0f), - (float2)(0x1.d03cf6p+1f, 0x1.e18fa0p+1f), - (float2)(0x1.40926ep+3f, 0x1.422a4ap+3f), - (float2)(0x1.b4a380p+4f, 0x1.b4ee86p+4f), - (float2)(0x1.28d016p+6f, 0x1.28d6fcp+6f), - (float2)(0x1.936d22p+7f, 0x1.936e68p+7f), - (float2)(0x1.122876p+9f, 0x1.122894p+9f), - (float2)(0x1.749ea6p+10f, 0x1.749eaap+10f), - (float2)(0x1.fa7158p+11f, 0x1.fa7158p+11f), - (float2)(0x1.5829dcp+13f, 0x1.5829dep+13f), - (float2)(0x1.d3c448p+14f, 0x1.d3c448p+14f), - (float2)(0x1.3de166p+16f, 0x1.3de166p+16f), - (float2)(0x1.b00b5ap+17f, 0x1.b00b5ap+17f), - (float2)(0x1.259ac4p+19f, 0x1.259ac4p+19f), - (float2)(0x1.8f0ccap+20f, 0x1.8f0ccap+20f), - (float2)(0x1.0f2ebep+22f, 0x1.0f2ebep+22f), - (float2)(0x1.709348p+23f, 0x1.709348p+23f), - (float2)(0x1.f4f220p+24f, 0x1.f4f220p+24f), - (float2)(0x1.546d90p+26f, 0x1.546d90p+26f), - (float2)(0x1.ceb088p+27f, 0x1.ceb088p+27f), - (float2)(0x1.3a6e20p+29f, 0x1.3a6e20p+29f), - (float2)(0x1.ab5adcp+30f, 0x1.ab5adcp+30f), - (float2)(0x1.226af4p+32f, 0x1.226af4p+32f), - (float2)(0x1.8ab7fcp+33f, 0x1.8ab7fcp+33f), - (float2)(0x1.0c3d3ap+35f, 0x1.0c3d3ap+35f), - (float2)(0x1.6c9326p+36f, 0x1.6c9326p+36f), - (float2)(0x1.ef8230p+37f, 0x1.ef8230p+37f), - (float2)(0x1.50bba4p+39f, 0x1.50bba4p+39f), - (float2)(0x1.c9aae4p+40f, 0x1.c9aae4p+40f), - (float2)(0x1.370470p+42f, 0x1.370470p+42f), - (float2)(0x1.a6b766p+43f, 0x1.a6b766p+43f), - (float2)(0x1.1f43fcp+45f, 0x1.1f43fcp+45f), - (float2)(0x1.866f34p+46f, 0x1.866f34p+46f), - (float2)(0x1.0953e2p+48f, 0x1.0953e2p+48f), - (float2)(0x1.689e22p+49f, 0x1.689e22p+49f), - (float2)(0x1.ea215ap+50f, 0x1.ea215ap+50f) -}; - TABLE_FUNCTION(float2, LOG2_TBL, log2_tbl); TABLE_FUNCTION(float2, LOG10_TBL, log10_tbl); uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) { return *(__constant uint4 *)(PIBITS_TBL + idx); } - -TABLE_FUNCTION(float2, SINHCOSH_TBL, sinhcosh_tbl); - -#ifdef cl_khr_fp64 - -DECLARE_TABLE(double2, SINH_TBL, 37) = { - (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0), - (double2)(0x1.2cd9fc0000000p+0, 0x1.13ae6096a0092p-26), - (double2)(0x1.d03cf60000000p+1, 0x1.db70cfb79a640p-26), - (double2)(0x1.40926e0000000p+3, 0x1.c2526b66dc067p-23), - (double2)(0x1.b4a3800000000p+4, 0x1.b81b18647f380p-23), - (double2)(0x1.28d0160000000p+6, 0x1.bc1cdd1e1eb08p-20), - (double2)(0x1.936d228000000p+7, 0x1.d9f201534fb09p-19), - (double2)(0x1.1228768000000p+9, 0x1.d1c064a4e9954p-18), - (double2)(0x1.749ea50000000p+10, 0x1.4eca65d06ea74p-18), - (double2)(0x1.fa71570000000p+11, 0x1.0c259bcc0ecc5p-15), - (double2)(0x1.5829dc8000000p+13, 0x1.b5a6647cf9016p-13), - (double2)(0x1.d3c4488000000p+14, 0x1.9691adefb0870p-15), - (double2)(0x1.3de1650000000p+16, 0x1.3410fc29cde38p-10), - (double2)(0x1.b00b590000000p+17, 0x1.6a31a50b6fb3cp-11), - (double2)(0x1.259ac48000000p+19, 0x1.7defc71805c40p-10), - (double2)(0x1.8f0cca8000000p+20, 0x1.eb49fd80e0babp-6), - (double2)(0x1.0f2ebd0000000p+22, 0x1.4fffc7bcd5920p-7), - (double2)(0x1.7093488000000p+23, 0x1.03a93b6c63435p-3), - (double2)(0x1.f4f2208000000p+24, 0x1.1940bb255fd1cp-4), - (double2)(0x1.546d8f8000000p+26, 0x1.ed26e14260b50p-2), - (double2)(0x1.ceb0888000000p+27, 0x1.b47401fc9f2a2p+0), - (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55634f1p+3), - (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff8194ddcp+2), - (double2)(0x1.226af30000000p+32, 0x1.d8fee052ba63ap+5), - (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde3f6p+7), - (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557d1ap+8), - (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9dc4p+8), - (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abap+11), - (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13), - (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6363p+14), - (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15), - (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17), - (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19), - (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19), - (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22), - (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21), - (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22), -}; - -DECLARE_TABLE(double2, COSH_TBL, 37) = { - (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0), - (double2)(0x1.8b07550000000p+0, 0x1.d9f5504c2bd28p-28), - (double2)(0x1.e18fa08000000p+1, 0x1.7cb66f0a4c9fdp-25), - (double2)(0x1.422a490000000p+3, 0x1.f58617928e588p-23), - (double2)(0x1.b4ee858000000p+4, 0x1.bc7d000c38d48p-25), - (double2)(0x1.28d6fc8000000p+6, 0x1.f7f9d4e329998p-21), - (double2)(0x1.936e678000000p+7, 0x1.6e6e464885269p-19), - (double2)(0x1.1228948000000p+9, 0x1.ba3a8b946c154p-19), - (double2)(0x1.749eaa8000000p+10, 0x1.3f4e76110d5a4p-18), - (double2)(0x1.fa71580000000p+11, 0x1.17622515a3e2bp-15), - (double2)(0x1.5829dd0000000p+13, 0x1.4dc4b528af3d0p-17), - (double2)(0x1.d3c4488000000p+14, 0x1.1156278615e10p-14), - (double2)(0x1.3de1650000000p+16, 0x1.35ad50ed821f5p-10), - (double2)(0x1.b00b590000000p+17, 0x1.6b61055f2935cp-11), - (double2)(0x1.259ac48000000p+19, 0x1.7e2794a601240p-10), - (double2)(0x1.8f0cca8000000p+20, 0x1.eb4b45f6aadd3p-6), - (double2)(0x1.0f2ebd0000000p+22, 0x1.5000b967b3698p-7), - (double2)(0x1.7093488000000p+23, 0x1.03a940fadc092p-3), - (double2)(0x1.f4f2208000000p+24, 0x1.1940bf3bf874cp-4), - (double2)(0x1.546d8f8000000p+26, 0x1.ed26e1a2a2110p-2), - (double2)(0x1.ceb0888000000p+27, 0x1.b4740205796d6p+0), - (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55cb85dp+3), - (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff81e18acp+2), - (double2)(0x1.226af30000000p+32, 0x1.d8fee052bdea4p+5), - (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde926p+7), - (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557e0ep+8), - (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9e1cp+8), - (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abep+11), - (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13), - (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6364p+14), - (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15), - (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17), - (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19), - (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19), - (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22), - (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21), - (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22) -}; - -TABLE_FUNCTION(double2, SINH_TBL, sinh_tbl); -TABLE_FUNCTION(double2, COSH_TBL, cosh_tbl); - -#endif // cl_khr_fp64 diff --git a/libclc/generic/lib/math/tanh.cl b/libclc/generic/lib/math/tanh.cl index 707754a13ea75..f576910f16deb 100644 --- a/libclc/generic/lib/math/tanh.cl +++ b/libclc/generic/lib/math/tanh.cl @@ -7,133 +7,8 @@ //===----------------------------------------------------------------------===// #include <clc/clc.h> -#include <clc/clcmacro.h> -#include <clc/math/math.h> +#include <clc/math/clc_tanh.h> -_CLC_OVERLOAD _CLC_DEF float tanh(float x) -{ - // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent - // to the following three formulae: - // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) - // 2. (1 - (2/(exp(2*x) + 1 ))) - // 3. (exp(2*x) - 1)/(exp(2*x) + 1) - // but computationally, some formulae are better on some ranges. - - const float large_threshold = 0x1.0a2b24p+3f; - - uint ux = as_uint(x); - uint aux = ux & EXSIGNBIT_SP32; - uint xs = ux ^ aux; - - float y = as_float(aux); - float y2 = y*y; - - float a1 = mad(y2, - mad(y2, 0.4891631088530669873e-4F, -0.14628356048797849e-2F), - -0.28192806108402678e0F); - float b1 = mad(y2, 0.3427017942262751343e0F, 0.845784192581041099e0F); - - float a2 = mad(y2, - mad(y2, 0.3827534993599483396e-4F, -0.12325644183611929e-2F), - -0.24069858695196524e0F); - float b2 = mad(y2, 0.292529068698052819e0F, 0.72209738473684982e0F); - - int c = y < 0.9f; - float a = c ? a1 : a2; - float b = c ? b1 : b2; - float zlo = mad(MATH_DIVIDE(a, b), y*y2, y); - - float p = exp(2.0f * y) + 1.0f; - float zhi = 1.0F - MATH_DIVIDE(2.0F, p); - - float z = y <= 1.0f ? zlo : zhi; - z = as_float(xs | as_uint(z)); - - // Edge cases - float sone = as_float(0x3f800000U | xs); - z = y > large_threshold ? sone : z; - z = aux < 0x39000000 | aux > 0x7f800000 ? x : z; - - return z; -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, tanh, float); - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double tanh(double x) -{ - // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent - // to the following three formulae: - // 1. (exp(x) - exp(-x))/(exp(x) + exp(-x)) - // 2. (1 - (2/(exp(2*x) + 1 ))) - // 3. (exp(2*x) - 1)/(exp(2*x) + 1) - // but computationally, some formulae are better on some ranges. - - // The point at which e^-x is insignificant compared to e^x = ln(2^27) - const double large_threshold = 0x1.2b708872320e2p+4; - - ulong ux = as_ulong(x); - ulong ax = ux & ~SIGNBIT_DP64; - ulong sx = ux ^ ax; - double y = as_double(ax); - double y2 = y * y; - - // y < 0.9 - double znl = fma(y2, - fma(y2, - fma(y2, -0.142077926378834722618091e-7, -0.200047621071909498730453e-3), - -0.176016349003044679402273e-1), - -0.274030424656179760118928e0); - - double zdl = fma(y2, - fma(y2, - fma(y2, 0.2091140262529164482568557e-3, 0.201562166026937652780575e-1), - 0.381641414288328849317962e0), - 0.822091273968539282568011e0); - - // 0.9 <= y <= 1 - double znm = fma(y2, - fma(y2, - fma(y2, -0.115475878996143396378318e-7, -0.165597043903549960486816e-3), - -0.146173047288731678404066e-1), - -0.227793870659088295252442e0); - - double zdm = fma(y2, - fma(y2, - fma(y2, 0.173076050126225961768710e-3, 0.167358775461896562588695e-1), - 0.317204558977294374244770e0), - 0.683381611977295894959554e0); - - int c = y < 0.9; - double zn = c ? znl : znm; - double zd = c ? zdl : zdm; - double z = y + y*y2 * MATH_DIVIDE(zn, zd); - - // y > 1 - double p = exp(2.0 * y) + 1.0; - double zg = 1.0 - 2.0 / p; - - z = y > 1.0 ? zg : z; - - // Other cases - z = y < 0x1.0p-28 | ax > PINFBITPATT_DP64 ? x : z; - - z = y > large_threshold ? 1.0 : z; - - return as_double(sx | as_ulong(z)); -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, tanh, double); - -#endif // cl_khr_fp64 - -#ifdef cl_khr_fp16 - -#pragma OPENCL EXTENSION cl_khr_fp16 : enable - -_CLC_DEFINE_UNARY_BUILTIN_FP16(tanh) - -#endif +#define FUNCTION tanh +#define __CLC_BODY <clc/shared/unary_def.inc> +#include <clc/math/gentype.inc> >From b961f633314b54ac0dea584dfd89713a93590187 Mon Sep 17 00:00:00 2001 From: Fraser Cormack <fra...@codeplay.com> Date: Wed, 2 Apr 2025 11:37:12 +0100 Subject: [PATCH 2/2] use fabs, and minor formatting --- libclc/clc/lib/generic/math/clc_cosh.inc | 10 ++++------ libclc/clc/lib/generic/math/clc_sinh.inc | 8 ++++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_cosh.inc b/libclc/clc/lib/generic/math/clc_cosh.inc index e36ce19243b76..a9fa1bb14140c 100644 --- a/libclc/clc/lib/generic/math/clc_cosh.inc +++ b/libclc/clc/lib/generic/math/clc_cosh.inc @@ -21,8 +21,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { const __CLC_GENTYPE small_threshold = 0x1.0a2b24p+3f; __CLC_UINTN ux = __CLC_AS_UINTN(x); - __CLC_UINTN aux = ux & EXSIGNBIT_SP32; - __CLC_GENTYPE y = __CLC_AS_GENTYPE(aux); + __CLC_GENTYPE y = __clc_fabs(x); + __CLC_UINTN aux = __CLC_AS_UINTN(y); // Find the integer part y0 of y and the increment dy = y - y0. We then // compute z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) z = cosh(y) = @@ -91,7 +91,6 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { #elif __CLC_FPSIZE == 64 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { - // After dealing with special cases the computation is split into // regions as follows: // @@ -106,9 +105,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cosh(__CLC_GENTYPE x) { // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) // cosh(x) is then sign(x)*z. - // This is ln(2^1025) - const __CLC_GENTYPE max_cosh_arg = - 7.10475860073943977113e+02; // 0x408633ce8fb9f87e + // This is ln(2^1025) = 0x408633ce8fb9f87e + const __CLC_GENTYPE max_cosh_arg = 7.10475860073943977113e+02; // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) const __CLC_GENTYPE small_threshold = 0x1.2b708872320e2p+4; diff --git a/libclc/clc/lib/generic/math/clc_sinh.inc b/libclc/clc/lib/generic/math/clc_sinh.inc index f089dd4a600a3..799cc32105084 100644 --- a/libclc/clc/lib/generic/math/clc_sinh.inc +++ b/libclc/clc/lib/generic/math/clc_sinh.inc @@ -20,9 +20,9 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) { const __CLC_GENTYPE small_threshold = 0x1.0a2b24p+3f; __CLC_UINTN ux = __CLC_AS_UINTN(x); - __CLC_UINTN aux = ux & EXSIGNBIT_SP32; + __CLC_GENTYPE y = __clc_fabs(x); + __CLC_UINTN aux = __CLC_AS_UINTN(y); __CLC_UINTN xs = ux ^ aux; - __CLC_GENTYPE y = __CLC_AS_GENTYPE(aux); // We find the integer part y0 of y and the increment dy = y - y0. We then // compute z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy) where sinh(y0) @@ -106,8 +106,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sinh(__CLC_GENTYPE x) { // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0))) // sinh(x) is then sign(x)*z. - const __CLC_GENTYPE max_sinh_arg = - 7.10475860073943977113e+02; // 0x408633ce8fb9f87e + // 0x408633ce8fb9f87e + const __CLC_GENTYPE max_sinh_arg = 7.10475860073943977113e+02; // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27) const __CLC_GENTYPE small_threshold = 0x1.2b708872320e2p+4; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits