https://github.com/frasercrmck created https://github.com/llvm/llvm-project/pull/139527
This commit moves the remaining FP64 sin and cos helper functions to the CLC library. As a consequence, it formally moves all sin, cos and sincos builtins to the CLC library. Previously, the FP16 and FP32 were nominally there but still in the OpenCL layer while waiting for the FP64 ones. The FP64 builtins are now vectorized as the FP16 and FP32 ones were earlier. One helper table had to be changed. It was previously a table of bytes loaded by each work-item as uint4. Since this doesn't vectorize well, the table was split to load two ulongNs per work-item. While this might not be as efficient on some devices, one mitigating factor is that we were previously loading 48 bytes per work-item in total, but only using 40 of them. With this commit we only load the bytes we need. >From 875c5fa90fff79456da734ae4be1e4590df6d127 Mon Sep 17 00:00:00 2001 From: Fraser Cormack <fra...@codeplay.com> Date: Thu, 1 May 2025 17:24:58 +0100 Subject: [PATCH 1/2] [libclc] Move sin, cos & sincos to CLC library This commit also vectorizes these builtins along the way. --- libclc/clc/include/clc/math/clc_cos.h | 19 ++ libclc/clc/include/clc/math/clc_sin.h | 19 ++ libclc/clc/include/clc/math/clc_sincos.h | 19 ++ .../clc/include/clc/math/clc_sincos_helpers.h | 7 + .../clc/math/clc_sincos_helpers_fp64.inc | 17 ++ libclc/clc/include/clc/math/tables.h | 2 +- libclc/clc/lib/generic/SOURCES | 3 + libclc/clc/lib/generic/math/clc_cos.cl | 21 ++ libclc/clc/lib/generic/math/clc_cos.inc | 63 ++++ libclc/clc/lib/generic/math/clc_sin.cl | 25 ++ libclc/clc/lib/generic/math/clc_sin.inc | 68 +++++ libclc/clc/lib/generic/math/clc_sincos.cl | 14 + .../lib/generic/math/clc_sincos.inc} | 8 +- .../lib/generic/math/clc_sincos_helpers.cl | 24 ++ .../generic/math/clc_sincos_helpers_fp64.inc | 234 ++++++++++++++ libclc/clc/lib/generic/math/clc_tables.cl | 62 ++++ libclc/clspv/lib/SOURCES | 2 - libclc/generic/include/clc/math/sincos.h | 4 +- libclc/generic/include/clc/math/sincos.inc | 14 - libclc/generic/lib/SOURCES | 2 - libclc/generic/lib/math/clc_tan.cl | 1 - libclc/generic/lib/math/cos.cl | 42 +-- libclc/generic/lib/math/cos.inc | 34 --- libclc/generic/lib/math/sin.cl | 41 +-- libclc/generic/lib/math/sin.inc | 38 --- libclc/generic/lib/math/sincos.cl | 4 +- libclc/generic/lib/math/sincos_helpers.cl | 285 ------------------ libclc/generic/lib/math/sincos_helpers.h | 24 -- libclc/generic/lib/math/tables.cl | 30 -- libclc/spirv/lib/SOURCES | 2 - 30 files changed, 612 insertions(+), 516 deletions(-) create mode 100644 libclc/clc/include/clc/math/clc_cos.h create mode 100644 libclc/clc/include/clc/math/clc_sin.h create mode 100644 libclc/clc/include/clc/math/clc_sincos.h create mode 100644 libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc create mode 100644 libclc/clc/lib/generic/math/clc_cos.cl create mode 100644 libclc/clc/lib/generic/math/clc_cos.inc create mode 100644 libclc/clc/lib/generic/math/clc_sin.cl create mode 100644 libclc/clc/lib/generic/math/clc_sin.inc create mode 100644 libclc/clc/lib/generic/math/clc_sincos.cl rename libclc/{generic/lib/math/sincos.inc => clc/lib/generic/math/clc_sincos.inc} (62%) create mode 100644 libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc delete mode 100644 libclc/generic/include/clc/math/sincos.inc delete mode 100644 libclc/generic/lib/math/cos.inc delete mode 100644 libclc/generic/lib/math/sin.inc delete mode 100644 libclc/generic/lib/math/sincos_helpers.cl delete mode 100644 libclc/generic/lib/math/sincos_helpers.h delete mode 100644 libclc/generic/lib/math/tables.cl diff --git a/libclc/clc/include/clc/math/clc_cos.h b/libclc/clc/include/clc/math/clc_cos.h new file mode 100644 index 0000000000000..44681608efc37 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_cos.h @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_COS_H__ +#define __CLC_MATH_CLC_COS_H__ + +#define __CLC_BODY <clc/math/unary_decl.inc> +#define __CLC_FUNCTION __clc_cos + +#include <clc/math/gentype.inc> + +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_COS_H__ diff --git a/libclc/clc/include/clc/math/clc_sin.h b/libclc/clc/include/clc/math/clc_sin.h new file mode 100644 index 0000000000000..de4c722ca123f --- /dev/null +++ b/libclc/clc/include/clc/math/clc_sin.h @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_SIN_H__ +#define __CLC_MATH_CLC_SIN_H__ + +#define __CLC_BODY <clc/math/unary_decl.inc> +#define __CLC_FUNCTION __clc_sin + +#include <clc/math/gentype.inc> + +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_SIN_H__ diff --git a/libclc/clc/include/clc/math/clc_sincos.h b/libclc/clc/include/clc/math/clc_sincos.h new file mode 100644 index 0000000000000..e26dc7c024c9c --- /dev/null +++ b/libclc/clc/include/clc/math/clc_sincos.h @@ -0,0 +1,19 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef __CLC_MATH_CLC_SINCOS_H__ +#define __CLC_MATH_CLC_SINCOS_H__ + +#define __CLC_BODY <clc/math/unary_decl_with_ptr.inc> +#define __CLC_FUNCTION __clc_sincos + +#include <clc/math/gentype.inc> + +#undef __CLC_FUNCTION + +#endif // __CLC_MATH_CLC_SINCOS_H__ diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers.h b/libclc/clc/include/clc/math/clc_sincos_helpers.h index 8029d439f4c62..e3a2e1c0b605c 100644 --- a/libclc/clc/include/clc/math/clc_sincos_helpers.h +++ b/libclc/clc/include/clc/math/clc_sincos_helpers.h @@ -16,4 +16,11 @@ #undef __FLOAT_ONLY +#define __DOUBLE_ONLY +#define __CLC_BODY <clc/math/clc_sincos_helpers_fp64.inc> + +#include <clc/math/gentype.inc> + +#undef __DOUBLE_ONLY + #endif // __CLC_MATH_CLC_SINCOS_HELPERS_H__ diff --git a/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc new file mode 100644 index 0000000000000..09c6e1c965f64 --- /dev/null +++ b/libclc/clc/include/clc/math/clc_sincos_helpers_fp64.inc @@ -0,0 +1,17 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +_CLC_DECL _CLC_OVERLOAD void +__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, + private __CLC_DOUBLEN *rr, + private __CLC_INTN *regn); + +_CLC_DECL _CLC_OVERLOAD void +__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, + private __CLC_DOUBLEN *rr, + private __CLC_INTN *regn); diff --git a/libclc/clc/include/clc/math/tables.h b/libclc/clc/include/clc/math/tables.h index fb172b0b8f221..0fec778b53679 100644 --- a/libclc/clc/include/clc/math/tables.h +++ b/libclc/clc/include/clc/math/tables.h @@ -61,7 +61,6 @@ TABLE_FUNCTION_DECL(float2, log2_tbl); TABLE_FUNCTION_DECL(float2, log10_tbl); -TABLE_FUNCTION_DECL(uint4, pibits_tbl); CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_head); CLC_TABLE_FUNCTION_DECL(float, log_inv_tbl_ep_tail); @@ -75,6 +74,7 @@ CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_head); CLC_TABLE_FUNCTION_DECL(float, cbrt_tbl_tail); CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_head); CLC_TABLE_FUNCTION_DECL(float, sinhcosh_tbl_tail); +CLC_TABLE_FUNCTION_DECL(ulong, pibits_tbl); #ifdef cl_khr_fp64 diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES index 4240e7b08e7d1..4d66c749fc53e 100644 --- a/libclc/clc/lib/generic/SOURCES +++ b/libclc/clc/lib/generic/SOURCES @@ -32,6 +32,7 @@ math/clc_atanpi.cl math/clc_cbrt.cl math/clc_ceil.cl math/clc_copysign.cl +math/clc_cos.cl math/clc_cosh.cl math/clc_cospi.cl math/clc_ep_log.cl @@ -86,6 +87,8 @@ math/clc_rint.cl math/clc_rootn.cl math/clc_round.cl math/clc_rsqrt.cl +math/clc_sin.cl +math/clc_sincos.cl math/clc_sincos_helpers.cl math/clc_sinh.cl math/clc_sinpi.cl diff --git a/libclc/clc/lib/generic/math/clc_cos.cl b/libclc/clc/lib/generic/math/clc_cos.cl new file mode 100644 index 0000000000000..0c9dc287aa0b4 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_cos.cl @@ -0,0 +1,21 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/clc_convert.h> +#include <clc/clcmacro.h> +#include <clc/float/definitions.h> +#include <clc/math/clc_fabs.h> +#include <clc/math/clc_sincos_helpers.h> +#include <clc/math/clc_sincos_piby4.h> +#include <clc/math/math.h> +#include <clc/relational/clc_isinf.h> +#include <clc/relational/clc_isnan.h> +#include <clc/relational/clc_select.h> + +#define __CLC_BODY <clc_cos.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/math/clc_cos.inc b/libclc/clc/lib/generic/math/clc_cos.inc new file mode 100644 index 0000000000000..4b8108c086090 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_cos.inc @@ -0,0 +1,63 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_cos(__CLC_FLOATN x) { + __CLC_FLOATN absx = __clc_fabs(x); + + __CLC_FLOATN r0, r1; + __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx); + + __CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1); + __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1); + + __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc; + c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31)); + + c = __clc_select(c, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x)); + + return c; +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_cos(__CLC_CONVERT_FLOATN(x))); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) { + __CLC_GENTYPE absx = __clc_fabs(x); + + __CLC_BIT_INTN is_medium = absx < 0x1.0p+47; + + __CLC_INTN regn_m, regn_l; + __CLC_GENTYPE r_m, r_l, rr_m, rr_l; + + __clc_remainder_piby2_medium(absx, &r_m, &rr_m, ®n_m); + __clc_remainder_piby2_large(absx, &r_l, &rr_l, ®n_l); + + __CLC_GENTYPE r = is_medium ? r_m : r_l; + __CLC_GENTYPE rr = is_medium ? rr_m : rr_l; + __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l; + + __CLC_GENTYPE sinval, cosval; + __clc_sincos_piby4(r, rr, &sinval, &cosval); + sinval = -sinval; + + __CLC_LONGN c = + __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? sinval : cosval); + c ^= __CLC_CONVERT_BIT_INTN(regn > 1) << 63; + + return __clc_isnan(absx) | __clc_isinf(absx) ? __CLC_GENTYPE_NAN + : __CLC_AS_GENTYPE(c); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_sin.cl b/libclc/clc/lib/generic/math/clc_sin.cl new file mode 100644 index 0000000000000..0ff9739c6a846 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_sin.cl @@ -0,0 +1,25 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/clc_convert.h> +#include <clc/clcmacro.h> +#include <clc/float/definitions.h> +#include <clc/internal/clc.h> +#include <clc/math/clc_fabs.h> +#include <clc/math/clc_sincos_helpers.h> +#include <clc/math/clc_sincos_piby4.h> +#include <clc/math/clc_trunc.h> +#include <clc/math/math.h> +#include <clc/math/tables.h> +#include <clc/relational/clc_isinf.h> +#include <clc/relational/clc_isnan.h> +#include <clc/relational/clc_select.h> +#include <clc/shared/clc_max.h> + +#define __CLC_BODY <clc_sin.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/clc/lib/generic/math/clc_sin.inc b/libclc/clc/lib/generic/math/clc_sin.inc new file mode 100644 index 0000000000000..b4f72eb625eb0 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_sin.inc @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#if __CLC_FPSIZE == 32 + +_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_sin(__CLC_FLOATN x) { + __CLC_FLOATN absx = __clc_fabs(x); + + __CLC_FLOATN r0, r1; + __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx); + + __CLC_FLOATN ss = __clc_sinf_piby4(r0, r1); + __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1); + + __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss; + s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^ + (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx))); + + s = __clc_select(s, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x)); + + // Subnormals + s = x == 0.0f ? x : s; + + return s; +} + +#elif __CLC_FPSIZE == 16 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) { + return __CLC_CONVERT_GENTYPE(__clc_sin(__CLC_CONVERT_FLOATN(x))); +} + +#elif __CLC_FPSIZE == 64 + +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) { + __CLC_GENTYPE absx = __clc_fabs(x); + + __CLC_BIT_INTN is_medium = absx < 0x1.0p+47; + + __CLC_INTN regn_m, regn_l; + __CLC_GENTYPE r_m, r_l, rr_m, rr_l; + + __clc_remainder_piby2_medium(absx, &r_m, &rr_m, ®n_m); + __clc_remainder_piby2_large(absx, &r_l, &rr_l, ®n_l); + + __CLC_GENTYPE r = is_medium ? r_m : r_l; + __CLC_GENTYPE rr = is_medium ? rr_m : rr_l; + __CLC_INTN regn = __CLC_CONVERT_INTN(is_medium) ? regn_m : regn_l; + + __CLC_GENTYPE sinval, cosval; + __clc_sincos_piby4(r, rr, &sinval, &cosval); + + __CLC_LONGN s = + __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? cosval : sinval); + + s ^= (__CLC_CONVERT_BIT_INTN(regn > 1) << 63) ^ + (__CLC_CONVERT_BIT_INTN(x < 0.0) << 63); + + return __clc_isinf(x) | __clc_isnan(x) ? __CLC_GENTYPE_NAN + : __CLC_AS_GENTYPE(s); +} + +#endif diff --git a/libclc/clc/lib/generic/math/clc_sincos.cl b/libclc/clc/lib/generic/math/clc_sincos.cl new file mode 100644 index 0000000000000..2209a41593a2d --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_sincos.cl @@ -0,0 +1,14 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include <clc/internal/clc.h> +#include <clc/math/clc_cos.h> +#include <clc/math/clc_sin.h> + +#define __CLC_BODY <clc_sincos.inc> +#include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/sincos.inc b/libclc/clc/lib/generic/math/clc_sincos.inc similarity index 62% rename from libclc/generic/lib/math/sincos.inc rename to libclc/clc/lib/generic/math/clc_sincos.inc index b1d4f88c3d427..1e21b7549e448 100644 --- a/libclc/generic/lib/math/sincos.inc +++ b/libclc/clc/lib/generic/math/clc_sincos.inc @@ -6,10 +6,10 @@ // //===----------------------------------------------------------------------===// -#define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE) \ - _CLC_OVERLOAD _CLC_DEF TYPE sincos (TYPE x, ADDRSPACE TYPE * cosval) { \ - *cosval = cos(x); \ - return sin(x); \ +#define __CLC_DECLARE_SINCOS(ADDRSPACE, TYPE) \ + _CLC_OVERLOAD _CLC_DEF TYPE __clc_sincos(TYPE x, ADDRSPACE TYPE *cosval) { \ + *cosval = __clc_cos(x); \ + return __clc_sin(x); \ } __CLC_DECLARE_SINCOS(global, __CLC_GENTYPE) diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl index 24676d3c7711c..c1768e39243fa 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl @@ -31,3 +31,27 @@ #define __CLC_BODY <clc_sincos_helpers.inc> #include <clc/math/gentype.inc> + +#undef __FLOAT_ONLY + +#ifdef cl_khr_fp64 + +#pragma OPENCL EXTENSION cl_khr_fp64 : enable + +#include <clc/math/clc_fract.h> +#include <clc/math/tables.h> +#include <clc/shared/clc_max.h> + +#define bytealign(src0, src1, src2) \ + (__CLC_CONVERT_UINTN( \ + ((__CLC_CONVERT_LONGN((src0)) << 32) | __CLC_CONVERT_LONGN((src1))) >> \ + (((src2) & 3) * 8))) + +#define __DOUBLE_ONLY +#define __CLC_BODY <clc_sincos_helpers_fp64.inc> + +#include <clc/math/gentype.inc> + +#undef __DOUBLE_ONLY + +#endif diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc new file mode 100644 index 0000000000000..3a8edb378cae2 --- /dev/null +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc @@ -0,0 +1,234 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Reduction for medium sized arguments +_CLC_DEF _CLC_OVERLOAD void +__clc_remainder_piby2_medium(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, + private __CLC_DOUBLEN *rr, + private __CLC_INTN *regn) { + // How many pi/2 is x a multiple of? + const __CLC_DOUBLEN two_by_pi = 0x1.45f306dc9c883p-1; + __CLC_DOUBLEN dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5)); + + const __CLC_DOUBLEN piby2_h = -7074237752028440.0 / 0x1.0p+52; + const __CLC_DOUBLEN piby2_m = -2483878800010755.0 / 0x1.0p+105; + const __CLC_DOUBLEN piby2_t = -3956492004828932.0 / 0x1.0p+158; + + // Compute product of npi2 with 159 bits of 2/pi + __CLC_DOUBLEN p_hh = piby2_h * dnpi2; + __CLC_DOUBLEN p_ht = __clc_fma(piby2_h, dnpi2, -p_hh); + __CLC_DOUBLEN p_mh = piby2_m * dnpi2; + __CLC_DOUBLEN p_mt = __clc_fma(piby2_m, dnpi2, -p_mh); + __CLC_DOUBLEN p_th = piby2_t * dnpi2; + __CLC_DOUBLEN p_tt = __clc_fma(piby2_t, dnpi2, -p_th); + + // Reduce to 159 bits + __CLC_DOUBLEN ph = p_hh; + __CLC_DOUBLEN pm = p_ht + p_mh; + __CLC_DOUBLEN t = p_mh - (pm - p_ht); + __CLC_DOUBLEN pt = p_th + t + p_mt + p_tt; + t = ph + pm; + pm = pm - (t - ph); + ph = t; + t = pm + pt; + pt = pt - (t - pm); + pm = t; + + // Subtract from x + t = x + ph; + __CLC_DOUBLEN qh = t + pm; + __CLC_DOUBLEN qt = pm - (qh - t) + pt; + + *r = qh; + *rr = qt; + *regn = __CLC_CONVERT_INTN(__CLC_CONVERT_LONGN(dnpi2)) & 0x3; +} + +// Given positive argument x, reduce it to the range [-pi/4,pi/4] using +// extra precision, and return the result in r, rr. +// Return value "regn" tells how many lots of pi/2 were subtracted +// from x to put it in the range [-pi/4,pi/4], mod 4. +_CLC_DEF _CLC_OVERLOAD void +__clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, + private __CLC_DOUBLEN *rr, + private __CLC_INTN *regn) { + + __CLC_LONGN ux = __CLC_AS_LONGN(x); + __CLC_INTN e = __CLC_CONVERT_INTN(ux >> 52) - 1023; + __CLC_INTN i = __clc_max(23, (e >> 3) + 17); + __CLC_INTN j = 150 - i; + __CLC_INTN j16 = j & ~0xf; + __CLC_DOUBLEN fract_temp; + + // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary + // byte boundary + __CLC_ULONGN q0 = USE_TABLE(pibits_tbl, j16); + __CLC_ULONGN q3 = USE_TABLE(pibits_tbl, (j16 + 8)); + __CLC_ULONGN q1 = USE_TABLE(pibits_tbl, (j16 + 16)); + __CLC_ULONGN q4 = USE_TABLE(pibits_tbl, (j16 + 24)); + __CLC_ULONGN q2 = USE_TABLE(pibits_tbl, (j16 + 32)); + + __CLC_INTN k = (j >> 2) & 0x3; + __CLC_INTN c0 = k == 0; + __CLC_INTN c1 = k == 1; + __CLC_INTN c2 = k == 2; + __CLC_INTN c3 = k == 3; + + __CLC_UINTN u0, u1, u2, u3, u4, u5, u6; + + __CLC_UINTN q0s0 = __CLC_CONVERT_UINTN(q0); + __CLC_UINTN q0s1 = __CLC_CONVERT_UINTN(q0 >> 32); + + __CLC_UINTN q1s0 = __CLC_CONVERT_UINTN(q1); + __CLC_UINTN q1s1 = __CLC_CONVERT_UINTN(q1 >> 32); + + __CLC_UINTN q2s0 = __CLC_CONVERT_UINTN(q2); + __CLC_UINTN q2s1 = __CLC_CONVERT_UINTN(q2 >> 32); + + __CLC_UINTN q3s0 = __CLC_CONVERT_UINTN(q3); + __CLC_UINTN q3s1 = __CLC_CONVERT_UINTN(q3 >> 32); + + __CLC_UINTN q4s0 = __CLC_CONVERT_UINTN(q4); + __CLC_UINTN q4s1 = __CLC_CONVERT_UINTN(q4 >> 32); + + u0 = c1 ? q0s1 : q0s0; + u0 = c2 ? q3s0 : u0; + u0 = c3 ? q3s1 : u0; + + u1 = c1 ? q3s0 : q0s1; + u1 = c2 ? q3s1 : u1; + u1 = c3 ? q1s0 : u1; + + u2 = c1 ? q3s1 : q3s0; + u2 = c2 ? q1s0 : u2; + u2 = c3 ? q1s1 : u2; + + u3 = c1 ? q1s0 : q3s1; + u3 = c2 ? q1s1 : u3; + u3 = c3 ? q4s0 : u3; + + u4 = c1 ? q1s1 : q1s0; + u4 = c2 ? q4s0 : u4; + u4 = c3 ? q4s1 : u4; + + u5 = c1 ? q4s0 : q1s1; + u5 = c2 ? q4s1 : u5; + u5 = c3 ? q2s0 : u5; + + u6 = c1 ? q4s1 : q4s0; + u6 = c2 ? q2s0 : u6; + u6 = c3 ? q2s1 : u6; + + __CLC_UINTN v0 = bytealign(u1, u0, j); + __CLC_UINTN v1 = bytealign(u2, u1, j); + __CLC_UINTN v2 = bytealign(u3, u2, j); + __CLC_UINTN v3 = bytealign(u4, u3, j); + __CLC_UINTN v4 = bytealign(u5, u4, j); + __CLC_UINTN v5 = bytealign(u6, u5, j); + + // Place those 192 bits in 4 48-bit doubles along with correct exponent + // If i > 1018 we would get subnormals so we scale p up and x down to get the + // same product + i = 2 + 8 * i; + x *= __CLC_CONVERT_BIT_INTN(i > 1018) ? 0x1.0p-136 : 1.0; + i -= i > 1018 ? 136 : 0; + +#define doublen_lohi(x, y) \ + __CLC_AS_DOUBLEN(__CLC_CONVERT_ULONGN((x)) & 0xFFFFFFFF | \ + __CLC_CONVERT_ULONGN((y)) << 32) + + __CLC_UINTN ua = __CLC_CONVERT_UINTN(1023 + 52 - i) << 20; + __CLC_DOUBLEN a = doublen_lohi((__CLC_ULONGN)0, ua); + __CLC_DOUBLEN p0 = doublen_lohi(v0, ua | (v1 & 0xffffU)) - a; + ua += 0x03000000U; + a = doublen_lohi((__CLC_ULONGN)0, ua); + __CLC_DOUBLEN p1 = + doublen_lohi(((v2 << 16) | (v1 >> 16)), (ua | (v2 >> 16))) - a; + ua += 0x03000000U; + a = doublen_lohi((__CLC_ULONGN)0, ua); + __CLC_DOUBLEN p2 = doublen_lohi(v3, (ua | (v4 & 0xffffU))) - a; + ua += 0x03000000U; + a = doublen_lohi((__CLC_ULONGN)0, ua); + __CLC_DOUBLEN p3 = + doublen_lohi(((v5 << 16) | (v4 >> 16)), (ua | (v5 >> 16))) - a; + +#undef doublen_lohi + + // Exact multiply + __CLC_DOUBLEN f0h = p0 * x; + __CLC_DOUBLEN f0l = __clc_fma(p0, x, -f0h); + __CLC_DOUBLEN f1h = p1 * x; + __CLC_DOUBLEN f1l = __clc_fma(p1, x, -f1h); + __CLC_DOUBLEN f2h = p2 * x; + __CLC_DOUBLEN f2l = __clc_fma(p2, x, -f2h); + __CLC_DOUBLEN f3h = p3 * x; + __CLC_DOUBLEN f3l = __clc_fma(p3, x, -f3h); + + // Accumulate product into 4 doubles + __CLC_DOUBLEN s, t; + + __CLC_DOUBLEN f3 = f3h + f2h; + t = f2h - (f3 - f3h); + s = f3l + t; + t = t - (s - f3l); + + __CLC_DOUBLEN f2 = s + f1h; + t = f1h - (f2 - s) + t; + s = f2l + t; + t = t - (s - f2l); + + __CLC_DOUBLEN f1 = s + f0h; + t = f0h - (f1 - s) + t; + s = f1l + t; + + __CLC_DOUBLEN f0 = s + f0l; + + // Strip off unwanted large integer bits + f3 = 0x1.0p+10 * __clc_fract(f3 * 0x1.0p-10, &fract_temp); + f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0; + + // Compute least significant integer bits + t = f3 + f2; + __CLC_DOUBLEN di = t - __clc_fract(t, &fract_temp); + i = __CLC_CONVERT_INTN(__CLC_CONVERT_FLOATN(di)); + + // Shift out remaining integer part + f3 -= di; + s = f3 + f2; + t = f2 - (s - f3); + f3 = s; + f2 = t; + s = f2 + f1; + t = f1 - (s - f2); + f2 = s; + f1 = t; + f1 += f0; + + // Subtract 1 if fraction is >= 0.5, and update regn + __CLC_INTN g = __CLC_CONVERT_INTN(f3 >= 0.5 ? 1L : 0L); + i += g; + f3 -= __CLC_CONVERT_DOUBLEN(__CLC_CONVERT_FLOATN(g)); + + // Shift up bits + s = f3 + f2; + t = f2 - (s - f3); + f3 = s; + f2 = t + f1; + + // Multiply precise fraction by pi/2 to get radians + const __CLC_DOUBLEN p2h = 7074237752028440.0 / 0x1.0p+52; + const __CLC_DOUBLEN p2t = 4967757600021510.0 / 0x1.0p+106; + + __CLC_DOUBLEN rhi = f3 * p2h; + __CLC_DOUBLEN rlo = + __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi))); + + *r = rhi + rlo; + *rr = rlo - (*r - rhi); + *regn = i & 0x3; +} diff --git a/libclc/clc/lib/generic/math/clc_tables.cl b/libclc/clc/lib/generic/math/clc_tables.cl index 7ee744589ae1d..05053aea7bc12 100644 --- a/libclc/clc/lib/generic/math/clc_tables.cl +++ b/libclc/clc/lib/generic/math/clc_tables.cl @@ -1648,4 +1648,66 @@ CLC_TABLE_FUNCTION(double, SINH_TBL_TAIL, sinh_tbl_tail); CLC_TABLE_FUNCTION(double, COSH_TBL_HEAD, cosh_tbl_head); CLC_TABLE_FUNCTION(double, COSH_TBL_TAIL, cosh_tbl_tail); +DECLARE_TABLE(uchar, PIBITS_TBL, ) = { + 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175, + 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31, + 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38, + 44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188, + 188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247, + 46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249, + 125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8, + 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39, + 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21, + 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109, + 3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13, + 230, 139, 2, 0, 0, 0, 0, 0, 0, 0 +}; + +_CLC_DEF _CLC_OVERLOAD ulong TABLE_MANGLE(pibits_tbl)(int idx) { + return *(__constant ulong *)(PIBITS_TBL + idx); +} +_CLC_DEF _CLC_OVERLOAD ulong2 TABLE_MANGLE(pibits_tbl)(int2 idx) { + return (ulong2){*(__constant ulong *)(PIBITS_TBL + idx.s0), + *(__constant ulong *)(PIBITS_TBL + idx.s1)}; +} +_CLC_DEF _CLC_OVERLOAD ulong3 TABLE_MANGLE(pibits_tbl)(int3 idx) { + return (ulong3){*(__constant ulong *)(PIBITS_TBL + idx.s0), + *(__constant ulong *)(PIBITS_TBL + idx.s1), + *(__constant ulong *)(PIBITS_TBL + idx.s2)}; +} +_CLC_DEF _CLC_OVERLOAD ulong4 TABLE_MANGLE(pibits_tbl)(int4 idx) { + return (ulong4){*(__constant ulong *)(PIBITS_TBL + idx.s0), + *(__constant ulong *)(PIBITS_TBL + idx.s1), + *(__constant ulong *)(PIBITS_TBL + idx.s2), + *(__constant ulong *)(PIBITS_TBL + idx.s3)}; +} +_CLC_DEF _CLC_OVERLOAD ulong8 TABLE_MANGLE(pibits_tbl)(int8 idx) { + return (ulong8){*(__constant ulong *)(PIBITS_TBL + idx.s0), + *(__constant ulong *)(PIBITS_TBL + idx.s1), + *(__constant ulong *)(PIBITS_TBL + idx.s2), + *(__constant ulong *)(PIBITS_TBL + idx.s3), + *(__constant ulong *)(PIBITS_TBL + idx.s4), + *(__constant ulong *)(PIBITS_TBL + idx.s5), + *(__constant ulong *)(PIBITS_TBL + idx.s6), + *(__constant ulong *)(PIBITS_TBL + idx.s7)}; +} +_CLC_DEF _CLC_OVERLOAD ulong16 TABLE_MANGLE(pibits_tbl)(int16 idx) { + return (ulong16){*(__constant ulong *)(PIBITS_TBL + idx.s0), + *(__constant ulong *)(PIBITS_TBL + idx.s1), + *(__constant ulong *)(PIBITS_TBL + idx.s2), + *(__constant ulong *)(PIBITS_TBL + idx.s3), + *(__constant ulong *)(PIBITS_TBL + idx.s4), + *(__constant ulong *)(PIBITS_TBL + idx.s5), + *(__constant ulong *)(PIBITS_TBL + idx.s6), + *(__constant ulong *)(PIBITS_TBL + idx.s7), + *(__constant ulong *)(PIBITS_TBL + idx.s8), + *(__constant ulong *)(PIBITS_TBL + idx.s9), + *(__constant ulong *)(PIBITS_TBL + idx.sA), + *(__constant ulong *)(PIBITS_TBL + idx.sB), + *(__constant ulong *)(PIBITS_TBL + idx.sC), + *(__constant ulong *)(PIBITS_TBL + idx.sD), + *(__constant ulong *)(PIBITS_TBL + idx.sE), + *(__constant ulong *)(PIBITS_TBL + idx.sF)}; +} + #endif // cl_khr_fp64 diff --git a/libclc/clspv/lib/SOURCES b/libclc/clspv/lib/SOURCES index d2fea9d586287..437210d1922da 100644 --- a/libclc/clspv/lib/SOURCES +++ b/libclc/clspv/lib/SOURCES @@ -66,10 +66,8 @@ subnormal_config.cl ../../generic/lib/math/rootn.cl ../../generic/lib/math/sin.cl ../../generic/lib/math/sincos.cl -../../generic/lib/math/sincos_helpers.cl ../../generic/lib/math/sinh.cl ../../generic/lib/math/sinpi.cl -../../generic/lib/math/tables.cl ../../generic/lib/math/tan.cl ../../generic/lib/math/tanh.cl ../../generic/lib/math/tanpi.cl diff --git a/libclc/generic/include/clc/math/sincos.h b/libclc/generic/include/clc/math/sincos.h index fbd29c441e319..7426c23e88af9 100644 --- a/libclc/generic/include/clc/math/sincos.h +++ b/libclc/generic/include/clc/math/sincos.h @@ -6,5 +6,7 @@ // //===----------------------------------------------------------------------===// -#define __CLC_BODY <clc/math/sincos.inc> +#define __CLC_BODY <clc/math/unary_decl_with_ptr.inc> +#define __CLC_FUNCTION __clc_sincos #include <clc/math/gentype.inc> +#undef __CLC_FUNCTION diff --git a/libclc/generic/include/clc/math/sincos.inc b/libclc/generic/include/clc/math/sincos.inc deleted file mode 100644 index d6ec2fe94704e..0000000000000 --- a/libclc/generic/include/clc/math/sincos.inc +++ /dev/null @@ -1,14 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x, - global __CLC_GENTYPE *cosval); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x, - local __CLC_GENTYPE *cosval); -_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE sincos(__CLC_GENTYPE x, - private __CLC_GENTYPE *cosval); diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES index 781aaf16a5244..8c7565e3dd231 100644 --- a/libclc/generic/lib/SOURCES +++ b/libclc/generic/lib/SOURCES @@ -131,7 +131,6 @@ math/native_rsqrt.cl math/native_sin.cl math/native_sqrt.cl math/native_tan.cl -math/tables.cl math/nextafter.cl math/pow.cl math/pown.cl @@ -144,7 +143,6 @@ math/round.cl math/rsqrt.cl math/sin.cl math/sincos.cl -math/sincos_helpers.cl math/sinh.cl math/sinpi.cl math/sqrt.cl diff --git a/libclc/generic/lib/math/clc_tan.cl b/libclc/generic/lib/math/clc_tan.cl index 7e28e9ffed3b6..ce51c1031fa45 100644 --- a/libclc/generic/lib/math/clc_tan.cl +++ b/libclc/generic/lib/math/clc_tan.cl @@ -6,7 +6,6 @@ // //===----------------------------------------------------------------------===// -#include "sincos_helpers.h" #include <clc/clc.h> #include <clc/clcmacro.h> #include <clc/math/clc_fabs.h> diff --git a/libclc/generic/lib/math/cos.cl b/libclc/generic/lib/math/cos.cl index 00ffa371ad8fc..5b97c6a6f379a 100644 --- a/libclc/generic/lib/math/cos.cl +++ b/libclc/generic/lib/math/cos.cl @@ -6,45 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "sincos_helpers.h" #include <clc/clc.h> -#include <clc/clc_convert.h> -#include <clc/clcmacro.h> -#include <clc/math/clc_fabs.h> -#include <clc/math/clc_sincos_helpers.h> -#include <clc/math/math.h> -#include <clc/relational/clc_isinf.h> -#include <clc/relational/clc_isnan.h> -#include <clc/relational/clc_select.h> +#include <clc/math/clc_cos.h> -// FP32 and FP16 versions. -#define __CLC_BODY <cos.inc> +#define FUNCTION cos +#define __CLC_BODY <clc/shared/unary_def.inc> #include <clc/math/gentype.inc> - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double cos(double x) { - x = fabs(x); - - double r, rr; - int regn; - - if (x < 0x1.0p+47) - __clc_remainder_piby2_medium(x, &r, &rr, ®n); - else - __clc_remainder_piby2_large(x, &r, &rr, ®n); - - double2 sc = __clc_sincos_piby4(r, rr); - sc.lo = -sc.lo; - - int2 c = as_int2(regn & 1 ? sc.lo : sc.hi); - c.hi ^= (regn > 1) << 31; - - return isnan(x) | isinf(x) ? as_double(QNANBITPATT_DP64) : as_double(c); -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, cos, double); - -#endif diff --git a/libclc/generic/lib/math/cos.inc b/libclc/generic/lib/math/cos.inc deleted file mode 100644 index 1db96710457dd..0000000000000 --- a/libclc/generic/lib/math/cos.inc +++ /dev/null @@ -1,34 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#if __CLC_FPSIZE == 32 - -_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN cos(__CLC_FLOATN x) { - __CLC_FLOATN absx = __clc_fabs(x); - - __CLC_FLOATN r0, r1; - __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx); - - __CLC_FLOATN ss = -__clc_sinf_piby4(r0, r1); - __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1); - - __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc; - c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31)); - - c = __clc_select(c, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x)); - - return c; -} - -#elif __CLC_FPSIZE == 16 - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE cos(__CLC_GENTYPE x) { - return __CLC_CONVERT_GENTYPE(cos(__CLC_CONVERT_FLOATN(x))); -} - -#endif diff --git a/libclc/generic/lib/math/sin.cl b/libclc/generic/lib/math/sin.cl index f776805ad1cdb..76728cfb1c5e5 100644 --- a/libclc/generic/lib/math/sin.cl +++ b/libclc/generic/lib/math/sin.cl @@ -6,44 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "sincos_helpers.h" #include <clc/clc.h> -#include <clc/clc_convert.h> -#include <clc/clcmacro.h> -#include <clc/math/clc_fabs.h> -#include <clc/math/clc_sincos_helpers.h> -#include <clc/math/math.h> -#include <clc/relational/clc_isinf.h> -#include <clc/relational/clc_isnan.h> -#include <clc/relational/clc_select.h> +#include <clc/math/clc_sin.h> -// FP32 and FP16 versions. -#define __CLC_BODY <sin.inc> +#define FUNCTION sin +#define __CLC_BODY <clc/shared/unary_def.inc> #include <clc/math/gentype.inc> - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_OVERLOAD _CLC_DEF double sin(double x) { - double y = fabs(x); - - double r, rr; - int regn; - - if (y < 0x1.0p+47) - __clc_remainder_piby2_medium(y, &r, &rr, ®n); - else - __clc_remainder_piby2_large(y, &r, &rr, ®n); - - double2 sc = __clc_sincos_piby4(r, rr); - - int2 s = as_int2(regn & 1 ? sc.hi : sc.lo); - s.hi ^= ((regn > 1) << 31) ^ ((x < 0.0) << 31); - - return isinf(x) | isnan(x) ? as_double(QNANBITPATT_DP64) : as_double(s); -} - -_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, sin, double); - -#endif diff --git a/libclc/generic/lib/math/sin.inc b/libclc/generic/lib/math/sin.inc deleted file mode 100644 index dbc99116dfa7d..0000000000000 --- a/libclc/generic/lib/math/sin.inc +++ /dev/null @@ -1,38 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#if __CLC_FPSIZE == 32 - -_CLC_OVERLOAD _CLC_DEF __CLC_FLOATN sin(__CLC_FLOATN x) { - __CLC_FLOATN absx = __clc_fabs(x); - - __CLC_FLOATN r0, r1; - __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx); - - __CLC_FLOATN ss = __clc_sinf_piby4(r0, r1); - __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1); - - __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss; - s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^ - (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx))); - - s = __clc_select(s, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x)); - - // Subnormals - s = x == 0.0f ? x : s; - - return s; -} - -#elif __CLC_FPSIZE == 16 - -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE sin(__CLC_GENTYPE x) { - return __CLC_CONVERT_GENTYPE(sin(__CLC_CONVERT_FLOATN(x))); -} - -#endif diff --git a/libclc/generic/lib/math/sincos.cl b/libclc/generic/lib/math/sincos.cl index fc0e4d350c122..25e620cf2a38e 100644 --- a/libclc/generic/lib/math/sincos.cl +++ b/libclc/generic/lib/math/sincos.cl @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include <clc/clc.h> +#include <clc/math/clc_sincos.h> -#define __CLC_BODY <sincos.inc> +#define FUNCTION sincos +#define __CLC_BODY <clc/math/unary_def_with_ptr.inc> #include <clc/math/gentype.inc> diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl deleted file mode 100644 index 651cd11ccf016..0000000000000 --- a/libclc/generic/lib/math/sincos_helpers.cl +++ /dev/null @@ -1,285 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "sincos_helpers.h" -#include <clc/clc.h> -#include <clc/integer/clc_clz.h> -#include <clc/integer/clc_mul_hi.h> -#include <clc/math/clc_fma.h> -#include <clc/math/clc_mad.h> -#include <clc/math/clc_trunc.h> -#include <clc/math/math.h> -#include <clc/math/tables.h> -#include <clc/shared/clc_max.h> - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -#define bytealign(src0, src1, src2) \ - ((uint)(((((long)(src0)) << 32) | (long)(src1)) >> (((src2) & 3) * 8))) - -// Reduction for medium sized arguments -_CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r, - private double *rr, - private int *regn) { - // How many pi/2 is x a multiple of? - const double two_by_pi = 0x1.45f306dc9c883p-1; - double dnpi2 = __clc_trunc(__clc_fma(x, two_by_pi, 0.5)); - - const double piby2_h = -7074237752028440.0 / 0x1.0p+52; - const double piby2_m = -2483878800010755.0 / 0x1.0p+105; - const double piby2_t = -3956492004828932.0 / 0x1.0p+158; - - // Compute product of npi2 with 159 bits of 2/pi - double p_hh = piby2_h * dnpi2; - double p_ht = __clc_fma(piby2_h, dnpi2, -p_hh); - double p_mh = piby2_m * dnpi2; - double p_mt = __clc_fma(piby2_m, dnpi2, -p_mh); - double p_th = piby2_t * dnpi2; - double p_tt = __clc_fma(piby2_t, dnpi2, -p_th); - - // Reduce to 159 bits - double ph = p_hh; - double pm = p_ht + p_mh; - double t = p_mh - (pm - p_ht); - double pt = p_th + t + p_mt + p_tt; - t = ph + pm; - pm = pm - (t - ph); - ph = t; - t = pm + pt; - pt = pt - (t - pm); - pm = t; - - // Subtract from x - t = x + ph; - double qh = t + pm; - double qt = pm - (qh - t) + pt; - - *r = qh; - *rr = qt; - *regn = (int)(long)dnpi2 & 0x3; -} - -// Given positive argument x, reduce it to the range [-pi/4,pi/4] using -// extra precision, and return the result in r, rr. -// Return value "regn" tells how many lots of pi/2 were subtracted -// from x to put it in the range [-pi/4,pi/4], mod 4. - -_CLC_DEF void __clc_remainder_piby2_large(double x, private double *r, - private double *rr, - private int *regn) { - - long ux = as_long(x); - int e = (int)(ux >> 52) - 1023; - int i = __clc_max(23, (e >> 3) + 17); - int j = 150 - i; - int j16 = j & ~0xf; - double fract_temp; - - // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary - // byte boundary - uint4 q0 = USE_TABLE(pibits_tbl, j16); - uint4 q1 = USE_TABLE(pibits_tbl, (j16 + 16)); - uint4 q2 = USE_TABLE(pibits_tbl, (j16 + 32)); - - int k = (j >> 2) & 0x3; - int4 c = (int4)k == (int4)(0, 1, 2, 3); - - uint u0, u1, u2, u3, u4, u5, u6; - - u0 = c.s1 ? q0.s1 : q0.s0; - u0 = c.s2 ? q0.s2 : u0; - u0 = c.s3 ? q0.s3 : u0; - - u1 = c.s1 ? q0.s2 : q0.s1; - u1 = c.s2 ? q0.s3 : u1; - u1 = c.s3 ? q1.s0 : u1; - - u2 = c.s1 ? q0.s3 : q0.s2; - u2 = c.s2 ? q1.s0 : u2; - u2 = c.s3 ? q1.s1 : u2; - - u3 = c.s1 ? q1.s0 : q0.s3; - u3 = c.s2 ? q1.s1 : u3; - u3 = c.s3 ? q1.s2 : u3; - - u4 = c.s1 ? q1.s1 : q1.s0; - u4 = c.s2 ? q1.s2 : u4; - u4 = c.s3 ? q1.s3 : u4; - - u5 = c.s1 ? q1.s2 : q1.s1; - u5 = c.s2 ? q1.s3 : u5; - u5 = c.s3 ? q2.s0 : u5; - - u6 = c.s1 ? q1.s3 : q1.s2; - u6 = c.s2 ? q2.s0 : u6; - u6 = c.s3 ? q2.s1 : u6; - - uint v0 = bytealign(u1, u0, j); - uint v1 = bytealign(u2, u1, j); - uint v2 = bytealign(u3, u2, j); - uint v3 = bytealign(u4, u3, j); - uint v4 = bytealign(u5, u4, j); - uint v5 = bytealign(u6, u5, j); - - // Place those 192 bits in 4 48-bit doubles along with correct exponent - // If i > 1018 we would get subnormals so we scale p up and x down to get the - // same product - i = 2 + 8 * i; - x *= i > 1018 ? 0x1.0p-136 : 1.0; - i -= i > 1018 ? 136 : 0; - - uint ua = (uint)(1023 + 52 - i) << 20; - double a = as_double((uint2)(0, ua)); - double p0 = as_double((uint2)(v0, ua | (v1 & 0xffffU))) - a; - ua += 0x03000000U; - a = as_double((uint2)(0, ua)); - double p1 = as_double((uint2)((v2 << 16) | (v1 >> 16), ua | (v2 >> 16))) - a; - ua += 0x03000000U; - a = as_double((uint2)(0, ua)); - double p2 = as_double((uint2)(v3, ua | (v4 & 0xffffU))) - a; - ua += 0x03000000U; - a = as_double((uint2)(0, ua)); - double p3 = as_double((uint2)((v5 << 16) | (v4 >> 16), ua | (v5 >> 16))) - a; - - // Exact multiply - double f0h = p0 * x; - double f0l = __clc_fma(p0, x, -f0h); - double f1h = p1 * x; - double f1l = __clc_fma(p1, x, -f1h); - double f2h = p2 * x; - double f2l = __clc_fma(p2, x, -f2h); - double f3h = p3 * x; - double f3l = __clc_fma(p3, x, -f3h); - - // Accumulate product into 4 doubles - double s, t; - - double f3 = f3h + f2h; - t = f2h - (f3 - f3h); - s = f3l + t; - t = t - (s - f3l); - - double f2 = s + f1h; - t = f1h - (f2 - s) + t; - s = f2l + t; - t = t - (s - f2l); - - double f1 = s + f0h; - t = f0h - (f1 - s) + t; - s = f1l + t; - - double f0 = s + f0l; - - // Strip off unwanted large integer bits - f3 = 0x1.0p+10 * fract(f3 * 0x1.0p-10, &fract_temp); - f3 += f3 + f2 < 0.0 ? 0x1.0p+10 : 0.0; - - // Compute least significant integer bits - t = f3 + f2; - double di = t - fract(t, &fract_temp); - i = (float)di; - - // Shift out remaining integer part - f3 -= di; - s = f3 + f2; - t = f2 - (s - f3); - f3 = s; - f2 = t; - s = f2 + f1; - t = f1 - (s - f2); - f2 = s; - f1 = t; - f1 += f0; - - // Subtract 1 if fraction is >= 0.5, and update regn - int g = f3 >= 0.5; - i += g; - f3 -= (float)g; - - // Shift up bits - s = f3 + f2; - t = f2 - (s - f3); - f3 = s; - f2 = t + f1; - - // Multiply precise fraction by pi/2 to get radians - const double p2h = 7074237752028440.0 / 0x1.0p+52; - const double p2t = 4967757600021510.0 / 0x1.0p+106; - - double rhi = f3 * p2h; - double rlo = __clc_fma(f2, p2h, __clc_fma(f3, p2t, __clc_fma(f3, p2h, -rhi))); - - *r = rhi + rlo; - *rr = rlo - (*r - rhi); - *regn = i & 0x3; -} - -_CLC_DEF double2 __clc_sincos_piby4(double x, double xx) { - // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ... - // = x * (1 - x^2/3! + x^4/5! - x^6/7! ... - // = x * f(w) - // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ... - // We use a minimax approximation of (f(w) - 1) / w - // because this produces an expansion in even powers of x. - // If xx (the tail of x) is non-zero, we add a correction - // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx) - // is an approximation to cos(x)*sin(xx) valid because - // xx is tiny relative to x. - - // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ... - // = f(w) - // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ... - // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w) - // because this produces an expansion in even powers of x. - // If xx (the tail of x) is non-zero, we subtract a correction - // term g(x,xx) = x*xx to the result, where g(x,xx) - // is an approximation to sin(x)*sin(xx) valid because - // xx is tiny relative to x. - - const double sc1 = -0.166666666666666646259241729; - const double sc2 = 0.833333333333095043065222816e-2; - const double sc3 = -0.19841269836761125688538679e-3; - const double sc4 = 0.275573161037288022676895908448e-5; - const double sc5 = -0.25051132068021699772257377197e-7; - const double sc6 = 0.159181443044859136852668200e-9; - - const double cc1 = 0.41666666666666665390037e-1; - const double cc2 = -0.13888888888887398280412e-2; - const double cc3 = 0.248015872987670414957399e-4; - const double cc4 = -0.275573172723441909470836e-6; - const double cc5 = 0.208761463822329611076335e-8; - const double cc6 = -0.113826398067944859590880e-10; - - double x2 = x * x; - double x3 = x2 * x; - double r = 0.5 * x2; - double t = 1.0 - r; - - double sp = __clc_fma( - __clc_fma(__clc_fma(__clc_fma(sc6, x2, sc5), x2, sc4), x2, sc3), x2, sc2); - - double cp = - t + - __clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(__clc_fma(cc6, x2, cc5), - x2, cc4), - x2, cc3), - x2, cc2), - x2, cc1), - x2 * x2, __clc_fma(x, xx, (1.0 - t) - r)); - - double2 ret; - ret.lo = - x - __clc_fma(-x3, sc1, __clc_fma(__clc_fma(-x3, sp, 0.5 * xx), x2, -xx)); - ret.hi = cp; - - return ret; -} - -#endif diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h deleted file mode 100644 index 11cb93f34850d..0000000000000 --- a/libclc/generic/lib/math/sincos_helpers.h +++ /dev/null @@ -1,24 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include <clc/clcfunc.h> -#include <clc/clctypes.h> - -#ifdef cl_khr_fp64 - -#pragma OPENCL EXTENSION cl_khr_fp64 : enable - -_CLC_DECL void __clc_remainder_piby2_medium(double x, private double *r, - private double *rr, - private int *regn); -_CLC_DECL void __clc_remainder_piby2_large(double x, private double *r, - private double *rr, - private int *regn); -_CLC_DECL double2 __clc_sincos_piby4(double x, double xx); - -#endif diff --git a/libclc/generic/lib/math/tables.cl b/libclc/generic/lib/math/tables.cl deleted file mode 100644 index 1bda480a9fa72..0000000000000 --- a/libclc/generic/lib/math/tables.cl +++ /dev/null @@ -1,30 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include <clc/clc.h> - -#include <clc/math/tables.h> - -DECLARE_TABLE(uchar, PIBITS_TBL, ) = { - 224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175, - 169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31, - 235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38, - 44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188, - 188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247, - 46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249, - 125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8, - 162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39, - 168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21, - 239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109, - 3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13, - 230, 139, 2, 0, 0, 0, 0, 0, 0, 0 -}; - -uint4 TABLE_MANGLE(pibits_tbl)(size_t idx) { - return *(__constant uint4 *)(PIBITS_TBL + idx); -} diff --git a/libclc/spirv/lib/SOURCES b/libclc/spirv/lib/SOURCES index 5446fe13a6d93..20e052253d977 100644 --- a/libclc/spirv/lib/SOURCES +++ b/libclc/spirv/lib/SOURCES @@ -55,7 +55,6 @@ math/fma.cl ../../generic/lib/math/log2.cl ../../generic/lib/math/logb.cl ../../generic/lib/math/modf.cl -../../generic/lib/math/tables.cl ../../generic/lib/math/pow.cl ../../generic/lib/math/pown.cl ../../generic/lib/math/powr.cl @@ -64,7 +63,6 @@ math/fma.cl ../../generic/lib/math/rootn.cl ../../generic/lib/math/sin.cl ../../generic/lib/math/sincos.cl -../../generic/lib/math/sincos_helpers.cl ../../generic/lib/math/sinh.cl ../../generic/lib/math/sinpi.cl ../../generic/lib/math/clc_tan.cl >From 212ae34f61507cc71dcd20f21af7a02e8e62593a Mon Sep 17 00:00:00 2001 From: Fraser Cormack <fra...@codeplay.com> Date: Thu, 1 May 2025 17:48:20 +0100 Subject: [PATCH 2/2] update table vars --- .../generic/math/clc_sincos_helpers_fp64.inc | 69 ++++++++++--------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc index 3a8edb378cae2..9b5776d9b05e9 100644 --- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc +++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc @@ -59,7 +59,7 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, private __CLC_INTN *regn) { __CLC_LONGN ux = __CLC_AS_LONGN(x); - __CLC_INTN e = __CLC_CONVERT_INTN(ux >> 52) - 1023; + __CLC_INTN e = __CLC_CONVERT_INTN(ux >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64; __CLC_INTN i = __clc_max(23, (e >> 3) + 17); __CLC_INTN j = 150 - i; __CLC_INTN j16 = j & ~0xf; @@ -68,18 +68,10 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary // byte boundary __CLC_ULONGN q0 = USE_TABLE(pibits_tbl, j16); - __CLC_ULONGN q3 = USE_TABLE(pibits_tbl, (j16 + 8)); - __CLC_ULONGN q1 = USE_TABLE(pibits_tbl, (j16 + 16)); - __CLC_ULONGN q4 = USE_TABLE(pibits_tbl, (j16 + 24)); - __CLC_ULONGN q2 = USE_TABLE(pibits_tbl, (j16 + 32)); - - __CLC_INTN k = (j >> 2) & 0x3; - __CLC_INTN c0 = k == 0; - __CLC_INTN c1 = k == 1; - __CLC_INTN c2 = k == 2; - __CLC_INTN c3 = k == 3; - - __CLC_UINTN u0, u1, u2, u3, u4, u5, u6; + __CLC_ULONGN q1 = USE_TABLE(pibits_tbl, (j16 + 8)); + __CLC_ULONGN q2 = USE_TABLE(pibits_tbl, (j16 + 16)); + __CLC_ULONGN q3 = USE_TABLE(pibits_tbl, (j16 + 24)); + __CLC_ULONGN q4 = USE_TABLE(pibits_tbl, (j16 + 32)); __CLC_UINTN q0s0 = __CLC_CONVERT_UINTN(q0); __CLC_UINTN q0s1 = __CLC_CONVERT_UINTN(q0 >> 32); @@ -96,33 +88,41 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, __CLC_UINTN q4s0 = __CLC_CONVERT_UINTN(q4); __CLC_UINTN q4s1 = __CLC_CONVERT_UINTN(q4 >> 32); + __CLC_INTN k = (j >> 2) & 0x3; + __CLC_INTN c0 = k == 0; + __CLC_INTN c1 = k == 1; + __CLC_INTN c2 = k == 2; + __CLC_INTN c3 = k == 3; + + __CLC_UINTN u0, u1, u2, u3, u4, u5, u6; + u0 = c1 ? q0s1 : q0s0; - u0 = c2 ? q3s0 : u0; - u0 = c3 ? q3s1 : u0; + u0 = c2 ? q1s0 : u0; + u0 = c3 ? q1s1 : u0; - u1 = c1 ? q3s0 : q0s1; - u1 = c2 ? q3s1 : u1; - u1 = c3 ? q1s0 : u1; + u1 = c1 ? q1s0 : q0s1; + u1 = c2 ? q1s1 : u1; + u1 = c3 ? q2s0 : u1; - u2 = c1 ? q3s1 : q3s0; - u2 = c2 ? q1s0 : u2; - u2 = c3 ? q1s1 : u2; + u2 = c1 ? q1s1 : q1s0; + u2 = c2 ? q2s0 : u2; + u2 = c3 ? q2s1 : u2; - u3 = c1 ? q1s0 : q3s1; - u3 = c2 ? q1s1 : u3; - u3 = c3 ? q4s0 : u3; + u3 = c1 ? q2s0 : q1s1; + u3 = c2 ? q2s1 : u3; + u3 = c3 ? q3s0 : u3; - u4 = c1 ? q1s1 : q1s0; - u4 = c2 ? q4s0 : u4; - u4 = c3 ? q4s1 : u4; + u4 = c1 ? q2s1 : q2s0; + u4 = c2 ? q3s0 : u4; + u4 = c3 ? q3s1 : u4; - u5 = c1 ? q4s0 : q1s1; - u5 = c2 ? q4s1 : u5; - u5 = c3 ? q2s0 : u5; + u5 = c1 ? q3s0 : q2s1; + u5 = c2 ? q3s1 : u5; + u5 = c3 ? q4s0 : u5; - u6 = c1 ? q4s1 : q4s0; - u6 = c2 ? q2s0 : u6; - u6 = c3 ? q2s1 : u6; + u6 = c1 ? q3s1 : q3s0; + u6 = c2 ? q4s0 : u6; + u6 = c3 ? q4s1 : u6; __CLC_UINTN v0 = bytealign(u1, u0, j); __CLC_UINTN v1 = bytealign(u2, u1, j); @@ -142,7 +142,8 @@ __clc_remainder_piby2_large(__CLC_DOUBLEN x, private __CLC_DOUBLEN *r, __CLC_AS_DOUBLEN(__CLC_CONVERT_ULONGN((x)) & 0xFFFFFFFF | \ __CLC_CONVERT_ULONGN((y)) << 32) - __CLC_UINTN ua = __CLC_CONVERT_UINTN(1023 + 52 - i) << 20; + __CLC_UINTN ua = __CLC_CONVERT_UINTN(EXPBIAS_DP64 + EXPSHIFTBITS_DP64 - i) + << 20; __CLC_DOUBLEN a = doublen_lohi((__CLC_ULONGN)0, ua); __CLC_DOUBLEN p0 = doublen_lohi(v0, ua | (v1 & 0xffffU)) - a; ua += 0x03000000U; _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits