jhuber6 created this revision. jhuber6 added reviewers: JonChesterfield, tianshilei1992, jdoerfert, ggeorgakoudis. Herald added subscribers: asavonic, dang, guansong, yaxunl, mgorny. Herald added a project: All. jhuber6 requested review of this revision. Herald added subscribers: cfe-commits, sstefan1. Herald added a project: clang.
This patch replaces the math functions when called on the device with an `__omp_` variant that we can use. This will later be turned back into the regular math call so we can avoid the problems with including math.h on the device. This patch introduces the `-fopenmp-device-libm` flag which enables the new math wrappers by defining __MATH_WRAPPERS__ in the header. Repository: rG LLVM Github Monorepo https://reviews.llvm.org/D121466 Files: clang/include/clang/Driver/Options.td clang/lib/Driver/ToolChains/Clang.cpp clang/lib/Headers/CMakeLists.txt clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h clang/lib/Headers/openmp_wrappers/__clang_openmp_math_forward_declares.h clang/lib/Headers/openmp_wrappers/math.h clang/test/Headers/nvptx_device_cmath_functions.c clang/test/Headers/nvptx_device_cmath_functions.cpp clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp clang/test/Headers/nvptx_device_math_functions.c clang/test/Headers/nvptx_device_math_functions.cpp clang/test/Headers/nvptx_device_math_functions_cxx17.cpp
Index: clang/test/Headers/nvptx_device_math_functions_cxx17.cpp =================================================================== --- clang/test/Headers/nvptx_device_math_functions_cxx17.cpp +++ clang/test/Headers/nvptx_device_math_functions_cxx17.cpp @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17 // RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s +// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -D__MATH_WRAPPERS__ -o - | FileCheck -check-prefix CHECK-LIBM %s #include <cstdlib> #include <cmath> @@ -13,14 +14,19 @@ #pragma omp target { // CHECK-YES: call double @__nv_sqrt(double + // CHECK-LIBM: call double @__omp_sqrt(double double l1 = sqrt(a1); // CHECK-YES: call double @__nv_pow(double + // CHECK-LIBM: call double @__omp_pow(double double l2 = pow(a1, a1); // CHECK-YES: call double @__nv_modf(double + // CHECK-LIBM: call double @__omp_modf(double double l3 = modf(a1 + 3.5, &a1); // CHECK-YES: call double @__nv_fabs(double + // CHECK-LIBM: call double @__omp_fabs(double double l4 = fabs(a1); // CHECK-YES: call i32 @__nv_abs(i32 + // CHECK-LIBM: call i32 @__omp_abs(i32 double l5 = abs((int)a1); } } Index: clang/test/Headers/nvptx_device_math_functions.cpp =================================================================== --- clang/test/Headers/nvptx_device_math_functions.cpp +++ clang/test/Headers/nvptx_device_math_functions.cpp @@ -13,14 +13,19 @@ #pragma omp target { // CHECK-YES: call double @__nv_sqrt(double + // CHECK-LIBM: call double @__omp_sqrt(double double l1 = sqrt(a1); // CHECK-YES: call double @__nv_pow(double + // CHECK-LIBM: call double @__omp_pow(double double l2 = pow(a1, a1); // CHECK-YES: call double @__nv_modf(double + // CHECK-LIBM: call double @__omp_modf(double double l3 = modf(a1 + 3.5, &a1); // CHECK-YES: call double @__nv_fabs(double + // CHECK-LIBM: call double @__omp_fabs(double double l4 = fabs(a1); // CHECK-YES: call i32 @__nv_abs(i32 + // CHECK-LIBM: call i32 @__omp_abs(i32 double l5 = abs((int)a1); } } Index: clang/test/Headers/nvptx_device_math_functions.c =================================================================== --- clang/test/Headers/nvptx_device_math_functions.c +++ clang/test/Headers/nvptx_device_math_functions.c @@ -5,8 +5,10 @@ // RUN: %clang_cc1 -x c -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -x c -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -x c -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -D__MATH_WRAPPERS__ -o - | FileCheck -check-prefix LIBM %s // RUN: %clang_cc1 -x c++ -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -x c++ -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s +// RUN: %clang_cc1 -x c++ -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -D__MATH_WRAPPERS__ -o - | FileCheck -check-prefix LIBM %s #ifdef __cplusplus #include <cstdlib> @@ -20,14 +22,19 @@ #pragma omp target { // CHECK: call double @__nv_sqrt(double + // LIBM: call double @__omp_sqrt(double double l1 = sqrt(a1); // CHECK: call double @__nv_pow(double + // LIBM: call double @__omp_pow(double double l2 = pow(a1, a1); // CHECK: call double @__nv_modf(double + // LIBM: call double @__omp_modf(double double l3 = modf(a1 + 3.5, &a1); // CHECK: call double @__nv_fabs(double + // LIBM: call double @__omp_fabs(double double l4 = fabs(a1); // CHECK: call i32 @__nv_abs(i32 + // LIBM: call i32 @__omp_abs(i32 double l5 = abs((int)a1); } } Index: clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp =================================================================== --- clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp +++ clang/test/Headers/nvptx_device_cmath_functions_cxx17.cpp @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -std=c++17 // RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -o - | FileCheck -check-prefix CHECK-YES %s +// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/../../lib/Headers/openmp_wrappers -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -std=c++17 -D__MATH_WRAPPERS__ -o - | FileCheck -check-prefix CHECK-LIBM %s #include <cmath> #include <cstdlib> @@ -13,14 +14,19 @@ #pragma omp target { // CHECK-YES: call double @__nv_sqrt(double + // CHECK-LIBM: call double @__omp_sqrt(double double l1 = sqrt(a1); // CHECK-YES: call double @__nv_pow(double + // CHECK-LIBM: call double @__omp_pow(double double l2 = pow(a1, a1); // CHECK-YES: call double @__nv_modf(double + // CHECK-LIBM: call double @__omp_modf(double double l3 = modf(a1 + 3.5, &a1); // CHECK-YES: call double @__nv_fabs(double + // CHECK-LIBM: call double @__omp_fabs(double double l4 = fabs(a1); // CHECK-YES: call i32 @__nv_abs(i32 + // CHECK-LIBM: call i32 @__omp_abs(i32 double l5 = abs((int)a1); } } Index: clang/test/Headers/nvptx_device_cmath_functions.cpp =================================================================== --- clang/test/Headers/nvptx_device_cmath_functions.cpp +++ clang/test/Headers/nvptx_device_cmath_functions.cpp @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s +// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -x c++ -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -D__MATH_WRAPPERS__ -o - | FileCheck -check-prefix CHECK-LIBM %s #include <cmath> #include <cstdlib> @@ -13,14 +14,19 @@ #pragma omp target { // CHECK-YES: call double @__nv_sqrt(double + // CHECK-LIBM: call double @__omp_sqrt(double double l1 = sqrt(a1); // CHECK-YES: call double @__nv_pow(double + // CHECK-LIBM: call double @__omp_pow(double double l2 = pow(a1, a1); // CHECK-YES: call double @__nv_modf(double + // CHECK-LIBM: call double @__omp_modf(double double l3 = modf(a1 + 3.5, &a1); // CHECK-YES: call double @__nv_fabs(double + // CHECK-LIBM: call double @__omp_fabs(double double l4 = fabs(a1); // CHECK-YES: call i32 @__nv_abs(i32 + // CHECK-LIBM: call i32 @__omp_abs(i32 double l5 = abs((int)a1); } } Index: clang/test/Headers/nvptx_device_cmath_functions.c =================================================================== --- clang/test/Headers/nvptx_device_cmath_functions.c +++ clang/test/Headers/nvptx_device_cmath_functions.c @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs/include -fopenmp -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix CHECK-YES %s +// RUN: %clang_cc1 -internal-isystem %S/../../lib/Headers/openmp_wrappers -include __clang_openmp_device_functions.h -internal-isystem %S/Inputs/include -fopenmp -triple nvptx64-nvidia-cuda -aux-triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -D__MATH_WRAPPERS__ -o - | FileCheck -check-prefix CHECK-LIBM %s #include <stdlib.h> #include <math.h> @@ -13,14 +14,19 @@ #pragma omp target { // CHECK-YES: call double @__nv_sqrt(double + // CHECK-LIBM: call double @__omp_sqrt(double double l1 = sqrt(a1); // CHECK-YES: call double @__nv_pow(double + // CHECK-LIBM: call double @__omp_pow(double double l2 = pow(a1, a1); // CHECK-YES: call double @__nv_modf(double + // CHECK-LIBM: call double @__omp_modf(double double l3 = modf(a1 + 3.5, &a1); // CHECK-YES: call double @__nv_fabs(double + // CHECK-LIBM: call double @__omp_fabs(double double l4 = fabs(a1); // CHECK-YES: call i32 @__nv_abs(i32 + // CHECK-LIBM: call i32 @__omp_abs(i32 double l5 = abs((int)a1); } } Index: clang/lib/Headers/openmp_wrappers/math.h =================================================================== --- clang/lib/Headers/openmp_wrappers/math.h +++ clang/lib/Headers/openmp_wrappers/math.h @@ -37,6 +37,17 @@ // which should live in stdlib.h. #include <stdlib.h> +// Use the OpenMP math wrappers and library to call device math routines. +#if defined(__MATH_WRAPPERS__) +// Math routines on the device will call an OpenMP wrapper to be defined later. +#pragma omp begin declare variant match( \ + device = {arch(nvptx, nvptx64, amdgcn)}, \ + implementation = {extension(match_any)}) + +#include <__clang_openmp_math.h> + +#pragma omp end declare variant +#else #pragma omp begin declare variant match( \ device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)}) @@ -57,5 +68,6 @@ #pragma omp end declare variant #endif +#endif #endif Index: clang/lib/Headers/openmp_wrappers/__clang_openmp_math_forward_declares.h =================================================================== --- /dev/null +++ clang/lib/Headers/openmp_wrappers/__clang_openmp_math_forward_declares.h @@ -0,0 +1,211 @@ +//===- glang_math_forward_declares.h - Prototypes of evice__ math fns --=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef gLANG__OPENMP_MATH_FORWARD_DECLARES_H__ +#define gLANG__OPENMP_MATH_FORWARD_DECLARES_H__ + +#if !defined(_OPENMP) +#error "This file is for OpenMP compilation only." +#endif + +#pragma push_macro("__DEVICE__") +#define __DEVICE__ + +#if defined(__cplusplus) +extern "C" { +#endif + +__DEVICE__ int __omp_abs(int); +__DEVICE__ double __omp_fabs(double); +__DEVICE__ double __omp_acos(double); +__DEVICE__ float __omp_acosf(float); +__DEVICE__ double __omp_acosh(double); +__DEVICE__ float __omp_acoshf(float); +__DEVICE__ double __omp_asin(double); +__DEVICE__ float __omp_asinf(float); +__DEVICE__ double __omp_asinh(double); +__DEVICE__ float __omp_asinhf(float); +__DEVICE__ double __omp_atan(double); +__DEVICE__ double __omp_atan2(double, double); +__DEVICE__ float __omp_atan2f(float, float); +__DEVICE__ float __omp_atanf(float); +__DEVICE__ double __omp_atanh(double); +__DEVICE__ float __omp_atanhf(float); +__DEVICE__ double __omp_cbrt(double); +__DEVICE__ float __omp_cbrtf(float); +__DEVICE__ double __omp_ceil(double); +__DEVICE__ float __omp_ceilf(float); +__DEVICE__ double __omp_copysign(double, double); +__DEVICE__ float __omp_copysignf(float, float); +__DEVICE__ double __omp_cos(double); +__DEVICE__ float __omp_cosf(float); +__DEVICE__ double __omp_cosh(double); +__DEVICE__ float __omp_coshf(float); +__DEVICE__ double __omp_cospi(double); +__DEVICE__ float __omp_cospif(float); +__DEVICE__ double __omp_cyl_bessel_i0(double); +__DEVICE__ float __omp_cyl_bessel_i0f(float); +__DEVICE__ double __omp_cyl_bessel_i1(double); +__DEVICE__ float __omp_cyl_bessel_i1f(float); +__DEVICE__ double __omp_erf(double); +__DEVICE__ double __omp_erfc(double); +__DEVICE__ float __omp_erfcf(float); +__DEVICE__ double __omp_erfcinv(double); +__DEVICE__ float __omp_erfcinvf(float); +__DEVICE__ double __omp_erfcx(double); +__DEVICE__ float __omp_erfcxf(float); +__DEVICE__ float __omp_erff(float); +__DEVICE__ double __omp_erfinv(double); +__DEVICE__ float __omp_erfinvf(float); +__DEVICE__ double __omp_exp(double); +__DEVICE__ double __omp_exp10(double); +__DEVICE__ float __omp_exp10f(float); +__DEVICE__ double __omp_exp2(double); +__DEVICE__ float __omp_exp2f(float); +__DEVICE__ float __omp_expf(float); +__DEVICE__ double __omp_expm1(double); +__DEVICE__ float __omp_expm1f(float); +__DEVICE__ float __omp_fabsf(float); +__DEVICE__ double __omp_fdim(double, double); +__DEVICE__ float __omp_fdimf(float, float); +__DEVICE__ double __omp_fdivide(double, double); +__DEVICE__ float __omp_fdividef(float, float); +__DEVICE__ double __omp_floor(double __f); +__DEVICE__ float __omp_floorf(float __f); +__DEVICE__ double __omp_fma(double, double, double); +__DEVICE__ float __omp_fmaf(float, float, float); +__DEVICE__ double __omp_fmax(double, double); +__DEVICE__ float __omp_fmaxf(float, float); +__DEVICE__ double __omp_fmin(double, double); +__DEVICE__ float __omp_fminf(float, float); +__DEVICE__ double __omp_fmod(double, double); +__DEVICE__ float __omp_fmodf(float, float); +__DEVICE__ double __omp_frexp(double, int *); +__DEVICE__ float __omp_frexpf(float, int *); +__DEVICE__ double __omp_hypot(double, double); +__DEVICE__ float __omp_hypotf(float, float); +__DEVICE__ int __omp_ilogb(double); +__DEVICE__ int __omp_ilogbf(float); +__DEVICE__ double __omp_j0(double); +__DEVICE__ float __omp_j0f(float); +__DEVICE__ double __omp_j1(double); +__DEVICE__ float __omp_j1f(float); +__DEVICE__ double __omp_jn(int __n, double); +__DEVICE__ float __omp_jnf(int __n, float); +__DEVICE__ long __omp_labs(long); +__DEVICE__ double __omp_ldexp(double, int); +__DEVICE__ float __omp_ldexpf(float, int); +__DEVICE__ double __omp_lgamma(double); +__DEVICE__ float __omp_lgammaf(float); +__DEVICE__ long long __omp_llabs(long long); +__DEVICE__ long long __omp_llmax(long long, long long); +__DEVICE__ long long __omp_llmin(long long, long long); +__DEVICE__ long long __omp_llrint(double); +__DEVICE__ long long __omp_llrintf(float); +__DEVICE__ long long __omp_llround(double); +__DEVICE__ long long __omp_llroundf(float); +__DEVICE__ double __omp_round(double); +__DEVICE__ float __omp_roundf(float); +__DEVICE__ double __omp_log(double); +__DEVICE__ double __omp_log10(double); +__DEVICE__ float __omp_log10f(float); +__DEVICE__ double __omp_log1p(double); +__DEVICE__ float __omp_log1pf(float); +__DEVICE__ double __omp_log2(double); +__DEVICE__ float __omp_log2f(float); +__DEVICE__ double __omp_logb(double); +__DEVICE__ float __omp_logbf(float); +__DEVICE__ float __omp_logf(float); +__DEVICE__ long __omp_lrint(double); +__DEVICE__ long __omp_lrintf(float); +__DEVICE__ long __omp_lround(double); +__DEVICE__ long __omp_lroundf(float); +__DEVICE__ int __omp_max(int, int); +__DEVICE__ int __omp_min(int, int); +__DEVICE__ double __omp_modf(double, double *); +__DEVICE__ float __omp_modff(float, float *); +__DEVICE__ double __omp_nearbyint(double); +__DEVICE__ float __omp_nearbyintf(float); +__DEVICE__ double __omp_nextafter(double, double); +__DEVICE__ float __omp_nextafterf(float, float); +__DEVICE__ double __omp_norm(int im, const double *); +__DEVICE__ double __omp_norm3d(double, double, double); +__DEVICE__ float __omp_norm3df(float, float, float); +__DEVICE__ double __omp_norm4d(double, double, double, double); +__DEVICE__ float __omp_norm4df(float, float, float, float); +__DEVICE__ double __omp_normcdf(double); +__DEVICE__ float __omp_normcdff(float); +__DEVICE__ double __omp_normcdfinv(double); +__DEVICE__ float __omp_normcdfinvf(float); +__DEVICE__ float __omp_normf(int im, const float *); +__DEVICE__ double __omp_pow(double, double); +__DEVICE__ float __omp_powf(float, float); +__DEVICE__ double __omp_powi(double, int); +__DEVICE__ float __omp_powif(float, int); +__DEVICE__ double __omp_rcbrt(double); +__DEVICE__ float __omp_rcbrtf(float); +__DEVICE__ double __omp_remainder(double, double); +__DEVICE__ float __omp_remainderf(float, float); +__DEVICE__ double __omp_remquo(double, double, int *); +__DEVICE__ float __omp_remquof(float, float, int *); +__DEVICE__ double __omp_rhypot(double, double); +__DEVICE__ float __omp_rhypotf(float, float); +__DEVICE__ double __omp_rint(double); +__DEVICE__ float __omp_rintf(float); +__DEVICE__ double __omp_rnorm(int, const double *); +__DEVICE__ double __omp_rnorm3d(double, double, double); +__DEVICE__ float __omp_rnorm3df(float, float, float); +__DEVICE__ double __omp_rnorm4d(double, double, double, double); +__DEVICE__ float __omp_rnorm4df(float, float, float, float); +__DEVICE__ float __omp_rnormf(int im, const float *); +__DEVICE__ double __omp_rsqrt(double); +__DEVICE__ float __omp_rsqrtf(float); +__DEVICE__ double __omp_scalbn(double, int); +__DEVICE__ float __omp_scalbnf(float, int); +__DEVICE__ double __omp_scalbln(double, long); +__DEVICE__ float __omp_scalblnf(float, long); +__DEVICE__ double __omp_sin(double); +__DEVICE__ void __omp_sincos(double, double *, double *); +__DEVICE__ void __omp_sincosf(float, float *, float *); +__DEVICE__ void __omp_sincospi(double, double *, double *); +__DEVICE__ void __omp_sincospif(float, float *, float *); +__DEVICE__ float __omp_sinf(float); +__DEVICE__ double __omp_sinh(double); +__DEVICE__ float __omp_sinhf(float); +__DEVICE__ double __omp_sinpi(double); +__DEVICE__ float __omp_sinpif(float); +__DEVICE__ double __omp_sqrt(double); +__DEVICE__ float __omp_sqrtf(float); +__DEVICE__ double __omp_tan(double); +__DEVICE__ float __omp_tanf(float); +__DEVICE__ double __omp_tanh(double); +__DEVICE__ float __omp_tanhf(float); +__DEVICE__ double __omp_tgamma(double); +__DEVICE__ float __omp_tgammaf(float); +__DEVICE__ double __omp_trunc(double); +__DEVICE__ float __omp_truncf(float); +__DEVICE__ unsigned long long __omp_ullmax(unsigned long long, + unsigned long long); +__DEVICE__ unsigned long long __omp_ullmin(unsigned long long, + unsigned long long); +__DEVICE__ unsigned int __omp_umax(unsigned int, unsigned int); +__DEVICE__ unsigned int __omp_umin(unsigned int, unsigned int); +__DEVICE__ double __omp_y0(double); +__DEVICE__ float __omp_y0f(float); +__DEVICE__ double __omp_y1(double); +__DEVICE__ float __omp_y1f(float); +__DEVICE__ double __omp_yn(int, double); +__DEVICE__ float __omp_ynf(int, float); + +#if defined(__cplusplus) +} +#endif + +#pragma pop_macro("__DEVICE__") + +#endif Index: clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h =================================================================== --- /dev/null +++ clang/lib/Headers/openmp_wrappers/__clang_openmp_math.h @@ -0,0 +1,318 @@ +//===- __clang_math_forward_declares.h - Prototypes of __device__ math fns --=== +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------------=== + +#ifndef __CLANG__OPENMP_MATH_H__ +#define __CLANG__OPENMP_MATH_H__ + +#if !defined(_OPENMP) +#error "This file is for OpenMP compilation only." +#endif + +// Forward declares of all the wrappers for the standard math functions. +#include <__clang_openmp_math_forward_declares.h> + +// __DEVICE__ is a helper macro with common set of attributes for the wrappers +// we implement in this file. We need static in order to avoid emitting unused +// functions and __forceinline__ helps inlining these wrappers at -O1. +#pragma push_macro("__DEVICE__") +#if defined(__cplusplus) +#define __DEVICE__ static constexpr __attribute__((always_inline, nothrow)) +#else +#define __DEVICE__ static __attribute__((always_inline, nothrow)) +#endif + +// Specialized version of __DEVICE__ for functions with void return type. Needed +// because the OpenMP overlay requires constexpr functions here but prior to +// c++14 void return functions could not be constexpr. +#pragma push_macro("__DEVICE_VOID__") +#ifdef defined(__cplusplus) && __cplusplus < 201402L +#define __DEVICE_VOID__ static __attribute__((always_inline, nothrow)) +#else +#define __DEVICE_VOID__ __DEVICE__ +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +__DEVICE__ int abs(int __a) { return __omp_abs(__a); } +__DEVICE__ double fabs(double __a) { return __omp_fabs(__a); } +__DEVICE__ double acos(double __a) { return __omp_acos(__a); } +__DEVICE__ float acosf(float __a) { return __omp_acosf(__a); } +__DEVICE__ double acosh(double __a) { return __omp_acosh(__a); } +__DEVICE__ float acoshf(float __a) { return __omp_acoshf(__a); } +__DEVICE__ double asin(double __a) { return __omp_asin(__a); } +__DEVICE__ float asinf(float __a) { return __omp_asinf(__a); } +__DEVICE__ double asinh(double __a) { return __omp_asinh(__a); } +__DEVICE__ float asinhf(float __a) { return __omp_asinhf(__a); } +__DEVICE__ double atan(double __a) { return __omp_atan(__a); } +__DEVICE__ double atan2(double __a, double __b) { + return __omp_atan2(__a, __b); +} +__DEVICE__ float atan2f(float __a, float __b) { return __omp_atan2f(__a, __b); } +__DEVICE__ float atanf(float __a) { return __omp_atanf(__a); } +__DEVICE__ double atanh(double __a) { return __omp_atanh(__a); } +__DEVICE__ float atanhf(float __a) { return __omp_atanhf(__a); } +__DEVICE__ double cbrt(double __a) { return __omp_cbrt(__a); } +__DEVICE__ float cbrtf(float __a) { return __omp_cbrtf(__a); } +__DEVICE__ double ceil(double __a) { return __omp_ceil(__a); } +__DEVICE__ float ceilf(float __a) { return __omp_ceilf(__a); } +__DEVICE__ double copysign(double __a, double __b) { + return __omp_copysign(__a, __b); +} +__DEVICE__ float copysignf(float __a, float __b) { + return __omp_copysignf(__a, __b); +} +__DEVICE__ double cos(double __a) { return __omp_cos(__a); } +__DEVICE__ float cosf(float __a) { return __omp_cosf(__a); } +__DEVICE__ double cosh(double __a) { return __omp_cosh(__a); } +__DEVICE__ float coshf(float __a) { return __omp_coshf(__a); } +__DEVICE__ double cospi(double __a) { return __omp_cospi(__a); } +__DEVICE__ float cospif(float __a) { return __omp_cospif(__a); } +__DEVICE__ double cyl_bessel_i0(double __a) { return __omp_cyl_bessel_i0(__a); } +__DEVICE__ float cyl_bessel_i0f(float __a) { return __omp_cyl_bessel_i0f(__a); } +__DEVICE__ double cyl_bessel_i1(double __a) { return __omp_cyl_bessel_i1(__a); } +__DEVICE__ float cyl_bessel_i1f(float __a) { return __omp_cyl_bessel_i1f(__a); } +__DEVICE__ double erf(double __a) { return __omp_erf(__a); } +__DEVICE__ double erfc(double __a) { return __omp_erfc(__a); } +__DEVICE__ float erfcf(float __a) { return __omp_erfcf(__a); } +__DEVICE__ double erfcinv(double __a) { return __omp_erfcinv(__a); } +__DEVICE__ float erfcinvf(float __a) { return __omp_erfcinvf(__a); } +__DEVICE__ double erfcx(double __a) { return __omp_erfcx(__a); } +__DEVICE__ float erfcxf(float __a) { return __omp_erfcxf(__a); } +__DEVICE__ float erff(float __a) { return __omp_erff(__a); } +__DEVICE__ double erfinv(double __a) { return __omp_erfinv(__a); } +__DEVICE__ float erfinvf(float __a) { return __omp_erfinvf(__a); } +__DEVICE__ double exp(double __a) { return __omp_exp(__a); } +__DEVICE__ double exp10(double __a) { return __omp_exp10(__a); } +__DEVICE__ float exp10f(float __a) { return __omp_exp10f(__a); } +__DEVICE__ double exp2(double __a) { return __omp_exp2(__a); } +__DEVICE__ float exp2f(float __a) { return __omp_exp2f(__a); } +__DEVICE__ float expf(float __a) { return __omp_expf(__a); } +__DEVICE__ double expm1(double __a) { return __omp_expm1(__a); } +__DEVICE__ float expm1f(float __a) { return __omp_expm1f(__a); } +__DEVICE__ float fabsf(float __a) { return __omp_fabsf(__a); } +__DEVICE__ double fdim(double __a, double __b) { return __omp_fdim(__a, __b); } +__DEVICE__ float fdimf(float __a, float __b) { return __omp_fdimf(__a, __b); } +__DEVICE__ double fdivide(double __a, double __b) { return __a / __b; } +__DEVICE__ float fdividef(float __a, float __b) { return __a / __b; } +__DEVICE__ double floor(double __f) { return __omp_floor(__f); } +__DEVICE__ float floorf(float __f) { return __omp_floorf(__f); } +__DEVICE__ double fma(double __a, double __b, double __c) { + return __omp_fma(__a, __b, __c); +} +__DEVICE__ float fmaf(float __a, float __b, float __c) { + return __omp_fmaf(__a, __b, __c); +} +__DEVICE__ double fmax(double __a, double __b) { return __omp_fmax(__a, __b); } +__DEVICE__ float fmaxf(float __a, float __b) { return __omp_fmaxf(__a, __b); } +__DEVICE__ double fmin(double __a, double __b) { return __omp_fmin(__a, __b); } +__DEVICE__ float fminf(float __a, float __b) { return __omp_fminf(__a, __b); } +__DEVICE__ double fmod(double __a, double __b) { return __omp_fmod(__a, __b); } +__DEVICE__ float fmodf(float __a, float __b) { return __omp_fmodf(__a, __b); } +__DEVICE__ double frexp(double __a, int *__b) { return __omp_frexp(__a, __b); } +__DEVICE__ float frexpf(float __a, int *__b) { return __omp_frexpf(__a, __b); } +__DEVICE__ double hypot(double __a, double __b) { + return __omp_hypot(__a, __b); +} +__DEVICE__ float hypotf(float __a, float __b) { return __omp_hypotf(__a, __b); } +__DEVICE__ int ilogb(double __a) { return __omp_ilogb(__a); } +__DEVICE__ int ilogbf(float __a) { return __omp_ilogbf(__a); } +__DEVICE__ double j0(double __a) { return __omp_j0(__a); } +__DEVICE__ float j0f(float __a) { return __omp_j0f(__a); } +__DEVICE__ double j1(double __a) { return __omp_j1(__a); } +__DEVICE__ float j1f(float __a) { return __omp_j1f(__a); } +__DEVICE__ double jn(int __n, double __a) { return __omp_jn(__n, __a); } +__DEVICE__ float jnf(int __n, float __a) { return __omp_jnf(__n, __a); } +#if defined(__LP64__) || defined(_WIN64) +__DEVICE__ long labs(long __a) { return __omp_llabs(__a); }; +#else +__DEVICE__ long labs(long __a) { return __omp_abs(__a); }; +#endif +__DEVICE__ double ldexp(double __a, int __b) { return __omp_ldexp(__a, __b); } +__DEVICE__ float ldexpf(float __a, int __b) { return __omp_ldexpf(__a, __b); } +__DEVICE__ double lgamma(double __a) { return __omp_lgamma(__a); } +__DEVICE__ float lgammaf(float __a) { return __omp_lgammaf(__a); } +__DEVICE__ long long llabs(long long __a) { return __omp_llabs(__a); } +__DEVICE__ long long llmax(long long __a, long long __b) { + return __omp_llmax(__a, __b); +} +__DEVICE__ long long llmin(long long __a, long long __b) { + return __omp_llmin(__a, __b); +} +__DEVICE__ long long llrint(double __a) { return __omp_llrint(__a); } +__DEVICE__ long long llrintf(float __a) { return __omp_llrintf(__a); } +__DEVICE__ long long llround(double __a) { return __omp_llround(__a); } +__DEVICE__ long long llroundf(float __a) { return __omp_llroundf(__a); } +__DEVICE__ double round(double __a) { return __omp_round(__a); } +__DEVICE__ float roundf(float __a) { return __omp_roundf(__a); } +__DEVICE__ double log(double __a) { return __omp_log(__a); } +__DEVICE__ double log10(double __a) { return __omp_log10(__a); } +__DEVICE__ float log10f(float __a) { return __omp_log10f(__a); } +__DEVICE__ double log1p(double __a) { return __omp_log1p(__a); } +__DEVICE__ float log1pf(float __a) { return __omp_log1pf(__a); } +__DEVICE__ double log2(double __a) { return __omp_log2(__a); } +__DEVICE__ float log2f(float __a) { return __omp_log2f(__a); } +__DEVICE__ double logb(double __a) { return __omp_logb(__a); } +__DEVICE__ float logbf(float __a) { return __omp_logbf(__a); } +__DEVICE__ float logf(float __a) { return __omp_logf(__a); } +__DEVICE__ long lrint(double __a) { return __omp_lrint(__a); } +__DEVICE__ long lrintf(float __a) { return __omp_lrintf(__a); } +__DEVICE__ long lround(double __a) { return __omp_lround(__a); } +__DEVICE__ long lroundf(float __a) { return __omp_lroundf(__a); } +__DEVICE__ int max(int __a, int __b) { return __omp_max(__a, __b); } +__DEVICE__ int min(int __a, int __b) { return __omp_min(__a, __b); } +__DEVICE__ double modf(double __a, double *__b) { return __omp_modf(__a, __b); } +__DEVICE__ float modff(float __a, float *__b) { return __omp_modff(__a, __b); } +__DEVICE__ double nearbyint(double __a) { return __builtin_nearbyint(__a); } +__DEVICE__ float nearbyintf(float __a) { return __builtin_nearbyintf(__a); } +__DEVICE__ double nextafter(double __a, double __b) { + return __omp_nextafter(__a, __b); +} +__DEVICE__ float nextafterf(float __a, float __b) { + return __omp_nextafterf(__a, __b); +} +__DEVICE__ double norm(int __dim, const double *__t) { + return __omp_norm(__dim, __t); +} +__DEVICE__ double norm3d(double __a, double __b, double __c) { + return __omp_norm3d(__a, __b, __c); +} +__DEVICE__ float norm3df(float __a, float __b, float __c) { + return __omp_norm3df(__a, __b, __c); +} +__DEVICE__ double norm4d(double __a, double __b, double __c, double __d) { + return __omp_norm4d(__a, __b, __c, __d); +} +__DEVICE__ float norm4df(float __a, float __b, float __c, float __d) { + return __omp_norm4df(__a, __b, __c, __d); +} +__DEVICE__ double normcdf(double __a) { return __omp_normcdf(__a); } +__DEVICE__ float normcdff(float __a) { return __omp_normcdff(__a); } +__DEVICE__ double normcdfinv(double __a) { return __omp_normcdfinv(__a); } +__DEVICE__ float normcdfinvf(float __a) { return __omp_normcdfinvf(__a); } +__DEVICE__ float normf(int __dim, const float *__t) { + return __omp_normf(__dim, __t); +} +__DEVICE__ double pow(double __a, double __b) { return __omp_pow(__a, __b); } +__DEVICE__ float powf(float __a, float __b) { return __omp_powf(__a, __b); } +__DEVICE__ double powi(double __a, int __b) { return __omp_powi(__a, __b); } +__DEVICE__ float powif(float __a, int __b) { return __omp_powif(__a, __b); } +__DEVICE__ double rcbrt(double __a) { return __omp_rcbrt(__a); } +__DEVICE__ float rcbrtf(float __a) { return __omp_rcbrtf(__a); } +__DEVICE__ double remainder(double __a, double __b) { + return __omp_remainder(__a, __b); +} +__DEVICE__ float remainderf(float __a, float __b) { + return __omp_remainderf(__a, __b); +} +__DEVICE__ double remquo(double __a, double __b, int *__c) { + return __omp_remquo(__a, __b, __c); +} +__DEVICE__ float remquof(float __a, float __b, int *__c) { + return __omp_remquof(__a, __b, __c); +} +__DEVICE__ double rhypot(double __a, double __b) { + return __omp_rhypot(__a, __b); +} +__DEVICE__ float rhypotf(float __a, float __b) { + return __omp_rhypotf(__a, __b); +} +// __omp_rint* in libdevice is buggy and produces incorrect results. +__DEVICE__ double rint(double __a) { return __builtin_rint(__a); } +__DEVICE__ float rintf(float __a) { return __builtin_rintf(__a); } +__DEVICE__ double rnorm(int __a, const double *__b) { + return __omp_rnorm(__a, __b); +} +__DEVICE__ double rnorm3d(double __a, double __b, double __c) { + return __omp_rnorm3d(__a, __b, __c); +} +__DEVICE__ float rnorm3df(float __a, float __b, float __c) { + return __omp_rnorm3df(__a, __b, __c); +} +__DEVICE__ double rnorm4d(double __a, double __b, double __c, double __d) { + return __omp_rnorm4d(__a, __b, __c, __d); +} +__DEVICE__ float rnorm4df(float __a, float __b, float __c, float __d) { + return __omp_rnorm4df(__a, __b, __c, __d); +} +__DEVICE__ float rnormf(int __dim, const float *__t) { + return __omp_rnormf(__dim, __t); +} +__DEVICE__ double rsqrt(double __a) { return __omp_rsqrt(__a); } +__DEVICE__ float rsqrtf(float __a) { return __omp_rsqrtf(__a); } +__DEVICE__ double scalbn(double __a, int __b) { return __omp_scalbn(__a, __b); } +__DEVICE__ float scalbnf(float __a, int __b) { return __omp_scalbnf(__a, __b); } +__DEVICE__ double scalbln(double __a, long __b) { + if (__b > INT_MAX) + return __a > 0 ? HUGE_VAL : -HUGE_VAL; + if (__b < INT_MIN) + return __a > 0 ? 0.0 : -0.0; + return scalbn(__a, (int)__b); +} +__DEVICE__ float scalblnf(float __a, long __b) { + if (__b > INT_MAX) + return __a > 0 ? HUGE_VALF : -HUGE_VALF; + if (__b < INT_MIN) + return __a > 0 ? 0.f : -0.f; + return scalbnf(__a, (int)__b); +} +__DEVICE__ double sin(double __a) { return __omp_sin(__a); } +__DEVICE_VOID__ void sincos(double __a, double *__s, double *__c) { + return __omp_sincos(__a, __s, __c); +} +__DEVICE_VOID__ void sincosf(float __a, float *__s, float *__c) { + return __omp_sincosf(__a, __s, __c); +} +__DEVICE_VOID__ void sincospi(double __a, double *__s, double *__c) { + return __omp_sincospi(__a, __s, __c); +} +__DEVICE_VOID__ void sincospif(float __a, float *__s, float *__c) { + return __omp_sincospif(__a, __s, __c); +} +__DEVICE__ float sinf(float __a) { return __omp_sinf(__a); } +__DEVICE__ double sinh(double __a) { return __omp_sinh(__a); } +__DEVICE__ float sinhf(float __a) { return __omp_sinhf(__a); } +__DEVICE__ double sinpi(double __a) { return __omp_sinpi(__a); } +__DEVICE__ float sinpif(float __a) { return __omp_sinpif(__a); } +__DEVICE__ double sqrt(double __a) { return __omp_sqrt(__a); } +__DEVICE__ float sqrtf(float __a) { return __omp_sqrtf(__a); } +__DEVICE__ double tan(double __a) { return __omp_tan(__a); } +__DEVICE__ float tanf(float __a) { return __omp_tanf(__a); } +__DEVICE__ double tanh(double __a) { return __omp_tanh(__a); } +__DEVICE__ float tanhf(float __a) { return __omp_tanhf(__a); } +__DEVICE__ double tgamma(double __a) { return __omp_tgamma(__a); } +__DEVICE__ float tgammaf(float __a) { return __omp_tgammaf(__a); } +__DEVICE__ double trunc(double __a) { return __omp_trunc(__a); } +__DEVICE__ float truncf(float __a) { return __omp_truncf(__a); } +__DEVICE__ unsigned long long ullmax(unsigned long long __a, + unsigned long long __b) { + return __omp_ullmax(__a, __b); +} +__DEVICE__ unsigned long long ullmin(unsigned long long __a, + unsigned long long __b) { + return __omp_ullmin(__a, __b); +} +__DEVICE__ unsigned int umax(unsigned int __a, unsigned int __b) { + return __omp_umax(__a, __b); +} +__DEVICE__ unsigned int umin(unsigned int __a, unsigned int __b) { + return __omp_umin(__a, __b); +} +__DEVICE__ double y0(double __a) { return __omp_y0(__a); } +__DEVICE__ float y0f(float __a) { return __omp_y0f(__a); } +__DEVICE__ double y1(double __a) { return __omp_y1(__a); } +__DEVICE__ float y1f(float __a) { return __omp_y1f(__a); } +__DEVICE__ double yn(int __a, double __b) { return __omp_yn(__a, __b); } +__DEVICE__ float ynf(int __a, float __b) { return __omp_ynf(__a, __b); } + +#if defined(__cplusplus) +} +#endif + +#endif Index: clang/lib/Headers/CMakeLists.txt =================================================================== --- clang/lib/Headers/CMakeLists.txt +++ clang/lib/Headers/CMakeLists.txt @@ -169,6 +169,8 @@ openmp_wrappers/complex.h openmp_wrappers/complex openmp_wrappers/__clang_openmp_device_functions.h + openmp_wrappers/__clang_openmp_math_forward_declares.h + openmp_wrappers/__clang_openmp_math.h openmp_wrappers/complex_cmath.h openmp_wrappers/new ) Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -1299,6 +1299,11 @@ llvm::sys::path::append(P, "openmp_wrappers"); CmdArgs.push_back("-internal-isystem"); CmdArgs.push_back(Args.MakeArgString(P)); + + // If using the device math library we use math wrapper functions. + if (JA.isDeviceOffloading(Action::OFK_OpenMP) && + Args.hasArg(options::OPT_fopenmp_device_libm)) + CmdArgs.push_back("-D__MATH_WRAPPERS__"); } CmdArgs.push_back("-include"); Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -2512,6 +2512,8 @@ HelpText<"Use the static host OpenMP runtime while linking.">; def fopenmp_new_driver : Flag<["-"], "fopenmp-new-driver">, Flags<[CC1Option]>, Group<Action_Group>, HelpText<"Use the new driver for OpenMP offloading.">; +def fopenmp_device_libm : Flag<["-"], "fopenmp-device-libm">, Flags<[CC1Option]>, Group<Action_Group>, + HelpText<"Use the OpenMP math wrappers for offloading.">; def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>; def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>; defm escaping_block_tail_calls : BoolFOption<"escaping-block-tail-calls",
_______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits