[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea updated this revision to Diff 159335. gtbercea added a comment. Fix function call. Repository: rC Clang https://reviews.llvm.org/D47849 Files: include/clang/Driver/ToolChain.h lib/Driver/ToolChains/Clang.cpp lib/Driver/ToolChains/Cuda.cpp lib/Driver/ToolChains/Cuda.h lib/Headers/CMakeLists.txt lib/Headers/__clang_cuda_device_functions.h lib/Headers/__clang_cuda_libdevice_declares.h test/CodeGen/nvptx_device_math_functions.c test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -76,9 +76,9 @@ // RUN: -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s -/// Use DAG to ensure that cubin file has been unbundled. +/// Use DAG to ensure that object file has not been unbundled. // CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[CUBIN:.*\.cubin]]" -// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-outputs={{.*}}[[CUBIN]] +// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}[[CUBIN]] // CHK-CUBIN-UNBUNDLING-NVLINK-DAG-SAME: "-unbundle" /// ### Index: test/CodeGen/nvptx_device_math_functions.c === --- /dev/null +++ test/CodeGen/nvptx_device_math_functions.c @@ -0,0 +1,20 @@ +// Test calling of device math functions. +///==/// + +// RUN: %clang -fmath-errno -S -emit-llvm -o - %s -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda | FileCheck -check-prefix CHECK-YES %s + +void test_sqrt(double a1) { + #pragma omp target + { +// CHECK-YES: call double @llvm.nvvm.sqrt.rn.d(double +double l1 = sqrt(a1); + } +} + +void test_pow(float a0, double a1, long double a2) { + #pragma omp target + { +// CHECK-YES: call double @__internal_accurate_pow(double +double l1 = pow(a1, a1); + } +} Index: lib/Headers/__clang_cuda_libdevice_declares.h === --- lib/Headers/__clang_cuda_libdevice_declares.h +++ lib/Headers/__clang_cuda_libdevice_declares.h @@ -24,443 +24,455 @@ #ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__ #define __CLANG_CUDA_LIBDEVICE_DECLARES_H__ +#if defined(_OPENMP) +#define __DEVICE__ +#elif defined(__CUDA__) +#define __DEVICE__ __device__ +#endif + +#if defined(__cplusplus) extern "C" { +#endif -__device__ int __nv_abs(int __a); -__device__ double __nv_acos(double __a); -__device__ float __nv_acosf(float __a); -__device__ double __nv_acosh(double __a); -__device__ float __nv_acoshf(float __a); -__device__ double __nv_asin(double __a); -__device__ float __nv_asinf(float __a); -__device__ double __nv_asinh(double __a); -__device__ float __nv_asinhf(float __a); -__device__ double __nv_atan2(double __a, double __b); -__device__ float __nv_atan2f(float __a, float __b); -__device__ double __nv_atan(double __a); -__device__ float __nv_atanf(float __a); -__device__ double __nv_atanh(double __a); -__device__ float __nv_atanhf(float __a); -__device__ int __nv_brev(int __a); -__device__ long long __nv_brevll(long long __a); -__device__ int __nv_byte_perm(int __a, int __b, int __c); -__device__ double __nv_cbrt(double __a); -__device__ float __nv_cbrtf(float __a); -__device__ double __nv_ceil(double __a); -__device__ float __nv_ceilf(float __a); -__device__ int __nv_clz(int __a); -__device__ int __nv_clzll(long long __a); -__device__ double __nv_copysign(double __a, double __b); -__device__ float __nv_copysignf(float __a, float __b); -__device__ double __nv_cos(double __a); -__device__ float __nv_cosf(float __a); -__device__ double __nv_cosh(double __a); -__device__ float __nv_coshf(float __a); -__device__ double __nv_cospi(double __a); -__device__ float __nv_cospif(float __a); -__device__ double __nv_cyl_bessel_i0(double __a); -__device__ float __nv_cyl_bessel_i0f(float __a); -__device__ double __nv_cyl_bessel_i1(double __a); -__device__ float __nv_cyl_bessel_i1f(float __a); -__device__ double __nv_dadd_rd(double __a, double __b); -__device__ double __nv_dadd_rn(double __a, double __b); -__device__ double __nv_dadd_ru(double __a, double __b); -__device__ double __nv_dadd_rz(double __a, double __b); -__device__ double __nv_ddiv_rd(double __a, double __b); -__device__ double __nv_ddiv_rn(double __a, double __b); -__device__ double __nv_ddiv_ru(double __a, double __b); -__device__ double __nv_ddiv_rz(double __a, double __b); -__device__ double __nv_dmul_rd(double __a, double __b); -__device__ double __nv_dmul_rn(double __a, double __b); -__device__ double __nv_dmul_ru(double __a, double __b); -__device__ double __nv_dmul_rz(double __a, double __b); -__device__ float __nv_do
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. In https://reviews.llvm.org/D47849#1190903, @Hahnfeld wrote: > Do we still need this? I think what we really need to solve is the problem of > (host) inline assembly in the header files... Don't we want to use device specific math functions? It's not just about avoiding some the host specific assembly, it's also about getting an implementation tailored to the device. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:664 + // Anything that's not a file name is potentially a static library + // so treat it as such. + if (C.canSkipOffloadBundler()) sfantao wrote: > So, what if it is not a static library? Can it be anything else at this point? Repository: rC Clang https://reviews.llvm.org/D47394 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea updated this revision to Diff 159536. gtbercea marked 3 inline comments as done. gtbercea added a comment. - Address comments. Repository: rC Clang https://reviews.llvm.org/D47394 Files: include/clang/Driver/Action.h include/clang/Driver/Compilation.h include/clang/Driver/Options.td include/clang/Driver/ToolChain.h lib/Driver/Action.cpp lib/Driver/Compilation.cpp lib/Driver/Driver.cpp lib/Driver/ToolChain.cpp lib/Driver/ToolChains/Clang.cpp lib/Driver/ToolChains/Clang.h lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu-linux.c test/Driver/openmp-offload-gpu.c test/Driver/openmp-offload.c Index: test/Driver/openmp-offload.c === --- test/Driver/openmp-offload.c +++ test/Driver/openmp-offload.c @@ -480,13 +480,13 @@ // Create host object and bundle. // CHK-BUJOBS: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-emit-obj" {{.*}}"-fopenmp" {{.*}}"-o" " // CHK-BUJOBS-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "-x" "ir" "{{.*}}[[HOSTBC]]" -// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= +// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= // CHK-BUJOBS-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]" // CHK-BUJOBS-ST: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-S" {{.*}}"-fopenmp" {{.*}}"-o" " // CHK-BUJOBS-ST-SAME: [[HOSTASM:[^\\/]+\.s]]" "-x" "ir" "{{.*}}[[HOSTBC]]" // CHK-BUJOBS-ST: clang{{.*}}" "-cc1as" "-triple" "powerpc64le--linux" "-filetype" "obj" {{.*}}"-o" " // CHK-BUJOBS-ST-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "{{.*}}[[HOSTASM]]" -// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= +// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= // CHK-BUJOBS-ST-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]" /// ### Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -61,7 +61,7 @@ /// Check cubin file generation and bundling // RUN: %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ -// RUN: -no-canonical-prefixes -save-temps %s -c 2>&1 \ +// RUN: -no-canonical-prefixes -save-temps %s -c -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-PTXAS-CUBIN-BUNDLING %s // CHK-PTXAS-CUBIN-BUNDLING: clang{{.*}}" "-o" "[[PTX:.*\.s]]" @@ -73,7 +73,7 @@ /// Check cubin file unbundling and usage by nvlink // RUN: touch %t.o // RUN: %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ -// RUN: -no-canonical-prefixes -save-temps %t.o 2>&1 \ +// RUN: -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s /// Use DAG to ensure that cubin file has been unbundled. @@ -87,11 +87,11 @@ // RUN: touch %t1.o // RUN: touch %t2.o // RUN: %clang -### -no-canonical-prefixes -target powerpc64le-unknown-linux-gnu -fopenmp=libomp \ -// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \ +// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-TWOCUBIN %s /// Check cubin file generation and usage by nvlink when toolchain has BindArchAction // RUN: %clang -### -no-canonical-prefixes -target x86_64-apple-darwin17.0.0 -fopenmp=libomp \ -// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \ +// RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-TWOCUBIN %s // CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.cubin" "{{.*}}openmp-offload-{{.*}}.cubin" Index: test/Driver/openmp-offload-gpu-linux.c === --- /dev/null +++ test/Driver/openmp-offload-gpu-linux.c @@ -0,0 +1,52 @@ +/// +/// Perform driver tests for OpenMP offloading on Linux systems +/// + +// UNSUPPORTED: system-windows + +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +/// Check cubin file generation and partial linking with ld +// RUN: %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea updated this revision to Diff 159574. gtbercea added a comment. Prevent math builtins from being used for nvptx toolchain. Repository: rC Clang https://reviews.llvm.org/D47849 Files: include/clang/Driver/ToolChain.h lib/Driver/ToolChains/Clang.cpp lib/Driver/ToolChains/Cuda.cpp lib/Driver/ToolChains/Cuda.h lib/Headers/CMakeLists.txt lib/Headers/__clang_cuda_device_functions.h lib/Headers/__clang_cuda_libdevice_declares.h test/CodeGen/nvptx_device_math_functions.c test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -76,9 +76,9 @@ // RUN: -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s -/// Use DAG to ensure that cubin file has been unbundled. +/// Use DAG to ensure that object file has not been unbundled. // CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[CUBIN:.*\.cubin]]" -// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-outputs={{.*}}[[CUBIN]] +// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}[[CUBIN]] // CHK-CUBIN-UNBUNDLING-NVLINK-DAG-SAME: "-unbundle" /// ### Index: test/CodeGen/nvptx_device_math_functions.c === --- /dev/null +++ test/CodeGen/nvptx_device_math_functions.c @@ -0,0 +1,20 @@ +// Test calling of device math functions. +///==/// + +// RUN: %clang -fmath-errno -S -emit-llvm -o - %s -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda | FileCheck -check-prefix CHECK-YES %s + +void test_sqrt(double a1) { + #pragma omp target + { +// CHECK-YES: call double @llvm.nvvm.sqrt.rn.d(double +double l1 = sqrt(a1); + } +} + +void test_pow(float a0, double a1, long double a2) { + #pragma omp target + { +// CHECK-YES: call double @__internal_accurate_pow(double +double l1 = pow(a1, a1); + } +} Index: lib/Headers/__clang_cuda_libdevice_declares.h === --- lib/Headers/__clang_cuda_libdevice_declares.h +++ lib/Headers/__clang_cuda_libdevice_declares.h @@ -24,443 +24,455 @@ #ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__ #define __CLANG_CUDA_LIBDEVICE_DECLARES_H__ +#if defined(_OPENMP) +#define __DEVICE__ +#elif defined(__CUDA__) +#define __DEVICE__ __device__ +#endif + +#if defined(__cplusplus) extern "C" { +#endif -__device__ int __nv_abs(int __a); -__device__ double __nv_acos(double __a); -__device__ float __nv_acosf(float __a); -__device__ double __nv_acosh(double __a); -__device__ float __nv_acoshf(float __a); -__device__ double __nv_asin(double __a); -__device__ float __nv_asinf(float __a); -__device__ double __nv_asinh(double __a); -__device__ float __nv_asinhf(float __a); -__device__ double __nv_atan2(double __a, double __b); -__device__ float __nv_atan2f(float __a, float __b); -__device__ double __nv_atan(double __a); -__device__ float __nv_atanf(float __a); -__device__ double __nv_atanh(double __a); -__device__ float __nv_atanhf(float __a); -__device__ int __nv_brev(int __a); -__device__ long long __nv_brevll(long long __a); -__device__ int __nv_byte_perm(int __a, int __b, int __c); -__device__ double __nv_cbrt(double __a); -__device__ float __nv_cbrtf(float __a); -__device__ double __nv_ceil(double __a); -__device__ float __nv_ceilf(float __a); -__device__ int __nv_clz(int __a); -__device__ int __nv_clzll(long long __a); -__device__ double __nv_copysign(double __a, double __b); -__device__ float __nv_copysignf(float __a, float __b); -__device__ double __nv_cos(double __a); -__device__ float __nv_cosf(float __a); -__device__ double __nv_cosh(double __a); -__device__ float __nv_coshf(float __a); -__device__ double __nv_cospi(double __a); -__device__ float __nv_cospif(float __a); -__device__ double __nv_cyl_bessel_i0(double __a); -__device__ float __nv_cyl_bessel_i0f(float __a); -__device__ double __nv_cyl_bessel_i1(double __a); -__device__ float __nv_cyl_bessel_i1f(float __a); -__device__ double __nv_dadd_rd(double __a, double __b); -__device__ double __nv_dadd_rn(double __a, double __b); -__device__ double __nv_dadd_ru(double __a, double __b); -__device__ double __nv_dadd_rz(double __a, double __b); -__device__ double __nv_ddiv_rd(double __a, double __b); -__device__ double __nv_ddiv_rn(double __a, double __b); -__device__ double __nv_ddiv_ru(double __a, double __b); -__device__ double __nv_ddiv_rz(double __a, double __b); -__device__ double __nv_dmul_rd(double __a, double __b); -__device__ double __nv_dmul_rn(double __a, double __b); -__device__ double __nv_dmul_ru(double __a, double __b); -__device__ double __nv_dmul_rz(double __a,
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. > Ok, so you are already talking about performance. I think we should fix > correctness first, in particular the compiler shouldn't complain whenever > `` is included. This patch is concerned with calling device functions when you're on the device. The correctness issues you mention are orthogonal to this and should be handled by another patch. I don't think this patch should be held up any longer. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. In https://reviews.llvm.org/D47849#1192245, @Hahnfeld wrote: > In https://reviews.llvm.org/D47849#1192134, @gtbercea wrote: > > > This patch is concerned with calling device functions when you're on the > > device. The correctness issues you mention are orthogonal to this and > > should be handled by another patch. I don't think this patch should be held > > up any longer. > > > I'm confused by now, could you please highlight the point that I'm missing? You're bringing up the correctness of the header files which is a detail that is orthogonal to this patch. Even if the header files worked correctly I would still want to use the libdevice functions. Fixing the header files themselves should be therefore done in a separate patch. Using the libdevice functions guarantees correctness (no weird assembly instructions that the device doesn't recognize etc.) and may improve performance (if for example the libdevice contained device specific assembly). The purpose of this patch is to call NVIDIA's libdevice math functions which should in principle be more efficient in terms of runtime and register usage. Not all of them may be more effecient today (like @tra suggested) but some of them will be. Maybe others will be improved in the future, maybe not, again that's an orthogonal point. The benefit of using libdevice functions is that any improvements NVIDIA makes we will be there to use them in the OpenMP NVPTX toolchain. The premise of the OpenMP NVPTX toolchain is that it will leverage as much of the CUDA toolchain as possible. Another point is that users specifically ask for NVIDIA math functions to be called on the device when using OpenMP NVPTX device offloading. The libdevice library offers __nv_fast_* variants of some math functions. Users want to have access to those functions and other functions that the libdevice library contains. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. > IIRC you started to work on this to fix the problem with inline assembly (see > https://reviews.llvm.org/D47849#1125019). AFAICS this patch fixes > declarations of math functions but you still cannot include `math.h` which > most "correct" codes do. I'm not sure what you mean by this. This patch enables me to include math.h. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. In https://reviews.llvm.org/D47849#1192368, @Hahnfeld wrote: > In https://reviews.llvm.org/D47849#1192321, @gtbercea wrote: > > > > IIRC you started to work on this to fix the problem with inline assembly > > > (see https://reviews.llvm.org/D47849#1125019). AFAICS this patch fixes > > > declarations of math functions but you still cannot include `math.h` > > > which most "correct" codes do. > > > > I'm not sure what you mean by this. This patch enables me to include math.h. > > > `math.c`: > > #include > > > executed commands: > >$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -c math.c -O2 > In file included from math.c:1: > In file included from /usr/include/math.h:413: > /usr/include/bits/mathinline.h:131:43: error: invalid input constraint 'x' > in asm > __asm ("pmovmskb %1, %0" : "=r" (__m) : "x" (__x)); > ^ > /usr/include/bits/mathinline.h:143:43: error: invalid input constraint 'x' > in asm > __asm ("pmovmskb %1, %0" : "=r" (__m) : "x" (__x)); > ^ > 2 errors generated. > I do not get that error. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. In https://reviews.llvm.org/D47849#1192368, @Hahnfeld wrote: > In https://reviews.llvm.org/D47849#1192321, @gtbercea wrote: > > > > IIRC you started to work on this to fix the problem with inline assembly > > > (see https://reviews.llvm.org/D47849#1125019). AFAICS this patch fixes > > > declarations of math functions but you still cannot include `math.h` > > > which most "correct" codes do. > > > > I'm not sure what you mean by this. This patch enables me to include math.h. > > > `math.c`: > > #include > > > executed commands: > >$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -c math.c -O2 > In file included from math.c:1: > In file included from /usr/include/math.h:413: > /usr/include/bits/mathinline.h:131:43: error: invalid input constraint 'x' > in asm > __asm ("pmovmskb %1, %0" : "=r" (__m) : "x" (__x)); > ^ > /usr/include/bits/mathinline.h:143:43: error: invalid input constraint 'x' > in asm > __asm ("pmovmskb %1, %0" : "=r" (__m) : "x" (__x)); > ^ > 2 errors generated. > We are probably linking against different math.h files. I don't seem to have a mathinline.h with those instructions. Perhaps this is an x86 specific error. I think I know what's happening. I think the host math.h is still included but not necessarily used. Math functions resolve to math functions in the CUDA header first (that's what this patch does). This patch doesn't prevent math.h from being included. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. In https://reviews.llvm.org/D47849#1192383, @Hahnfeld wrote: > In https://reviews.llvm.org/D47849#1192375, @gtbercea wrote: > > > I do not get that error. > > > In the beginning you said that you were facing the same error. Did that go > away in the meantime? > Are you testing on x86 or Power? With optimizations enabled? Since I'm running on Power I was facing a similar problem related to host assembly instructions on device but not exactly the same error. The error you are seeing is that the NVPTX target doesn't regard "x" as a valid input constraint. x is an x86 specific constraint which I don't have on the Power side. The problems I was having were related to the math functions on the device resolving to host math functions which contained host assembly instructions which were not recognized by NVPTX. This patch fixes that issue. Perhaps the inclusion of the host math.h should just be prevented for device code? Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. @Hahnfeld do you get the same error if you compile with clang++ instead of clang? Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. > The downside of this approach is that LLVM doesn't recognize these function > calls and doesn't perform optimizations to fold libcalls. For example `pow(a, > 2)` is transformed into a multiplication but `__nv_pow(a, 2)` is not. Doesn't CUDA have the same problem? Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. > I don't want to use a fast `pow(a, 2)`, I don't want to call a library > function for that at all. I do believe you won't end up calling a function. If you're compiling with optimizations on this will be inlined. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. Thanks @Hahnfeld for your suggestions. Unfortunately doing the lowering in the backend one would need to replace the math function calls with calls to libdevice function calls. I have not been able to do that in an elegant way. Encoding the interface to libdevice is just not a clean process not to mention that any changes to libdevice will have to be tracked manually with every new CUDA version. It does not make the code more maintainable, on the contrary I think it makes it harder to track libdevice changes. On the same note, clang-cuda doesn't do the pow(a,2) -> a*a optimization, I checked. It is something that needs to be fixed for Clang-CUDA first before OpenMP can make use of it. OpenMP-NVPTX toolchain is designed to exist on top of the CUDA toolchain. It therefore inherits all the clang-cuda benefits and in this particular case, limitations. As for the Sema check error you report (the one related to the x restriction), I think the fix you proposed is good and should be pushed in a separate patch. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added a comment. Just to address any generality concerns: This patch fixes the problem of calling libdevice math functions for all platform combinations. It ensures that the OpenMP NVPTX target region will NOT call any host math functions (which ever host that may be) IF equivalent device functions are available. I think there was a confusion regarding header file inclusion. This patch does not address any issues that might arise from the user including header files (be it math.h or some other header). Any failure related to header file inclusion (such as the reported x restriction issue on x86) is unrelated to what this patch aims to do. Before the functionality in this patch can kick in, any user-included headers must successfully pass all checks in place for the NVPTX toolchain. A fix in the direction of the one proposed in one of the comments above is probably required. The fix would also needs its own separate patch. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea updated this revision to Diff 160598. gtbercea added a comment. Herald added a subscriber: jholewinski. Add __NO_MATH_INLINES macro for the NVPTX toolchain to prevent any host assembly from seeping onto the device. Repository: rC Clang https://reviews.llvm.org/D47849 Files: include/clang/Driver/ToolChain.h lib/Basic/Targets/NVPTX.cpp lib/Driver/ToolChains/Clang.cpp lib/Driver/ToolChains/Cuda.cpp lib/Driver/ToolChains/Cuda.h lib/Headers/CMakeLists.txt lib/Headers/__clang_cuda_device_functions.h lib/Headers/__clang_cuda_libdevice_declares.h test/CodeGen/nvptx_device_math_functions.c test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -76,9 +76,9 @@ // RUN: -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s -/// Use DAG to ensure that cubin file has been unbundled. +/// Use DAG to ensure that object file has not been unbundled. // CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[CUBIN:.*\.cubin]]" -// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-outputs={{.*}}[[CUBIN]] +// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}[[CUBIN]] // CHK-CUBIN-UNBUNDLING-NVLINK-DAG-SAME: "-unbundle" /// ### Index: test/CodeGen/nvptx_device_math_functions.c === --- /dev/null +++ test/CodeGen/nvptx_device_math_functions.c @@ -0,0 +1,20 @@ +// Test calling of device math functions. +///==/// + +// RUN: %clang -fmath-errno -S -emit-llvm -o - %s -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda | FileCheck -check-prefix CHECK-YES %s + +void test_sqrt(double a1) { + #pragma omp target + { +// CHECK-YES: call double @llvm.nvvm.sqrt.rn.d(double +double l1 = sqrt(a1); + } +} + +void test_pow(float a0, double a1, long double a2) { + #pragma omp target + { +// CHECK-YES: call double @__internal_accurate_pow(double +double l1 = pow(a1, a1); + } +} Index: lib/Headers/__clang_cuda_libdevice_declares.h === --- lib/Headers/__clang_cuda_libdevice_declares.h +++ lib/Headers/__clang_cuda_libdevice_declares.h @@ -24,443 +24,455 @@ #ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__ #define __CLANG_CUDA_LIBDEVICE_DECLARES_H__ +#if defined(_OPENMP) +#define __DEVICE__ +#elif defined(__CUDA__) +#define __DEVICE__ __device__ +#endif + +#if defined(__cplusplus) extern "C" { +#endif -__device__ int __nv_abs(int __a); -__device__ double __nv_acos(double __a); -__device__ float __nv_acosf(float __a); -__device__ double __nv_acosh(double __a); -__device__ float __nv_acoshf(float __a); -__device__ double __nv_asin(double __a); -__device__ float __nv_asinf(float __a); -__device__ double __nv_asinh(double __a); -__device__ float __nv_asinhf(float __a); -__device__ double __nv_atan2(double __a, double __b); -__device__ float __nv_atan2f(float __a, float __b); -__device__ double __nv_atan(double __a); -__device__ float __nv_atanf(float __a); -__device__ double __nv_atanh(double __a); -__device__ float __nv_atanhf(float __a); -__device__ int __nv_brev(int __a); -__device__ long long __nv_brevll(long long __a); -__device__ int __nv_byte_perm(int __a, int __b, int __c); -__device__ double __nv_cbrt(double __a); -__device__ float __nv_cbrtf(float __a); -__device__ double __nv_ceil(double __a); -__device__ float __nv_ceilf(float __a); -__device__ int __nv_clz(int __a); -__device__ int __nv_clzll(long long __a); -__device__ double __nv_copysign(double __a, double __b); -__device__ float __nv_copysignf(float __a, float __b); -__device__ double __nv_cos(double __a); -__device__ float __nv_cosf(float __a); -__device__ double __nv_cosh(double __a); -__device__ float __nv_coshf(float __a); -__device__ double __nv_cospi(double __a); -__device__ float __nv_cospif(float __a); -__device__ double __nv_cyl_bessel_i0(double __a); -__device__ float __nv_cyl_bessel_i0f(float __a); -__device__ double __nv_cyl_bessel_i1(double __a); -__device__ float __nv_cyl_bessel_i1f(float __a); -__device__ double __nv_dadd_rd(double __a, double __b); -__device__ double __nv_dadd_rn(double __a, double __b); -__device__ double __nv_dadd_ru(double __a, double __b); -__device__ double __nv_dadd_rz(double __a, double __b); -__device__ double __nv_ddiv_rd(double __a, double __b); -__device__ double __nv_ddiv_rn(double __a, double __b); -__device__ double __nv_ddiv_ru(double __a, double __b); -__device__ double __nv_ddiv_rz(double __a, double __b); -__device__ double __nv_dmul_rd(double __a, double __b); -__device__ double __nv_dmul_rn(dou
[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation
gtbercea added a comment. In https://reviews.llvm.org/D50845#1202973, @ABataev wrote: > >> If I understand it correctly, the root cause of this exercise is that we > >> want to compile for GPU using plain C. CUDA avoids this issue by > >> separating device and host code via target attributes and clang has few > >> special cases to ignore inline assembly errors in the host code if we're > >> compiling for device. For OpenMP there's no such separation, not in the > >> system headers, at least. > > > > Yes, that's one of the nice properties of CUDA (for the compiler). There > > used to be the same restriction for OpenMP where all functions used in > > `target` regions needed to be put in `declare target`. However that was > > relaxed in favor of implicitly marking all **called** functions in that TU > > to be `declare target`. > > So ideally I think Clang should determine which functions are really > > `declare target` (either explicit or implicit) and only run semantical > > analysis on them. If a function is then found to be "broken" it's perfectly > > desirable to error back to the user. > > It is not possible for OpenMP because we support implicit declare target > functions. Clang cannot identify whether the function is going to be used on > the device or not during sema analysis. Sounds like that is a recipe for just disabling sema analysis for all implicit declare target functions. Repository: rC Clang https://reviews.llvm.org/D50845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation
gtbercea added a comment. In https://reviews.llvm.org/D50845#1202991, @hfinkel wrote: > In https://reviews.llvm.org/D50845#1202965, @Hahnfeld wrote: > > > In https://reviews.llvm.org/D50845#1202963, @hfinkel wrote: > > > > > As a result, we should really have a separate header that has those > > > actually-available functions. When targeting NVPTX, why don't we have the > > > included math.h be CUDA's math.h? In the end, those are the functions we > > > need to call when we generate code. Right? > > > > > > That's what https://reviews.llvm.org/D47849 deals with. > > > Yes, but it doesn't get CUDA's math.h. Maybe I misunderstand how this works > (and I very well might, because it's not clear that CUDA has a math.h by that > name), but that patch tries to avoid problems with the host's math.h and then > also injects __clang_cuda_device_functions.h into the device compilation. How > does this compare to when you include math.h in Clang's CUDA mode? It seems > to be that we want to somehow map standard includes, where applicable, to > include files in CUDA's include/crt directory (e.g., crt/math_functions.h and > crt/common_functions.h for stdio.h for printf), and nothing else ends up > being available (because it is, in fact, not available). There's no CUDA specific math.h unless you want to regard clang_cuda_device_functions.h as a math header. The patch is using the same approach as CUDA and redirecting the function calls to device specific function calls. The parts of that patch which deal with host header compatibility would more naturally belong in a patch like this one so ultimately they won't be part of that patch. I'm currently working on improving the patch though by eliminating the clang_cuda_device_functions.h injection and elimintating the need to disable the built-ins. Repository: rC Clang https://reviews.llvm.org/D50845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea created this revision. gtbercea added reviewers: Hahnfeld, hfinkel, caomhin, carlo.bertolli, tra. Herald added subscribers: cfe-commits, guansong. So far, the clang-offload-bundler has been the default tool for bundling together various files types produced by the different OpenMP offloading toolchains supported by Clang. It does a great job for file types such as .bc, .ll, .ii, .ast. It is also used for bundling object files. Object files are special, in this case object files which contain sections meant to be executed on devices other than the host (such is the case of the OpenMP NVPTX toolchain). The bundling of object files prevents: - STATIC LINKING: These bundled object files can be part of static libraries which means that the object file requires an unbundling step. If an object file in a static library requires "unbundling" then we need to know the whereabouts of that library and of the files before the actual link step which makes it impossible to do static linking using the "-L/path/to/lib/folder -labc" flag. - INTEROPERABILITY WITH OTHER COMPILERS: These bundled object files can end up being passed between Clang and other compilers which may lead to incompatibilities: passing a bundled file from Clang to another compiler would lead to that compiler not being able to unbundle it. Passing an unbundled object file to Clang and therefore Clang not knowing that it doesn't need to unbundle it. **Goal:** Disable the use of the clang-offload-bundler for bundling/unbundling object files which contain OpenMP NVPTX device offloaded code. This applies to the case where the following set of flags are passed to Clang: -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda When the above condition is not met the compiler works as it does today by invoking the clang-offload-bundler for bundling/unbundling object files (at the cost of static linking and interoperability). The clang-offload-bundler usage on files other than object files is not affected by this patch. **Extensibility** Although this patch disables bundling/unbundling of object files via the clang-offload-bundler for the OpenMP NVPTX device offloading toolchain ONLY, this functionality can be extended to other platforms/system where: - the device toolchain can produce a host-compatible object AND - partial linking of host objects is supported. **The solution:** The solution enables the OpenMP NVPTX toolchain to produce an object file which is host-compatible (when compiling with -c). The host-compatible file is produced using several steps: Step 1 (already exists): invoke PTXAS on the .s file to obtain a .cubin. Step 2 (new step): invoke the FATBIN tool (this tool comes with every standard CUDA installation) which creates a CUDA fatbinary that contains both the PTX code (the .s file) and the .cubin file. This same tool can wrap the resulting .fatbin file in a C/C++ wrapper thus creating a .fatbin.c file. Step 3 (new step): call clang++ on the .fatbin.c file to create a .o file which is host-compatible. Once this device side host-compatible file is produced for the NVPTX toolchain then one further step is needed: Step 4 (new step): invoke a linker supporting partial linking (currently using "ld -r") to link host-compatible object file against the original host file and end up with one single object file which I can now safely pass to another compiler or include in a static library (new step). **Passing final object file to clang:** This file doesn't require unbundling so call to "clang-offload-bundler --unbundle" is NOT required. The compiler needs to be notified that the object file contains an "offloaded device part" by using: "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda". This will invoke the OpenMP NVPTX toolchain and it will call only NVLINK on this file. **Passing final object file to clang inside a static lib "libabc.a" passed to clang via: "-L/path/to/lib/folder -labc":** Call clang with "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" to trigger NVPTX toolchain. The -L path along with the -labc will be passed to NVLINK which will perform the "static linking". Repository: rC Clang https://reviews.llvm.org/D47394 Files: include/clang/Driver/Action.h include/clang/Driver/Compilation.h include/clang/Driver/Driver.h include/clang/Driver/ToolChain.h lib/Driver/Action.cpp lib/Driver/Compilation.cpp lib/Driver/Driver.cpp lib/Driver/ToolChain.cpp lib/Driver/ToolChains/Clang.cpp lib/Driver/ToolChains/Clang.h lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c test/Driver/openmp-offload.c Index: test/Driver/openmp-offload.c === --- test/Driver/openmp-offload.c +++ test/Driver/openmp-offload.c @@ -480,13 +480,13 @@ // Create host object and bundle. // CHK-BUJOBS: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-emit-obj" {{.*}}"-fopenmp" {{.*}}"-o" " // CHK-BUJOBS-SAME: [[HOSTOBJ:[^\\/]+\
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea updated this revision to Diff 148677. Repository: rC Clang https://reviews.llvm.org/D47394 Files: include/clang/Driver/Action.h include/clang/Driver/Compilation.h include/clang/Driver/Driver.h include/clang/Driver/ToolChain.h lib/Driver/Action.cpp lib/Driver/Compilation.cpp lib/Driver/Driver.cpp lib/Driver/ToolChain.cpp lib/Driver/ToolChains/Clang.cpp lib/Driver/ToolChains/Clang.h lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c test/Driver/openmp-offload.c Index: test/Driver/openmp-offload.c === --- test/Driver/openmp-offload.c +++ test/Driver/openmp-offload.c @@ -480,13 +480,13 @@ // Create host object and bundle. // CHK-BUJOBS: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-emit-obj" {{.*}}"-fopenmp" {{.*}}"-o" " // CHK-BUJOBS-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "-x" "ir" "{{.*}}[[HOSTBC]]" -// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= +// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= // CHK-BUJOBS-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]" // CHK-BUJOBS-ST: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-S" {{.*}}"-fopenmp" {{.*}}"-o" " // CHK-BUJOBS-ST-SAME: [[HOSTASM:[^\\/]+\.s]]" "-x" "ir" "{{.*}}[[HOSTBC]]" // CHK-BUJOBS-ST: clang{{.*}}" "-cc1as" "-triple" "powerpc64le--linux" "-filetype" "obj" {{.*}}"-o" " // CHK-BUJOBS-ST-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "{{.*}}[[HOSTASM]]" -// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= +// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs= // CHK-BUJOBS-ST-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]" /// ### Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -66,24 +66,29 @@ // CHK-PTXAS-CUBIN-BUNDLING: clang{{.*}}" "-o" "[[PTX:.*\.s]]" // CHK-PTXAS-CUBIN-BUNDLING-NEXT: ptxas{{.*}}" "--output-file" "[[CUBIN:.*\.cubin]]" {{.*}}"[[PTX]]" -// CHK-PTXAS-CUBIN-BUNDLING: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-inputs={{.*}}[[CUBIN]] +// CHK-PTXAS-CUBIN-BUNDLING: fatbinary{{.*}}" "--create=[[FATBIN:.*\.fatbin]]" " +// CHK-PTXAS-CUBIN-BUNDLING-SAME: --embedded-fatbin=[[FATBINC:.*\.fatbin.c]]" " +// CHK-PTXAS-CUBIN-BUNDLING-SAME: --cmdline=--compile-only" "--image=profile={{.*}}[[PTX]]" " +// CHK-PTXAS-CUBIN-BUNDLING-SAME: --image=profile={{.*}}file=[[CUBIN]]" "--cuda" "--device-c" +// CHK-PTXAS-CUBIN-BUNDLING: clang++{{.*}}" "-c" "-o" "[[HOSTDEV:.*\.o]]"{{.*}}" "[[FATBINC]]" "-D__NV_MODULE_ID= +// CHK-PTXAS-CUBIN-BUNDLING-NOT: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-inputs={{.*}}[[CUBIN]] +// CHK-PTXAS-CUBIN-BUNDLING: ld" "-r" "[[HOSTDEV]]" "{{.*}}.o" "-o" "{{.*}}.o" /// ### -/// Check cubin file unbundling and usage by nvlink +/// Check object file unbundling is not happening when skipping bundler // RUN: touch %t.o // RUN: %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ // RUN: -no-canonical-prefixes -save-temps %t.o 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s -/// Use DAG to ensure that cubin file has been unbundled. -// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[CUBIN:.*\.cubin]]" -// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-outputs={{.*}}[[CUBIN]] -// CHK-CUBIN-UNBUNDLING-NVLINK-DAG-SAME: "-unbundle" +/// Use DAG to ensure that object file has not been unbundled. +// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[OBJ:.*\.o]]" +// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: ld{{.*}}" {{.*}}"[[OBJ]]" /// ### -/// Check cubin file generation and usage by nvlink +/// Check object file generation is not happening when skipping bundler // RUN: touch %t1.o // RUN: touch %t2.o // RUN: %clang -### -no-canonical-prefixes -target powerpc64le-unknown-linux-gnu -fopenmp=libomp \ @@ -94,7 +99,7 @@ // RUN: -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-TWOCUBIN %s -// CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.cubin" "{{.*}}openmp-offload-{{.*}}.cubin" +// CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.o" "{{.*}}o
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea added inline comments. Comment at: test/Driver/openmp-offload.c:497 // RUN: %clang -### -fopenmp=libomp -o %t.out -lsomelib -target powerpc64le-linux -fopenmp-targets=powerpc64le-ibm-linux-gnu,x86_64-pc-linux-gnu %t.i -no-canonical-prefixes 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-UBJOBS %s // RUN: %clang -### -fopenmp=libomp -o %t.out -lsomelib -target powerpc64le-linux -fopenmp-targets=powerpc64le-ibm-linux-gnu,x86_64-pc-linux-gnu %t.i -save-temps -no-canonical-prefixes 2>&1 \ gtbercea wrote: > sfantao wrote: > > We need a test for the static linking. The host linker has to be nvcc in > > that case, right? > The host linker is "ld". The "bundling" step is replaced (in the case of > OpenMP NVPTX device offloading only) by a call to "ld -r" to partially link > the 2 object files: the object file produced by the HOST toolchain and the > object file produced by the OpenMP NVPTX device offloading toolchain (because > we want to produce a single output). nvcc is not called at all in this patch. Repository: rC Clang https://reviews.llvm.org/D47394 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea added inline comments. Comment at: test/Driver/openmp-offload.c:497 // RUN: %clang -### -fopenmp=libomp -o %t.out -lsomelib -target powerpc64le-linux -fopenmp-targets=powerpc64le-ibm-linux-gnu,x86_64-pc-linux-gnu %t.i -no-canonical-prefixes 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-UBJOBS %s // RUN: %clang -### -fopenmp=libomp -o %t.out -lsomelib -target powerpc64le-linux -fopenmp-targets=powerpc64le-ibm-linux-gnu,x86_64-pc-linux-gnu %t.i -save-temps -no-canonical-prefixes 2>&1 \ sfantao wrote: > We need a test for the static linking. The host linker has to be nvcc in that > case, right? The host linker is "ld". The "bundling" step is replaced (in the case of OpenMP NVPTX device offloading only) by a call to "ld -r" to partially link the 2 object files: the object file produced by the HOST toolchain and the object file produced by the OpenMP NVPTX device offloading toolchain (because we want to produce a single output). Repository: rC Clang https://reviews.llvm.org/D47394 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:536 + } } sfantao wrote: > What prevents all this from being done in the bundler? If I understand it > correctly, if the bundler implements this wrapping all the checks for > librariers wouldn't be required and, only two changes would be required in > the driver: > > - generate fatbin instead of cubin. This is straightforward to do by changing > the device assembling job. In terms of the loading of the kernels by the > device API, doing it through fatbin or cubin should be equivalent except that > fatbin enables storing the PTX format and JIT for newer GPUs. > - Use NVIDIA linker as host linker. > > This last requirement could be problematic if we get two targets attempting > to use different (incompatible linkers). If we get this kind of > incompatibility we should get the appropriate diagnostic. What prevents it is the fact that the bundler is called AFTER the HOST and DEVICE object files have been produced. The creation of the fatbin (FATBINARY + CALNG++) needs to happen within the NVPTX toolchain. Repository: rC Clang https://reviews.llvm.org/D47394 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea added a comment. In https://reviews.llvm.org/D47394#1114848, @sfantao wrote: > Just to clarify one thing in my last comment: > > When I say that we didn't aim at having clang compatible with other > compilers, I mean the OpenMP offloading descriptors, where all the variables > and offloading entry points are. Of course we want to allow the resulting > binaries to be compatible with linkers taking inputs of other compilers, so > that you can have, e.g., OpenMP and CUDA supported in the same executable, > even though working independently. Today you will have trouble linking against a Clang object file in another compiler that doesn't know anything about the clang-offload-bundler. Repository: rC Clang https://reviews.llvm.org/D47394 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D37913: [OpenMP] Enable the existing nocudalib flag for OpenMP offloading toolchain.
gtbercea updated this revision to Diff 116621. gtbercea added a comment. Split line. https://reviews.llvm.org/D37913 Files: lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -125,3 +125,13 @@ // RUN: | FileCheck -check-prefix=CHK-PTXAS-RELO %s // CHK-PTXAS-RELO: ptxas{{.*}}" "-c" + +/// ### + +/// Check that error is not thrown by toolchain when no cuda lib flag is used. +/// Check that the flag is passed when -fopenmp-relocatable-target is used. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 \ +// RUN: -nocudalib -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-FLAG-NOLIBDEVICE %s + +// CHK-FLAG-NOLIBDEVICE-NOT: error:{{.*}}sm_60 Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -492,11 +492,11 @@ if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); - -if (DriverArgs.hasArg(options::OPT_nocudalib)) - return; } + if (DriverArgs.hasArg(options::OPT_nocudalib)) +return; + std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); if (LibDeviceFile.empty()) { Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -125,3 +125,13 @@ // RUN: | FileCheck -check-prefix=CHK-PTXAS-RELO %s // CHK-PTXAS-RELO: ptxas{{.*}}" "-c" + +/// ### + +/// Check that error is not thrown by toolchain when no cuda lib flag is used. +/// Check that the flag is passed when -fopenmp-relocatable-target is used. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 \ +// RUN: -nocudalib -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHK-FLAG-NOLIBDEVICE %s + +// CHK-FLAG-NOLIBDEVICE-NOT: error:{{.*}}sm_60 Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -492,11 +492,11 @@ if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals, options::OPT_fno_cuda_approx_transcendentals, false)) CC1Args.push_back("-fcuda-approx-transcendentals"); - -if (DriverArgs.hasArg(options::OPT_nocudalib)) - return; } + if (DriverArgs.hasArg(options::OPT_nocudalib)) +return; + std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch); if (LibDeviceFile.empty()) { ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D37914: [OpenMP] Don't throw cudalib not found error if only front-end is required.
gtbercea reopened this revision. gtbercea added a comment. This revision is now accepted and ready to land. Open. Repository: rL LLVM https://reviews.llvm.org/D37914 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38040: [OpenMP] Add an additional test for D34888
gtbercea updated this revision to Diff 116664. gtbercea added a comment. Fix test. https://reviews.llvm.org/D38040 Files: test/OpenMP/target_map_codegen.cpp Index: test/OpenMP/target_map_codegen.cpp === --- test/OpenMP/target_map_codegen.cpp +++ test/OpenMP/target_map_codegen.cpp @@ -4554,3 +4554,33 @@ } #endif #endif + +///==/// +// RUN: %clang -DCK30 -std=c++11 -fopenmp -S -emit-llvm -fopenmp-targets=nvptx64-nvidia-cuda %s -o - 2>&1 \ +// RUN: | FileCheck -check-prefix=CK30 %s + +#ifdef CK30 + +void target_maps_parallel_integer(int a){ + int ParamToKernel = a; +#pragma omp target map(tofrom: ParamToKernel) + { +ParamToKernel += 1; + } +} + +// CK30: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4) %ParamToKernel) + +// CK30: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} { + +// CK30: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs +// CK30: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]] +// CK30: store i32* %ParamToKernel, i32** [[GEPOBPBIT]] +// CK30: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}offload_ptrs +// CK30: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]] +// CK30: store i32* %ParamToKernel, i32** [[GEPOPBIT]] +// CK30: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs +// CK30: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}offload_ptrs +// CK30: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]] + +#endif Index: test/OpenMP/target_map_codegen.cpp === --- test/OpenMP/target_map_codegen.cpp +++ test/OpenMP/target_map_codegen.cpp @@ -4554,3 +4554,33 @@ } #endif #endif + +///==/// +// RUN: %clang -DCK30 -std=c++11 -fopenmp -S -emit-llvm -fopenmp-targets=nvptx64-nvidia-cuda %s -o - 2>&1 \ +// RUN: | FileCheck -check-prefix=CK30 %s + +#ifdef CK30 + +void target_maps_parallel_integer(int a){ + int ParamToKernel = a; +#pragma omp target map(tofrom: ParamToKernel) + { +ParamToKernel += 1; + } +} + +// CK30: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4) %ParamToKernel) + +// CK30: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} { + +// CK30: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs +// CK30: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]] +// CK30: store i32* %ParamToKernel, i32** [[GEPOBPBIT]] +// CK30: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}offload_ptrs +// CK30: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]] +// CK30: store i32* %ParamToKernel, i32** [[GEPOPBIT]] +// CK30: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs +// CK30: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}offload_ptrs +// CK30: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]] + +#endif ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38040: [OpenMP] Add an additional test for D34888
gtbercea updated this revision to Diff 116671. gtbercea added a comment. Add nocudalib flag. https://reviews.llvm.org/D38040 Files: test/OpenMP/target_map_codegen.cpp Index: test/OpenMP/target_map_codegen.cpp === --- test/OpenMP/target_map_codegen.cpp +++ test/OpenMP/target_map_codegen.cpp @@ -4554,3 +4554,33 @@ } #endif #endif + +///==/// +// RUN: %clang -DCK30 -std=c++11 -fopenmp -S -emit-llvm -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -nocudalib %s -o - 2>&1 | FileCheck -check-prefix=CK30 %s + +#ifdef CK30 + +void target_maps_parallel_integer(int a){ + int ParamToKernel = a; +#pragma omp target map(tofrom: ParamToKernel) + { +ParamToKernel += 1; + } +} + +// CK30: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4) %ParamToKernel) + +// CK30: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} { + +// CK30: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs +// CK30: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]] +// CK30: store i32* %ParamToKernel, i32** [[GEPOBPBIT]] +// CK30: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}offload_ptrs +// CK30: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]] +// CK30: store i32* %ParamToKernel, i32** [[GEPOPBIT]] +// CK30: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs +// CK30: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}offload_ptrs +// CK30: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]] + +#endif Index: test/OpenMP/target_map_codegen.cpp === --- test/OpenMP/target_map_codegen.cpp +++ test/OpenMP/target_map_codegen.cpp @@ -4554,3 +4554,33 @@ } #endif #endif + +///==/// +// RUN: %clang -DCK30 -std=c++11 -fopenmp -S -emit-llvm -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -nocudalib %s -o - 2>&1 | FileCheck -check-prefix=CK30 %s + +#ifdef CK30 + +void target_maps_parallel_integer(int a){ + int ParamToKernel = a; +#pragma omp target map(tofrom: ParamToKernel) + { +ParamToKernel += 1; + } +} + +// CK30: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4) %ParamToKernel) + +// CK30: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} { + +// CK30: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs +// CK30: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]] +// CK30: store i32* %ParamToKernel, i32** [[GEPOBPBIT]] +// CK30: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}offload_ptrs +// CK30: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]] +// CK30: store i32* %ParamToKernel, i32** [[GEPOPBIT]] +// CK30: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs +// CK30: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}offload_ptrs +// CK30: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]] + +#endif ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38040: [OpenMP] Add an additional test for D34888
gtbercea reopened this revision. gtbercea added a comment. This revision is now accepted and ready to land. Open https://reviews.llvm.org/D38040 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D29660: [OpenMP] Add flag for overwriting default PTX version for OpenMP targets
gtbercea reopened this revision. gtbercea added a comment. This revision is now accepted and ready to land. Open Repository: rL LLVM https://reviews.llvm.org/D29660 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38040: [OpenMP] Add an additional test for D34888
gtbercea reopened this revision. gtbercea added a comment. This revision is now accepted and ready to land. Open https://reviews.llvm.org/D38040 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38040: [OpenMP] Add an additional test for D34888
gtbercea updated this revision to Diff 116747. gtbercea added a comment. Fix test. https://reviews.llvm.org/D38040 Files: test/OpenMP/openmp_offload_codegen.cpp Index: test/OpenMP/openmp_offload_codegen.cpp === --- /dev/null +++ test/OpenMP/openmp_offload_codegen.cpp @@ -0,0 +1,36 @@ +// Test device for mapping codegen. +///==/// + +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -S -emit-llvm %s -o - 2>&1 \ +// RUN: | FileCheck -check-prefix=CK1 %s + +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1-DEVICE + +// expected-no-diagnostics + +#ifdef CK1 + +void target_maps_parallel_integer(int a){ + int ParamToKernel = a; +#pragma omp target map(tofrom: ParamToKernel) + { +ParamToKernel += 1; + } +} + +// CK1-DEVICE: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4){{.*}} + +// CK1: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} { + +// CK1: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}} +// CK1: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]] +// CK1: store i32* %ParamToKernel, i32** [[GEPOBPBIT]] +// CK1: [[GEPOP:%.+]] = getelementptr inbounds {{.*}} +// CK1: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]] +// CK1: store i32* %ParamToKernel, i32** [[GEPOPBIT]] +// CK1: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}} +// CK1: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}} +// CK1: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]] + +#endif \ No newline at end of file Index: test/OpenMP/openmp_offload_codegen.cpp === --- /dev/null +++ test/OpenMP/openmp_offload_codegen.cpp @@ -0,0 +1,36 @@ +// Test device for mapping codegen. +///==/// + +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -S -emit-llvm %s -o - 2>&1 \ +// RUN: | FileCheck -check-prefix=CK1 %s + +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1-DEVICE + +// expected-no-diagnostics + +#ifdef CK1 + +void target_maps_parallel_integer(int a){ + int ParamToKernel = a; +#pragma omp target map(tofrom: ParamToKernel) + { +ParamToKernel += 1; + } +} + +// CK1-DEVICE: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4){{.*}} + +// CK1: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} { + +// CK1: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}} +// CK1: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]] +// CK1: store i32* %ParamToKernel, i32** [[GEPOBPBIT]] +// CK1: [[GEPOP:%.+]] = getelementptr inbounds {{.*}} +// CK1: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]] +// CK1: store i32* %ParamToKernel, i32** [[GEPOPBIT]] +// CK1: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}} +// CK1: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}} +// CK1: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]] + +#endif \ No newline at end of file ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38257: [OpenMP] Fix memory leak when translating arguments
gtbercea accepted this revision. gtbercea added a comment. This revision is now accepted and ready to land. LGTM https://reviews.llvm.org/D38257 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38258: [OpenMP] Fix passing of -m arguments to device toolchain
gtbercea added inline comments. Comment at: test/Driver/openmp-offload.c:89 +/// ### + /// Check the phases graph when using a single target, different from the host. Shouldn't these tests be in the gpu test file? https://reviews.llvm.org/D38258 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38259: [OpenMP] Fix translation of target args
gtbercea accepted this revision. gtbercea added a comment. This revision is now accepted and ready to land. LGTM https://reviews.llvm.org/D38259 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38258: [OpenMP] Fix passing of -m arguments to device toolchain
gtbercea accepted this revision. gtbercea added a comment. This revision is now accepted and ready to land. LGTM Comment at: test/Driver/openmp-offload.c:89 +/// ### + /// Check the phases graph when using a single target, different from the host. Hahnfeld wrote: > gtbercea wrote: > > Shouldn't these tests be in the gpu test file? > There is nothing specific to GPUs here IMO, that is why I moved the test back > to this file Actually no, you're right! :) https://reviews.llvm.org/D38258 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:170-182 -// This code prevents IsValid from being set when -// no libdevice has been found. -bool allEmpty = true; -std::string LibDeviceFile; -for (auto key : LibDeviceMap.keys()) { - LibDeviceFile = LibDeviceMap.lookup(key); - if (!LibDeviceFile.empty()) Hahnfeld wrote: > tra wrote: > > Hahnfeld wrote: > > > tra wrote: > > > > I'd keep this code. It appears to serve useful purpose as it requires > > > > CUDA installation to have at least some libdevice library in it. It > > > > gives us a change to find a valid installation, instead of ailing some > > > > time later when we ask for a libdevice file and fail because there are > > > > none. > > > We had some internal discussions about this after I submitted the patch > > > here. > > > > > > The main question is: Do we want to support CUDA installations without > > > libdevice and are there use cases for that? I'd say that the user should > > > be able to use a toolchain without libdevice together with `-nocudalib`. > > Sounds reasonable. How about keeping the code but putting it under > > `if(!hasArg(nocudalib))`? > > > Ok, I'll do that in a separate patch and keep the code here for now. The problem with nocudalib is that if for example you write a test, which looks to verify some device facing feature that requires a libdevice to be found (so you don't want to use nocudalib), it will probably work on your machine which has the correct CUDA setup but fail on another machine which does not (which is where you want to use nocudalib). You can see the contradiction there. https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:170-182 -// This code prevents IsValid from being set when -// no libdevice has been found. -bool allEmpty = true; -std::string LibDeviceFile; -for (auto key : LibDeviceMap.keys()) { - LibDeviceFile = LibDeviceMap.lookup(key); - if (!LibDeviceFile.empty()) gtbercea wrote: > Hahnfeld wrote: > > tra wrote: > > > Hahnfeld wrote: > > > > tra wrote: > > > > > I'd keep this code. It appears to serve useful purpose as it requires > > > > > CUDA installation to have at least some libdevice library in it. It > > > > > gives us a change to find a valid installation, instead of ailing > > > > > some time later when we ask for a libdevice file and fail because > > > > > there are none. > > > > We had some internal discussions about this after I submitted the patch > > > > here. > > > > > > > > The main question is: Do we want to support CUDA installations without > > > > libdevice and are there use cases for that? I'd say that the user > > > > should be able to use a toolchain without libdevice together with > > > > `-nocudalib`. > > > Sounds reasonable. How about keeping the code but putting it under > > > `if(!hasArg(nocudalib))`? > > > > > Ok, I'll do that in a separate patch and keep the code here for now. > The problem with nocudalib is that if for example you write a test, which > looks to verify some device facing feature that requires a libdevice to be > found (so you don't want to use nocudalib), it will probably work on your > machine which has the correct CUDA setup but fail on another machine which > does not (which is where you want to use nocudalib). You can see the > contradiction there. Just to be clear I am arguing for keeping this code :) https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.h:90 - } }; I would also like to keep the spirit of this code if not in this exact form at least something that performs the same functionality. https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:170-182 -// This code prevents IsValid from being set when -// no libdevice has been found. -bool allEmpty = true; -std::string LibDeviceFile; -for (auto key : LibDeviceMap.keys()) { - LibDeviceFile = LibDeviceMap.lookup(key); - if (!LibDeviceFile.empty()) tra wrote: > tra wrote: > > gtbercea wrote: > > > gtbercea wrote: > > > > Hahnfeld wrote: > > > > > tra wrote: > > > > > > Hahnfeld wrote: > > > > > > > tra wrote: > > > > > > > > I'd keep this code. It appears to serve useful purpose as it > > > > > > > > requires CUDA installation to have at least some libdevice > > > > > > > > library in it. It gives us a change to find a valid > > > > > > > > installation, instead of ailing some time later when we ask for > > > > > > > > a libdevice file and fail because there are none. > > > > > > > We had some internal discussions about this after I submitted the > > > > > > > patch here. > > > > > > > > > > > > > > The main question is: Do we want to support CUDA installations > > > > > > > without libdevice and are there use cases for that? I'd say that > > > > > > > the user should be able to use a toolchain without libdevice > > > > > > > together with `-nocudalib`. > > > > > > Sounds reasonable. How about keeping the code but putting it under > > > > > > `if(!hasArg(nocudalib))`? > > > > > > > > > > > Ok, I'll do that in a separate patch and keep the code here for now. > > > > The problem with nocudalib is that if for example you write a test, > > > > which looks to verify some device facing feature that requires a > > > > libdevice to be found (so you don't want to use nocudalib), it will > > > > probably work on your machine which has the correct CUDA setup but fail > > > > on another machine which does not (which is where you want to use > > > > nocudalib). You can see the contradiction there. > > > Just to be clear I am arguing for keeping this code :) > > @gtbercea: I'm not sure I follow your example. If you're talking about > > clang tests, we do have fake CUDA installation setup under > > test/Driver/Inputs which removes dependency on whatever CUDA you may or may > > not have installed on your machine. I also don't see a contradiction -- you > > you do need libdevice, it makes no point picking a broken CUDA installation > > which does not have any libdevice files. If you explicitly tell compiler > > that you don't need libdevice, that would make CUDA w/o libdevice > > acceptable. With --cuda-path you do have a way to tell clang which > > installation you want it to use. What do I miss? > > > > > Ah, you were arguing with Hahnfeld@'s -nocudalib example. Then I guess we're > in violent agreement. I fully agree with this: "you do need libdevice, it makes no point picking a broken CUDA installation which does not have any libdevice files. If you explicitly tell compiler that you don't need libdevice, that would make CUDA w/o libdevice acceptable." I was trying to show an example of a situation where you have your code compiled using nocudalib on one machine and then the same code will error on a machine which requires the nocudalib flag to be passed to make up for the absence of libdevice. https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:170-182 -// This code prevents IsValid from being set when -// no libdevice has been found. -bool allEmpty = true; -std::string LibDeviceFile; -for (auto key : LibDeviceMap.keys()) { - LibDeviceFile = LibDeviceMap.lookup(key); - if (!LibDeviceFile.empty()) gtbercea wrote: > tra wrote: > > tra wrote: > > > gtbercea wrote: > > > > gtbercea wrote: > > > > > Hahnfeld wrote: > > > > > > tra wrote: > > > > > > > Hahnfeld wrote: > > > > > > > > tra wrote: > > > > > > > > > I'd keep this code. It appears to serve useful purpose as it > > > > > > > > > requires CUDA installation to have at least some libdevice > > > > > > > > > library in it. It gives us a change to find a valid > > > > > > > > > installation, instead of ailing some time later when we ask > > > > > > > > > for a libdevice file and fail because there are none. > > > > > > > > We had some internal discussions about this after I submitted > > > > > > > > the patch here. > > > > > > > > > > > > > > > > The main question is: Do we want to support CUDA installations > > > > > > > > without libdevice and are there use cases for that? I'd say > > > > > > > > that the user should be able to use a toolchain without > > > > > > > > libdevice together with `-nocudalib`. > > > > > > > Sounds reasonable. How about keeping the code but putting it > > > > > > > under `if(!hasArg(nocudalib))`? > > > > > > > > > > > > > Ok, I'll do that in a separate patch and keep the code here for now. > > > > > The problem with nocudalib is that if for example you write a test, > > > > > which looks to verify some device facing feature that requires a > > > > > libdevice to be found (so you don't want to use nocudalib), it will > > > > > probably work on your machine which has the correct CUDA setup but > > > > > fail on another machine which does not (which is where you want to > > > > > use nocudalib). You can see the contradiction there. > > > > Just to be clear I am arguing for keeping this code :) > > > @gtbercea: I'm not sure I follow your example. If you're talking about > > > clang tests, we do have fake CUDA installation setup under > > > test/Driver/Inputs which removes dependency on whatever CUDA you may or > > > may not have installed on your machine. I also don't see a contradiction > > > -- you you do need libdevice, it makes no point picking a broken CUDA > > > installation which does not have any libdevice files. If you explicitly > > > tell compiler that you don't need libdevice, that would make CUDA w/o > > > libdevice acceptable. With --cuda-path you do have a way to tell clang > > > which installation you want it to use. What do I miss? > > > > > > > > Ah, you were arguing with Hahnfeld@'s -nocudalib example. Then I guess > > we're in violent agreement. > I fully agree with this: "you do need libdevice, it makes no point picking a > broken CUDA installation which does not have any libdevice files. If you > explicitly tell compiler that you don't need libdevice, that would make CUDA > w/o libdevice acceptable." > > I was trying to show an example of a situation where you have your code > compiled using nocudalib on one machine and then the same code will error on > a machine which requires the nocudalib flag to be passed to make up for the > absence of libdevice. > > Yes it was a counter argument to that! :) https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.h:90 - } }; gtbercea wrote: > I would also like to keep the spirit of this code if not in this exact form > at least something that performs the same functionality. @tra what's your opinion on this code? Should this stay, stay but modified to be more robust or taken out completely? https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.h:90 - } }; tra wrote: > gtbercea wrote: > > gtbercea wrote: > > > I would also like to keep the spirit of this code if not in this exact > > > form at least something that performs the same functionality. > > @tra what's your opinion on this code? Should this stay, stay but modified > > to be more robust or taken out completely? > There are currently no users for this. In general, I would rather not have > magically-changing default GPU based on how broken your CUDA installation is. > IMO it would be better to keep defaults static and fail if prerequisites are > not met. I would have thought that it is up to the compiler to select, as default, the lowest viable compute capability. This is what this code aims to do (whether it actually does that's a separate issue :) ). https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.h:90 - } }; gtbercea wrote: > tra wrote: > > gtbercea wrote: > > > gtbercea wrote: > > > > I would also like to keep the spirit of this code if not in this exact > > > > form at least something that performs the same functionality. > > > @tra what's your opinion on this code? Should this stay, stay but > > > modified to be more robust or taken out completely? > > There are currently no users for this. In general, I would rather not have > > magically-changing default GPU based on how broken your CUDA installation > > is. IMO it would be better to keep defaults static and fail if > > prerequisites are not met. > I would have thought that it is up to the compiler to select, as default, the > lowest viable compute capability. This is what this code aims to do (whether > it actually does that's a separate issue :) ). > The reason I added this code in the first place was to overcome the fact that something like a default of sm_30 may work on the K40 but once you go to newer Pascal, Volta GPUs then you need a new minimum compute capability that is supported. https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch
gtbercea added a comment. LGTM https://reviews.llvm.org/D38883 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38976: [OpenMP] Add implicit data sharing support when offloading to NVIDIA GPUs using OpenMP device offloading
gtbercea created this revision. Herald added a subscriber: jholewinski. This patch is part of the development effort to add support in the current OpenMP GPU offloading implementation for implicitly sharing variables between a target region executed by the team master thread and the worker threads within that team. This patch is the first of three required for successfully performing the implicit sharing of master thread variables with the worker threads within a team. The remaining two patches are: - a patch to the LLVM NVPTX backend which ensures the lowering of shared variables to an device memory which allows the sharing of references; - a runtime patch to libomptarget which ensures that a list of references to shared variables is properly maintained. A simple code snippet which illustrates an implicit data sharing situation is as follows: #pragma omp target { // master thread only int v; #pragma omp parallel { // worker threads // use v } } Variable v is implicitly shared from the team master thread which executes the code in between the target and parallel directives. The worker threads must operate on the latest version of v, including any updates performed by the master. The code generated in this patch relies on the LLVM NVPTX patch (mentioned above) which prevents v from being lowered in the thread local memory of the master thread thus making the reference to this variable un-shareable with the workers. This ensures that the code generated by this patch is correct. Since the parallel region is outlined the passing of arguments to the outlined regions must preserve the original order of arguments. The runtime therefore maintains a list of references to shared variables thus ensuring their passing in the correct order. The passing of arguments to the outlined parallel function is performed in a separate function which the data sharing infrastructure constructs in this patch. The function is inlined when optimizations are enabled. Repository: rL LLVM https://reviews.llvm.org/D38976 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp test/OpenMP/nvptx_target_teams_codegen.cpp Index: test/OpenMP/nvptx_target_teams_codegen.cpp === --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -146,7 +146,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -78,7 +78,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -92,20 +92,20 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] // // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]( + // CHECK: call void [[PARALLEL_FN1]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT1]] // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM2]], label {{%
[PATCH] D38978: [OpenMP] Enable the lowering of implicitly shared variables in OpenMP GPU-offloaded target regions to the GPU shared memory
gtbercea created this revision. Herald added subscribers: mgorny, jholewinski. This patch is part of the development effort to add support in the current OpenMP GPU offloading implementation for implicitly sharing variables between a target region executed by the team master thread and the worker threads within that team. This patch is the second of three required for successfully performing the implicit sharing of master thread variables with the worker threads within a team: -Patch https://reviews.llvm.org/D38976 extends the CLANG code generation with code that handles shared variables. -Patch (coming soon) extends the functionality of libomptarget to maintain a list of references to shared variables. This patch adds a shared memory stack to the prolog of the kernel function representing the device offloaded OpenMP target region. The new passes along with the changes to existing ones, ensure that any OpenMP variable which needs to be shared across several threads will be allocated in this new stack, in the shared memory of the device. This patch covers the case of sharing variables from the master thread to the worker threads: #pragma omp target { // master thread only int v; #pragma omp parallel { // worker threads // use v } } Repository: rL LLVM https://reviews.llvm.org/D38978 Files: include/llvm/CodeGen/TargetPassConfig.h lib/CodeGen/TargetPassConfig.cpp lib/Target/NVPTX/CMakeLists.txt lib/Target/NVPTX/NVPTX.h lib/Target/NVPTX/NVPTXAsmPrinter.cpp lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp lib/Target/NVPTX/NVPTXFrameLowering.cpp lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp lib/Target/NVPTX/NVPTXFunctionDataSharing.h lib/Target/NVPTX/NVPTXInstrInfo.td lib/Target/NVPTX/NVPTXLowerAlloca.cpp lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp lib/Target/NVPTX/NVPTXRegisterInfo.cpp lib/Target/NVPTX/NVPTXRegisterInfo.h lib/Target/NVPTX/NVPTXRegisterInfo.td lib/Target/NVPTX/NVPTXTargetMachine.cpp lib/Target/NVPTX/NVPTXUtilities.cpp lib/Target/NVPTX/NVPTXUtilities.h Index: lib/Target/NVPTX/NVPTXUtilities.h === --- lib/Target/NVPTX/NVPTXUtilities.h +++ lib/Target/NVPTX/NVPTXUtilities.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H +#include "NVPTXTargetMachine.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" @@ -60,6 +62,8 @@ bool getAlign(const Function &, unsigned index, unsigned &); bool getAlign(const CallInst &, unsigned index, unsigned &); +bool ptrIsStored(Value *Ptr); + } #endif Index: lib/Target/NVPTX/NVPTXUtilities.cpp === --- lib/Target/NVPTX/NVPTXUtilities.cpp +++ lib/Target/NVPTX/NVPTXUtilities.cpp @@ -28,6 +28,8 @@ namespace llvm { +#define DEBUG_TYPE "nvptx-utilities" + namespace { typedef std::map > key_val_pair_t; typedef std::map global_val_annot_t; @@ -314,4 +316,50 @@ return false; } +/// Returns true if there are any instructions storing +/// the address of this pointer. +bool ptrIsStored(Value *Ptr) { + SmallVector PointerAliases; + PointerAliases.push_back(Ptr); + + SmallVector Users; + for (const Use &U : Ptr->uses()) +Users.push_back(U.getUser()); + + for (unsigned I = 0; I < Users.size(); ++I) { +// Get pointer usage +const User *FU = Users[I]; + +// Check if Ptr or an alias to it is the destination of the store +auto SI = dyn_cast(FU); +if (SI) { + for (auto Alias: PointerAliases) +if (SI->getValueOperand() == Alias) + return true; + continue; +} + +// TODO: Can loads lead to address being taken? +// TODO: Can GEPs lead to address being taken? + +// Bitcasts increase aliases of the pointer +auto BI = dyn_cast(FU); +if (BI) { + for (const Use &U : BI->uses()) +Users.push_back(U.getUser()); + PointerAliases.push_back(BI); + continue; +} + +// TODO: +// There may be other instructions which increase the number +// of alias values ex. operations on the address of the alloca. +// The whole alloca'ed memory region needs to be shared if at +// least one of the values needs to be shared. + } + + // Address of the pointer has been stored + return false; +} + } // namespace llvm Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp === --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -54,6 +54,7 @@ void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); +void initializeNVPTXFunctionDataSharingPass(PassRegistry &); } // end namespac
[PATCH] D39005: [OpenMP] Clean up variable and function names for NVPTX backend
gtbercea created this revision. Herald added a subscriber: jholewinski. Clean-up variable and function names. Repository: rL LLVM https://reviews.llvm.org/D39005 Files: lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp Index: lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp === --- lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp +++ lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -37,6 +37,8 @@ /// \brief Clean up the name to remove symbols invalid in PTX. std::string cleanUpName(StringRef Name); + /// Set a clean name, ensuring collisions are avoided. + void generateCleanName(Value &V); }; } @@ -50,20 +52,31 @@ "Assign valid PTX names to globals", false, false) bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) { - for (GlobalVariable &GV : M.globals()) { -// We are only allowed to rename local symbols. -if (GV.hasLocalLinkage()) { - // setName doesn't do extra work if the name does not change. - // Note: this does not create collisions - if setName is asked to set the - // name to something that already exists, it adds a proper postfix to - // avoid collisions. - GV.setName(cleanUpName(GV.getName())); -} - } + // We are only allowed to rename local symbols. + for (GlobalVariable &GV : M.globals()) +if (GV.hasLocalLinkage()) + generateCleanName(GV); + + // Clean function symbols. + for (auto &FN : M.functions()) +if (FN.hasLocalLinkage()) + generateCleanName(FN); return true; } +void NVPTXAssignValidGlobalNames::generateCleanName(Value &V) { + std::string ValidName; + do { +ValidName = cleanUpName(V.getName()); +// setName doesn't do extra work if the name does not change. +// Collisions are avoided by adding a suffix (which may yet be unclean in +// PTX). +V.setName(ValidName); +// If there are no collisions return, otherwise clean up the new name. + } while (!V.getName().equals(ValidName)); +} + std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) { std::string ValidName; raw_string_ostream ValidNameStream(ValidName); Index: lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp === --- lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp +++ lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp @@ -37,6 +37,8 @@ /// \brief Clean up the name to remove symbols invalid in PTX. std::string cleanUpName(StringRef Name); + /// Set a clean name, ensuring collisions are avoided. + void generateCleanName(Value &V); }; } @@ -50,20 +52,31 @@ "Assign valid PTX names to globals", false, false) bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) { - for (GlobalVariable &GV : M.globals()) { -// We are only allowed to rename local symbols. -if (GV.hasLocalLinkage()) { - // setName doesn't do extra work if the name does not change. - // Note: this does not create collisions - if setName is asked to set the - // name to something that already exists, it adds a proper postfix to - // avoid collisions. - GV.setName(cleanUpName(GV.getName())); -} - } + // We are only allowed to rename local symbols. + for (GlobalVariable &GV : M.globals()) +if (GV.hasLocalLinkage()) + generateCleanName(GV); + + // Clean function symbols. + for (auto &FN : M.functions()) +if (FN.hasLocalLinkage()) + generateCleanName(FN); return true; } +void NVPTXAssignValidGlobalNames::generateCleanName(Value &V) { + std::string ValidName; + do { +ValidName = cleanUpName(V.getName()); +// setName doesn't do extra work if the name does not change. +// Collisions are avoided by adding a suffix (which may yet be unclean in +// PTX). +V.setName(ValidName); +// If there are no collisions return, otherwise clean up the new name. + } while (!V.getName().equals(ValidName)); +} + std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) { std::string ValidName; raw_string_ostream ValidNameStream(ValidName); ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D38978: [OpenMP] Enable the lowering of implicitly shared variables in OpenMP GPU-offloaded target regions to the GPU shared memory
gtbercea updated this revision to Diff 119327. gtbercea added a comment. Eliminate variable and function name clean-up. That has been moved into a separate patch: https://reviews.llvm.org/D39005 Repository: rL LLVM https://reviews.llvm.org/D38978 Files: include/llvm/CodeGen/TargetPassConfig.h lib/CodeGen/TargetPassConfig.cpp lib/Target/NVPTX/CMakeLists.txt lib/Target/NVPTX/NVPTX.h lib/Target/NVPTX/NVPTXAsmPrinter.cpp lib/Target/NVPTX/NVPTXFrameLowering.cpp lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp lib/Target/NVPTX/NVPTXFunctionDataSharing.h lib/Target/NVPTX/NVPTXInstrInfo.td lib/Target/NVPTX/NVPTXLowerAlloca.cpp lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp lib/Target/NVPTX/NVPTXRegisterInfo.cpp lib/Target/NVPTX/NVPTXRegisterInfo.h lib/Target/NVPTX/NVPTXRegisterInfo.td lib/Target/NVPTX/NVPTXTargetMachine.cpp lib/Target/NVPTX/NVPTXUtilities.cpp lib/Target/NVPTX/NVPTXUtilities.h Index: lib/Target/NVPTX/NVPTXUtilities.h === --- lib/Target/NVPTX/NVPTXUtilities.h +++ lib/Target/NVPTX/NVPTXUtilities.h @@ -14,6 +14,8 @@ #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H +#include "NVPTXTargetMachine.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IntrinsicInst.h" @@ -60,6 +62,8 @@ bool getAlign(const Function &, unsigned index, unsigned &); bool getAlign(const CallInst &, unsigned index, unsigned &); +bool ptrIsStored(Value *Ptr); + } #endif Index: lib/Target/NVPTX/NVPTXUtilities.cpp === --- lib/Target/NVPTX/NVPTXUtilities.cpp +++ lib/Target/NVPTX/NVPTXUtilities.cpp @@ -28,6 +28,8 @@ namespace llvm { +#define DEBUG_TYPE "nvptx-utilities" + namespace { typedef std::map > key_val_pair_t; typedef std::map global_val_annot_t; @@ -314,4 +316,50 @@ return false; } +/// Returns true if there are any instructions storing +/// the address of this pointer. +bool ptrIsStored(Value *Ptr) { + SmallVector PointerAliases; + PointerAliases.push_back(Ptr); + + SmallVector Users; + for (const Use &U : Ptr->uses()) +Users.push_back(U.getUser()); + + for (unsigned I = 0; I < Users.size(); ++I) { +// Get pointer usage +const User *FU = Users[I]; + +// Check if Ptr or an alias to it is the destination of the store +auto SI = dyn_cast(FU); +if (SI) { + for (auto Alias: PointerAliases) +if (SI->getValueOperand() == Alias) + return true; + continue; +} + +// TODO: Can loads lead to address being taken? +// TODO: Can GEPs lead to address being taken? + +// Bitcasts increase aliases of the pointer +auto BI = dyn_cast(FU); +if (BI) { + for (const Use &U : BI->uses()) +Users.push_back(U.getUser()); + PointerAliases.push_back(BI); + continue; +} + +// TODO: +// There may be other instructions which increase the number +// of alias values ex. operations on the address of the alloca. +// The whole alloca'ed memory region needs to be shared if at +// least one of the values needs to be shared. + } + + // Address of the pointer has been stored + return false; +} + } // namespace llvm Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp === --- lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -54,6 +54,7 @@ void initializeNVPTXLowerAggrCopiesPass(PassRegistry &); void initializeNVPTXLowerArgsPass(PassRegistry &); void initializeNVPTXLowerAllocaPass(PassRegistry &); +void initializeNVPTXFunctionDataSharingPass(PassRegistry &); } // end namespace llvm @@ -72,6 +73,7 @@ initializeNVPTXAssignValidGlobalNamesPass(PR); initializeNVPTXLowerArgsPass(PR); initializeNVPTXLowerAllocaPass(PR); + initializeNVPTXFunctionDataSharingPass(PR); initializeNVPTXLowerAggrCopiesPass(PR); } @@ -148,6 +150,7 @@ bool addInstSelector() override; void addPostRegAlloc() override; void addMachineSSAOptimization() override; + void addMachineSSALowering() override; FunctionPass *createTargetRegisterAllocator(bool) override; void addFastRegAlloc(FunctionPass *RegAllocPass) override; @@ -248,10 +251,15 @@ // before the address space inference passes. addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine())); if (getOptLevel() != CodeGenOpt::None) { +// Add address space inference passes addAddressSpaceInferencePasses(); if (!DisableLoadStoreVectorizer) addPass(createLoadStoreVectorizerPass()); addStraightLineScalarOptimizationPasses(); + } else { +// Even when no optimizations are used, we need to lower certain +// alloca instructions to the appropriate memory type for correctness. +addPass(createNVPTXFunctionDataSharingPass(&getNVPTXTa
[PATCH] D39005: [OpenMP] Clean up variable and function names for NVPTX backend
gtbercea added a comment. Hi Artem, Justin, I see that this patch is the same as the patch Arpith wanted to post a while back i.e. https://reviews.llvm.org/D17738. Was there a consensus regarding what the right thing to do is in this case? I'd be interested to get the ball rolling in regard to coming up with a fix for this. I see some suggestions in past patches. Some help/clarification would be much appreciated. Thanks! Repository: rL LLVM https://reviews.llvm.org/D39005 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D39005: [OpenMP] Clean up variable and function names for NVPTX backend
gtbercea added a comment. In https://reviews.llvm.org/D39005#900226, @jlebar wrote: > > I'd be interested to get the ball rolling in regard to coming up with a fix > > for this. I see some suggestions in past patches. Some help/clarification > > would be much appreciated. > > Happy to help, but I'm not sure what to offer beyond the link in Art's > previous comment. Thanks Justin. Perhaps if we could start by clarifying this statement "One option is that we add a function to LLVM get an available separator character, which can default to '.', but we set to '$' for nvptx, and use that for generating new names at the IR level." as well as this statement "This seems practical. Perhaps it could be part of the name mangling scheme already encoded in DataLayout?". The first question that comes to mind is what is the link between data layout and name mangling conventions? Repository: rL LLVM https://reviews.llvm.org/D39005 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls
gtbercea added inline comments. Comment at: lib/Basic/Targets/NVPTX.cpp:232 + // getting inlined on the device. + Builder.defineMacro("__NO_MATH_INLINES"); } tra wrote: > This relies on implementation detail of particular variant of the header file > you're assuming all compilations will include. This is a workaround of the > real problem (attempting to use headers from machine X while targeting Y) at > best. > > D50845 is dealing with the issue of headers for target code. Hopefully, > they'll find a way to provide device-specific headers, so you don't rely on > host headers being parseable during device-side compilation. I agree. The proper fix would be what the other patch is attempting to do. Comment at: lib/Driver/ToolChains/Clang.cpp:4758 +// toolchain. +CmdArgs.push_back("-fno-math-builtin"); } tra wrote: > Could you elaborate on why you don't want the builtins? > Builtins are enabled and are useful for CUDA. What makes their use different > for OpenMP? > Are you doing it to guarantee that math functions remain unresolved in IR so > you could link them in from external bitcode? > That's right. I don't particularly like this approach as this leads to OpenMP-NVPTX toolchain missing out on optimizations such as replacing math function call with basic operations ( pow(a,2) -> a*a for example). I am trying to fix this in a future patch by allowing intrinsics/builtins to propagate. Repository: rC Clang https://reviews.llvm.org/D47849 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D51312: [OpenMP][NVPTX] Use appropriate _CALL_ELF macro when offloading
gtbercea created this revision. gtbercea added reviewers: Hahnfeld, ABataev, caomhin. Herald added subscribers: cfe-commits, guansong. When offloading to a device and using the powerpc64le version of the auxiliary triple, the _CALL_ELF macro is not set correctly to 2 resulting in the attempt to include a header that does not exist. This patch fixes this problem. Repository: rC Clang https://reviews.llvm.org/D51312 Files: lib/Frontend/InitPreprocessor.cpp Index: lib/Frontend/InitPreprocessor.cpp === --- lib/Frontend/InitPreprocessor.cpp +++ lib/Frontend/InitPreprocessor.cpp @@ -1112,8 +1112,12 @@ Builder.defineMacro("__x86_64__"); break; case llvm::Triple::ppc64: +Builder.defineMacro("__powerpc64__"); +Builder.defineMacro("_CALL_ELF", "1"); +break; case llvm::Triple::ppc64le: Builder.defineMacro("__powerpc64__"); +Builder.defineMacro("_CALL_ELF", "2"); break; default: break; Index: lib/Frontend/InitPreprocessor.cpp === --- lib/Frontend/InitPreprocessor.cpp +++ lib/Frontend/InitPreprocessor.cpp @@ -1112,8 +1112,12 @@ Builder.defineMacro("__x86_64__"); break; case llvm::Triple::ppc64: +Builder.defineMacro("__powerpc64__"); +Builder.defineMacro("_CALL_ELF", "1"); +break; case llvm::Triple::ppc64le: Builder.defineMacro("__powerpc64__"); +Builder.defineMacro("_CALL_ELF", "2"); break; default: break; ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D51312: [OpenMP][NVPTX] Use appropriate _CALL_ELF macro when offloading
gtbercea updated this revision to Diff 162708. gtbercea added a comment. Add test. Repository: rC Clang https://reviews.llvm.org/D51312 Files: lib/Frontend/InitPreprocessor.cpp test/Preprocessor/aux-triple.c Index: test/Preprocessor/aux-triple.c === --- test/Preprocessor/aux-triple.c +++ test/Preprocessor/aux-triple.c @@ -14,7 +14,7 @@ // RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \ // RUN: -triple nvptx64-none-none -aux-triple powerpc64le-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s \ -// RUN: -check-prefixes NVPTX64,PPC64,LINUX,LINUX-CPP +// RUN: -check-prefixes NVPTX64,PPC64LE,LINUX,LINUX-CPP // RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \ // RUN: -triple nvptx64-none-none -aux-triple x86_64-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s \ @@ -24,22 +24,24 @@ // RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \ // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ // RUN: -aux-triple powerpc64le-unknown-linux-gnu \ -// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64,LINUX %s +// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64LE,LINUX %s // RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \ // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ // RUN: -aux-triple x86_64-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,X86_64,LINUX %s // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \ // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ // RUN: -aux-triple powerpc64le-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s \ -// RUN: -check-prefixes NVPTX64,PPC64,LINUX,LINUX-CPP +// RUN: -check-prefixes NVPTX64,PPC64LE,LINUX,LINUX-CPP // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \ // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ // RUN: -aux-triple x86_64-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s \ // RUN: -check-prefixes NVPTX64,X86_64,LINUX,LINUX-CPP +// PPC64LE:#define _CALL_ELF 2 + // NONE-NOT:#define _GNU_SOURCE // LINUX-CPP:#define _GNU_SOURCE 1 @@ -56,7 +58,7 @@ // LINUX:#define __linux__ 1 // NONE-NOT:#define __powerpc64__ -// PPC64:#define __powerpc64__ 1 +// PPC64LE:#define __powerpc64__ 1 // NONE-NOT:#define __x86_64__ // X86_64:#define __x86_64__ 1 Index: lib/Frontend/InitPreprocessor.cpp === --- lib/Frontend/InitPreprocessor.cpp +++ lib/Frontend/InitPreprocessor.cpp @@ -1106,14 +1106,19 @@ auto AuxTriple = AuxTI.getTriple(); // Define basic target macros needed by at least bits/wordsize.h and - // bits/mathinline.h + // bits/mathinline.h. + // On PowerPC, explicitely set _CALL_ELF macro needed for gnu/stubs.h. switch (AuxTriple.getArch()) { case llvm::Triple::x86_64: Builder.defineMacro("__x86_64__"); break; case llvm::Triple::ppc64: +Builder.defineMacro("__powerpc64__"); +Builder.defineMacro("_CALL_ELF", "1"); +break; case llvm::Triple::ppc64le: Builder.defineMacro("__powerpc64__"); +Builder.defineMacro("_CALL_ELF", "2"); break; default: break; Index: test/Preprocessor/aux-triple.c === --- test/Preprocessor/aux-triple.c +++ test/Preprocessor/aux-triple.c @@ -14,7 +14,7 @@ // RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \ // RUN: -triple nvptx64-none-none -aux-triple powerpc64le-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s \ -// RUN: -check-prefixes NVPTX64,PPC64,LINUX,LINUX-CPP +// RUN: -check-prefixes NVPTX64,PPC64LE,LINUX,LINUX-CPP // RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \ // RUN: -triple nvptx64-none-none -aux-triple x86_64-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s \ @@ -24,22 +24,24 @@ // RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \ // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ // RUN: -aux-triple powerpc64le-unknown-linux-gnu \ -// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64,LINUX %s +// RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64LE,LINUX %s // RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \ // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ // RUN: -aux-triple x86_64-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines -check-prefixes NVPTX64,X86_64,LINUX %s // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \ // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \ // RUN: -aux-triple powerpc64le-unknown-linux-gnu \ // RUN: | FileCheck -match-full-lines %s \ -// RUN: -check-prefixes NVPTX64,PPC64,LINUX,LINUX-CPP +// RUN: -check-prefixes N
[PATCH] D51446: [OpenMP][bugfix] Add missing macros for Power
gtbercea created this revision. gtbercea added reviewers: ABataev, Hahnfeld, caomhin. Herald added subscribers: cfe-commits, guansong. Add missing macros when the auxiliary triple points to the PPC architecture. Repository: rC Clang https://reviews.llvm.org/D51446 Files: lib/Frontend/InitPreprocessor.cpp Index: lib/Frontend/InitPreprocessor.cpp === --- lib/Frontend/InitPreprocessor.cpp +++ lib/Frontend/InitPreprocessor.cpp @@ -1113,10 +1113,18 @@ Builder.defineMacro("__x86_64__"); break; case llvm::Triple::ppc64: +if (AuxTI.getLongDoubleWidth() == 128) { + Builder.defineMacro("__LONG_DOUBLE_128__"); + Builder.defineMacro("__LONGDOUBLE128"); +} Builder.defineMacro("__powerpc64__"); Builder.defineMacro("_CALL_ELF", "1"); break; case llvm::Triple::ppc64le: +if (AuxTI.getLongDoubleWidth() == 128) { + Builder.defineMacro("__LONG_DOUBLE_128__"); + Builder.defineMacro("__LONGDOUBLE128"); +} Builder.defineMacro("__powerpc64__"); Builder.defineMacro("_CALL_ELF", "2"); break; Index: lib/Frontend/InitPreprocessor.cpp === --- lib/Frontend/InitPreprocessor.cpp +++ lib/Frontend/InitPreprocessor.cpp @@ -1113,10 +1113,18 @@ Builder.defineMacro("__x86_64__"); break; case llvm::Triple::ppc64: +if (AuxTI.getLongDoubleWidth() == 128) { + Builder.defineMacro("__LONG_DOUBLE_128__"); + Builder.defineMacro("__LONGDOUBLE128"); +} Builder.defineMacro("__powerpc64__"); Builder.defineMacro("_CALL_ELF", "1"); break; case llvm::Triple::ppc64le: +if (AuxTI.getLongDoubleWidth() == 128) { + Builder.defineMacro("__LONG_DOUBLE_128__"); + Builder.defineMacro("__LONGDOUBLE128"); +} Builder.defineMacro("__powerpc64__"); Builder.defineMacro("_CALL_ELF", "2"); break; ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D51446: [OpenMP][bugfix] Add missing macros for Power
gtbercea updated this revision to Diff 163377. gtbercea added a comment. Add test. Repository: rC Clang https://reviews.llvm.org/D51446 Files: lib/Frontend/InitPreprocessor.cpp test/Preprocessor/aux-triple.c Index: test/Preprocessor/aux-triple.c === --- test/Preprocessor/aux-triple.c +++ test/Preprocessor/aux-triple.c @@ -50,6 +50,9 @@ // NONE-NOT:#define __ELF__ // LINUX:#define __ELF__ 1 +// PPC64LE:#define __LONGDOUBLE128 1 +// PPC64LE:#define __LONG_DOUBLE_128__ 1 + // NVPTX64:#define __LP64__ 1 // NVPTX64:#define __NVPTX__ 1 // NVPTX64:#define __PTX__ 1 Index: lib/Frontend/InitPreprocessor.cpp === --- lib/Frontend/InitPreprocessor.cpp +++ lib/Frontend/InitPreprocessor.cpp @@ -1113,13 +1113,24 @@ Builder.defineMacro("__x86_64__"); break; case llvm::Triple::ppc64: -Builder.defineMacro("__powerpc64__"); -Builder.defineMacro("_CALL_ELF", "1"); -break; case llvm::Triple::ppc64le: + { Builder.defineMacro("__powerpc64__"); -Builder.defineMacro("_CALL_ELF", "2"); + +StringRef ABI = AuxTI.getABI(); +// Set _CALL_ELF macro needed for gnu/stubs.h +if (ABI == "elfv1" || ABI == "elfv1-qpx") + Builder.defineMacro("_CALL_ELF", "1"); +if (ABI == "elfv2") + Builder.defineMacro("_CALL_ELF", "2"); + +// Required by PowerPC host toolchain. +if (AuxTI.getLongDoubleWidth() == 128) { + Builder.defineMacro("__LONGDOUBLE128"); + Builder.defineMacro("__LONG_DOUBLE_128__"); +} break; + } default: break; } Index: test/Preprocessor/aux-triple.c === --- test/Preprocessor/aux-triple.c +++ test/Preprocessor/aux-triple.c @@ -50,6 +50,9 @@ // NONE-NOT:#define __ELF__ // LINUX:#define __ELF__ 1 +// PPC64LE:#define __LONGDOUBLE128 1 +// PPC64LE:#define __LONG_DOUBLE_128__ 1 + // NVPTX64:#define __LP64__ 1 // NVPTX64:#define __NVPTX__ 1 // NVPTX64:#define __PTX__ 1 Index: lib/Frontend/InitPreprocessor.cpp === --- lib/Frontend/InitPreprocessor.cpp +++ lib/Frontend/InitPreprocessor.cpp @@ -1113,13 +1113,24 @@ Builder.defineMacro("__x86_64__"); break; case llvm::Triple::ppc64: -Builder.defineMacro("__powerpc64__"); -Builder.defineMacro("_CALL_ELF", "1"); -break; case llvm::Triple::ppc64le: + { Builder.defineMacro("__powerpc64__"); -Builder.defineMacro("_CALL_ELF", "2"); + +StringRef ABI = AuxTI.getABI(); +// Set _CALL_ELF macro needed for gnu/stubs.h +if (ABI == "elfv1" || ABI == "elfv1-qpx") + Builder.defineMacro("_CALL_ELF", "1"); +if (ABI == "elfv2") + Builder.defineMacro("_CALL_ELF", "2"); + +// Required by PowerPC host toolchain. +if (AuxTI.getLongDoubleWidth() == 128) { + Builder.defineMacro("__LONGDOUBLE128"); + Builder.defineMacro("__LONG_DOUBLE_128__"); +} break; + } default: break; } ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation
gtbercea added a comment. In https://reviews.llvm.org/D50845#1219709, @tra wrote: > FYI. This breaks our CUDA compilation. I haven't figured out what exactly is > wrong yet. I may need to unroll the patch if the fix is not obvious. Agreed. Patches https://reviews.llvm.org/D51446 and https://reviews.llvm.org/D51312 apply fixes for the PPC64 toolchain. Similar fixes are needed for other architectures probably. In general, it looks like this patch leads to some host macros having to be defined again for the auxiliary triple case. It is not clear to me how to exhaustively identify the missing macros, so far it's been just trial and error. Repository: rL LLVM https://reviews.llvm.org/D50845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation
gtbercea added a comment. In https://reviews.llvm.org/D50845#1219746, @tra wrote: > Also, whatever macros we generate do not prevent headers from using x86 > inline assembly. I see quite a few inline asm code in preprocessed output. > The headers are from libc ~2.19. Did you try adding Builder.defineMacro("__NO_MATH_INLINES"); Repository: rL LLVM https://reviews.llvm.org/D50845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation
gtbercea added a comment. In https://reviews.llvm.org/D50845#1219859, @Hahnfeld wrote: > removing `InitializePredefinedAuxMacros` and the new test completely should > do. Yep they also contain https://reviews.llvm.org/D51312 in case you're rolling back individual commits. Repository: rL LLVM https://reviews.llvm.org/D50845 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea marked 3 inline comments as done. gtbercea added a comment. Answers to comments. Comment at: include/clang/Driver/Compilation.h:312 + /// \param skipBundler - bool value set once by the driver. + void setSkipOffloadBundler(bool skipBundler); + sfantao wrote: > gtbercea wrote: > > sfantao wrote: > > > Why is this a property of the compilation and not of a set of actions > > > referring to a given target? That would allow one to combine in the same > > > compilation targets requiring the bundler and targets that wouldn't. > > This was a way to pass this information to the OpenMP NVPTX device > > toolchain. > > > > Both the Driver OpenMP NVPTX toolchain need to agree on the usage of the > > new scheme (proposed in this patch) or the old scheme (the one that is in > > the compiler today). > > > > > I understand, but the way I see it is that it is the toolchain that skips the > bundler not the compilation. I understand that as of this patch, you skip > only if there is a single nvptx target. If you have more than one target, as > some tests do, some toolchains will still need the bundler. So, we are making > what happens with the nvptx target dependent of other toolchains. Is this an > intended effect of this patch? Bundler is skipped only for the OpenMP NVPTX toolchain. I'm not sure what you mean by "other toolchain". Comment at: lib/Driver/Compilation.cpp:276 +void Compilation::setSkipOffloadBundler(bool skipBundler) { + skipOffloadBundler = skipBundler; +} sfantao wrote: > gtbercea wrote: > > sfantao wrote: > > > Given the logic you have below, you are assuming this is not set to false > > > ever. It would be wise to get an assertion here in case you end up having > > > toolchains skipping and others don't. If that is just not supported a > > > diagnostic should be added instead. > > > > > > The convention is that local variables use CamelCase. > > The checks I added in the Driver will set this flag to true if all > > toolchains Clang offloads to support the skipping of the bundler/unbundler > > for object files. Currently only NVPTX toolchain can skip the > > bundler/unbundler for object files so the code path in this patch will be > > taken only for: > > > > -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda > Ok, if that is the case, just add an assertion here. If one of the toolchains in the list of toolchains can't skip then none of them skip. If all can skip then they all skip. What assertion would you like me to add? Comment at: lib/Driver/Driver.cpp:2943 +} + } + sfantao wrote: > gtbercea wrote: > > sfantao wrote: > > > Can you just implement this check in the definition of `Compilation: > > > canSkipClangOffloadBundler` and get rid of `setSkipOffloadBundler`? All > > > the requirted information is already in `Compilation` under > > > `C.getInputArgs()`. > > The driver needs to have the result of this check available. The flag is > > passed to the step which adds host-device dependencies. If the bundler can > > be skipped then the unbundling action is not required. > > > > I guess this could be implemented in Compilation. Even so I would like it > > to happen only once like it does here and not every time someone queries > > the "can I skip the bundler" flag. > > > > I wanted this check to happen only once hence why I put in on the driver > > side. The result of this check needs to be available in Driver.cpp and in > > Cuda.cpp files (see usage in this patch). Compilation keeps track of the > > flag because skipping the bundler is an all or nothing flag: you can skip > > the bundler/unbundler for object files if and only if all toolchains you > > are offloading to can skip it. > > > Right, in these circumstances "can skip bundler" is the same as "do I have a > single toolchain" and "is that toolchain nvptx". This is fairly inexpensive > to do, so I don't really see the need to record this state in the driver. It > will also be clearer what are the conditions for which you skip the bundler. That is true for now but if more toolchains get added to the list of toolchains that can skip the bundler then you want to factor it out and make it happen only once in a toolchain-independent point in the code. Otherwise you will carry that list of toolchains everywhere in the code where you need to do the check. Also if you are to do this at toolchain level you will not be able to check if the other toolchains were able to skip or not. For now ALL toolchains must skip or ALL toolchains don't skip the bundler. Comment at: lib/Driver/ToolChains/Cuda.cpp:496 + ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch)) + : GPUArch.str().c_str(); + const char *PtxF = sfantao wrote: > Why don't create fatbins instead of cubins in all cases. For the purposes of > OpenMP they are equ
[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain
gtbercea added inline comments. Comment at: include/clang/Driver/Compilation.h:312 + /// \param skipBundler - bool value set once by the driver. + void setSkipOffloadBundler(bool skipBundler); + sfantao wrote: > gtbercea wrote: > > sfantao wrote: > > > gtbercea wrote: > > > > sfantao wrote: > > > > > Why is this a property of the compilation and not of a set of actions > > > > > referring to a given target? That would allow one to combine in the > > > > > same compilation targets requiring the bundler and targets that > > > > > wouldn't. > > > > This was a way to pass this information to the OpenMP NVPTX device > > > > toolchain. > > > > > > > > Both the Driver OpenMP NVPTX toolchain need to agree on the usage of > > > > the new scheme (proposed in this patch) or the old scheme (the one that > > > > is in the compiler today). > > > > > > > > > > > I understand, but the way I see it is that it is the toolchain that skips > > > the bundler not the compilation. I understand that as of this patch, you > > > skip only if there is a single nvptx target. If you have more than one > > > target, as some tests do, some toolchains will still need the bundler. > > > So, we are making what happens with the nvptx target dependent of other > > > toolchains. Is this an intended effect of this patch? > > Bundler is skipped only for the OpenMP NVPTX toolchain. I'm not sure what > > you mean by "other toolchain". > Is skipped for the NVPTX toolchain if there are no "other" device toolchains > requested. Say I have a working pipeline that does static linking with nvptx > correctly. Then on top of that I add another device to `-fopenmp-targets`, > that pipeline will now fail even for nvptx, right? It's a choice between skipping the bundler and running the current, default mode with the bundler enabled. If targets other than NVPTX are present then we default to using the bundler for all toolchains. There is no hybrid mode enabled where some targets use the bundler and some don't. Comment at: lib/Driver/Compilation.cpp:276 +void Compilation::setSkipOffloadBundler(bool skipBundler) { + skipOffloadBundler = skipBundler; +} sfantao wrote: > gtbercea wrote: > > sfantao wrote: > > > gtbercea wrote: > > > > sfantao wrote: > > > > > Given the logic you have below, you are assuming this is not set to > > > > > false ever. It would be wise to get an assertion here in case you end > > > > > up having toolchains skipping and others don't. If that is just not > > > > > supported a diagnostic should be added instead. > > > > > > > > > > The convention is that local variables use CamelCase. > > > > The checks I added in the Driver will set this flag to true if all > > > > toolchains Clang offloads to support the skipping of the > > > > bundler/unbundler for object files. Currently only NVPTX toolchain can > > > > skip the bundler/unbundler for object files so the code path in this > > > > patch will be taken only for: > > > > > > > > -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda > > > Ok, if that is the case, just add an assertion here. > > If one of the toolchains in the list of toolchains can't skip then none of > > them skip. If all can skip then they all skip. What assertion would you > > like me to add? > If SkipOffloadBundler is set to true you don't expect it to be set to false > afterwards, right? That should be asserted. That's correct, I can add that sure. Repository: rC Clang https://reviews.llvm.org/D47394 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D42841: [docs] Improve help for OpenMP options
gtbercea accepted this revision. gtbercea added a comment. LG https://reviews.llvm.org/D42841 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea created this revision. gtbercea added reviewers: Hahnfeld, ABataev, carlo.bertolli, caomhin, grokos. Herald added subscribers: cfe-commits, guansong. This patch adds an additional flag to the OpenMP device offloading toolchain to link in the runtime library bitcode. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,35 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime); + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing libomptarget-nvptx.bc in library path.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,35 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime); + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing libomptarget-nvptx.bc in library path.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 133882. gtbercea added a comment. Fix warning message. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,36 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,36 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 133919. gtbercea added a comment. Add regression tests. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,24 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch /tmp/libomptarget-nvptx-sm_60.bc +// RUN: LIBRARY_PATH=/tmp %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: LIBRARY_PATH= %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,36 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,24 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch /tmp/libomptarget-nvptx-sm_60.bc +// RUN: LIBRARY_PATH=/tmp %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}li
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 134235. gtbercea added a comment. Move unix specific test to new file. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c test/Driver/unix-openmp-offload-gpu.c Index: test/Driver/unix-openmp-offload-gpu.c === --- /dev/null +++ test/Driver/unix-openmp-offload-gpu.c @@ -0,0 +1,21 @@ +/// +/// Perform several driver tests for OpenMP offloading +/// + +// REQUIRES: linux +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch /tmp/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=/tmp +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,14 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: env LIBRARY_PATH="" +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,36 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/unix-openmp-offload-gpu.c === --- /dev/null +++ test/Driver/unix-openmp-offload-gpu.c @@ -0,0 +1,21 @@ +/// +/// Perform several driver tests for OpenMP offloading +/// + +// REQUIRES: linux +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +/// ###
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea marked an inline comment as done. gtbercea added inline comments. Comment at: test/Driver/openmp-offload-gpu.c:150 +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch /tmp/libomptarget-nvptx-sm_60.bc +// RUN: LIBRARY_PATH=/tmp %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ Hahnfeld wrote: > This should not be in `/tmp` but probably `%T`. I don't think this would have worked since I need to create a file with a specific name in a folder somewhere and the separator is OS specific. I moved the test to a new file where I limit OS to linux. Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 134238. gtbercea added a comment. Fix tmp folder name. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c test/Driver/unix-openmp-offload-gpu.c Index: test/Driver/unix-openmp-offload-gpu.c === --- /dev/null +++ test/Driver/unix-openmp-offload-gpu.c @@ -0,0 +1,21 @@ +/// +/// Perform several driver tests for OpenMP offloading +/// + +// REQUIRES: linux +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch %t-dir/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%t-dir +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,14 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: env LIBRARY_PATH="" +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,36 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/unix-openmp-offload-gpu.c === --- /dev/null +++ test/Driver/unix-openmp-offload-gpu.c @@ -0,0 +1,21 @@ +/// +/// Perform several driver tests for OpenMP offloading +/// + +// REQUIRES: linux +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +/// ###
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea added inline comments. Comment at: test/Driver/unix-openmp-offload-gpu.c:15 +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch /tmp/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=/tmp Hahnfeld wrote: > Hahnfeld wrote: > > I don't see how that solves the problem of using `/tmp`?!? > (Interesting that this works with `%t`, the documentation mentions `%T` for a > directory. But as other test cases do the same...) %T works too I just tried it. Any preference as to which one to use? Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 134278. gtbercea added a comment. Use %T. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c test/Driver/unix-openmp-offload-gpu.c Index: test/Driver/unix-openmp-offload-gpu.c === --- /dev/null +++ test/Driver/unix-openmp-offload-gpu.c @@ -0,0 +1,21 @@ +/// +/// Perform several driver tests for OpenMP offloading +/// + +// REQUIRES: linux +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,14 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: env LIBRARY_PATH="" +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,36 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/unix-openmp-offload-gpu.c === --- /dev/null +++ test/Driver/unix-openmp-offload-gpu.c @@ -0,0 +1,21 @@ +/// +/// Perform several driver tests for OpenMP offloading +/// + +// REQUIRES: linux +// REQUIRES: clang-driver +// REQUIRES: x86-registered-target +// REQUIRES: powerpc-registered-target +// REQUIRES: nvptx-registered-target + +///
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea added a comment. In https://reviews.llvm.org/D43197#1007918, @Hahnfeld wrote: > I'm still not sure we can't run this test on Windows. I think lots of other > tests use `touch`, even some specific to Windows... Let me know what you'd like me to do. I can add the test back. I do see other tests not worrying about this so maybe I can do the same here... Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 134292. gtbercea added a comment. Revert. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,26 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: env LIBRARY_PATH="" +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,36 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,26 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mli
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 134295. gtbercea added a comment. Fix test. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,24 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -529,6 +529,36 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +if (char *env = ::getenv("LIBRARY_PATH")) { + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (std::string LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -196,6 +196,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,24 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library that will be found via the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc +
[PATCH] D43625: [OpenMP] Remove implicit data sharing code gen that aims to use device shared memory
gtbercea created this revision. gtbercea added reviewers: ABataev, carlo.bertolli, caomhin. Herald added subscribers: cfe-commits, guansong, jholewinski. Remove this scheme for now since it will be covered by another more generic scheme using global memory. This code will be worked into an optimization for the generic data sharing scheme. Removing this completely and then adding it via future patches will make all future data sharing patches cleaner. Repository: rC Clang https://reviews.llvm.org/D43625 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp test/OpenMP/nvptx_target_teams_codegen.cpp Index: test/OpenMP/nvptx_target_teams_codegen.cpp === --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -148,7 +148,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -78,7 +78,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -92,20 +92,20 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] // // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]_wrapper( + // CHECK: call void [[PARALLEL_FN1]]( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT1]] // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] // // CHECK: [[EXEC_PFN2]] - // CHECK: call void [[PARALLEL_FN2]]_wrapper( + // CHECK: call void [[PARALLEL_FN2]]( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT2]] @@ -152,13 +152,13 @@ // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1]]_wrapper to i8*), + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @__kmpc_serialized_parallel( // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( // CHECK: call void @__kmpc_end_serialized_parallel( - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2]]_wrapper to i8*), + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK-64-DAG: load i32, i32* [[REF_A]] @@ -203,7 +203,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_ker
[PATCH] D43660: [OpenMP] Add OpenMP data sharing infrastructure using global memory
gtbercea created this revision. gtbercea added reviewers: ABataev, carlo.bertolli, caomhin, hfinkel, Hahnfeld. Herald added subscribers: cfe-commits, guansong, jholewinski. This patch handles the Clang code generation phase for the OpenMP data sharing infrastructure. TODO: add a more detailed description. Repository: rC Clang https://reviews.llvm.org/D43660 Files: lib/CodeGen/CGDecl.cpp lib/CodeGen/CGOpenMPRuntime.cpp lib/CodeGen/CGOpenMPRuntime.h lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h lib/CodeGen/CGStmtOpenMP.cpp lib/CodeGen/CodeGenFunction.cpp test/OpenMP/nvptx_parallel_codegen.cpp Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -92,20 +92,20 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] // // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]( + // CHECK: call void [[PARALLEL_FN1]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT1]] // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] // // CHECK: [[EXEC_PFN2]] - // CHECK: call void [[PARALLEL_FN2]]( + // CHECK: call void [[PARALLEL_FN2]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT2]] @@ -152,13 +152,13 @@ // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*), + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @__kmpc_serialized_parallel( // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( // CHECK: call void @__kmpc_end_serialized_parallel( - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*), + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN2]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK-64-DAG: load i32, i32* [[REF_A]] @@ -203,7 +203,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -217,11 +217,11 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*) + // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i16, i32)* [[PARALLEL_FN4:@.+]]_wrapper to i8*) // CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]] // // CHECK: [[EXEC_PFN]] - // CHECK: call void [[PARALLEL_FN4]]( + // CHECK: call void [[PARALLEL_FN4]]_wrapper( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT]] @@ -283,7 +283,7 @@ // CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]] // // CHECK: [[IF_THEN]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*), + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN4]]_wrapper to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: br label {{%?}}[[IF_END:.+]] Index: lib/CodeGen/CodeGenFunction.cpp === --- lib/CodeGen/CodeGenFunction.cpp +++ lib/CodeGen/CodeGenFunction.cpp @@ -1058,6 +1058,11 @@ EmitStartEHSpec(CurCodeDecl); PrologueCleanupDepth = EHStack.stable_begin(); + + // Emit OpenMP specific initialization of the dev
[PATCH] D41485: [OpenMP][libomptarget] Add data sharing support in libomptarget
gtbercea created this revision. gtbercea added reviewers: carlo.bertolli, ABataev, Hahnfeld, grokos, caomhin, hfinkel. This patch extends the libomptarget functionality in patch https://reviews.llvm.org/D14254 with support for the data sharing scheme for supporting implicitly shared variables. The runtime therefore maintains a list of references to shared variables. Repository: rL LLVM https://reviews.llvm.org/D41485 Files: libomptarget/deviceRTLs/nvptx/src/interface.h libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h libomptarget/deviceRTLs/nvptx/src/option.h libomptarget/deviceRTLs/nvptx/src/parallel.cu Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu === --- libomptarget/deviceRTLs/nvptx/src/parallel.cu +++ libomptarget/deviceRTLs/nvptx/src/parallel.cu @@ -210,10 +210,16 @@ //} // // This routine is always called by the team master.. -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized) { +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized, + void ***SharedArgs, int32_t nArgs) { PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); omptarget_nvptx_workFn = WorkFn; + if (nArgs > 0) { +omptarget_nvptx_sharedArgs.EnsureSize(nArgs); +*SharedArgs = omptarget_nvptx_sharedArgs.GetArgs(); + } + if (!IsOMPRuntimeInitialized) return; // This routine is only called by the team master. The team master is @@ -310,11 +316,13 @@ // returns True if this thread is active, else False. // // Only the worker threads call this routine. -EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized) { +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized, + void ***SharedArgs) { PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); // Work function and arguments for L1 parallel region. *WorkFn = omptarget_nvptx_workFn; + *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs(); if (!IsOMPRuntimeInitialized) return true; Index: libomptarget/deviceRTLs/nvptx/src/option.h === --- libomptarget/deviceRTLs/nvptx/src/option.h +++ libomptarget/deviceRTLs/nvptx/src/option.h @@ -46,6 +46,10 @@ // to synchronize with each other. #define L1_BARRIER (1) +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + // Maximum number of omp state objects per SM allocated statically in global memory. #if __CUDA_ARCH__ >= 600 #define OMP_STATE_COUNT 32 Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -60,6 +60,46 @@ #define __ACTIVEMASK() __ballot(1) #endif +// arguments needed for L0 parallelism only. +class omptarget_nvptx_SharedArgs { +public: + // All these methods must be called by the master thread only. + INLINE void Init() { +args = buffer; +nArgs = MAX_SHARED_ARGS; + } + INLINE void DeInit() { +// Free any memory allocated for outlined parallel function with a large +// number of arguments. +if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, (char *)"new extended args"); + Init(); +} + } + INLINE void EnsureSize(int size) { +if (size > nArgs) { + if (nArgs > MAX_SHARED_ARGS) { +SafeFree(args, (char *)"new extended args"); + } + args = (void **) SafeMalloc(size * sizeof(void *), + (char *)"new extended args"); + nArgs = size; +} + } + // Called by all threads. + INLINE void **GetArgs() { return args; }; +private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; +}; + +extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs; + // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { // The maximum number of workers in a kernel. Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu === --- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -32,6 +32,7 @@ // extern volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn; extern __device__ __shared__ uint32_t execution_param; +__device__ __shared__
[PATCH] D41486: [OpenMP][Clang] Add missing argument to runtime functions.
gtbercea created this revision. gtbercea added reviewers: ABataev, carlo.bertolli, hfinkel, Hahnfeld, caomhin. Herald added a subscriber: jholewinski. This patch adds a missing argument to the runtime interface. Tests are adjusted accordingly. Repository: rL LLVM https://reviews.llvm.org/D41486 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_target_teams_codegen.cpp Index: test/OpenMP/nvptx_target_teams_codegen.cpp === --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], {{.*}} i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -148,7 +148,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], {{.*}} i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], Index: test/OpenMP/nvptx_data_sharing.cpp === --- test/OpenMP/nvptx_data_sharing.cpp +++ test/OpenMP/nvptx_data_sharing.cpp @@ -24,15 +24,15 @@ // CK1: define internal void @__omp_offloading_{{.*}}test_ds{{.*}}worker() [[ATTR1:#.*]] { // CK1: [[SHAREDARGS:%.+]] = alloca i8** -// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, i8*** [[SHAREDARGS]]) +// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, {{.*}} i8*** [[SHAREDARGS]]) // CK1: [[SHARGSTMP:%.+]] = load i8**, i8*** [[SHAREDARGS]] // CK1: call void @__omp_outlined___wrapper{{.*}}({{.*}}, i8** [[SHARGSTMP]]) /// = In the kernel function = /// // CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}() [[ATTR2:#.*]] { // CK1: [[SHAREDARGS1:%.+]] = alloca i8** -// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i8*** [[SHAREDARGS1]], i32 1) +// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, {{.*}} i8*** [[SHAREDARGS1]], i32 1) // CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]] // CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]] // CK1: [[SHAREDVAR:%.+]] = bitcast i32* {{.*}} to i8* Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp === --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -521,7 +521,8 @@ // Set up shared arguments Address SharedArgs = CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args"); - llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer()}; + llvm::Value *Args[] = {WorkFn.getPointer(), Bld.getInt16(1), + SharedArgs.getPointer()}; llvm::Value *Ret = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); @@ -638,16 +639,16 @@ case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { /// Build void __kmpc_kernel_prepare_parallel( /// void *outlined_function, void ***args, kmp_int32 nArgs); -llvm::Type *TypeParams[] = {CGM.Int8PtrTy, +llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty, CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); break; } case OMPRTL_NVPTX__kmpc_kernel_parallel: { /// Build bool __kmpc_kernel_parallel(void **outlined_function, void ***args); -llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, +llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty, CGM.Int8PtrPtrTy->getPointerTo(0)}; llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); llvm::FunctionType *FnTy = @@ -949,7 +950,7 @@ CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_args"); llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); - llvm::Value *Args[] = {ID, SharedArgsPtr, + llvm::Value *Args[] = {ID, Bld.getInt16(1), SharedArgsPtr, Bld.getInt32(CapturedVars.size())}; CGF.EmitRuntimeCall( @@ -970,7 +971,7 @@ Idx++; } } else { - llvm::Value *Args[] = {ID, + llvm::Value *
[PATCH] D41486: [OpenMP][Clang] Add missing argument to runtime functions.
gtbercea added a comment. In https://reviews.llvm.org/D41486#961981, @Hahnfeld wrote: > https://reviews.llvm.org/D41012? This patch doesn't update the documentation > with function signatures. Ok so I see that your patch uses a different order of the arguments. I've just added the data sharing related arguments at the end and this matches the libomptarget patch I just posted. Which way do we want to do this? Repository: rL LLVM https://reviews.llvm.org/D41486 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D41486: [OpenMP][Clang] Add missing argument to runtime functions.
gtbercea updated this revision to Diff 127865. gtbercea added a comment. Address comments. Repository: rL LLVM https://reviews.llvm.org/D41486 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_target_teams_codegen.cpp Index: test/OpenMP/nvptx_target_teams_codegen.cpp === --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1, i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -148,7 +148,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1, i8*** %shared_args) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], Index: test/OpenMP/nvptx_data_sharing.cpp === --- test/OpenMP/nvptx_data_sharing.cpp +++ test/OpenMP/nvptx_data_sharing.cpp @@ -24,15 +24,15 @@ // CK1: define internal void @__omp_offloading_{{.*}}test_ds{{.*}}worker() [[ATTR1:#.*]] { // CK1: [[SHAREDARGS:%.+]] = alloca i8** -// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, i8*** [[SHAREDARGS]]) +// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1, i8*** [[SHAREDARGS]]) // CK1: [[SHARGSTMP:%.+]] = load i8**, i8*** [[SHAREDARGS]] // CK1: call void @__omp_outlined___wrapper{{.*}}({{.*}}, i8** [[SHARGSTMP]]) /// = In the kernel function = /// // CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}() [[ATTR2:#.*]] { // CK1: [[SHAREDARGS1:%.+]] = alloca i8** -// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i8*** [[SHAREDARGS1]], i32 1) +// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1, i8*** [[SHAREDARGS1]], i32 1) // CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]] // CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]] // CK1: [[SHAREDVAR:%.+]] = bitcast i32* {{.*}} to i8* Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp === --- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -33,10 +33,11 @@ /// \brief Call to void __kmpc_spmd_kernel_deinit(); OMPRTL_NVPTX__kmpc_spmd_kernel_deinit, /// \brief Call to void __kmpc_kernel_prepare_parallel(void - /// *outlined_function, void ***args, kmp_int32 nArgs); + /// *outlined_function, int16_t IsOMPRuntimeInitialized, + /// void ***args, kmp_int32 nArgs); OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, - /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, void - /// ***args); + /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, + /// int16_t IsOMPRuntimeInitialized, void ***args); OMPRTL_NVPTX__kmpc_kernel_parallel, /// \brief Call to void __kmpc_kernel_end_parallel(); OMPRTL_NVPTX__kmpc_kernel_end_parallel, @@ -521,7 +522,9 @@ // Set up shared arguments Address SharedArgs = CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args"); - llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer()}; + llvm::Value *Args[] = {WorkFn.getPointer(), + /*IsOMPRuntimeInitialized*/ Bld.getInt16(1), + SharedArgs.getPointer()}; llvm::Value *Ret = CGF.EmitRuntimeCall( createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); @@ -638,16 +641,16 @@ case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { /// Build void __kmpc_kernel_prepare_parallel( /// void *outlined_function, void ***args, kmp_int32 nArgs); -llvm::Type *TypeParams[] = {CGM.Int8PtrTy, +llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty, CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty}; llvm::FunctionType *FnTy = llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); break; } case OMPRTL_NVPTX__kmpc_kernel_parallel: { /// Build bool __kmpc_kernel_parallel(void **outlined_function, void ***args); -llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, +llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty, C
[PATCH] D40451: [OpenMP] Add function attribute for triggering shared memory lowering in the LLVM backend
gtbercea closed this revision. gtbercea added a comment. Committed here https://reviews.llvm.org/D41123 Repository: rL LLVM https://reviews.llvm.org/D40451 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D41486: [OpenMP][Clang] Add missing argument to runtime functions.
gtbercea abandoned this revision. gtbercea added a comment. Functionality already landed. See previous comment. Repository: rL LLVM https://reviews.llvm.org/D41486 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43660: [OpenMP] Add OpenMP data sharing infrastructure using global memory
gtbercea updated this revision to Diff 136528. Repository: rC Clang https://reviews.llvm.org/D43660 Files: lib/CodeGen/CGDecl.cpp lib/CodeGen/CGOpenMPRuntime.cpp lib/CodeGen/CGOpenMPRuntime.h lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h lib/CodeGen/CGStmtOpenMP.cpp lib/CodeGen/CodeGenFunction.cpp test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -64,254 +64,243 @@ // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() +// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, +// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, +// CHECK: store i8* null, i8** [[OMP_WORK_FN]], +// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], +// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] +// +// CHECK: [[AWAIT_WORK]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] +// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 +// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 +// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null +// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] +// +// CHECK: [[SEL_WORKERS]] +// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] +// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 +// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] +// +// CHECK: [[EXEC_PARALLEL]] +// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] +// +// CHECK: [[EXEC_PFN1]] +// CHECK: call void [[PARALLEL_FN1]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT1]] +// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] +// +// CHECK: [[EXEC_PFN2]] +// CHECK: call void [[PARALLEL_FN2]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT2]] +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[TERM_PARALLEL]] +// CHECK: call void @__kmpc_kernel_end_parallel() +// CHECK: br label {{%?}}[[BAR_PARALLEL]] +// +// CHECK: [[BAR_PARALLEL]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: br label {{%?}}[[AWAIT_WORK]] +// +// CHECK: [[EXIT]] +// CHECK: ret void +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// Store captures in the context. +// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// +// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] +// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] +// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] +// +// CHECK: [[WORKER]] +// CHECK: {{call|invoke}} void [[T6]]_worker() +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[CHECK_MASTER]] +// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], +// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] +// +// CHECK: [[MASTER]] +// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] +// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*), +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( +// CHECK: call void @__kmpc_end_serialized_parallel( +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN2]]_wrapper to i8*), +// CHECK: call void @
[PATCH] D43625: [OpenMP] Remove implicit data sharing code gen that aims to use device shared memory
gtbercea updated this revision to Diff 136570. gtbercea added a comment. Add Source location. Repository: rC Clang https://reviews.llvm.org/D43625 Files: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp test/OpenMP/nvptx_target_teams_codegen.cpp Index: test/OpenMP/nvptx_target_teams_codegen.cpp === --- test/OpenMP/nvptx_target_teams_codegen.cpp +++ test/OpenMP/nvptx_target_teams_codegen.cpp @@ -60,7 +60,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -146,7 +146,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1) + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1) // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -78,7 +78,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -92,20 +92,20 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) + // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*) // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] // // CHECK: [[EXEC_PFN1]] - // CHECK: call void [[PARALLEL_FN1]]_wrapper( + // CHECK: call void [[PARALLEL_FN1]]( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT1]] // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) + // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*) // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] // // CHECK: [[EXEC_PFN2]] - // CHECK: call void [[PARALLEL_FN2]]_wrapper( + // CHECK: call void [[PARALLEL_FN2]]( // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] // // CHECK: [[CHECK_NEXT2]] @@ -152,13 +152,13 @@ // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1]]_wrapper to i8*), + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @__kmpc_serialized_parallel( // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( // CHECK: call void @__kmpc_end_serialized_parallel( - // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2]]_wrapper to i8*), + // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*), // CHECK: call void @llvm.nvvm.barrier0() // CHECK: call void @llvm.nvvm.barrier0() // CHECK-64-DAG: load i32, i32* [[REF_A]] @@ -203,7 +203,7 @@ // // CHECK: [[AWAIT_WORK]] // CHECK: call void @llvm.nvvm.barrier0() - // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], + // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], @@ -217,11 +217,11 @@ // // CHECK: [[EXEC_PARALLEL]] // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]], - // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]],
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea added a comment. In https://reviews.llvm.org/D43197#1011256, @Hahnfeld wrote: > Looking more closely at the patch, this doesn't seem to look into the `lib` / > `lib64` next to the compiler. I'm not sure if `LIBRARY_PATH` is set for every > installation, so I think we should add this one to catch the obvious case. > This probably needs some attention for the tests because they'll find the > just-built libraries... The contract with the user us that the .bc lib needs to be in LIBRARY_PATH, this is what we require today. Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:536-542 + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } Hahnfeld wrote: > `tools::addDirectoryList` uses `StringRef::find`, I'm not sure if > `StringRef::split` creates real copies of the string... What is your suggestion? Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 137203. gtbercea added a comment. Address comments. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,25 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s +// RUN: rm %T/libomptarget-nvptx-sm_60.bc + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include @@ -580,6 +581,42 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +// Add path to lib and/or lib64 folders. +SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); +llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DriverArgs.MakeArgString(DefaultLibPath)); + +// Add user defined library paths from LIBRARY_PATH. +if (llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH")) { + SmallVector Frags; + llvm::SplitString(*LibPath, Frags, + StringRef(&(llvm::sys::EnvPathSeparator))); + for (auto Path : Frags) +LibraryPaths.emplace_back(Path.trim()); +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (const std::string &LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -203,6 +203,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,25 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +///
[PATCH] D43660: [OpenMP] Add OpenMP data sharing infrastructure using global memory
gtbercea updated this revision to Diff 137210. gtbercea added a comment. Add init stack function. Repository: rC Clang https://reviews.llvm.org/D43660 Files: lib/CodeGen/CGDecl.cpp lib/CodeGen/CGOpenMPRuntime.cpp lib/CodeGen/CGOpenMPRuntime.h lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h lib/CodeGen/CGStmtOpenMP.cpp lib/CodeGen/CodeGenFunction.cpp test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -64,254 +64,243 @@ // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() +// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, +// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, +// CHECK: store i8* null, i8** [[OMP_WORK_FN]], +// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], +// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] +// +// CHECK: [[AWAIT_WORK]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] +// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 +// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 +// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null +// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] +// +// CHECK: [[SEL_WORKERS]] +// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] +// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 +// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] +// +// CHECK: [[EXEC_PARALLEL]] +// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] +// +// CHECK: [[EXEC_PFN1]] +// CHECK: call void [[PARALLEL_FN1]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT1]] +// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] +// +// CHECK: [[EXEC_PFN2]] +// CHECK: call void [[PARALLEL_FN2]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT2]] +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[TERM_PARALLEL]] +// CHECK: call void @__kmpc_kernel_end_parallel() +// CHECK: br label {{%?}}[[BAR_PARALLEL]] +// +// CHECK: [[BAR_PARALLEL]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: br label {{%?}}[[AWAIT_WORK]] +// +// CHECK: [[EXIT]] +// CHECK: ret void +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// Store captures in the context. +// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// +// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] +// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] +// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] +// +// CHECK: [[WORKER]] +// CHECK: {{call|invoke}} void [[T6]]_worker() +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[CHECK_MASTER]] +// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], +// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] +// +// CHECK: [[MASTER]] +// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] +// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*), +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( +// CHECK: call void @__kmpc_end_serialized_parallel( +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[P
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 137219. gtbercea added a comment. Address comments. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,25 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s +// RUN: rm %T/libomptarget-nvptx-sm_60.bc + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include @@ -580,6 +581,43 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +// Add path to lib and/or lib64 folders. +SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); +llvm::sys::path::append(DefaultLibPath, +Twine("lib") + CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DefaultLibPath.c_str()); + +// Add user defined library paths from LIBRARY_PATH. +if (llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH")) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (auto Path : Frags) +LibraryPaths.emplace_back(Path.trim()); +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (const std::string &LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -203,6 +203,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,25 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ##
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 137226. gtbercea added a comment. Address comments. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,25 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s +// RUN: rm %T/libomptarget-nvptx-sm_60.bc + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: Expect degraded performance on the target device due to missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include @@ -580,6 +581,43 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +// Add path to lib and/or lib64 folders. +SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); +llvm::sys::path::append(DefaultLibPath, +Twine("lib") + CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DefaultLibPath.c_str()); + +// Add user defined library paths from LIBRARY_PATH. +if (llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH")) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (auto Path : Frags) +LibraryPaths.emplace_back(Path.trim()); +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (const std::string &LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -203,6 +203,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "No .bc library found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,25 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 137230. gtbercea added a comment. Fix test. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,25 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s +// RUN: rm %T/libomptarget-nvptx-sm_60.bc + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: No .bc library 'libomptarget-nvptx-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include @@ -580,6 +581,43 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +// Add path to lib and/or lib64 folders. +SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); +llvm::sys::path::append(DefaultLibPath, +Twine("lib") + CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DefaultLibPath.c_str()); + +// Add user defined library paths from LIBRARY_PATH. +if (llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH")) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (auto Path : Frags) +LibraryPaths.emplace_back(Path.trim()); +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (const std::string &LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -203,6 +203,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def remark_drv_omp_offload_target_missingbcruntime : Warning< + "No .bc library '%0' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 137233. gtbercea added a comment. - Fix message and test. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,25 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s +// RUN: rm %T/libomptarget-nvptx-sm_60.bc + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include @@ -580,6 +581,43 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +// Add path to lib and/or lib64 folders. +SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); +llvm::sys::path::append(DefaultLibPath, +Twine("lib") + CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DefaultLibPath.c_str()); + +// Add user defined library paths from LIBRARY_PATH. +if (llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH")) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (auto Path : Frags) +LibraryPaths.emplace_back(Path.trim()); +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (const std::string &LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -203,6 +203,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def warn_drv_omp_offload_target_missingbcruntime : Warning< + "No library '%0' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offlo
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:536-542 + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } Hahnfeld wrote: > gtbercea wrote: > > Hahnfeld wrote: > > > `tools::addDirectoryList` uses `StringRef::find`, I'm not sure if > > > `StringRef::split` creates real copies of the string... > > What is your suggestion? > IMO you should use whatever existing code does, in that case > `StringRef::find`. Is this comment still relevant in the light of the most recent changes? Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43660: [OpenMP] Add OpenMP data sharing infrastructure using global memory
gtbercea updated this revision to Diff 137600. gtbercea added a comment. Patch splitting: limit support in this patch to standalone target regions only. Support for combined directives will be fully covered in a subsequent patch. Repository: rC Clang https://reviews.llvm.org/D43660 Files: lib/CodeGen/CGDecl.cpp lib/CodeGen/CGOpenMPRuntime.cpp lib/CodeGen/CGOpenMPRuntime.h lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp lib/CodeGen/CGOpenMPRuntimeNVPTX.h lib/CodeGen/CGStmtOpenMP.cpp lib/CodeGen/CodeGenFunction.cpp test/OpenMP/nvptx_data_sharing.cpp test/OpenMP/nvptx_parallel_codegen.cpp Index: test/OpenMP/nvptx_parallel_codegen.cpp === --- test/OpenMP/nvptx_parallel_codegen.cpp +++ test/OpenMP/nvptx_parallel_codegen.cpp @@ -64,254 +64,243 @@ // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker() +// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker() +// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8, +// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*, +// CHECK: store i8* null, i8** [[OMP_WORK_FN]], +// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]], +// CHECK: br label {{%?}}[[AWAIT_WORK:.+]] +// +// CHECK: [[AWAIT_WORK]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]] +// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8 +// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1 +// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null +// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]] +// +// CHECK: [[SEL_WORKERS]] +// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]] +// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0 +// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]] +// +// CHECK: [[EXEC_PARALLEL]] +// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]] +// +// CHECK: [[EXEC_PFN1]] +// CHECK: call void [[PARALLEL_FN1]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT1]] +// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]], +// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*) +// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]] +// +// CHECK: [[EXEC_PFN2]] +// CHECK: call void [[PARALLEL_FN2]]_wrapper( +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[CHECK_NEXT2]] +// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]] +// +// CHECK: [[TERM_PARALLEL]] +// CHECK: call void @__kmpc_kernel_end_parallel() +// CHECK: br label {{%?}}[[BAR_PARALLEL]] +// +// CHECK: [[BAR_PARALLEL]] +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: br label {{%?}}[[AWAIT_WORK]] +// +// CHECK: [[EXIT]] +// CHECK: ret void +// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]] +// Create local storage for each capture. +// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]], +// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]] +// Store captures in the context. +// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32* +// +// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]] +// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]] +// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]] +// +// CHECK: [[WORKER]] +// CHECK: {{call|invoke}} void [[T6]]_worker() +// CHECK: br label {{%?}}[[EXIT:.+]] +// +// CHECK: [[CHECK_MASTER]] +// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]], +// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]] +// +// CHECK: [[MASTER]] +// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]] +// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]] +// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*), +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @llvm.nvvm.barrier0() +// CHECK: call void @__kmpc_serialized_parallel( +// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]]( +// CHE
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea marked an inline comment as done. gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:536-542 + StringRef CompilerPath = env; + while (!CompilerPath.empty()) { +std::pair Split = +CompilerPath.split(llvm::sys::EnvPathSeparator); +LibraryPaths.push_back(Split.first); +CompilerPath = Split.second; + } Hahnfeld wrote: > gtbercea wrote: > > Hahnfeld wrote: > > > gtbercea wrote: > > > > Hahnfeld wrote: > > > > > `tools::addDirectoryList` uses `StringRef::find`, I'm not sure if > > > > > `StringRef::split` creates real copies of the string... > > > > What is your suggestion? > > > IMO you should use whatever existing code does, in that case > > > `StringRef::find`. > > Is this comment still relevant in the light of the most recent changes? > Probably not (although the code is now completely different from > `tools::addDirectoryList`) Gotcha, do let me know if you see any other issue with this version of the code. I will mark this one as done for now. Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 137754. gtbercea added a comment. Change test. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/Inputs/lib/libomptarget-nvptx-sm_60.bc test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,23 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: env LIBRARY_PATH=%S/Inputs/lib %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include @@ -580,6 +581,43 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +// Add path to lib and/or lib64 folders. +SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); +llvm::sys::path::append(DefaultLibPath, +Twine("lib") + CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DriverArgs.MakeArgString(DefaultLibPath)); + +// Add user defined library paths from LIBRARY_PATH. +if (llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH")) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (auto Path : Frags) +LibraryPaths.emplace_back(Path.trim()); +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (const std::string &LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -203,6 +203,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def warn_drv_omp_offload_target_missingbcruntime : Warning< + "No library '%0' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +14
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea added inline comments. Comment at: lib/Driver/ToolChains/Cuda.cpp:592 +Twine("lib") + CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DefaultLibPath.c_str()); + ABataev wrote: > Do you still need `.c_str()` here? Doesn't compile without it but we can get there using Args.MakeArgString() also. Comment at: test/Driver/openmp-offload-gpu.c:150 +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ Hahnfeld wrote: > grokos wrote: > > ABataev wrote: > > > Create empty `libomptarget-nvptx-sm_60.bc` in `Driver/lib` directory and > > > use it in the test rather create|delete it dynamically. > > I'm also in favour of this approach. On some systems /tmp is not accessible > > and the regression test fails. > This test doesn't (and shouldn't!) use `/tmp`. The build directory and `%T` > are always writable (if not, you have different issues on your system). > > Btw you need to pay attention that the driver now finds files next to the > compiler directory. You may want to make sure that the test always passes / > doesn't fail for wrong reasons. Just added this. Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 137755. gtbercea added a comment. Revert to c_str(). Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/Inputs/lib/libomptarget-nvptx-sm_60.bc test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,23 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: env LIBRARY_PATH=%S/Inputs/lib %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include @@ -580,6 +581,43 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +// Add path to lib and/or lib64 folders. +SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); +llvm::sys::path::append(DefaultLibPath, +Twine("lib") + CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DefaultLibPath.c_str()); + +// Add user defined library paths from LIBRARY_PATH. +if (llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH")) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (auto Path : Frags) +LibraryPaths.emplace_back(Path.trim()); +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (const std::string &LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -203,6 +203,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def warn_drv_omp_offload_target_missingbcruntime : Warning< + "No library '%0' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,23 @@ //
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea added inline comments. Comment at: test/Driver/openmp-offload-gpu.c:150 +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: touch %T/libomptarget-nvptx-sm_60.bc +// RUN: env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ gtbercea wrote: > Hahnfeld wrote: > > grokos wrote: > > > ABataev wrote: > > > > Create empty `libomptarget-nvptx-sm_60.bc` in `Driver/lib` directory > > > > and use it in the test rather create|delete it dynamically. > > > I'm also in favour of this approach. On some systems /tmp is not > > > accessible and the regression test fails. > > This test doesn't (and shouldn't!) use `/tmp`. The build directory and `%T` > > are always writable (if not, you have different issues on your system). > > > > Btw you need to pay attention that the driver now finds files next to the > > compiler directory. You may want to make sure that the test always passes / > > doesn't fail for wrong reasons. > Just added this. @Hahnfeld I've used %S instead. The only way in which the test can be a false positive is when the lib folder contains this .bc file. But there's no way to stop this from happening since we check DefaultLibPath first. Repository: rC Clang https://reviews.llvm.org/D43197 ___ cfe-commits mailing list cfe-commits@lists.llvm.org http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library
gtbercea updated this revision to Diff 137769. gtbercea added a comment. Fix test. Repository: rC Clang https://reviews.llvm.org/D43197 Files: include/clang/Basic/DiagnosticDriverKinds.td lib/Driver/ToolChains/Cuda.cpp test/Driver/Inputs/lib/libomptarget-nvptx-sm_20.bc test/Driver/openmp-offload-gpu.c Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,23 @@ // RUN: | FileCheck -check-prefix=CHK-NOLIBDEVICE %s // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60 + +/// ### + +/// Check that the runtime bitcode library is part of the compile line. Create a bogus +/// bitcode library and add it to the LIBRARY_PATH. +// RUN: env LIBRARY_PATH=%S/Inputs/lib %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s + +// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_20.bc + +/// ### + +/// Check that the warning is thrown when the libomptarget bitcode library is not found. +/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should never exist. +// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \ +// RUN: -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps \ +// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-WARN %s + +// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices. Index: lib/Driver/ToolChains/Cuda.cpp === --- lib/Driver/ToolChains/Cuda.cpp +++ lib/Driver/ToolChains/Cuda.cpp @@ -21,6 +21,7 @@ #include "llvm/Option/ArgList.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" #include "llvm/Support/Program.h" #include @@ -580,6 +581,43 @@ CC1Args.push_back("-target-feature"); CC1Args.push_back("+ptx42"); } + + if (DeviceOffloadingKind == Action::OFK_OpenMP) { +SmallVector LibraryPaths; +// Add path to lib and/or lib64 folders. +SmallString<256> DefaultLibPath = + llvm::sys::path::parent_path(getDriver().Dir); +llvm::sys::path::append(DefaultLibPath, +Twine("lib") + CLANG_LIBDIR_SUFFIX); +LibraryPaths.emplace_back(DefaultLibPath.c_str()); + +// Add user defined library paths from LIBRARY_PATH. +if (llvm::Optional LibPath = + llvm::sys::Process::GetEnv("LIBRARY_PATH")) { + SmallVector Frags; + const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'}; + llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr); + for (auto Path : Frags) +LibraryPaths.emplace_back(Path.trim()); +} + +std::string LibOmpTargetName = + "libomptarget-nvptx-" + GpuArch.str() + ".bc"; +bool FoundBCLibrary = false; +for (const std::string &LibraryPath : LibraryPaths) { + SmallString<128> LibOmpTargetFile(LibraryPath); + llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName); + if (llvm::sys::fs::exists(LibOmpTargetFile)) { +CC1Args.push_back("-mlink-cuda-bitcode"); +CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile)); +FoundBCLibrary = true; +break; + } +} +if (!FoundBCLibrary) + getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime) + << LibOmpTargetName; + } } void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs, Index: include/clang/Basic/DiagnosticDriverKinds.td === --- include/clang/Basic/DiagnosticDriverKinds.td +++ include/clang/Basic/DiagnosticDriverKinds.td @@ -203,6 +203,9 @@ def warn_drv_omp_offload_target_duplicate : Warning< "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, InGroup; +def warn_drv_omp_offload_target_missingbcruntime : Warning< + "No library '%0' found in the default clang lib directory or in LIBRARY_PATH. Expect degraded performance due to no inlining of runtime functions on target devices.">, + InGroup; def err_drv_bitcode_unsupported_on_toolchain : Error< "-fembed-bitcode is not supported on versions of iOS prior to 6.0">; Index: test/Driver/openmp-offload-gpu.c === --- test/Driver/openmp-offload-gpu.c +++ test/Driver/openmp-offload-gpu.c @@ -142,3 +142,23 @@ // RUN: |