[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 159335.
gtbercea added a comment.

  Fix function call.


Repository:
  rC Clang

https://reviews.llvm.org/D47849

Files:
  include/clang/Driver/ToolChain.h
  lib/Driver/ToolChains/Clang.cpp
  lib/Driver/ToolChains/Cuda.cpp
  lib/Driver/ToolChains/Cuda.h
  lib/Headers/CMakeLists.txt
  lib/Headers/__clang_cuda_device_functions.h
  lib/Headers/__clang_cuda_libdevice_declares.h
  test/CodeGen/nvptx_device_math_functions.c
  test/Driver/openmp-offload-gpu.c

Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -76,9 +76,9 @@
 // RUN:  -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s
 
-/// Use DAG to ensure that cubin file has been unbundled.
+/// Use DAG to ensure that object file has not been unbundled.
 // CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[CUBIN:.*\.cubin]]"
-// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-outputs={{.*}}[[CUBIN]]
+// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}[[CUBIN]]
 // CHK-CUBIN-UNBUNDLING-NVLINK-DAG-SAME: "-unbundle"
 
 /// ###
Index: test/CodeGen/nvptx_device_math_functions.c
===
--- /dev/null
+++ test/CodeGen/nvptx_device_math_functions.c
@@ -0,0 +1,20 @@
+// Test calling of device math functions.
+///==///
+
+// RUN: %clang -fmath-errno -S -emit-llvm -o - %s -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda | FileCheck -check-prefix CHECK-YES %s
+
+void test_sqrt(double a1) {
+  #pragma omp target
+  {
+// CHECK-YES: call double @llvm.nvvm.sqrt.rn.d(double
+double l1 = sqrt(a1);
+  }
+}
+
+void test_pow(float a0, double a1, long double a2) {
+  #pragma omp target
+  {
+// CHECK-YES: call double @__internal_accurate_pow(double
+double l1 = pow(a1, a1);
+  }
+}
Index: lib/Headers/__clang_cuda_libdevice_declares.h
===
--- lib/Headers/__clang_cuda_libdevice_declares.h
+++ lib/Headers/__clang_cuda_libdevice_declares.h
@@ -24,443 +24,455 @@
 #ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
 #define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
 
+#if defined(_OPENMP)
+#define __DEVICE__
+#elif defined(__CUDA__)
+#define __DEVICE__ __device__
+#endif
+
+#if defined(__cplusplus)
 extern "C" {
+#endif
 
-__device__ int __nv_abs(int __a);
-__device__ double __nv_acos(double __a);
-__device__ float __nv_acosf(float __a);
-__device__ double __nv_acosh(double __a);
-__device__ float __nv_acoshf(float __a);
-__device__ double __nv_asin(double __a);
-__device__ float __nv_asinf(float __a);
-__device__ double __nv_asinh(double __a);
-__device__ float __nv_asinhf(float __a);
-__device__ double __nv_atan2(double __a, double __b);
-__device__ float __nv_atan2f(float __a, float __b);
-__device__ double __nv_atan(double __a);
-__device__ float __nv_atanf(float __a);
-__device__ double __nv_atanh(double __a);
-__device__ float __nv_atanhf(float __a);
-__device__ int __nv_brev(int __a);
-__device__ long long __nv_brevll(long long __a);
-__device__ int __nv_byte_perm(int __a, int __b, int __c);
-__device__ double __nv_cbrt(double __a);
-__device__ float __nv_cbrtf(float __a);
-__device__ double __nv_ceil(double __a);
-__device__ float __nv_ceilf(float __a);
-__device__ int __nv_clz(int __a);
-__device__ int __nv_clzll(long long __a);
-__device__ double __nv_copysign(double __a, double __b);
-__device__ float __nv_copysignf(float __a, float __b);
-__device__ double __nv_cos(double __a);
-__device__ float __nv_cosf(float __a);
-__device__ double __nv_cosh(double __a);
-__device__ float __nv_coshf(float __a);
-__device__ double __nv_cospi(double __a);
-__device__ float __nv_cospif(float __a);
-__device__ double __nv_cyl_bessel_i0(double __a);
-__device__ float __nv_cyl_bessel_i0f(float __a);
-__device__ double __nv_cyl_bessel_i1(double __a);
-__device__ float __nv_cyl_bessel_i1f(float __a);
-__device__ double __nv_dadd_rd(double __a, double __b);
-__device__ double __nv_dadd_rn(double __a, double __b);
-__device__ double __nv_dadd_ru(double __a, double __b);
-__device__ double __nv_dadd_rz(double __a, double __b);
-__device__ double __nv_ddiv_rd(double __a, double __b);
-__device__ double __nv_ddiv_rn(double __a, double __b);
-__device__ double __nv_ddiv_ru(double __a, double __b);
-__device__ double __nv_ddiv_rz(double __a, double __b);
-__device__ double __nv_dmul_rd(double __a, double __b);
-__device__ double __nv_dmul_rn(double __a, double __b);
-__device__ double __nv_dmul_ru(double __a, double __b);
-__device__ double __nv_dmul_rz(double __a, double __b);
-__device__ float __nv_do

[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-07 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D47849#1190903, @Hahnfeld wrote:

> Do we still need this? I think what we really need to solve is the problem of 
> (host) inline assembly in the header files...


Don't we want to use device specific math functions?
It's not just about avoiding some the host specific assembly, it's also about 
getting an implementation tailored to the device.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-08-07 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:664
+  // Anything that's not a file name is potentially a static library
+  // so treat it as such.
+  if (C.canSkipOffloadBundler())

sfantao wrote:
> So, what if it is not a static library?
Can it be anything else at this point?


Repository:
  rC Clang

https://reviews.llvm.org/D47394



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-08-07 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 159536.
gtbercea marked 3 inline comments as done.
gtbercea added a comment.

- Address comments.


Repository:
  rC Clang

https://reviews.llvm.org/D47394

Files:
  include/clang/Driver/Action.h
  include/clang/Driver/Compilation.h
  include/clang/Driver/Options.td
  include/clang/Driver/ToolChain.h
  lib/Driver/Action.cpp
  lib/Driver/Compilation.cpp
  lib/Driver/Driver.cpp
  lib/Driver/ToolChain.cpp
  lib/Driver/ToolChains/Clang.cpp
  lib/Driver/ToolChains/Clang.h
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu-linux.c
  test/Driver/openmp-offload-gpu.c
  test/Driver/openmp-offload.c

Index: test/Driver/openmp-offload.c
===
--- test/Driver/openmp-offload.c
+++ test/Driver/openmp-offload.c
@@ -480,13 +480,13 @@
 // Create host object and bundle.
 // CHK-BUJOBS: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-emit-obj" {{.*}}"-fopenmp" {{.*}}"-o" "
 // CHK-BUJOBS-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "-x" "ir" "{{.*}}[[HOSTBC]]"
-// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
+// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
 // CHK-BUJOBS-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]"
 // CHK-BUJOBS-ST: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-S" {{.*}}"-fopenmp" {{.*}}"-o" "
 // CHK-BUJOBS-ST-SAME: [[HOSTASM:[^\\/]+\.s]]" "-x" "ir" "{{.*}}[[HOSTBC]]"
 // CHK-BUJOBS-ST: clang{{.*}}" "-cc1as" "-triple" "powerpc64le--linux" "-filetype" "obj" {{.*}}"-o" "
 // CHK-BUJOBS-ST-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "{{.*}}[[HOSTASM]]"
-// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
+// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
 // CHK-BUJOBS-ST-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]"
 
 /// ###
Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -61,7 +61,7 @@
 
 /// Check cubin file generation and bundling
 // RUN:   %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-// RUN:  -no-canonical-prefixes -save-temps %s -c 2>&1 \
+// RUN:  -no-canonical-prefixes -save-temps %s -c -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-PTXAS-CUBIN-BUNDLING %s
 
 // CHK-PTXAS-CUBIN-BUNDLING: clang{{.*}}" "-o" "[[PTX:.*\.s]]"
@@ -73,7 +73,7 @@
 /// Check cubin file unbundling and usage by nvlink
 // RUN:   touch %t.o
 // RUN:   %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
-// RUN:  -no-canonical-prefixes -save-temps %t.o 2>&1 \
+// RUN:  -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s
 
 /// Use DAG to ensure that cubin file has been unbundled.
@@ -87,11 +87,11 @@
 // RUN:   touch %t1.o
 // RUN:   touch %t2.o
 // RUN:   %clang -### -no-canonical-prefixes -target powerpc64le-unknown-linux-gnu -fopenmp=libomp \
-// RUN:  -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \
+// RUN:  -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-TWOCUBIN %s
 /// Check cubin file generation and usage by nvlink when toolchain has BindArchAction
 // RUN:   %clang -### -no-canonical-prefixes -target x86_64-apple-darwin17.0.0 -fopenmp=libomp \
-// RUN:  -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \
+// RUN:  -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-TWOCUBIN %s
 
 // CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.cubin" "{{.*}}openmp-offload-{{.*}}.cubin"
Index: test/Driver/openmp-offload-gpu-linux.c
===
--- /dev/null
+++ test/Driver/openmp-offload-gpu-linux.c
@@ -0,0 +1,52 @@
+///
+/// Perform driver tests for OpenMP offloading on Linux systems
+///
+
+// UNSUPPORTED: system-windows
+
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+/// Check cubin file generation and partial linking with ld
+// RUN:   %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=

[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-07 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 159574.
gtbercea added a comment.

Prevent math builtins from being used for nvptx toolchain.


Repository:
  rC Clang

https://reviews.llvm.org/D47849

Files:
  include/clang/Driver/ToolChain.h
  lib/Driver/ToolChains/Clang.cpp
  lib/Driver/ToolChains/Cuda.cpp
  lib/Driver/ToolChains/Cuda.h
  lib/Headers/CMakeLists.txt
  lib/Headers/__clang_cuda_device_functions.h
  lib/Headers/__clang_cuda_libdevice_declares.h
  test/CodeGen/nvptx_device_math_functions.c
  test/Driver/openmp-offload-gpu.c

Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -76,9 +76,9 @@
 // RUN:  -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s
 
-/// Use DAG to ensure that cubin file has been unbundled.
+/// Use DAG to ensure that object file has not been unbundled.
 // CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[CUBIN:.*\.cubin]]"
-// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-outputs={{.*}}[[CUBIN]]
+// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}[[CUBIN]]
 // CHK-CUBIN-UNBUNDLING-NVLINK-DAG-SAME: "-unbundle"
 
 /// ###
Index: test/CodeGen/nvptx_device_math_functions.c
===
--- /dev/null
+++ test/CodeGen/nvptx_device_math_functions.c
@@ -0,0 +1,20 @@
+// Test calling of device math functions.
+///==///
+
+// RUN: %clang -fmath-errno -S -emit-llvm -o - %s -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda | FileCheck -check-prefix CHECK-YES %s
+
+void test_sqrt(double a1) {
+  #pragma omp target
+  {
+// CHECK-YES: call double @llvm.nvvm.sqrt.rn.d(double
+double l1 = sqrt(a1);
+  }
+}
+
+void test_pow(float a0, double a1, long double a2) {
+  #pragma omp target
+  {
+// CHECK-YES: call double @__internal_accurate_pow(double
+double l1 = pow(a1, a1);
+  }
+}
Index: lib/Headers/__clang_cuda_libdevice_declares.h
===
--- lib/Headers/__clang_cuda_libdevice_declares.h
+++ lib/Headers/__clang_cuda_libdevice_declares.h
@@ -24,443 +24,455 @@
 #ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
 #define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
 
+#if defined(_OPENMP)
+#define __DEVICE__
+#elif defined(__CUDA__)
+#define __DEVICE__ __device__
+#endif
+
+#if defined(__cplusplus)
 extern "C" {
+#endif
 
-__device__ int __nv_abs(int __a);
-__device__ double __nv_acos(double __a);
-__device__ float __nv_acosf(float __a);
-__device__ double __nv_acosh(double __a);
-__device__ float __nv_acoshf(float __a);
-__device__ double __nv_asin(double __a);
-__device__ float __nv_asinf(float __a);
-__device__ double __nv_asinh(double __a);
-__device__ float __nv_asinhf(float __a);
-__device__ double __nv_atan2(double __a, double __b);
-__device__ float __nv_atan2f(float __a, float __b);
-__device__ double __nv_atan(double __a);
-__device__ float __nv_atanf(float __a);
-__device__ double __nv_atanh(double __a);
-__device__ float __nv_atanhf(float __a);
-__device__ int __nv_brev(int __a);
-__device__ long long __nv_brevll(long long __a);
-__device__ int __nv_byte_perm(int __a, int __b, int __c);
-__device__ double __nv_cbrt(double __a);
-__device__ float __nv_cbrtf(float __a);
-__device__ double __nv_ceil(double __a);
-__device__ float __nv_ceilf(float __a);
-__device__ int __nv_clz(int __a);
-__device__ int __nv_clzll(long long __a);
-__device__ double __nv_copysign(double __a, double __b);
-__device__ float __nv_copysignf(float __a, float __b);
-__device__ double __nv_cos(double __a);
-__device__ float __nv_cosf(float __a);
-__device__ double __nv_cosh(double __a);
-__device__ float __nv_coshf(float __a);
-__device__ double __nv_cospi(double __a);
-__device__ float __nv_cospif(float __a);
-__device__ double __nv_cyl_bessel_i0(double __a);
-__device__ float __nv_cyl_bessel_i0f(float __a);
-__device__ double __nv_cyl_bessel_i1(double __a);
-__device__ float __nv_cyl_bessel_i1f(float __a);
-__device__ double __nv_dadd_rd(double __a, double __b);
-__device__ double __nv_dadd_rn(double __a, double __b);
-__device__ double __nv_dadd_ru(double __a, double __b);
-__device__ double __nv_dadd_rz(double __a, double __b);
-__device__ double __nv_ddiv_rd(double __a, double __b);
-__device__ double __nv_ddiv_rn(double __a, double __b);
-__device__ double __nv_ddiv_ru(double __a, double __b);
-__device__ double __nv_ddiv_rz(double __a, double __b);
-__device__ double __nv_dmul_rd(double __a, double __b);
-__device__ double __nv_dmul_rn(double __a, double __b);
-__device__ double __nv_dmul_ru(double __a, double __b);
-__device__ double __nv_dmul_rz(double __a, 

[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.



> Ok, so you are already talking about performance. I think we should fix 
> correctness first, in particular the compiler shouldn't complain whenever 
> `` is included.

This patch is concerned with calling device functions when you're on the 
device. The correctness issues you mention are orthogonal to this and should be 
handled by another patch. I don't think this patch should be held up any longer.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D47849#1192245, @Hahnfeld wrote:

> In https://reviews.llvm.org/D47849#1192134, @gtbercea wrote:
>
> > This patch is concerned with calling device functions when you're on the 
> > device. The correctness issues you mention are orthogonal to this and 
> > should be handled by another patch. I don't think this patch should be held 
> > up any longer.
>
>
> I'm confused by now, could you please highlight the point that I'm missing?


You're bringing up the correctness of the header files which is a detail that 
is orthogonal to this patch. Even if the header files worked correctly I would 
still want to use the libdevice functions. Fixing the header files themselves 
should be therefore done in a separate patch.
Using the libdevice functions guarantees correctness (no weird assembly 
instructions that the device doesn't recognize etc.) and may improve 
performance (if for example the libdevice contained device specific assembly).

The purpose of this patch is to call NVIDIA's libdevice math functions which 
should in principle be more efficient in terms of runtime and register usage. 
Not all of them may be more effecient today (like @tra suggested) but some of 
them will be. Maybe others will be improved in the future, maybe not, again 
that's an orthogonal point. The benefit of using libdevice functions is that 
any improvements NVIDIA makes we will be there to use them in the OpenMP NVPTX 
toolchain. The premise of the OpenMP NVPTX toolchain is that it will leverage 
as much of the CUDA toolchain as possible.

Another point is that users specifically ask for NVIDIA math functions to be 
called on the device when using OpenMP NVPTX device offloading. The libdevice 
library offers __nv_fast_* variants of some math functions. Users want to have 
access to those functions and other functions that the libdevice library 
contains.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.



> IIRC you started to work on this to fix the problem with inline assembly (see 
> https://reviews.llvm.org/D47849#1125019). AFAICS this patch fixes 
> declarations of math functions but you still cannot include `math.h` which 
> most "correct" codes do.

I'm not sure what you mean by this. This patch enables me to include math.h.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D47849#1192368, @Hahnfeld wrote:

> In https://reviews.llvm.org/D47849#1192321, @gtbercea wrote:
>
> > > IIRC you started to work on this to fix the problem with inline assembly 
> > > (see https://reviews.llvm.org/D47849#1125019). AFAICS this patch fixes 
> > > declarations of math functions but you still cannot include `math.h` 
> > > which most "correct" codes do.
> >
> > I'm not sure what you mean by this. This patch enables me to include math.h.
>
>
> `math.c`:
>
>   #include 
>
>
> executed commands:
>
>$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -c math.c -O2
>   In file included from math.c:1:
>   In file included from /usr/include/math.h:413:
>   /usr/include/bits/mathinline.h:131:43: error: invalid input constraint 'x' 
> in asm
> __asm ("pmovmskb %1, %0" : "=r" (__m) : "x" (__x));
> ^
>   /usr/include/bits/mathinline.h:143:43: error: invalid input constraint 'x' 
> in asm
> __asm ("pmovmskb %1, %0" : "=r" (__m) : "x" (__x));
> ^
>   2 errors generated.
>


I do not get that error.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D47849#1192368, @Hahnfeld wrote:

> In https://reviews.llvm.org/D47849#1192321, @gtbercea wrote:
>
> > > IIRC you started to work on this to fix the problem with inline assembly 
> > > (see https://reviews.llvm.org/D47849#1125019). AFAICS this patch fixes 
> > > declarations of math functions but you still cannot include `math.h` 
> > > which most "correct" codes do.
> >
> > I'm not sure what you mean by this. This patch enables me to include math.h.
>
>
> `math.c`:
>
>   #include 
>
>
> executed commands:
>
>$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda -c math.c -O2
>   In file included from math.c:1:
>   In file included from /usr/include/math.h:413:
>   /usr/include/bits/mathinline.h:131:43: error: invalid input constraint 'x' 
> in asm
> __asm ("pmovmskb %1, %0" : "=r" (__m) : "x" (__x));
> ^
>   /usr/include/bits/mathinline.h:143:43: error: invalid input constraint 'x' 
> in asm
> __asm ("pmovmskb %1, %0" : "=r" (__m) : "x" (__x));
> ^
>   2 errors generated.
>


We are probably linking against different math.h files. I don't seem to have a 
mathinline.h with those instructions. Perhaps this is an x86 specific error.

I think I know what's happening. I think the host math.h is still included but 
not necessarily used. Math functions resolve to math functions in the CUDA 
header first (that's what this patch does). This patch doesn't prevent math.h 
from being included.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D47849#1192383, @Hahnfeld wrote:

> In https://reviews.llvm.org/D47849#1192375, @gtbercea wrote:
>
> > I do not get that error.
>
>
> In the beginning you said that you were facing the same error. Did that go 
> away in the meantime?
>  Are you testing on x86 or Power? With optimizations enabled?


Since I'm running on Power I was facing a similar problem related to host 
assembly instructions on device but not exactly the same error.

The error you are seeing is that the NVPTX target doesn't regard "x" as a valid 
input constraint. x is an x86 specific constraint which I don't have on the 
Power side.

The problems I was having were related to the math functions on the device 
resolving to host math functions which contained host assembly instructions 
which were not recognized by NVPTX. This patch fixes that issue.

Perhaps the inclusion of the host math.h should just be prevented for device 
code?


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

@Hahnfeld do you get the same error if you compile with clang++ instead of 
clang?


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-10 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.



> The downside of this approach is that LLVM doesn't recognize these function 
> calls and doesn't perform optimizations to fold libcalls. For example `pow(a, 
> 2)` is transformed into a multiplication but `__nv_pow(a, 2)` is not.

Doesn't CUDA have the same problem?


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-10 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.



>   I don't want to use a fast `pow(a, 2)`, I don't want to call a library 
> function for that at all.

I do believe you won't end up calling a function. If you're compiling with 
optimizations on this will be inlined.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

Thanks @Hahnfeld for your suggestions.

Unfortunately doing the lowering in the backend one would need to replace the 
math function calls with calls to libdevice function calls. I have not been 
able to do that in an elegant way. Encoding the interface to libdevice is just 
not a clean process not to mention that any changes to libdevice will have to 
be tracked manually with every new CUDA version. It does not make the code more 
maintainable, on the contrary I think it makes it harder to track libdevice 
changes.

On the same note, clang-cuda doesn't do the pow(a,2) -> a*a optimization, I 
checked. It is something that needs to be fixed for Clang-CUDA first before 
OpenMP can make use of it. OpenMP-NVPTX toolchain is designed to exist on top 
of the CUDA toolchain. It therefore inherits all the clang-cuda benefits and in 
this particular case, limitations.

As for the Sema check error you report (the one related to the x restriction), 
I think the fix you proposed is good and should be pushed in a separate patch.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

Just to address any generality concerns:

This patch fixes the problem of calling libdevice math functions for all 
platform combinations. It ensures that the OpenMP NVPTX target region will NOT 
call any host math functions (which ever host that may be) IF equivalent device 
functions are available.

I think there was a confusion regarding header file inclusion. This patch does 
not address any issues that might arise from the user including header files 
(be it math.h or some other header). Any failure related to header file 
inclusion (such as the reported x restriction issue on x86) is unrelated to 
what this patch aims to do. Before the functionality in this patch can kick in, 
any user-included headers must successfully pass all checks in place for the 
NVPTX toolchain. A fix in the direction of the one proposed in one of the 
comments above is probably required. The fix would also needs its own separate 
patch.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 160598.
gtbercea added a comment.
Herald added a subscriber: jholewinski.

Add __NO_MATH_INLINES macro for the NVPTX toolchain to prevent any host 
assembly from seeping onto the device.


Repository:
  rC Clang

https://reviews.llvm.org/D47849

Files:
  include/clang/Driver/ToolChain.h
  lib/Basic/Targets/NVPTX.cpp
  lib/Driver/ToolChains/Clang.cpp
  lib/Driver/ToolChains/Cuda.cpp
  lib/Driver/ToolChains/Cuda.h
  lib/Headers/CMakeLists.txt
  lib/Headers/__clang_cuda_device_functions.h
  lib/Headers/__clang_cuda_libdevice_declares.h
  test/CodeGen/nvptx_device_math_functions.c
  test/Driver/openmp-offload-gpu.c

Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -76,9 +76,9 @@
 // RUN:  -no-canonical-prefixes -save-temps %t.o -fopenmp-use-target-bundling 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s
 
-/// Use DAG to ensure that cubin file has been unbundled.
+/// Use DAG to ensure that object file has not been unbundled.
 // CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[CUBIN:.*\.cubin]]"
-// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-outputs={{.*}}[[CUBIN]]
+// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}[[CUBIN]]
 // CHK-CUBIN-UNBUNDLING-NVLINK-DAG-SAME: "-unbundle"
 
 /// ###
Index: test/CodeGen/nvptx_device_math_functions.c
===
--- /dev/null
+++ test/CodeGen/nvptx_device_math_functions.c
@@ -0,0 +1,20 @@
+// Test calling of device math functions.
+///==///
+
+// RUN: %clang -fmath-errno -S -emit-llvm -o - %s -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda | FileCheck -check-prefix CHECK-YES %s
+
+void test_sqrt(double a1) {
+  #pragma omp target
+  {
+// CHECK-YES: call double @llvm.nvvm.sqrt.rn.d(double
+double l1 = sqrt(a1);
+  }
+}
+
+void test_pow(float a0, double a1, long double a2) {
+  #pragma omp target
+  {
+// CHECK-YES: call double @__internal_accurate_pow(double
+double l1 = pow(a1, a1);
+  }
+}
Index: lib/Headers/__clang_cuda_libdevice_declares.h
===
--- lib/Headers/__clang_cuda_libdevice_declares.h
+++ lib/Headers/__clang_cuda_libdevice_declares.h
@@ -24,443 +24,455 @@
 #ifndef __CLANG_CUDA_LIBDEVICE_DECLARES_H__
 #define __CLANG_CUDA_LIBDEVICE_DECLARES_H__
 
+#if defined(_OPENMP)
+#define __DEVICE__
+#elif defined(__CUDA__)
+#define __DEVICE__ __device__
+#endif
+
+#if defined(__cplusplus)
 extern "C" {
+#endif
 
-__device__ int __nv_abs(int __a);
-__device__ double __nv_acos(double __a);
-__device__ float __nv_acosf(float __a);
-__device__ double __nv_acosh(double __a);
-__device__ float __nv_acoshf(float __a);
-__device__ double __nv_asin(double __a);
-__device__ float __nv_asinf(float __a);
-__device__ double __nv_asinh(double __a);
-__device__ float __nv_asinhf(float __a);
-__device__ double __nv_atan2(double __a, double __b);
-__device__ float __nv_atan2f(float __a, float __b);
-__device__ double __nv_atan(double __a);
-__device__ float __nv_atanf(float __a);
-__device__ double __nv_atanh(double __a);
-__device__ float __nv_atanhf(float __a);
-__device__ int __nv_brev(int __a);
-__device__ long long __nv_brevll(long long __a);
-__device__ int __nv_byte_perm(int __a, int __b, int __c);
-__device__ double __nv_cbrt(double __a);
-__device__ float __nv_cbrtf(float __a);
-__device__ double __nv_ceil(double __a);
-__device__ float __nv_ceilf(float __a);
-__device__ int __nv_clz(int __a);
-__device__ int __nv_clzll(long long __a);
-__device__ double __nv_copysign(double __a, double __b);
-__device__ float __nv_copysignf(float __a, float __b);
-__device__ double __nv_cos(double __a);
-__device__ float __nv_cosf(float __a);
-__device__ double __nv_cosh(double __a);
-__device__ float __nv_coshf(float __a);
-__device__ double __nv_cospi(double __a);
-__device__ float __nv_cospif(float __a);
-__device__ double __nv_cyl_bessel_i0(double __a);
-__device__ float __nv_cyl_bessel_i0f(float __a);
-__device__ double __nv_cyl_bessel_i1(double __a);
-__device__ float __nv_cyl_bessel_i1f(float __a);
-__device__ double __nv_dadd_rd(double __a, double __b);
-__device__ double __nv_dadd_rn(double __a, double __b);
-__device__ double __nv_dadd_ru(double __a, double __b);
-__device__ double __nv_dadd_rz(double __a, double __b);
-__device__ double __nv_ddiv_rd(double __a, double __b);
-__device__ double __nv_ddiv_rn(double __a, double __b);
-__device__ double __nv_ddiv_ru(double __a, double __b);
-__device__ double __nv_ddiv_rz(double __a, double __b);
-__device__ double __nv_dmul_rd(double __a, double __b);
-__device__ double __nv_dmul_rn(dou

[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation

2018-08-16 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D50845#1202973, @ABataev wrote:

> >> If I understand it correctly, the root cause of this exercise is that we 
> >> want to compile for GPU using plain C. CUDA avoids this issue by 
> >> separating device and host code via target attributes and clang has few 
> >> special cases to ignore inline assembly errors in the host code if we're 
> >> compiling for device. For OpenMP there's no such separation, not in the 
> >> system headers, at least.
> > 
> > Yes, that's one of the nice properties of CUDA (for the compiler). There 
> > used to be the same restriction for OpenMP where all functions used in 
> > `target` regions needed to be put in `declare target`. However that was 
> > relaxed in favor of implicitly marking all **called** functions in that TU 
> > to be `declare target`.
> >  So ideally I think Clang should determine which functions are really 
> > `declare target` (either explicit or implicit) and only run semantical 
> > analysis on them. If a function is then found to be "broken" it's perfectly 
> > desirable to error back to the user.
>
> It is not possible for OpenMP because we support implicit declare target 
> functions. Clang cannot identify whether the function is going to be used on 
> the device or not during sema analysis.


Sounds like that is a recipe for just disabling sema analysis for all implicit 
declare target functions.


Repository:
  rC Clang

https://reviews.llvm.org/D50845



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation

2018-08-16 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D50845#1202991, @hfinkel wrote:

> In https://reviews.llvm.org/D50845#1202965, @Hahnfeld wrote:
>
> > In https://reviews.llvm.org/D50845#1202963, @hfinkel wrote:
> >
> > > As a result, we should really have a separate header that has those 
> > > actually-available functions. When targeting NVPTX, why don't we have the 
> > > included math.h be CUDA's math.h? In the end, those are the functions we 
> > > need to call when we generate code. Right?
> >
> >
> > That's what https://reviews.llvm.org/D47849 deals with.
>
>
> Yes, but it doesn't get CUDA's math.h. Maybe I misunderstand how this works 
> (and I very well might, because it's not clear that CUDA has a math.h by that 
> name), but that patch tries to avoid problems with the host's math.h and then 
> also injects __clang_cuda_device_functions.h into the device compilation. How 
> does this compare to when you include math.h in Clang's CUDA mode? It seems 
> to be that we want to somehow map standard includes, where applicable, to 
> include files in CUDA's include/crt directory (e.g., crt/math_functions.h and 
> crt/common_functions.h for stdio.h for printf), and nothing else ends up 
> being available (because it is, in fact, not available).


There's no CUDA specific math.h unless you want to regard 
clang_cuda_device_functions.h as a math header. The patch is using the same 
approach as CUDA and redirecting the function calls to device specific function 
calls. The parts of that patch which deal with host header compatibility would 
more naturally belong in a patch like this one so ultimately they won't be part 
of that patch. I'm currently working on improving the patch though by 
eliminating the clang_cuda_device_functions.h injection and elimintating the 
need to disable the built-ins.


Repository:
  rC Clang

https://reviews.llvm.org/D50845



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-05-25 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
gtbercea added reviewers: Hahnfeld, hfinkel, caomhin, carlo.bertolli, tra.
Herald added subscribers: cfe-commits, guansong.

So far, the clang-offload-bundler has been the default tool for bundling 
together various files types produced by the different OpenMP offloading 
toolchains supported by Clang. It does a great job for file types such as .bc, 
.ll, .ii, .ast. It is also used for bundling object files. Object files are 
special, in this case object files which contain sections meant to be executed 
on devices other than the host (such is the case of the OpenMP NVPTX 
toolchain). The bundling of object files prevents:

- STATIC LINKING: These bundled object files can be part of static libraries 
which means that the object file requires an unbundling step. If an object file 
in a static library requires "unbundling" then we need to know the whereabouts 
of that library and of the files before the actual link step which makes it 
impossible to do static linking using the "-L/path/to/lib/folder -labc" flag.
- INTEROPERABILITY WITH OTHER COMPILERS: These bundled object files can end up 
being passed between Clang and other compilers which may lead to 
incompatibilities: passing a bundled file from Clang to another compiler would 
lead to that compiler not being able to unbundle it. Passing an unbundled 
object file to Clang and therefore Clang not knowing that it doesn't need to 
unbundle it.

**Goal:**
Disable the use of the clang-offload-bundler for bundling/unbundling object 
files which contain OpenMP NVPTX device offloaded code. This applies to the 
case where the following set of flags are passed to Clang:
-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
When the above condition is not met the compiler works as it does today by 
invoking the clang-offload-bundler for bundling/unbundling object files (at the 
cost of static linking and interoperability).
The clang-offload-bundler usage on files other than object files is not 
affected by this patch.

**Extensibility**
Although this patch disables bundling/unbundling of object files via the 
clang-offload-bundler for the OpenMP NVPTX device offloading toolchain ONLY, 
this functionality can be extended to other platforms/system where:

- the device toolchain can produce a host-compatible object AND
- partial linking of host objects is supported.

**The solution:**
The solution enables the OpenMP NVPTX toolchain to produce an object file which 
is host-compatible (when compiling with -c). The host-compatible file is 
produced using several steps:
Step 1 (already exists): invoke PTXAS on the .s file to obtain a .cubin.
Step 2 (new step): invoke the FATBIN tool (this tool comes with every standard 
CUDA installation) which creates a CUDA fatbinary that contains both the PTX 
code (the .s file) and the .cubin file. This same tool can wrap the resulting 
.fatbin file in a C/C++ wrapper thus creating a .fatbin.c file.
Step 3 (new step): call clang++ on the .fatbin.c file to create a .o file which 
is host-compatible.

Once this device side host-compatible file is produced for the NVPTX toolchain 
then one further step is needed:
Step 4 (new step): invoke a linker supporting partial linking (currently using 
"ld -r") to link host-compatible object file against the original host file and 
end up with one single object file which I can now safely pass to another 
compiler or include in a static library (new step).

**Passing final object file to clang:**
This file doesn't require unbundling so call to "clang-offload-bundler 
--unbundle" is NOT required.
The compiler needs to be notified that the object file contains an "offloaded 
device part" by using: "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda". This 
will invoke the OpenMP NVPTX toolchain and it will call only NVLINK on this 
file.

**Passing final object file to clang inside a static lib "libabc.a" passed to 
clang via: "-L/path/to/lib/folder -labc":**
Call clang with "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" to trigger 
NVPTX toolchain.
The -L path along with the -labc will be passed to NVLINK which will perform 
the "static linking".


Repository:
  rC Clang

https://reviews.llvm.org/D47394

Files:
  include/clang/Driver/Action.h
  include/clang/Driver/Compilation.h
  include/clang/Driver/Driver.h
  include/clang/Driver/ToolChain.h
  lib/Driver/Action.cpp
  lib/Driver/Compilation.cpp
  lib/Driver/Driver.cpp
  lib/Driver/ToolChain.cpp
  lib/Driver/ToolChains/Clang.cpp
  lib/Driver/ToolChains/Clang.h
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c
  test/Driver/openmp-offload.c

Index: test/Driver/openmp-offload.c
===
--- test/Driver/openmp-offload.c
+++ test/Driver/openmp-offload.c
@@ -480,13 +480,13 @@
 // Create host object and bundle.
 // CHK-BUJOBS: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-emit-obj" {{.*}}"-fopenmp" {{.*}}"-o" "
 // CHK-BUJOBS-SAME: [[HOSTOBJ:[^\\/]+\

[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-05-25 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 148677.

Repository:
  rC Clang

https://reviews.llvm.org/D47394

Files:
  include/clang/Driver/Action.h
  include/clang/Driver/Compilation.h
  include/clang/Driver/Driver.h
  include/clang/Driver/ToolChain.h
  lib/Driver/Action.cpp
  lib/Driver/Compilation.cpp
  lib/Driver/Driver.cpp
  lib/Driver/ToolChain.cpp
  lib/Driver/ToolChains/Clang.cpp
  lib/Driver/ToolChains/Clang.h
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c
  test/Driver/openmp-offload.c

Index: test/Driver/openmp-offload.c
===
--- test/Driver/openmp-offload.c
+++ test/Driver/openmp-offload.c
@@ -480,13 +480,13 @@
 // Create host object and bundle.
 // CHK-BUJOBS: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-emit-obj" {{.*}}"-fopenmp" {{.*}}"-o" "
 // CHK-BUJOBS-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "-x" "ir" "{{.*}}[[HOSTBC]]"
-// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
+// CHK-BUJOBS: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
 // CHK-BUJOBS-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]"
 // CHK-BUJOBS-ST: clang{{.*}}" "-cc1" "-triple" "powerpc64le--linux" "-S" {{.*}}"-fopenmp" {{.*}}"-o" "
 // CHK-BUJOBS-ST-SAME: [[HOSTASM:[^\\/]+\.s]]" "-x" "ir" "{{.*}}[[HOSTBC]]"
 // CHK-BUJOBS-ST: clang{{.*}}" "-cc1as" "-triple" "powerpc64le--linux" "-filetype" "obj" {{.*}}"-o" "
 // CHK-BUJOBS-ST-SAME: [[HOSTOBJ:[^\\/]+\.o]]" "{{.*}}[[HOSTASM]]"
-// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o" "-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
+// CHK-BUJOBS-ST: clang-offload-bundler{{.*}}" "-type=o"{{.*}}"-targets=openmp-powerpc64le-ibm-linux-gnu,openmp-x86_64-pc-linux-gnu,host-powerpc64le--linux" "-outputs=
 // CHK-BUJOBS-ST-SAME: [[RES:[^\\/]+\.o]]" "-inputs={{.*}}[[T1OBJ]],{{.*}}[[T2OBJ]],{{.*}}[[HOSTOBJ]]"
 
 /// ###
Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -66,24 +66,29 @@
 
 // CHK-PTXAS-CUBIN-BUNDLING: clang{{.*}}" "-o" "[[PTX:.*\.s]]"
 // CHK-PTXAS-CUBIN-BUNDLING-NEXT: ptxas{{.*}}" "--output-file" "[[CUBIN:.*\.cubin]]" {{.*}}"[[PTX]]"
-// CHK-PTXAS-CUBIN-BUNDLING: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-inputs={{.*}}[[CUBIN]]
+// CHK-PTXAS-CUBIN-BUNDLING: fatbinary{{.*}}" "--create=[[FATBIN:.*\.fatbin]]" "
+// CHK-PTXAS-CUBIN-BUNDLING-SAME: --embedded-fatbin=[[FATBINC:.*\.fatbin.c]]" "
+// CHK-PTXAS-CUBIN-BUNDLING-SAME: --cmdline=--compile-only" "--image=profile={{.*}}[[PTX]]" "
+// CHK-PTXAS-CUBIN-BUNDLING-SAME: --image=profile={{.*}}file=[[CUBIN]]" "--cuda" "--device-c"
+// CHK-PTXAS-CUBIN-BUNDLING: clang++{{.*}}" "-c" "-o" "[[HOSTDEV:.*\.o]]"{{.*}}" "[[FATBINC]]" "-D__NV_MODULE_ID=
+// CHK-PTXAS-CUBIN-BUNDLING-NOT: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-inputs={{.*}}[[CUBIN]]
+// CHK-PTXAS-CUBIN-BUNDLING: ld" "-r" "[[HOSTDEV]]" "{{.*}}.o" "-o" "{{.*}}.o"
 
 /// ###
 
-/// Check cubin file unbundling and usage by nvlink
+/// Check object file unbundling is not happening when skipping bundler
 // RUN:   touch %t.o
 // RUN:   %clang -### -target powerpc64le-unknown-linux-gnu -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
 // RUN:  -no-canonical-prefixes -save-temps %t.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-CUBIN-UNBUNDLING-NVLINK %s
 
-/// Use DAG to ensure that cubin file has been unbundled.
-// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[CUBIN:.*\.cubin]]"
-// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: clang-offload-bundler{{.*}}" "-type=o" {{.*}}"-outputs={{.*}}[[CUBIN]]
-// CHK-CUBIN-UNBUNDLING-NVLINK-DAG-SAME: "-unbundle"
+/// Use DAG to ensure that object file has not been unbundled.
+// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: nvlink{{.*}}" {{.*}}"[[OBJ:.*\.o]]"
+// CHK-CUBIN-UNBUNDLING-NVLINK-DAG: ld{{.*}}" {{.*}}"[[OBJ]]"
 
 /// ###
 
-/// Check cubin file generation and usage by nvlink
+/// Check object file generation is not happening when skipping bundler
 // RUN:   touch %t1.o
 // RUN:   touch %t2.o
 // RUN:   %clang -### -no-canonical-prefixes -target powerpc64le-unknown-linux-gnu -fopenmp=libomp \
@@ -94,7 +99,7 @@
 // RUN:  -fopenmp-targets=nvptx64-nvidia-cuda %t1.o %t2.o 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-TWOCUBIN %s
 
-// CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.cubin" "{{.*}}openmp-offload-{{.*}}.cubin"
+// CHK-TWOCUBIN: nvlink{{.*}}openmp-offload-{{.*}}.o" "{{.*}}o

[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-05-29 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: test/Driver/openmp-offload.c:497
 // RUN:   %clang -###  -fopenmp=libomp -o %t.out -lsomelib -target 
powerpc64le-linux 
-fopenmp-targets=powerpc64le-ibm-linux-gnu,x86_64-pc-linux-gnu %t.i 
-no-canonical-prefixes 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-UBJOBS %s
 // RUN:   %clang -### -fopenmp=libomp -o %t.out -lsomelib -target 
powerpc64le-linux 
-fopenmp-targets=powerpc64le-ibm-linux-gnu,x86_64-pc-linux-gnu %t.i -save-temps 
-no-canonical-prefixes 2>&1 \

gtbercea wrote:
> sfantao wrote:
> > We need a test for the static linking. The host linker has to be nvcc in 
> > that case, right?
> The host linker is "ld". The "bundling" step is replaced (in the case of 
> OpenMP NVPTX device offloading only) by a call to "ld -r" to partially link 
> the 2 object files: the object file produced by the HOST toolchain and the 
> object file produced by the OpenMP NVPTX device offloading toolchain (because 
> we want to produce a single output).
nvcc is not called at all in this patch.


Repository:
  rC Clang

https://reviews.llvm.org/D47394



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-05-29 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: test/Driver/openmp-offload.c:497
 // RUN:   %clang -###  -fopenmp=libomp -o %t.out -lsomelib -target 
powerpc64le-linux 
-fopenmp-targets=powerpc64le-ibm-linux-gnu,x86_64-pc-linux-gnu %t.i 
-no-canonical-prefixes 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHK-UBJOBS %s
 // RUN:   %clang -### -fopenmp=libomp -o %t.out -lsomelib -target 
powerpc64le-linux 
-fopenmp-targets=powerpc64le-ibm-linux-gnu,x86_64-pc-linux-gnu %t.i -save-temps 
-no-canonical-prefixes 2>&1 \

sfantao wrote:
> We need a test for the static linking. The host linker has to be nvcc in that 
> case, right?
The host linker is "ld". The "bundling" step is replaced (in the case of OpenMP 
NVPTX device offloading only) by a call to "ld -r" to partially link the 2 
object files: the object file produced by the HOST toolchain and the object 
file produced by the OpenMP NVPTX device offloading toolchain (because we want 
to produce a single output).


Repository:
  rC Clang

https://reviews.llvm.org/D47394



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-05-29 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:536
+  }
 }
 

sfantao wrote:
> What prevents all this from being done in the bundler? If I understand it 
> correctly, if the bundler implements this wrapping all the checks for 
> librariers wouldn't be required and, only two changes would be required in 
> the driver:
> 
> - generate fatbin instead of cubin. This is straightforward to do by changing 
> the device assembling job. In terms of the loading of the kernels by the 
> device API, doing it through fatbin or cubin should be equivalent except that 
> fatbin enables storing the PTX format and JIT for newer GPUs.
> - Use NVIDIA linker as host linker.
> 
> This last requirement could be problematic if we get two targets attempting  
> to use different (incompatible linkers). If we get this kind of 
> incompatibility we should get the appropriate diagnostic.
What prevents it is the fact that the bundler is called AFTER the HOST and 
DEVICE object files have been produced. The creation of the fatbin (FATBINARY + 
CALNG++) needs to happen within the NVPTX toolchain.



Repository:
  rC Clang

https://reviews.llvm.org/D47394



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-05-29 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D47394#1114848, @sfantao wrote:

> Just to clarify one thing in my last comment:
>
> When I say that we didn't aim at having clang compatible with other 
> compilers, I mean the OpenMP offloading descriptors, where all the variables 
> and offloading entry points are. Of course we want to allow the resulting 
> binaries to be compatible with linkers taking inputs of other compilers, so 
> that you can have, e.g., OpenMP and CUDA supported in the same executable, 
> even though working independently.


Today you will have trouble linking against a Clang object file in another 
compiler that doesn't know anything about the clang-offload-bundler.


Repository:
  rC Clang

https://reviews.llvm.org/D47394



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37913: [OpenMP] Enable the existing nocudalib flag for OpenMP offloading toolchain.

2017-09-25 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 116621.
gtbercea added a comment.

Split line.


https://reviews.llvm.org/D37913

Files:
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -125,3 +125,13 @@
 // RUN:   | FileCheck -check-prefix=CHK-PTXAS-RELO %s
 
 // CHK-PTXAS-RELO: ptxas{{.*}}" "-c"
+
+/// ###
+
+/// Check that error is not thrown by toolchain when no cuda lib flag is used.
+/// Check that the flag is passed when -fopenmp-relocatable-target is used.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda 
-Xopenmp-target -march=sm_60 \
+// RUN:   -nocudalib -fopenmp-relocatable-target -save-temps 
-no-canonical-prefixes %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FLAG-NOLIBDEVICE %s
+
+// CHK-FLAG-NOLIBDEVICE-NOT: error:{{.*}}sm_60
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -492,11 +492,11 @@
 if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
options::OPT_fno_cuda_approx_transcendentals, 
false))
   CC1Args.push_back("-fcuda-approx-transcendentals");
-
-if (DriverArgs.hasArg(options::OPT_nocudalib))
-  return;
   }
 
+  if (DriverArgs.hasArg(options::OPT_nocudalib))
+return;
+
   std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
 
   if (LibDeviceFile.empty()) {


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -125,3 +125,13 @@
 // RUN:   | FileCheck -check-prefix=CHK-PTXAS-RELO %s
 
 // CHK-PTXAS-RELO: ptxas{{.*}}" "-c"
+
+/// ###
+
+/// Check that error is not thrown by toolchain when no cuda lib flag is used.
+/// Check that the flag is passed when -fopenmp-relocatable-target is used.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda -Xopenmp-target -march=sm_60 \
+// RUN:   -nocudalib -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-FLAG-NOLIBDEVICE %s
+
+// CHK-FLAG-NOLIBDEVICE-NOT: error:{{.*}}sm_60
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -492,11 +492,11 @@
 if (DriverArgs.hasFlag(options::OPT_fcuda_approx_transcendentals,
options::OPT_fno_cuda_approx_transcendentals, false))
   CC1Args.push_back("-fcuda-approx-transcendentals");
-
-if (DriverArgs.hasArg(options::OPT_nocudalib))
-  return;
   }
 
+  if (DriverArgs.hasArg(options::OPT_nocudalib))
+return;
+
   std::string LibDeviceFile = CudaInstallation.getLibDeviceFile(GpuArch);
 
   if (LibDeviceFile.empty()) {
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D37914: [OpenMP] Don't throw cudalib not found error if only front-end is required.

2017-09-25 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea reopened this revision.
gtbercea added a comment.
This revision is now accepted and ready to land.

Open.


Repository:
  rL LLVM

https://reviews.llvm.org/D37914



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38040: [OpenMP] Add an additional test for D34888

2017-09-26 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 116664.
gtbercea added a comment.

Fix test.


https://reviews.llvm.org/D38040

Files:
  test/OpenMP/target_map_codegen.cpp


Index: test/OpenMP/target_map_codegen.cpp
===
--- test/OpenMP/target_map_codegen.cpp
+++ test/OpenMP/target_map_codegen.cpp
@@ -4554,3 +4554,33 @@
 }
 #endif
 #endif
+
+///==///
+// RUN: %clang -DCK30 -std=c++11 -fopenmp -S -emit-llvm 
-fopenmp-targets=nvptx64-nvidia-cuda %s -o - 2>&1 \
+// RUN: | FileCheck -check-prefix=CK30 %s
+
+#ifdef CK30
+
+void target_maps_parallel_integer(int a){
+  int ParamToKernel = a;
+#pragma omp target map(tofrom: ParamToKernel)
+  {
+ParamToKernel += 1;
+  }
+}
+
+// CK30: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4) 
%ParamToKernel)
+
+// CK30: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} {
+
+// CK30: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs
+// CK30: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]]
+// CK30: store i32* %ParamToKernel, i32** [[GEPOBPBIT]]
+// CK30: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}offload_ptrs
+// CK30: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]]
+// CK30: store i32* %ParamToKernel, i32** [[GEPOPBIT]]
+// CK30: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs
+// CK30: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}offload_ptrs
+// CK30: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]]
+
+#endif


Index: test/OpenMP/target_map_codegen.cpp
===
--- test/OpenMP/target_map_codegen.cpp
+++ test/OpenMP/target_map_codegen.cpp
@@ -4554,3 +4554,33 @@
 }
 #endif
 #endif
+
+///==///
+// RUN: %clang -DCK30 -std=c++11 -fopenmp -S -emit-llvm -fopenmp-targets=nvptx64-nvidia-cuda %s -o - 2>&1 \
+// RUN: | FileCheck -check-prefix=CK30 %s
+
+#ifdef CK30
+
+void target_maps_parallel_integer(int a){
+  int ParamToKernel = a;
+#pragma omp target map(tofrom: ParamToKernel)
+  {
+ParamToKernel += 1;
+  }
+}
+
+// CK30: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4) %ParamToKernel)
+
+// CK30: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} {
+
+// CK30: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs
+// CK30: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]]
+// CK30: store i32* %ParamToKernel, i32** [[GEPOBPBIT]]
+// CK30: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}offload_ptrs
+// CK30: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]]
+// CK30: store i32* %ParamToKernel, i32** [[GEPOPBIT]]
+// CK30: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs
+// CK30: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}offload_ptrs
+// CK30: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]]
+
+#endif
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38040: [OpenMP] Add an additional test for D34888

2017-09-26 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 116671.
gtbercea added a comment.

Add nocudalib flag.


https://reviews.llvm.org/D38040

Files:
  test/OpenMP/target_map_codegen.cpp


Index: test/OpenMP/target_map_codegen.cpp
===
--- test/OpenMP/target_map_codegen.cpp
+++ test/OpenMP/target_map_codegen.cpp
@@ -4554,3 +4554,33 @@
 }
 #endif
 #endif
+
+///==///
+// RUN: %clang -DCK30 -std=c++11 -fopenmp -S -emit-llvm 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN: -nocudalib %s -o - 2>&1 | FileCheck -check-prefix=CK30 %s
+
+#ifdef CK30
+
+void target_maps_parallel_integer(int a){
+  int ParamToKernel = a;
+#pragma omp target map(tofrom: ParamToKernel)
+  {
+ParamToKernel += 1;
+  }
+}
+
+// CK30: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4) 
%ParamToKernel)
+
+// CK30: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} {
+
+// CK30: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs
+// CK30: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]]
+// CK30: store i32* %ParamToKernel, i32** [[GEPOBPBIT]]
+// CK30: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}offload_ptrs
+// CK30: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]]
+// CK30: store i32* %ParamToKernel, i32** [[GEPOPBIT]]
+// CK30: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs
+// CK30: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}offload_ptrs
+// CK30: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]]
+
+#endif


Index: test/OpenMP/target_map_codegen.cpp
===
--- test/OpenMP/target_map_codegen.cpp
+++ test/OpenMP/target_map_codegen.cpp
@@ -4554,3 +4554,33 @@
 }
 #endif
 #endif
+
+///==///
+// RUN: %clang -DCK30 -std=c++11 -fopenmp -S -emit-llvm -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN: -nocudalib %s -o - 2>&1 | FileCheck -check-prefix=CK30 %s
+
+#ifdef CK30
+
+void target_maps_parallel_integer(int a){
+  int ParamToKernel = a;
+#pragma omp target map(tofrom: ParamToKernel)
+  {
+ParamToKernel += 1;
+  }
+}
+
+// CK30: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4) %ParamToKernel)
+
+// CK30: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} {
+
+// CK30: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs
+// CK30: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]]
+// CK30: store i32* %ParamToKernel, i32** [[GEPOBPBIT]]
+// CK30: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}offload_ptrs
+// CK30: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]]
+// CK30: store i32* %ParamToKernel, i32** [[GEPOPBIT]]
+// CK30: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}offload_baseptrs
+// CK30: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}offload_ptrs
+// CK30: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]]
+
+#endif
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38040: [OpenMP] Add an additional test for D34888

2017-09-26 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea reopened this revision.
gtbercea added a comment.
This revision is now accepted and ready to land.

Open


https://reviews.llvm.org/D38040



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D29660: [OpenMP] Add flag for overwriting default PTX version for OpenMP targets

2017-09-26 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea reopened this revision.
gtbercea added a comment.
This revision is now accepted and ready to land.

Open


Repository:
  rL LLVM

https://reviews.llvm.org/D29660



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38040: [OpenMP] Add an additional test for D34888

2017-09-26 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea reopened this revision.
gtbercea added a comment.
This revision is now accepted and ready to land.

Open


https://reviews.llvm.org/D38040



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38040: [OpenMP] Add an additional test for D34888

2017-09-26 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 116747.
gtbercea added a comment.

Fix test.


https://reviews.llvm.org/D38040

Files:
  test/OpenMP/openmp_offload_codegen.cpp


Index: test/OpenMP/openmp_offload_codegen.cpp
===
--- /dev/null
+++ test/OpenMP/openmp_offload_codegen.cpp
@@ -0,0 +1,36 @@
+// Test device for mapping codegen.
+///==///
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple 
powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -S -emit-llvm 
%s -o - 2>&1 \
+// RUN:   | FileCheck -check-prefix=CK1 %s
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple 
powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc 
%s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple 
powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s 
-fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck 
%s --check-prefix CK1-DEVICE
+
+// expected-no-diagnostics
+
+#ifdef CK1
+
+void target_maps_parallel_integer(int a){
+  int ParamToKernel = a;
+#pragma omp target map(tofrom: ParamToKernel)
+  {
+ParamToKernel += 1;
+  }
+}
+
+// CK1-DEVICE: {{.*}}void @__omp_offloading_{{.*}}(i32* 
dereferenceable(4){{.*}}
+
+// CK1: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} {
+
+// CK1: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}
+// CK1: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]]
+// CK1: store i32* %ParamToKernel, i32** [[GEPOBPBIT]]
+// CK1: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}
+// CK1: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]]
+// CK1: store i32* %ParamToKernel, i32** [[GEPOPBIT]]
+// CK1: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}
+// CK1: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}
+// CK1: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]]
+
+#endif
\ No newline at end of file


Index: test/OpenMP/openmp_offload_codegen.cpp
===
--- /dev/null
+++ test/OpenMP/openmp_offload_codegen.cpp
@@ -0,0 +1,36 @@
+// Test device for mapping codegen.
+///==///
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -S -emit-llvm %s -o - 2>&1 \
+// RUN:   | FileCheck -check-prefix=CK1 %s
+
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1-DEVICE
+
+// expected-no-diagnostics
+
+#ifdef CK1
+
+void target_maps_parallel_integer(int a){
+  int ParamToKernel = a;
+#pragma omp target map(tofrom: ParamToKernel)
+  {
+ParamToKernel += 1;
+  }
+}
+
+// CK1-DEVICE: {{.*}}void @__omp_offloading_{{.*}}(i32* dereferenceable(4){{.*}}
+
+// CK1: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} {
+
+// CK1: [[GEPOBP:%.+]] = getelementptr inbounds {{.*}}
+// CK1: [[GEPOBPBIT:%.+]] = bitcast i8** [[GEPOBP]]
+// CK1: store i32* %ParamToKernel, i32** [[GEPOBPBIT]]
+// CK1: [[GEPOP:%.+]] = getelementptr inbounds {{.*}}
+// CK1: [[GEPOPBIT:%.+]] = bitcast i8** [[GEPOP]]
+// CK1: store i32* %ParamToKernel, i32** [[GEPOPBIT]]
+// CK1: [[GEPOBPARG:%.+]] = getelementptr inbounds {{.*}}
+// CK1: [[GEPOPARG:%.+]] = getelementptr inbounds {{.*}}
+// CK1: call {{.*}}tgt_target({{.*}}i8** [[GEPOBPARG]], i8** [[GEPOPARG]]
+
+#endif
\ No newline at end of file
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38257: [OpenMP] Fix memory leak when translating arguments

2017-09-27 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea accepted this revision.
gtbercea added a comment.
This revision is now accepted and ready to land.

LGTM


https://reviews.llvm.org/D38257



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38258: [OpenMP] Fix passing of -m arguments to device toolchain

2017-09-27 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: test/Driver/openmp-offload.c:89
+/// ###
+
 /// Check the phases graph when using a single target, different from the host.

Shouldn't these tests be in the gpu test file?


https://reviews.llvm.org/D38258



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38259: [OpenMP] Fix translation of target args

2017-09-27 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea accepted this revision.
gtbercea added a comment.
This revision is now accepted and ready to land.

LGTM


https://reviews.llvm.org/D38259



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38258: [OpenMP] Fix passing of -m arguments to device toolchain

2017-09-27 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea accepted this revision.
gtbercea added a comment.
This revision is now accepted and ready to land.

LGTM




Comment at: test/Driver/openmp-offload.c:89
+/// ###
+
 /// Check the phases graph when using a single target, different from the host.

Hahnfeld wrote:
> gtbercea wrote:
> > Shouldn't these tests be in the gpu test file?
> There is nothing specific to GPUs here IMO, that is why I moved the test back 
> to this file
Actually no, you're right! :)


https://reviews.llvm.org/D38258



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-13 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:170-182
-// This code prevents IsValid from being set when
-// no libdevice has been found.
-bool allEmpty = true;
-std::string LibDeviceFile;
-for (auto key : LibDeviceMap.keys()) {
-  LibDeviceFile = LibDeviceMap.lookup(key);
-  if (!LibDeviceFile.empty())

Hahnfeld wrote:
> tra wrote:
> > Hahnfeld wrote:
> > > tra wrote:
> > > > I'd keep this code. It appears to serve useful purpose as it requires 
> > > > CUDA installation to have at least some libdevice library in it.  It 
> > > > gives us a change to find a valid installation, instead of ailing some 
> > > > time later when we ask for a libdevice file and fail because there are 
> > > > none.
> > > We had some internal discussions about this after I submitted the patch 
> > > here.
> > > 
> > > The main question is: Do we want to support CUDA installations without 
> > > libdevice and are there use cases for that? I'd say that the user should 
> > > be able to use a toolchain without libdevice together with `-nocudalib`.
> > Sounds reasonable. How about keeping the code but putting it under 
> > `if(!hasArg(nocudalib))`?
> > 
> Ok, I'll do that in a separate patch and keep the code here for now.
The problem with nocudalib is that if for example you write a test, which looks 
to verify some device facing feature that requires a libdevice to be found (so 
you don't want to use nocudalib), it will probably work on your machine which 
has the correct CUDA setup but fail on another machine which does not (which is 
where you want to use nocudalib). You can see the contradiction there.


https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-13 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:170-182
-// This code prevents IsValid from being set when
-// no libdevice has been found.
-bool allEmpty = true;
-std::string LibDeviceFile;
-for (auto key : LibDeviceMap.keys()) {
-  LibDeviceFile = LibDeviceMap.lookup(key);
-  if (!LibDeviceFile.empty())

gtbercea wrote:
> Hahnfeld wrote:
> > tra wrote:
> > > Hahnfeld wrote:
> > > > tra wrote:
> > > > > I'd keep this code. It appears to serve useful purpose as it requires 
> > > > > CUDA installation to have at least some libdevice library in it.  It 
> > > > > gives us a change to find a valid installation, instead of ailing 
> > > > > some time later when we ask for a libdevice file and fail because 
> > > > > there are none.
> > > > We had some internal discussions about this after I submitted the patch 
> > > > here.
> > > > 
> > > > The main question is: Do we want to support CUDA installations without 
> > > > libdevice and are there use cases for that? I'd say that the user 
> > > > should be able to use a toolchain without libdevice together with 
> > > > `-nocudalib`.
> > > Sounds reasonable. How about keeping the code but putting it under 
> > > `if(!hasArg(nocudalib))`?
> > > 
> > Ok, I'll do that in a separate patch and keep the code here for now.
> The problem with nocudalib is that if for example you write a test, which 
> looks to verify some device facing feature that requires a libdevice to be 
> found (so you don't want to use nocudalib), it will probably work on your 
> machine which has the correct CUDA setup but fail on another machine which 
> does not (which is where you want to use nocudalib). You can see the 
> contradiction there.
Just to be clear I am arguing for keeping this code :)


https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-13 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.h:90
-  }
 };
 

I would also like to keep the spirit of this code if not in this exact form at 
least something that performs the same functionality.


https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-13 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:170-182
-// This code prevents IsValid from being set when
-// no libdevice has been found.
-bool allEmpty = true;
-std::string LibDeviceFile;
-for (auto key : LibDeviceMap.keys()) {
-  LibDeviceFile = LibDeviceMap.lookup(key);
-  if (!LibDeviceFile.empty())

tra wrote:
> tra wrote:
> > gtbercea wrote:
> > > gtbercea wrote:
> > > > Hahnfeld wrote:
> > > > > tra wrote:
> > > > > > Hahnfeld wrote:
> > > > > > > tra wrote:
> > > > > > > > I'd keep this code. It appears to serve useful purpose as it 
> > > > > > > > requires CUDA installation to have at least some libdevice 
> > > > > > > > library in it.  It gives us a change to find a valid 
> > > > > > > > installation, instead of ailing some time later when we ask for 
> > > > > > > > a libdevice file and fail because there are none.
> > > > > > > We had some internal discussions about this after I submitted the 
> > > > > > > patch here.
> > > > > > > 
> > > > > > > The main question is: Do we want to support CUDA installations 
> > > > > > > without libdevice and are there use cases for that? I'd say that 
> > > > > > > the user should be able to use a toolchain without libdevice 
> > > > > > > together with `-nocudalib`.
> > > > > > Sounds reasonable. How about keeping the code but putting it under 
> > > > > > `if(!hasArg(nocudalib))`?
> > > > > > 
> > > > > Ok, I'll do that in a separate patch and keep the code here for now.
> > > > The problem with nocudalib is that if for example you write a test, 
> > > > which looks to verify some device facing feature that requires a 
> > > > libdevice to be found (so you don't want to use nocudalib), it will 
> > > > probably work on your machine which has the correct CUDA setup but fail 
> > > > on another machine which does not (which is where you want to use 
> > > > nocudalib). You can see the contradiction there.
> > > Just to be clear I am arguing for keeping this code :)
> > @gtbercea: I'm not sure I follow your example. If you're talking about 
> > clang tests, we do have fake CUDA installation setup under 
> > test/Driver/Inputs which removes dependency on whatever CUDA you may or may 
> > not have installed on your machine. I also don't see a contradiction -- you 
> > you do need libdevice, it makes no point picking a broken CUDA installation 
> > which does not have any libdevice files. If you explicitly tell compiler 
> > that you don't need libdevice, that would make CUDA w/o libdevice 
> > acceptable. With --cuda-path you do have a way to tell clang which 
> > installation you want it to use. What do I miss?
> > 
> > 
> Ah, you were arguing with Hahnfeld@'s -nocudalib example. Then I guess we're 
> in violent agreement.
I fully agree with this: "you do need libdevice, it makes no point picking a 
broken CUDA installation which does not have any libdevice files. If you 
explicitly tell compiler that you don't need libdevice, that would make CUDA 
w/o libdevice acceptable."

I was trying to show an example of a situation where you have your code 
compiled using nocudalib on one machine and then the same code will error on a 
machine which requires the nocudalib flag to be passed to make up for the 
absence of libdevice.




https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-13 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:170-182
-// This code prevents IsValid from being set when
-// no libdevice has been found.
-bool allEmpty = true;
-std::string LibDeviceFile;
-for (auto key : LibDeviceMap.keys()) {
-  LibDeviceFile = LibDeviceMap.lookup(key);
-  if (!LibDeviceFile.empty())

gtbercea wrote:
> tra wrote:
> > tra wrote:
> > > gtbercea wrote:
> > > > gtbercea wrote:
> > > > > Hahnfeld wrote:
> > > > > > tra wrote:
> > > > > > > Hahnfeld wrote:
> > > > > > > > tra wrote:
> > > > > > > > > I'd keep this code. It appears to serve useful purpose as it 
> > > > > > > > > requires CUDA installation to have at least some libdevice 
> > > > > > > > > library in it.  It gives us a change to find a valid 
> > > > > > > > > installation, instead of ailing some time later when we ask 
> > > > > > > > > for a libdevice file and fail because there are none.
> > > > > > > > We had some internal discussions about this after I submitted 
> > > > > > > > the patch here.
> > > > > > > > 
> > > > > > > > The main question is: Do we want to support CUDA installations 
> > > > > > > > without libdevice and are there use cases for that? I'd say 
> > > > > > > > that the user should be able to use a toolchain without 
> > > > > > > > libdevice together with `-nocudalib`.
> > > > > > > Sounds reasonable. How about keeping the code but putting it 
> > > > > > > under `if(!hasArg(nocudalib))`?
> > > > > > > 
> > > > > > Ok, I'll do that in a separate patch and keep the code here for now.
> > > > > The problem with nocudalib is that if for example you write a test, 
> > > > > which looks to verify some device facing feature that requires a 
> > > > > libdevice to be found (so you don't want to use nocudalib), it will 
> > > > > probably work on your machine which has the correct CUDA setup but 
> > > > > fail on another machine which does not (which is where you want to 
> > > > > use nocudalib). You can see the contradiction there.
> > > > Just to be clear I am arguing for keeping this code :)
> > > @gtbercea: I'm not sure I follow your example. If you're talking about 
> > > clang tests, we do have fake CUDA installation setup under 
> > > test/Driver/Inputs which removes dependency on whatever CUDA you may or 
> > > may not have installed on your machine. I also don't see a contradiction 
> > > -- you you do need libdevice, it makes no point picking a broken CUDA 
> > > installation which does not have any libdevice files. If you explicitly 
> > > tell compiler that you don't need libdevice, that would make CUDA w/o 
> > > libdevice acceptable. With --cuda-path you do have a way to tell clang 
> > > which installation you want it to use. What do I miss?
> > > 
> > > 
> > Ah, you were arguing with Hahnfeld@'s -nocudalib example. Then I guess 
> > we're in violent agreement.
> I fully agree with this: "you do need libdevice, it makes no point picking a 
> broken CUDA installation which does not have any libdevice files. If you 
> explicitly tell compiler that you don't need libdevice, that would make CUDA 
> w/o libdevice acceptable."
> 
> I was trying to show an example of a situation where you have your code 
> compiled using nocudalib on one machine and then the same code will error on 
> a machine which requires the nocudalib flag to be passed to make up for the 
> absence of libdevice.
> 
> 
Yes it was a counter argument to that! :) 


https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-13 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.h:90
-  }
 };
 

gtbercea wrote:
> I would also like to keep the spirit of this code if not in this exact form 
> at least something that performs the same functionality.
@tra what's your opinion on this code? Should this stay, stay but modified to 
be more robust or taken out completely?


https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-13 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.h:90
-  }
 };
 

tra wrote:
> gtbercea wrote:
> > gtbercea wrote:
> > > I would also like to keep the spirit of this code if not in this exact 
> > > form at least something that performs the same functionality.
> > @tra what's your opinion on this code? Should this stay, stay but modified 
> > to be more robust or taken out completely?
> There are currently no users for this. In general, I would rather not have 
> magically-changing default GPU based on how broken your CUDA installation is. 
> IMO it would be better to keep defaults static and fail if prerequisites are 
> not met.
I would have thought that it is up to the compiler to select, as default, the 
lowest viable compute capability. This is what this code aims to do (whether it 
actually does that's a separate issue :) ).



https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-13 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.h:90
-  }
 };
 

gtbercea wrote:
> tra wrote:
> > gtbercea wrote:
> > > gtbercea wrote:
> > > > I would also like to keep the spirit of this code if not in this exact 
> > > > form at least something that performs the same functionality.
> > > @tra what's your opinion on this code? Should this stay, stay but 
> > > modified to be more robust or taken out completely?
> > There are currently no users for this. In general, I would rather not have 
> > magically-changing default GPU based on how broken your CUDA installation 
> > is. IMO it would be better to keep defaults static and fail if 
> > prerequisites are not met.
> I would have thought that it is up to the compiler to select, as default, the 
> lowest viable compute capability. This is what this code aims to do (whether 
> it actually does that's a separate issue :) ).
> 
The reason I added this code in the first place was to overcome the fact that 
something like a default of sm_30 may work on the K40 but once you go to newer 
Pascal, Volta GPUs then you need a new minimum compute capability that is 
supported.


https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38883: [CMake][OpenMP] Customize default offloading arch

2017-10-16 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

LGTM


https://reviews.llvm.org/D38883



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38976: [OpenMP] Add implicit data sharing support when offloading to NVIDIA GPUs using OpenMP device offloading

2017-10-16 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
Herald added a subscriber: jholewinski.

This patch is part of the development effort to add support in the current 
OpenMP GPU offloading implementation for implicitly sharing variables between a 
target region executed by the team master thread and the worker threads within 
that team.

This patch is the first of three required for successfully performing the 
implicit sharing of master thread variables with the worker threads within a 
team. The remaining two patches are:

- a patch to the LLVM NVPTX backend which ensures the lowering of shared 
variables to an device memory which allows the sharing of references;
- a runtime patch to libomptarget which ensures that a list of references to 
shared variables is properly maintained.

A simple code snippet which illustrates an implicit data sharing situation is 
as follows:

  #pragma omp target
  {
 // master thread only
 int v;
 #pragma omp parallel
 {
// worker threads
// use v
 }
  }

Variable v is implicitly shared from the team master thread which executes the 
code in between the target and parallel directives. The worker threads must 
operate on the latest version of v, including any updates performed by the 
master.

The code generated in this patch relies on the LLVM NVPTX patch (mentioned 
above) which prevents v from being lowered in the thread local memory of the 
master thread thus making the reference to this variable un-shareable with the 
workers. This ensures that the code generated by this patch is correct.
Since the parallel region is outlined the passing of arguments to the outlined 
regions must preserve the original order of arguments. The runtime therefore 
maintains a list of references to shared variables thus ensuring their passing 
in the correct order. The passing of arguments to the outlined parallel 
function is performed in a separate function which the data sharing 
infrastructure constructs in this patch. The function is inlined when 
optimizations are enabled.


Repository:
  rL LLVM

https://reviews.llvm.org/D38976

Files:
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  test/OpenMP/nvptx_data_sharing.cpp
  test/OpenMP/nvptx_parallel_codegen.cpp
  test/OpenMP/nvptx_target_teams_codegen.cpp

Index: test/OpenMP/nvptx_target_teams_codegen.cpp
===
--- test/OpenMP/nvptx_target_teams_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -60,7 +60,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]])
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -146,7 +146,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]])
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
Index: test/OpenMP/nvptx_parallel_codegen.cpp
===
--- test/OpenMP/nvptx_parallel_codegen.cpp
+++ test/OpenMP/nvptx_parallel_codegen.cpp
@@ -78,7 +78,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]])
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -92,20 +92,20 @@
   //
   // CHECK: [[EXEC_PARALLEL]]
   // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*)
+  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
   // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
   //
   // CHECK: [[EXEC_PFN1]]
-  // CHECK: call void [[PARALLEL_FN1]](
+  // CHECK: call void [[PARALLEL_FN1]]_wrapper(
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[CHECK_NEXT1]]
   // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*)
+  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
   // CHECK: br i1 [[WM2]], label {{%

[PATCH] D38978: [OpenMP] Enable the lowering of implicitly shared variables in OpenMP GPU-offloaded target regions to the GPU shared memory

2017-10-16 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
Herald added subscribers: mgorny, jholewinski.

This patch is part of the development effort to add support in the current 
OpenMP GPU offloading implementation for implicitly sharing variables between a 
target region executed by the team master thread and the worker threads within 
that team.

This patch is the second of three required for successfully performing the 
implicit sharing of master thread variables with the worker threads within a 
team:
-Patch https://reviews.llvm.org/D38976 extends the CLANG code generation with 
code that handles shared variables.
-Patch (coming soon) extends the functionality of libomptarget to maintain a 
list of references to shared variables.

This patch adds a shared memory stack to the prolog of the kernel function 
representing the device offloaded OpenMP target region. The new passes along 
with the changes to existing ones, ensure that any OpenMP variable which needs 
to be shared across several threads will be allocated in this new stack, in the 
shared memory of the device. This patch covers the case of sharing variables 
from the master thread to the worker threads:

  #pragma omp target
  {
 // master thread only
 int v;
 #pragma omp parallel
 {
// worker threads
// use v
 }
  }


Repository:
  rL LLVM

https://reviews.llvm.org/D38978

Files:
  include/llvm/CodeGen/TargetPassConfig.h
  lib/CodeGen/TargetPassConfig.cpp
  lib/Target/NVPTX/CMakeLists.txt
  lib/Target/NVPTX/NVPTX.h
  lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
  lib/Target/NVPTX/NVPTXFrameLowering.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.h
  lib/Target/NVPTX/NVPTXInstrInfo.td
  lib/Target/NVPTX/NVPTXLowerAlloca.cpp
  lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.h
  lib/Target/NVPTX/NVPTXRegisterInfo.td
  lib/Target/NVPTX/NVPTXTargetMachine.cpp
  lib/Target/NVPTX/NVPTXUtilities.cpp
  lib/Target/NVPTX/NVPTXUtilities.h

Index: lib/Target/NVPTX/NVPTXUtilities.h
===
--- lib/Target/NVPTX/NVPTXUtilities.h
+++ lib/Target/NVPTX/NVPTXUtilities.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -60,6 +62,8 @@
 bool getAlign(const Function &, unsigned index, unsigned &);
 bool getAlign(const CallInst &, unsigned index, unsigned &);
 
+bool ptrIsStored(Value *Ptr);
+
 }
 
 #endif
Index: lib/Target/NVPTX/NVPTXUtilities.cpp
===
--- lib/Target/NVPTX/NVPTXUtilities.cpp
+++ lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -28,6 +28,8 @@
 
 namespace llvm {
 
+#define DEBUG_TYPE "nvptx-utilities"
+
 namespace {
 typedef std::map > key_val_pair_t;
 typedef std::map global_val_annot_t;
@@ -314,4 +316,50 @@
   return false;
 }
 
+/// Returns true if there are any instructions storing
+/// the address of this pointer.
+bool ptrIsStored(Value *Ptr) {
+  SmallVector PointerAliases;
+  PointerAliases.push_back(Ptr);
+
+  SmallVector Users;
+  for (const Use &U : Ptr->uses())
+Users.push_back(U.getUser());
+
+  for (unsigned I = 0; I < Users.size(); ++I) {
+// Get pointer usage
+const User *FU = Users[I];
+
+// Check if Ptr or an alias to it is the destination of the store
+auto SI = dyn_cast(FU);
+if (SI) {
+  for (auto Alias: PointerAliases)
+if (SI->getValueOperand() == Alias)
+  return true;
+  continue;
+}
+
+// TODO: Can loads lead to address being taken?
+// TODO: Can GEPs lead to address being taken?
+
+// Bitcasts increase aliases of the pointer
+auto BI = dyn_cast(FU);
+if (BI) {
+  for (const Use &U : BI->uses())
+Users.push_back(U.getUser());
+  PointerAliases.push_back(BI);
+  continue;
+}
+
+// TODO:
+// There may be other instructions which increase the number
+// of alias values ex. operations on the address of the alloca.
+// The whole alloca'ed memory region needs to be shared if at
+// least one of the values needs to be shared.
+  }
+
+  // Address of the pointer has been stored
+  return false;
+}
+
 } // namespace llvm
Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -54,6 +54,7 @@
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
+void initializeNVPTXFunctionDataSharingPass(PassRegistry &);
 
 } // end namespac

[PATCH] D39005: [OpenMP] Clean up variable and function names for NVPTX backend

2017-10-17 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
Herald added a subscriber: jholewinski.

Clean-up variable and function names.


Repository:
  rL LLVM

https://reviews.llvm.org/D39005

Files:
  lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp


Index: lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
===
--- lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -37,6 +37,8 @@
 
   /// \brief Clean up the name to remove symbols invalid in PTX.
   std::string cleanUpName(StringRef Name);
+  /// Set a clean name, ensuring collisions are avoided.
+  void generateCleanName(Value &V);
 };
 }
 
@@ -50,20 +52,31 @@
 "Assign valid PTX names to globals", false, false)
 
 bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) {
-  for (GlobalVariable &GV : M.globals()) {
-// We are only allowed to rename local symbols.
-if (GV.hasLocalLinkage()) {
-  // setName doesn't do extra work if the name does not change.
-  // Note: this does not create collisions - if setName is asked to set the
-  // name to something that already exists, it adds a proper postfix to
-  // avoid collisions.
-  GV.setName(cleanUpName(GV.getName()));
-}
-  }
+  // We are only allowed to rename local symbols.
+  for (GlobalVariable &GV : M.globals())
+if (GV.hasLocalLinkage())
+  generateCleanName(GV);
+
+  // Clean function symbols.
+  for (auto &FN : M.functions())
+if (FN.hasLocalLinkage())
+  generateCleanName(FN);
 
   return true;
 }
 
+void NVPTXAssignValidGlobalNames::generateCleanName(Value &V) {
+  std::string ValidName;
+  do {
+ValidName = cleanUpName(V.getName());
+// setName doesn't do extra work if the name does not change.
+// Collisions are avoided by adding a suffix (which may yet be unclean in
+// PTX).
+V.setName(ValidName);
+// If there are no collisions return, otherwise clean up the new name.
+  } while (!V.getName().equals(ValidName));
+}
+
 std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) {
   std::string ValidName;
   raw_string_ostream ValidNameStream(ValidName);


Index: lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
===
--- lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -37,6 +37,8 @@
 
   /// \brief Clean up the name to remove symbols invalid in PTX.
   std::string cleanUpName(StringRef Name);
+  /// Set a clean name, ensuring collisions are avoided.
+  void generateCleanName(Value &V);
 };
 }
 
@@ -50,20 +52,31 @@
 "Assign valid PTX names to globals", false, false)
 
 bool NVPTXAssignValidGlobalNames::runOnModule(Module &M) {
-  for (GlobalVariable &GV : M.globals()) {
-// We are only allowed to rename local symbols.
-if (GV.hasLocalLinkage()) {
-  // setName doesn't do extra work if the name does not change.
-  // Note: this does not create collisions - if setName is asked to set the
-  // name to something that already exists, it adds a proper postfix to
-  // avoid collisions.
-  GV.setName(cleanUpName(GV.getName()));
-}
-  }
+  // We are only allowed to rename local symbols.
+  for (GlobalVariable &GV : M.globals())
+if (GV.hasLocalLinkage())
+  generateCleanName(GV);
+
+  // Clean function symbols.
+  for (auto &FN : M.functions())
+if (FN.hasLocalLinkage())
+  generateCleanName(FN);
 
   return true;
 }
 
+void NVPTXAssignValidGlobalNames::generateCleanName(Value &V) {
+  std::string ValidName;
+  do {
+ValidName = cleanUpName(V.getName());
+// setName doesn't do extra work if the name does not change.
+// Collisions are avoided by adding a suffix (which may yet be unclean in
+// PTX).
+V.setName(ValidName);
+// If there are no collisions return, otherwise clean up the new name.
+  } while (!V.getName().equals(ValidName));
+}
+
 std::string NVPTXAssignValidGlobalNames::cleanUpName(StringRef Name) {
   std::string ValidName;
   raw_string_ostream ValidNameStream(ValidName);
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D38978: [OpenMP] Enable the lowering of implicitly shared variables in OpenMP GPU-offloaded target regions to the GPU shared memory

2017-10-17 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 119327.
gtbercea added a comment.

Eliminate variable and function name clean-up. That has been moved into a 
separate patch: https://reviews.llvm.org/D39005


Repository:
  rL LLVM

https://reviews.llvm.org/D38978

Files:
  include/llvm/CodeGen/TargetPassConfig.h
  lib/CodeGen/TargetPassConfig.cpp
  lib/Target/NVPTX/CMakeLists.txt
  lib/Target/NVPTX/NVPTX.h
  lib/Target/NVPTX/NVPTXAsmPrinter.cpp
  lib/Target/NVPTX/NVPTXFrameLowering.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp
  lib/Target/NVPTX/NVPTXFunctionDataSharing.h
  lib/Target/NVPTX/NVPTXInstrInfo.td
  lib/Target/NVPTX/NVPTXLowerAlloca.cpp
  lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.cpp
  lib/Target/NVPTX/NVPTXRegisterInfo.h
  lib/Target/NVPTX/NVPTXRegisterInfo.td
  lib/Target/NVPTX/NVPTXTargetMachine.cpp
  lib/Target/NVPTX/NVPTXUtilities.cpp
  lib/Target/NVPTX/NVPTXUtilities.h

Index: lib/Target/NVPTX/NVPTXUtilities.h
===
--- lib/Target/NVPTX/NVPTXUtilities.h
+++ lib/Target/NVPTX/NVPTXUtilities.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -60,6 +62,8 @@
 bool getAlign(const Function &, unsigned index, unsigned &);
 bool getAlign(const CallInst &, unsigned index, unsigned &);
 
+bool ptrIsStored(Value *Ptr);
+
 }
 
 #endif
Index: lib/Target/NVPTX/NVPTXUtilities.cpp
===
--- lib/Target/NVPTX/NVPTXUtilities.cpp
+++ lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -28,6 +28,8 @@
 
 namespace llvm {
 
+#define DEBUG_TYPE "nvptx-utilities"
+
 namespace {
 typedef std::map > key_val_pair_t;
 typedef std::map global_val_annot_t;
@@ -314,4 +316,50 @@
   return false;
 }
 
+/// Returns true if there are any instructions storing
+/// the address of this pointer.
+bool ptrIsStored(Value *Ptr) {
+  SmallVector PointerAliases;
+  PointerAliases.push_back(Ptr);
+
+  SmallVector Users;
+  for (const Use &U : Ptr->uses())
+Users.push_back(U.getUser());
+
+  for (unsigned I = 0; I < Users.size(); ++I) {
+// Get pointer usage
+const User *FU = Users[I];
+
+// Check if Ptr or an alias to it is the destination of the store
+auto SI = dyn_cast(FU);
+if (SI) {
+  for (auto Alias: PointerAliases)
+if (SI->getValueOperand() == Alias)
+  return true;
+  continue;
+}
+
+// TODO: Can loads lead to address being taken?
+// TODO: Can GEPs lead to address being taken?
+
+// Bitcasts increase aliases of the pointer
+auto BI = dyn_cast(FU);
+if (BI) {
+  for (const Use &U : BI->uses())
+Users.push_back(U.getUser());
+  PointerAliases.push_back(BI);
+  continue;
+}
+
+// TODO:
+// There may be other instructions which increase the number
+// of alias values ex. operations on the address of the alloca.
+// The whole alloca'ed memory region needs to be shared if at
+// least one of the values needs to be shared.
+  }
+
+  // Address of the pointer has been stored
+  return false;
+}
+
 } // namespace llvm
Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -54,6 +54,7 @@
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
+void initializeNVPTXFunctionDataSharingPass(PassRegistry &);
 
 } // end namespace llvm
 
@@ -72,6 +73,7 @@
   initializeNVPTXAssignValidGlobalNamesPass(PR);
   initializeNVPTXLowerArgsPass(PR);
   initializeNVPTXLowerAllocaPass(PR);
+  initializeNVPTXFunctionDataSharingPass(PR);
   initializeNVPTXLowerAggrCopiesPass(PR);
 }
 
@@ -148,6 +150,7 @@
   bool addInstSelector() override;
   void addPostRegAlloc() override;
   void addMachineSSAOptimization() override;
+  void addMachineSSALowering() override;
 
   FunctionPass *createTargetRegisterAllocator(bool) override;
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
@@ -248,10 +251,15 @@
   // before the address space inference passes.
   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
   if (getOptLevel() != CodeGenOpt::None) {
+// Add address space inference passes
 addAddressSpaceInferencePasses();
 if (!DisableLoadStoreVectorizer)
   addPass(createLoadStoreVectorizerPass());
 addStraightLineScalarOptimizationPasses();
+  } else {
+// Even when no optimizations are used, we need to lower certain
+// alloca instructions to the appropriate memory type for correctness.
+addPass(createNVPTXFunctionDataSharingPass(&getNVPTXTa

[PATCH] D39005: [OpenMP] Clean up variable and function names for NVPTX backend

2017-10-17 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

Hi Artem, Justin,

I see that this patch is the same as the patch Arpith wanted to post a while 
back i.e. https://reviews.llvm.org/D17738.

Was there a consensus regarding what the right thing to do is in this case?

I'd be interested to get the ball rolling in regard to coming up with a fix for 
this. I see some suggestions in past patches. Some help/clarification  would be 
much appreciated.

Thanks!


Repository:
  rL LLVM

https://reviews.llvm.org/D39005



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D39005: [OpenMP] Clean up variable and function names for NVPTX backend

2017-10-18 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D39005#900226, @jlebar wrote:

> > I'd be interested to get the ball rolling in regard to coming up with a fix 
> > for this. I see some suggestions in past patches. Some help/clarification 
> > would be much appreciated.
>
> Happy to help, but I'm not sure what to offer beyond the link in Art's 
> previous comment.


Thanks Justin. Perhaps if we could start by clarifying this statement "One 
option is that we add a function to LLVM get an available separator character, 
which can default to '.', but we set to '$' for nvptx, and use that for 
generating new names at the IR level."  as well as this statement "This seems 
practical. Perhaps it could be part of the name mangling scheme already encoded 
in DataLayout?".

The first question that comes to mind is what is the link between data layout 
and name mangling conventions?


Repository:
  rL LLVM

https://reviews.llvm.org/D39005



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47849: [OpenMP][Clang][NVPTX] Enable math functions called in an OpenMP NVPTX target device region to be resolved as device-native function calls

2018-08-23 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Basic/Targets/NVPTX.cpp:232
+  // getting inlined on the device.
+  Builder.defineMacro("__NO_MATH_INLINES");
 }

tra wrote:
> This relies on implementation detail of particular variant of the header file 
> you're assuming all compilations will include. This is a workaround of the 
> real problem (attempting to use headers from machine X while targeting Y) at 
> best.
> 
> D50845 is dealing with the issue of headers for target code. Hopefully, 
> they'll find a way to provide device-specific headers, so you don't rely on 
> host headers being parseable during device-side compilation.
I agree. The proper fix would be what the other patch is attempting to do.



Comment at: lib/Driver/ToolChains/Clang.cpp:4758
+// toolchain.
+CmdArgs.push_back("-fno-math-builtin");
   }

tra wrote:
> Could you elaborate on why you don't want the builtins?
> Builtins are enabled and are useful for CUDA. What makes their use different 
> for OpenMP?
> Are you doing it to guarantee that math functions remain unresolved in IR so 
> you could link them in from external bitcode?
> 
That's right. I don't particularly like this approach as this leads to 
OpenMP-NVPTX toolchain missing out on optimizations such as replacing math 
function call with basic operations ( pow(a,2) -> a*a for example).
I am trying to fix this in a future patch by allowing intrinsics/builtins to 
propagate.


Repository:
  rC Clang

https://reviews.llvm.org/D47849



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D51312: [OpenMP][NVPTX] Use appropriate _CALL_ELF macro when offloading

2018-08-27 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
gtbercea added reviewers: Hahnfeld, ABataev, caomhin.
Herald added subscribers: cfe-commits, guansong.

When offloading to a device and using the powerpc64le version of the auxiliary 
triple, the _CALL_ELF macro is not set correctly to 2 resulting in the attempt 
to include a header that does not exist. This patch fixes this problem.


Repository:
  rC Clang

https://reviews.llvm.org/D51312

Files:
  lib/Frontend/InitPreprocessor.cpp


Index: lib/Frontend/InitPreprocessor.cpp
===
--- lib/Frontend/InitPreprocessor.cpp
+++ lib/Frontend/InitPreprocessor.cpp
@@ -1112,8 +1112,12 @@
 Builder.defineMacro("__x86_64__");
 break;
   case llvm::Triple::ppc64:
+Builder.defineMacro("__powerpc64__");
+Builder.defineMacro("_CALL_ELF", "1");
+break;
   case llvm::Triple::ppc64le:
 Builder.defineMacro("__powerpc64__");
+Builder.defineMacro("_CALL_ELF", "2");
 break;
   default:
 break;


Index: lib/Frontend/InitPreprocessor.cpp
===
--- lib/Frontend/InitPreprocessor.cpp
+++ lib/Frontend/InitPreprocessor.cpp
@@ -1112,8 +1112,12 @@
 Builder.defineMacro("__x86_64__");
 break;
   case llvm::Triple::ppc64:
+Builder.defineMacro("__powerpc64__");
+Builder.defineMacro("_CALL_ELF", "1");
+break;
   case llvm::Triple::ppc64le:
 Builder.defineMacro("__powerpc64__");
+Builder.defineMacro("_CALL_ELF", "2");
 break;
   default:
 break;
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D51312: [OpenMP][NVPTX] Use appropriate _CALL_ELF macro when offloading

2018-08-27 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 162708.
gtbercea added a comment.

Add test.


Repository:
  rC Clang

https://reviews.llvm.org/D51312

Files:
  lib/Frontend/InitPreprocessor.cpp
  test/Preprocessor/aux-triple.c


Index: test/Preprocessor/aux-triple.c
===
--- test/Preprocessor/aux-triple.c
+++ test/Preprocessor/aux-triple.c
@@ -14,7 +14,7 @@
 // RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \
 // RUN: -triple nvptx64-none-none -aux-triple 
powerpc64le-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines %s \
-// RUN: -check-prefixes NVPTX64,PPC64,LINUX,LINUX-CPP
+// RUN: -check-prefixes NVPTX64,PPC64LE,LINUX,LINUX-CPP
 // RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \
 // RUN: -triple nvptx64-none-none -aux-triple x86_64-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines %s \
@@ -24,22 +24,24 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \
 // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \
 // RUN: -aux-triple powerpc64le-unknown-linux-gnu \
-// RUN:   | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64,LINUX %s
+// RUN:   | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64LE,LINUX 
%s
 // RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \
 // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \
 // RUN: -aux-triple x86_64-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines -check-prefixes NVPTX64,X86_64,LINUX %s
 // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \
 // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \
 // RUN: -aux-triple powerpc64le-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines %s \
-// RUN: -check-prefixes NVPTX64,PPC64,LINUX,LINUX-CPP
+// RUN: -check-prefixes NVPTX64,PPC64LE,LINUX,LINUX-CPP
 // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \
 // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \
 // RUN: -aux-triple x86_64-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines %s \
 // RUN: -check-prefixes NVPTX64,X86_64,LINUX,LINUX-CPP
 
+// PPC64LE:#define _CALL_ELF 2
+
 // NONE-NOT:#define _GNU_SOURCE
 // LINUX-CPP:#define _GNU_SOURCE 1
 
@@ -56,7 +58,7 @@
 // LINUX:#define __linux__ 1
 
 // NONE-NOT:#define __powerpc64__
-// PPC64:#define __powerpc64__ 1
+// PPC64LE:#define __powerpc64__ 1
 
 // NONE-NOT:#define __x86_64__
 // X86_64:#define __x86_64__ 1
Index: lib/Frontend/InitPreprocessor.cpp
===
--- lib/Frontend/InitPreprocessor.cpp
+++ lib/Frontend/InitPreprocessor.cpp
@@ -1106,14 +1106,19 @@
   auto AuxTriple = AuxTI.getTriple();
 
   // Define basic target macros needed by at least bits/wordsize.h and
-  // bits/mathinline.h
+  // bits/mathinline.h.
+  // On PowerPC, explicitely set _CALL_ELF macro needed for gnu/stubs.h.
   switch (AuxTriple.getArch()) {
   case llvm::Triple::x86_64:
 Builder.defineMacro("__x86_64__");
 break;
   case llvm::Triple::ppc64:
+Builder.defineMacro("__powerpc64__");
+Builder.defineMacro("_CALL_ELF", "1");
+break;
   case llvm::Triple::ppc64le:
 Builder.defineMacro("__powerpc64__");
+Builder.defineMacro("_CALL_ELF", "2");
 break;
   default:
 break;


Index: test/Preprocessor/aux-triple.c
===
--- test/Preprocessor/aux-triple.c
+++ test/Preprocessor/aux-triple.c
@@ -14,7 +14,7 @@
 // RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \
 // RUN: -triple nvptx64-none-none -aux-triple powerpc64le-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines %s \
-// RUN: -check-prefixes NVPTX64,PPC64,LINUX,LINUX-CPP
+// RUN: -check-prefixes NVPTX64,PPC64LE,LINUX,LINUX-CPP
 // RUN: %clang_cc1 -x cuda -E -dM -ffreestanding < /dev/null \
 // RUN: -triple nvptx64-none-none -aux-triple x86_64-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines %s \
@@ -24,22 +24,24 @@
 // RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \
 // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \
 // RUN: -aux-triple powerpc64le-unknown-linux-gnu \
-// RUN:   | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64,LINUX %s
+// RUN:   | FileCheck -match-full-lines -check-prefixes NVPTX64,PPC64LE,LINUX %s
 // RUN: %clang_cc1 -E -dM -ffreestanding < /dev/null \
 // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \
 // RUN: -aux-triple x86_64-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines -check-prefixes NVPTX64,X86_64,LINUX %s
 // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding < /dev/null \
 // RUN: -fopenmp -fopenmp-is-device -triple nvptx64-none-none \
 // RUN: -aux-triple powerpc64le-unknown-linux-gnu \
 // RUN:   | FileCheck -match-full-lines %s \
-// RUN: -check-prefixes NVPTX64,PPC64,LINUX,LINUX-CPP
+// RUN: -check-prefixes N

[PATCH] D51446: [OpenMP][bugfix] Add missing macros for Power

2018-08-29 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
gtbercea added reviewers: ABataev, Hahnfeld, caomhin.
Herald added subscribers: cfe-commits, guansong.

Add missing macros when the auxiliary triple points to the PPC architecture.


Repository:
  rC Clang

https://reviews.llvm.org/D51446

Files:
  lib/Frontend/InitPreprocessor.cpp


Index: lib/Frontend/InitPreprocessor.cpp
===
--- lib/Frontend/InitPreprocessor.cpp
+++ lib/Frontend/InitPreprocessor.cpp
@@ -1113,10 +1113,18 @@
 Builder.defineMacro("__x86_64__");
 break;
   case llvm::Triple::ppc64:
+if (AuxTI.getLongDoubleWidth() == 128) {
+  Builder.defineMacro("__LONG_DOUBLE_128__");
+  Builder.defineMacro("__LONGDOUBLE128");
+}
 Builder.defineMacro("__powerpc64__");
 Builder.defineMacro("_CALL_ELF", "1");
 break;
   case llvm::Triple::ppc64le:
+if (AuxTI.getLongDoubleWidth() == 128) {
+  Builder.defineMacro("__LONG_DOUBLE_128__");
+  Builder.defineMacro("__LONGDOUBLE128");
+}
 Builder.defineMacro("__powerpc64__");
 Builder.defineMacro("_CALL_ELF", "2");
 break;


Index: lib/Frontend/InitPreprocessor.cpp
===
--- lib/Frontend/InitPreprocessor.cpp
+++ lib/Frontend/InitPreprocessor.cpp
@@ -1113,10 +1113,18 @@
 Builder.defineMacro("__x86_64__");
 break;
   case llvm::Triple::ppc64:
+if (AuxTI.getLongDoubleWidth() == 128) {
+  Builder.defineMacro("__LONG_DOUBLE_128__");
+  Builder.defineMacro("__LONGDOUBLE128");
+}
 Builder.defineMacro("__powerpc64__");
 Builder.defineMacro("_CALL_ELF", "1");
 break;
   case llvm::Triple::ppc64le:
+if (AuxTI.getLongDoubleWidth() == 128) {
+  Builder.defineMacro("__LONG_DOUBLE_128__");
+  Builder.defineMacro("__LONGDOUBLE128");
+}
 Builder.defineMacro("__powerpc64__");
 Builder.defineMacro("_CALL_ELF", "2");
 break;
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D51446: [OpenMP][bugfix] Add missing macros for Power

2018-08-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 163377.
gtbercea added a comment.

  Add test.


Repository:
  rC Clang

https://reviews.llvm.org/D51446

Files:
  lib/Frontend/InitPreprocessor.cpp
  test/Preprocessor/aux-triple.c


Index: test/Preprocessor/aux-triple.c
===
--- test/Preprocessor/aux-triple.c
+++ test/Preprocessor/aux-triple.c
@@ -50,6 +50,9 @@
 // NONE-NOT:#define __ELF__
 // LINUX:#define __ELF__ 1
 
+// PPC64LE:#define __LONGDOUBLE128 1
+// PPC64LE:#define __LONG_DOUBLE_128__ 1
+
 // NVPTX64:#define __LP64__ 1
 // NVPTX64:#define __NVPTX__ 1
 // NVPTX64:#define __PTX__ 1
Index: lib/Frontend/InitPreprocessor.cpp
===
--- lib/Frontend/InitPreprocessor.cpp
+++ lib/Frontend/InitPreprocessor.cpp
@@ -1113,13 +1113,24 @@
 Builder.defineMacro("__x86_64__");
 break;
   case llvm::Triple::ppc64:
-Builder.defineMacro("__powerpc64__");
-Builder.defineMacro("_CALL_ELF", "1");
-break;
   case llvm::Triple::ppc64le:
+  {
 Builder.defineMacro("__powerpc64__");
-Builder.defineMacro("_CALL_ELF", "2");
+
+StringRef ABI = AuxTI.getABI();
+// Set _CALL_ELF macro needed for gnu/stubs.h
+if (ABI == "elfv1" || ABI == "elfv1-qpx")
+  Builder.defineMacro("_CALL_ELF", "1");
+if (ABI == "elfv2")
+  Builder.defineMacro("_CALL_ELF", "2");
+
+// Required by PowerPC host toolchain.
+if (AuxTI.getLongDoubleWidth() == 128) {
+  Builder.defineMacro("__LONGDOUBLE128");
+  Builder.defineMacro("__LONG_DOUBLE_128__");
+}
 break;
+  }
   default:
 break;
   }


Index: test/Preprocessor/aux-triple.c
===
--- test/Preprocessor/aux-triple.c
+++ test/Preprocessor/aux-triple.c
@@ -50,6 +50,9 @@
 // NONE-NOT:#define __ELF__
 // LINUX:#define __ELF__ 1
 
+// PPC64LE:#define __LONGDOUBLE128 1
+// PPC64LE:#define __LONG_DOUBLE_128__ 1
+
 // NVPTX64:#define __LP64__ 1
 // NVPTX64:#define __NVPTX__ 1
 // NVPTX64:#define __PTX__ 1
Index: lib/Frontend/InitPreprocessor.cpp
===
--- lib/Frontend/InitPreprocessor.cpp
+++ lib/Frontend/InitPreprocessor.cpp
@@ -1113,13 +1113,24 @@
 Builder.defineMacro("__x86_64__");
 break;
   case llvm::Triple::ppc64:
-Builder.defineMacro("__powerpc64__");
-Builder.defineMacro("_CALL_ELF", "1");
-break;
   case llvm::Triple::ppc64le:
+  {
 Builder.defineMacro("__powerpc64__");
-Builder.defineMacro("_CALL_ELF", "2");
+
+StringRef ABI = AuxTI.getABI();
+// Set _CALL_ELF macro needed for gnu/stubs.h
+if (ABI == "elfv1" || ABI == "elfv1-qpx")
+  Builder.defineMacro("_CALL_ELF", "1");
+if (ABI == "elfv2")
+  Builder.defineMacro("_CALL_ELF", "2");
+
+// Required by PowerPC host toolchain.
+if (AuxTI.getLongDoubleWidth() == 128) {
+  Builder.defineMacro("__LONGDOUBLE128");
+  Builder.defineMacro("__LONG_DOUBLE_128__");
+}
 break;
+  }
   default:
 break;
   }
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation

2018-08-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D50845#1219709, @tra wrote:

> FYI. This breaks our CUDA compilation. I haven't figured out what exactly is 
> wrong yet. I may need to unroll the patch if the fix is not obvious.


Agreed. Patches https://reviews.llvm.org/D51446 and 
https://reviews.llvm.org/D51312 apply fixes for the PPC64 toolchain. Similar 
fixes are needed for other architectures probably.
In general, it looks like this patch leads to some host macros having to be 
defined again for the auxiliary triple case. It is not clear to me how to 
exhaustively identify the missing macros, so far it's been just trial and error.


Repository:
  rL LLVM

https://reviews.llvm.org/D50845



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation

2018-08-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D50845#1219746, @tra wrote:

> Also, whatever macros we generate do not prevent headers from using x86 
> inline assembly. I see quite a few inline asm code in preprocessed output. 
> The headers are from libc ~2.19.


Did you try adding

  Builder.defineMacro("__NO_MATH_INLINES");




Repository:
  rL LLVM

https://reviews.llvm.org/D50845



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D50845: [CUDA/OpenMP] Define only some host macros during device compilation

2018-08-30 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D50845#1219859, @Hahnfeld wrote:

> removing `InitializePredefinedAuxMacros` and the new test completely should 
> do.


Yep they also contain https://reviews.llvm.org/D51312 in case you're rolling 
back individual commits.


Repository:
  rL LLVM

https://reviews.llvm.org/D50845



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-07-31 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea marked 3 inline comments as done.
gtbercea added a comment.

Answers to comments.




Comment at: include/clang/Driver/Compilation.h:312
+  /// \param skipBundler - bool value set once by the driver.
+  void setSkipOffloadBundler(bool skipBundler);
+

sfantao wrote:
> gtbercea wrote:
> > sfantao wrote:
> > > Why is this a property of the compilation and not of a set of actions 
> > > referring to a given target? That would allow one to combine in the same 
> > > compilation targets requiring the bundler and targets that wouldn't. 
> > This was a way to pass this information to the OpenMP NVPTX device 
> > toolchain.
> > 
> > Both the Driver OpenMP NVPTX toolchain need to agree on the usage of the 
> > new scheme (proposed in this patch) or the old scheme (the one that is in 
> > the compiler today).
> > 
> > 
> I understand, but the way I see it is that it is the toolchain that skips the 
> bundler not the compilation. I understand that as of this patch, you skip 
> only if there is a single nvptx target. If you have more than one target, as 
> some tests do, some toolchains will still need the bundler. So, we are making 
> what happens with the nvptx target dependent of other toolchains. Is this an 
> intended effect of this patch?
Bundler is skipped only for the OpenMP NVPTX toolchain. I'm not sure what you 
mean by "other toolchain".



Comment at: lib/Driver/Compilation.cpp:276
+void Compilation::setSkipOffloadBundler(bool skipBundler) {
+  skipOffloadBundler = skipBundler;
+}

sfantao wrote:
> gtbercea wrote:
> > sfantao wrote:
> > > Given the logic you have below, you are assuming this is not set to false 
> > > ever. It would be wise to get an assertion here in case you end up having 
> > > toolchains skipping and others don't. If that is just not supported a 
> > > diagnostic should be added instead.
> > > 
> > > The convention is that local variables use CamelCase.
> > The checks I added in the Driver will set this flag to true if all 
> > toolchains Clang offloads to support the skipping of the bundler/unbundler 
> > for object files. Currently only NVPTX toolchain can skip the 
> > bundler/unbundler for object files so the code path in this patch will be 
> > taken only for:
> > 
> > -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
> Ok, if that is the case, just add an assertion here.
If one of the toolchains in the list of toolchains can't skip then none of them 
skip. If all can skip then they all skip. What assertion would you like me to 
add?



Comment at: lib/Driver/Driver.cpp:2943
+}
+  }
+

sfantao wrote:
> gtbercea wrote:
> > sfantao wrote:
> > > Can you just implement this check in the definition of `Compilation: 
> > > canSkipClangOffloadBundler` and get rid of `setSkipOffloadBundler`? All 
> > > the requirted information is already in `Compilation` under 
> > > `C.getInputArgs()`.
> > The driver needs to have the result of this check available. The flag is 
> > passed to the step which adds host-device dependencies. If the bundler can 
> > be skipped then the unbundling action is not required.
> > 
> > I guess this could be implemented in Compilation. Even so I would like it 
> > to happen only once like it does here and not every time someone queries 
> > the "can I skip the bundler" flag.
> > 
> > I wanted this check to happen only once hence why I put in on the driver 
> > side. The result of this check needs to be available in Driver.cpp and in 
> > Cuda.cpp files (see usage in this patch). Compilation keeps track of the 
> > flag because skipping the bundler is an all or nothing flag: you can skip 
> > the bundler/unbundler for object files if and only if all toolchains you 
> > are offloading to can skip it.
> > 
> Right, in these circumstances "can skip bundler" is the same as "do I have a 
> single toolchain" and "is that toolchain nvptx". This is fairly inexpensive 
> to do, so I don't really see the need to record this state in the driver. It 
> will also be clearer what are the conditions for which you skip the bundler.
That is true for now but if more toolchains get added to the list of toolchains 
that can skip the bundler then you want to factor it out and make it happen 
only once in a toolchain-independent point in the code. Otherwise you will 
carry that list of toolchains everywhere in the code where you need to do the 
check.

Also if you are to do this at toolchain level you will not be able to check if 
the other toolchains were able to skip or not. For now ALL toolchains must skip 
or ALL toolchains don't skip the bundler.



Comment at: lib/Driver/ToolChains/Cuda.cpp:496
+  ? CudaVirtualArchToString(VirtualArchForCudaArch(gpu_arch))
+  : GPUArch.str().c_str();
+  const char *PtxF =

sfantao wrote:
> Why don't create fatbins instead of cubins in all cases. For the purposes of 
> OpenMP they are equ

[PATCH] D47394: [OpenMP][Clang][NVPTX] Replace bundling with partial linking for the OpenMP NVPTX device offloading toolchain

2018-07-31 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: include/clang/Driver/Compilation.h:312
+  /// \param skipBundler - bool value set once by the driver.
+  void setSkipOffloadBundler(bool skipBundler);
+

sfantao wrote:
> gtbercea wrote:
> > sfantao wrote:
> > > gtbercea wrote:
> > > > sfantao wrote:
> > > > > Why is this a property of the compilation and not of a set of actions 
> > > > > referring to a given target? That would allow one to combine in the 
> > > > > same compilation targets requiring the bundler and targets that 
> > > > > wouldn't. 
> > > > This was a way to pass this information to the OpenMP NVPTX device 
> > > > toolchain.
> > > > 
> > > > Both the Driver OpenMP NVPTX toolchain need to agree on the usage of 
> > > > the new scheme (proposed in this patch) or the old scheme (the one that 
> > > > is in the compiler today).
> > > > 
> > > > 
> > > I understand, but the way I see it is that it is the toolchain that skips 
> > > the bundler not the compilation. I understand that as of this patch, you 
> > > skip only if there is a single nvptx target. If you have more than one 
> > > target, as some tests do, some toolchains will still need the bundler. 
> > > So, we are making what happens with the nvptx target dependent of other 
> > > toolchains. Is this an intended effect of this patch?
> > Bundler is skipped only for the OpenMP NVPTX toolchain. I'm not sure what 
> > you mean by "other toolchain".
> Is skipped for the NVPTX toolchain if there are no "other" device toolchains 
> requested. Say I have a working pipeline that does static linking with nvptx 
> correctly. Then on top of that I add another device to `-fopenmp-targets`, 
> that pipeline will now fail even for nvptx, right?
It's a choice between skipping the bundler and running the current, default 
mode with the bundler enabled. If targets other than NVPTX are present then we 
default to using the bundler for all toolchains.
There is no hybrid mode enabled where some targets use the bundler and some 
don't. 



Comment at: lib/Driver/Compilation.cpp:276
+void Compilation::setSkipOffloadBundler(bool skipBundler) {
+  skipOffloadBundler = skipBundler;
+}

sfantao wrote:
> gtbercea wrote:
> > sfantao wrote:
> > > gtbercea wrote:
> > > > sfantao wrote:
> > > > > Given the logic you have below, you are assuming this is not set to 
> > > > > false ever. It would be wise to get an assertion here in case you end 
> > > > > up having toolchains skipping and others don't. If that is just not 
> > > > > supported a diagnostic should be added instead.
> > > > > 
> > > > > The convention is that local variables use CamelCase.
> > > > The checks I added in the Driver will set this flag to true if all 
> > > > toolchains Clang offloads to support the skipping of the 
> > > > bundler/unbundler for object files. Currently only NVPTX toolchain can 
> > > > skip the bundler/unbundler for object files so the code path in this 
> > > > patch will be taken only for:
> > > > 
> > > > -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda
> > > Ok, if that is the case, just add an assertion here.
> > If one of the toolchains in the list of toolchains can't skip then none of 
> > them skip. If all can skip then they all skip. What assertion would you 
> > like me to add?
> If SkipOffloadBundler is set to true you don't expect it to be set to false 
> afterwards, right? That should be asserted.
That's correct, I can add that sure.


Repository:
  rC Clang

https://reviews.llvm.org/D47394



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D42841: [docs] Improve help for OpenMP options

2018-02-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea accepted this revision.
gtbercea added a comment.

LG


https://reviews.llvm.org/D42841



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-12 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
gtbercea added reviewers: Hahnfeld, ABataev, carlo.bertolli, caomhin, grokos.
Herald added subscribers: cfe-commits, guansong.

This patch adds an additional flag to the OpenMP device offloading toolchain to 
link in the runtime library bitcode.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp


Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,35 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime);
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing 
libomptarget-nvptx.bc in library path.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,35 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime);
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing libomptarget-nvptx.bc in library path.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-12 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 133882.
gtbercea added a comment.

Fix warning message.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp


Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,36 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,36 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 
___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-12 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 133919.
gtbercea added a comment.

Add regression tests.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,24 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch /tmp/libomptarget-nvptx-sm_60.bc
+// RUN:   LIBRARY_PATH=/tmp %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   LIBRARY_PATH= %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,36 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,24 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch /tmp/libomptarget-nvptx-sm_60.bc
+// RUN:   LIBRARY_PATH=/tmp %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}li

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 134235.
gtbercea added a comment.

Move unix specific test to new file.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c
  test/Driver/unix-openmp-offload-gpu.c


Index: test/Driver/unix-openmp-offload-gpu.c
===
--- /dev/null
+++ test/Driver/unix-openmp-offload-gpu.c
@@ -0,0 +1,21 @@
+///
+/// Perform several driver tests for OpenMP offloading
+///
+
+// REQUIRES: linux
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch /tmp/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=/tmp
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,14 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   env LIBRARY_PATH=""
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,36 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/unix-openmp-offload-gpu.c
===
--- /dev/null
+++ test/Driver/unix-openmp-offload-gpu.c
@@ -0,0 +1,21 @@
+///
+/// Perform several driver tests for OpenMP offloading
+///
+
+// REQUIRES: linux
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+/// ###

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea marked an inline comment as done.
gtbercea added inline comments.



Comment at: test/Driver/openmp-offload-gpu.c:150
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch /tmp/libomptarget-nvptx-sm_60.bc
+// RUN:   LIBRARY_PATH=/tmp %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \

Hahnfeld wrote:
> This should not be in `/tmp` but probably `%T`.
I don't think this would have worked since I need to create a file with a 
specific name in a folder somewhere and the separator is OS specific. I moved 
the test to a new file where I limit OS to linux.


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 134238.
gtbercea added a comment.

Fix tmp folder name.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c
  test/Driver/unix-openmp-offload-gpu.c


Index: test/Driver/unix-openmp-offload-gpu.c
===
--- /dev/null
+++ test/Driver/unix-openmp-offload-gpu.c
@@ -0,0 +1,21 @@
+///
+/// Perform several driver tests for OpenMP offloading
+///
+
+// REQUIRES: linux
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch %t-dir/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%t-dir
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,14 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   env LIBRARY_PATH=""
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,36 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/unix-openmp-offload-gpu.c
===
--- /dev/null
+++ test/Driver/unix-openmp-offload-gpu.c
@@ -0,0 +1,21 @@
+///
+/// Perform several driver tests for OpenMP offloading
+///
+
+// REQUIRES: linux
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+/// ###

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: test/Driver/unix-openmp-offload-gpu.c:15
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch /tmp/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=/tmp

Hahnfeld wrote:
> Hahnfeld wrote:
> > I don't see how that solves the problem of using `/tmp`?!?
> (Interesting that this works with `%t`, the documentation mentions `%T` for a 
> directory. But as other test cases do the same...)
%T works too I just tried it. Any preference as to which one to use?


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 134278.
gtbercea added a comment.

Use %T.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c
  test/Driver/unix-openmp-offload-gpu.c


Index: test/Driver/unix-openmp-offload-gpu.c
===
--- /dev/null
+++ test/Driver/unix-openmp-offload-gpu.c
@@ -0,0 +1,21 @@
+///
+/// Perform several driver tests for OpenMP offloading
+///
+
+// REQUIRES: linux
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,14 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   env LIBRARY_PATH=""
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,36 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/unix-openmp-offload-gpu.c
===
--- /dev/null
+++ test/Driver/unix-openmp-offload-gpu.c
@@ -0,0 +1,21 @@
+///
+/// Perform several driver tests for OpenMP offloading
+///
+
+// REQUIRES: linux
+// REQUIRES: clang-driver
+// REQUIRES: x86-registered-target
+// REQUIRES: powerpc-registered-target
+// REQUIRES: nvptx-registered-target
+
+/// 

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D43197#1007918, @Hahnfeld wrote:

> I'm still not sure we can't run this test on Windows. I think lots of other 
> tests use `touch`, even some specific to Windows...


Let me know what you'd like me to do. I can add the test back. I do see other 
tests not worrying about this so maybe I can do the same here...


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 134292.
gtbercea added a comment.

Revert.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,26 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   env LIBRARY_PATH=""
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,36 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,26 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mli

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-02-14 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 134295.
gtbercea added a comment.

Fix test.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,24 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -529,6 +529,36 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+if (char *env = ::getenv("LIBRARY_PATH")) {
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (std::string LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -196,6 +196,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,24 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create a bogus
+/// bitcode library that will be found via the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps \
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+

[PATCH] D43625: [OpenMP] Remove implicit data sharing code gen that aims to use device shared memory

2018-02-22 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
gtbercea added reviewers: ABataev, carlo.bertolli, caomhin.
Herald added subscribers: cfe-commits, guansong, jholewinski.

Remove this scheme for now since it will be covered by another more generic 
scheme using global memory. This code will be worked into an optimization for 
the generic data sharing scheme. Removing this completely and then adding it 
via future patches will make all future data sharing patches cleaner.


Repository:
  rC Clang

https://reviews.llvm.org/D43625

Files:
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  test/OpenMP/nvptx_data_sharing.cpp
  test/OpenMP/nvptx_parallel_codegen.cpp
  test/OpenMP/nvptx_target_teams_codegen.cpp

Index: test/OpenMP/nvptx_target_teams_codegen.cpp
===
--- test/OpenMP/nvptx_target_teams_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -60,7 +60,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1)
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -148,7 +148,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1)
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
Index: test/OpenMP/nvptx_parallel_codegen.cpp
===
--- test/OpenMP/nvptx_parallel_codegen.cpp
+++ test/OpenMP/nvptx_parallel_codegen.cpp
@@ -78,7 +78,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -92,20 +92,20 @@
   //
   // CHECK: [[EXEC_PARALLEL]]
   // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
+  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*)
   // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
   //
   // CHECK: [[EXEC_PFN1]]
-  // CHECK: call void [[PARALLEL_FN1]]_wrapper(
+  // CHECK: call void [[PARALLEL_FN1]](
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[CHECK_NEXT1]]
   // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
+  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*)
   // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
   //
   // CHECK: [[EXEC_PFN2]]
-  // CHECK: call void [[PARALLEL_FN2]]_wrapper(
+  // CHECK: call void [[PARALLEL_FN2]](
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[CHECK_NEXT2]]
@@ -152,13 +152,13 @@
   // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
   // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
   // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
-  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1]]_wrapper to i8*),
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*),
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @__kmpc_serialized_parallel(
   // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
   // CHECK: call void @__kmpc_end_serialized_parallel(
-  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2]]_wrapper to i8*),
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*),
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK-64-DAG: load i32, i32* [[REF_A]]
@@ -203,7 +203,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_ker

[PATCH] D43660: [OpenMP] Add OpenMP data sharing infrastructure using global memory

2018-02-22 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
gtbercea added reviewers: ABataev, carlo.bertolli, caomhin, hfinkel, Hahnfeld.
Herald added subscribers: cfe-commits, guansong, jholewinski.

This patch handles the Clang code generation phase for the OpenMP data sharing 
infrastructure.

TODO: add a more detailed description.


Repository:
  rC Clang

https://reviews.llvm.org/D43660

Files:
  lib/CodeGen/CGDecl.cpp
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.cpp
  test/OpenMP/nvptx_parallel_codegen.cpp

Index: test/OpenMP/nvptx_parallel_codegen.cpp
===
--- test/OpenMP/nvptx_parallel_codegen.cpp
+++ test/OpenMP/nvptx_parallel_codegen.cpp
@@ -92,20 +92,20 @@
   //
   // CHECK: [[EXEC_PARALLEL]]
   // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*)
+  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
   // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
   //
   // CHECK: [[EXEC_PFN1]]
-  // CHECK: call void [[PARALLEL_FN1]](
+  // CHECK: call void [[PARALLEL_FN1]]_wrapper(
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[CHECK_NEXT1]]
   // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*)
+  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
   // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
   //
   // CHECK: [[EXEC_PFN2]]
-  // CHECK: call void [[PARALLEL_FN2]](
+  // CHECK: call void [[PARALLEL_FN2]]_wrapper(
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[CHECK_NEXT2]]
@@ -152,13 +152,13 @@
   // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
   // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
   // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
-  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*),
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*),
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @__kmpc_serialized_parallel(
   // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
   // CHECK: call void @__kmpc_end_serialized_parallel(
-  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*),
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN2]]_wrapper to i8*),
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK-64-DAG: load i32, i32* [[REF_A]]
@@ -203,7 +203,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -217,11 +217,11 @@
   //
   // CHECK: [[EXEC_PARALLEL]]
   // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i32*, i32*)* [[PARALLEL_FN4:@.+]] to i8*)
+  // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]], bitcast (void (i16, i32)* [[PARALLEL_FN4:@.+]]_wrapper to i8*)
   // CHECK: br i1 [[WM]], label {{%?}}[[EXEC_PFN:.+]], label {{%?}}[[CHECK_NEXT:.+]]
   //
   // CHECK: [[EXEC_PFN]]
-  // CHECK: call void [[PARALLEL_FN4]](
+  // CHECK: call void [[PARALLEL_FN4]]_wrapper(
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[CHECK_NEXT]]
@@ -283,7 +283,7 @@
   // CHECK: br i1 [[CMP]], label {{%?}}[[IF_THEN:.+]], label {{%?}}[[IF_ELSE:.+]]
   //
   // CHECK: [[IF_THEN]]
-  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN4]] to i8*),
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN4]]_wrapper to i8*),
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[IF_END:.+]]
Index: lib/CodeGen/CodeGenFunction.cpp
===
--- lib/CodeGen/CodeGenFunction.cpp
+++ lib/CodeGen/CodeGenFunction.cpp
@@ -1058,6 +1058,11 @@
   EmitStartEHSpec(CurCodeDecl);
 
   PrologueCleanupDepth = EHStack.stable_begin();
+
+  // Emit OpenMP specific initialization of the dev

[PATCH] D41485: [OpenMP][libomptarget] Add data sharing support in libomptarget

2017-12-21 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
gtbercea added reviewers: carlo.bertolli, ABataev, Hahnfeld, grokos, caomhin, 
hfinkel.

This patch extends the libomptarget functionality in patch 
https://reviews.llvm.org/D14254 with support for the data sharing scheme for 
supporting implicitly shared variables. The runtime therefore maintains a list 
of references to shared variables.


Repository:
  rL LLVM

https://reviews.llvm.org/D41485

Files:
  libomptarget/deviceRTLs/nvptx/src/interface.h
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
  libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
  libomptarget/deviceRTLs/nvptx/src/option.h
  libomptarget/deviceRTLs/nvptx/src/parallel.cu

Index: libomptarget/deviceRTLs/nvptx/src/parallel.cu
===
--- libomptarget/deviceRTLs/nvptx/src/parallel.cu
+++ libomptarget/deviceRTLs/nvptx/src/parallel.cu
@@ -210,10 +210,16 @@
 //}
 //
 // This routine is always called by the team master..
-EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized) {
+EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, int16_t IsOMPRuntimeInitialized,
+   void ***SharedArgs, int32_t nArgs) {
   PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
   omptarget_nvptx_workFn = WorkFn;
 
+  if (nArgs > 0) {
+omptarget_nvptx_sharedArgs.EnsureSize(nArgs);
+*SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
+  }
+
   if (!IsOMPRuntimeInitialized) return;
 
   // This routine is only called by the team master.  The team master is
@@ -310,11 +316,13 @@
 // returns True if this thread is active, else False.
 //
 // Only the worker threads call this routine.
-EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized) {
+EXTERN bool __kmpc_kernel_parallel(void **WorkFn, int16_t IsOMPRuntimeInitialized,
+   void ***SharedArgs) {
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
 
   // Work function and arguments for L1 parallel region.
   *WorkFn   = omptarget_nvptx_workFn;
+  *SharedArgs = omptarget_nvptx_sharedArgs.GetArgs();
 
   if (!IsOMPRuntimeInitialized) return true;
 
Index: libomptarget/deviceRTLs/nvptx/src/option.h
===
--- libomptarget/deviceRTLs/nvptx/src/option.h
+++ libomptarget/deviceRTLs/nvptx/src/option.h
@@ -46,6 +46,10 @@
 // to synchronize with each other.
 #define L1_BARRIER (1)
 
+// Maximum number of preallocated arguments to an outlined parallel/simd function.
+// Anything more requires dynamic memory allocation.
+#define MAX_SHARED_ARGS 20
+
 // Maximum number of omp state objects per SM allocated statically in global memory.
 #if __CUDA_ARCH__ >= 600
 #define OMP_STATE_COUNT 32
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -60,6 +60,46 @@
 #define __ACTIVEMASK() __ballot(1)
 #endif
 
+// arguments needed for L0 parallelism only.
+class omptarget_nvptx_SharedArgs {
+public:
+  // All these methods must be called by the master thread only.
+  INLINE void Init() {
+args  = buffer;
+nArgs = MAX_SHARED_ARGS;
+  }
+  INLINE void DeInit() {
+// Free any memory allocated for outlined parallel function with a large
+// number of arguments.
+if (nArgs > MAX_SHARED_ARGS) {
+  SafeFree(args, (char *)"new extended args");
+  Init();
+}
+  }
+  INLINE void EnsureSize(int size) {
+if (size > nArgs) {
+  if (nArgs > MAX_SHARED_ARGS) {
+SafeFree(args, (char *)"new extended args");
+  }
+  args = (void **) SafeMalloc(size * sizeof(void *),
+  (char *)"new extended args");
+  nArgs = size;
+}
+  }
+  // Called by all threads.
+  INLINE void **GetArgs() { return args; };
+private:
+  // buffer of pre-allocated arguments.
+  void *buffer[MAX_SHARED_ARGS];
+  // pointer to arguments buffer.
+  // starts off as a pointer to 'buffer' but can be dynamically allocated.
+  void **args;
+  // starts off as MAX_SHARED_ARGS but can increase in size.
+  uint32_t nArgs;
+};
+
+extern __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_sharedArgs;
+
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
   // The maximum number of workers in a kernel.
Index: libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===
--- libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -32,6 +32,7 @@
 //
 extern volatile __device__ __shared__ omptarget_nvptx_WorkFn   omptarget_nvptx_workFn;
 extern __device__ __shared__ uint32_t execution_param;
+__device__ __shared__ 

[PATCH] D41486: [OpenMP][Clang] Add missing argument to runtime functions.

2017-12-21 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea created this revision.
gtbercea added reviewers: ABataev, carlo.bertolli, hfinkel, Hahnfeld, caomhin.
Herald added a subscriber: jholewinski.

This patch adds a missing argument to the runtime interface. Tests are adjusted 
accordingly.


Repository:
  rL LLVM

https://reviews.llvm.org/D41486

Files:
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  test/OpenMP/nvptx_data_sharing.cpp
  test/OpenMP/nvptx_target_teams_codegen.cpp


Index: test/OpenMP/nvptx_target_teams_codegen.cpp
===
--- test/OpenMP/nvptx_target_teams_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -60,7 +60,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** 
[[OMP_WORK_FN]], i8*** %shared_args)
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** 
[[OMP_WORK_FN]], {{.*}} i8*** %shared_args)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -148,7 +148,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** 
[[OMP_WORK_FN]], i8*** %shared_args)
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** 
[[OMP_WORK_FN]], {{.*}} i8*** %shared_args)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
Index: test/OpenMP/nvptx_data_sharing.cpp
===
--- test/OpenMP/nvptx_data_sharing.cpp
+++ test/OpenMP/nvptx_data_sharing.cpp
@@ -24,15 +24,15 @@
 
 // CK1: define internal void @__omp_offloading_{{.*}}test_ds{{.*}}worker() 
[[ATTR1:#.*]] {
 // CK1: [[SHAREDARGS:%.+]] = alloca i8**
-// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, i8*** [[SHAREDARGS]])
+// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, {{.*}} i8*** 
[[SHAREDARGS]])
 // CK1: [[SHARGSTMP:%.+]] = load i8**, i8*** [[SHAREDARGS]]
 // CK1: call void @__omp_outlined___wrapper{{.*}}({{.*}}, i8** [[SHARGSTMP]])
 
 /// = In the kernel function = ///
 
 // CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}() [[ATTR2:#.*]] 
{
 // CK1: [[SHAREDARGS1:%.+]] = alloca i8**
-// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i8*** 
[[SHAREDARGS1]], i32 1)
+// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, {{.*}} i8*** 
[[SHAREDARGS1]], i32 1)
 // CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]]
 // CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]]
 // CK1: [[SHAREDVAR:%.+]] = bitcast i32* {{.*}} to i8*
Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -521,7 +521,8 @@
   // Set up shared arguments
   Address SharedArgs =
   CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args");
-  llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer()};
+  llvm::Value *Args[] = {WorkFn.getPointer(), Bld.getInt16(1),
+ SharedArgs.getPointer()};
   llvm::Value *Ret = CGF.EmitRuntimeCall(
   createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
   Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
@@ -638,16 +639,16 @@
   case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
 /// Build void __kmpc_kernel_prepare_parallel(
 /// void *outlined_function, void ***args, kmp_int32 nArgs);
-llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
+llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty,
 CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty};
 llvm::FunctionType *FnTy =
 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
 break;
   }
   case OMPRTL_NVPTX__kmpc_kernel_parallel: {
 /// Build bool __kmpc_kernel_parallel(void **outlined_function, void 
***args);
-llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy,
+llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty,
 CGM.Int8PtrPtrTy->getPointerTo(0)};
 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
 llvm::FunctionType *FnTy =
@@ -949,7 +950,7 @@
   CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy,
   "shared_args");
   llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
-  llvm::Value *Args[] = {ID, SharedArgsPtr,
+  llvm::Value *Args[] = {ID, Bld.getInt16(1), SharedArgsPtr,
  Bld.getInt32(CapturedVars.size())};
 
   CGF.EmitRuntimeCall(
@@ -970,7 +971,7 @@
 Idx++;
   }
 } else {
-  llvm::Value *Args[] = {ID,
+  llvm::Value *

[PATCH] D41486: [OpenMP][Clang] Add missing argument to runtime functions.

2017-12-21 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D41486#961981, @Hahnfeld wrote:

> https://reviews.llvm.org/D41012? This patch doesn't update the documentation 
> with function signatures.


Ok so I see that your patch uses a different order of the arguments. I've just 
added the data sharing related arguments at the end and this matches the 
libomptarget patch I just posted. Which way do we want to do this?


Repository:
  rL LLVM

https://reviews.llvm.org/D41486



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D41486: [OpenMP][Clang] Add missing argument to runtime functions.

2017-12-21 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 127865.
gtbercea added a comment.

Address comments.


Repository:
  rL LLVM

https://reviews.llvm.org/D41486

Files:
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  test/OpenMP/nvptx_data_sharing.cpp
  test/OpenMP/nvptx_target_teams_codegen.cpp

Index: test/OpenMP/nvptx_target_teams_codegen.cpp
===
--- test/OpenMP/nvptx_target_teams_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -60,7 +60,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args)
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1, i8*** %shared_args)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -148,7 +148,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args)
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1, i8*** %shared_args)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
Index: test/OpenMP/nvptx_data_sharing.cpp
===
--- test/OpenMP/nvptx_data_sharing.cpp
+++ test/OpenMP/nvptx_data_sharing.cpp
@@ -24,15 +24,15 @@
 
 // CK1: define internal void @__omp_offloading_{{.*}}test_ds{{.*}}worker() [[ATTR1:#.*]] {
 // CK1: [[SHAREDARGS:%.+]] = alloca i8**
-// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, i8*** [[SHAREDARGS]])
+// CK1: call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1, i8*** [[SHAREDARGS]])
 // CK1: [[SHARGSTMP:%.+]] = load i8**, i8*** [[SHAREDARGS]]
 // CK1: call void @__omp_outlined___wrapper{{.*}}({{.*}}, i8** [[SHARGSTMP]])
 
 /// = In the kernel function = ///
 
 // CK1: {{.*}}define void @__omp_offloading{{.*}}test_ds{{.*}}() [[ATTR2:#.*]] {
 // CK1: [[SHAREDARGS1:%.+]] = alloca i8**
-// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i8*** [[SHAREDARGS1]], i32 1)
+// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}}, i16 1, i8*** [[SHAREDARGS1]], i32 1)
 // CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]]
 // CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]]
 // CK1: [[SHAREDVAR:%.+]] = bitcast i32* {{.*}} to i8*
Index: lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===
--- lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -33,10 +33,11 @@
   /// \brief Call to void __kmpc_spmd_kernel_deinit();
   OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
   /// \brief Call to void __kmpc_kernel_prepare_parallel(void
-  /// *outlined_function, void ***args, kmp_int32 nArgs);
+  /// *outlined_function, int16_t IsOMPRuntimeInitialized,
+  /// void ***args, kmp_int32 nArgs);
   OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
-  /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, void
-  /// ***args);
+  /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function,
+  /// int16_t IsOMPRuntimeInitialized, void ***args);
   OMPRTL_NVPTX__kmpc_kernel_parallel,
   /// \brief Call to void __kmpc_kernel_end_parallel();
   OMPRTL_NVPTX__kmpc_kernel_end_parallel,
@@ -521,7 +522,9 @@
   // Set up shared arguments
   Address SharedArgs =
   CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args");
-  llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer()};
+  llvm::Value *Args[] = {WorkFn.getPointer(),
+ /*IsOMPRuntimeInitialized*/ Bld.getInt16(1),
+ SharedArgs.getPointer()};
   llvm::Value *Ret = CGF.EmitRuntimeCall(
   createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
   Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
@@ -638,16 +641,16 @@
   case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
 /// Build void __kmpc_kernel_prepare_parallel(
 /// void *outlined_function, void ***args, kmp_int32 nArgs);
-llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
+llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty,
 CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty};
 llvm::FunctionType *FnTy =
 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
 break;
   }
   case OMPRTL_NVPTX__kmpc_kernel_parallel: {
 /// Build bool __kmpc_kernel_parallel(void **outlined_function, void ***args);
-llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy,
+llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty,
 C

[PATCH] D40451: [OpenMP] Add function attribute for triggering shared memory lowering in the LLVM backend

2017-12-21 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea closed this revision.
gtbercea added a comment.

Committed here https://reviews.llvm.org/D41123


Repository:
  rL LLVM

https://reviews.llvm.org/D40451



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D41486: [OpenMP][Clang] Add missing argument to runtime functions.

2018-01-03 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea abandoned this revision.
gtbercea added a comment.

Functionality already landed. See previous comment.


Repository:
  rL LLVM

https://reviews.llvm.org/D41486



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43660: [OpenMP] Add OpenMP data sharing infrastructure using global memory

2018-03-01 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 136528.

Repository:
  rC Clang

https://reviews.llvm.org/D43660

Files:
  lib/CodeGen/CGDecl.cpp
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.cpp
  test/OpenMP/nvptx_data_sharing.cpp
  test/OpenMP/nvptx_parallel_codegen.cpp

Index: test/OpenMP/nvptx_parallel_codegen.cpp
===
--- test/OpenMP/nvptx_parallel_codegen.cpp
+++ test/OpenMP/nvptx_parallel_codegen.cpp
@@ -64,254 +64,243 @@
 
   // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
 
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
+// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
+// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+//
+// CHECK: [[AWAIT_WORK]]
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
+// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
+// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
+// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
+// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+//
+// CHECK: [[SEL_WORKERS]]
+// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
+// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
+// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+//
+// CHECK: [[EXEC_PARALLEL]]
+// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
+// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
+//
+// CHECK: [[EXEC_PFN1]]
+// CHECK: call void [[PARALLEL_FN1]]_wrapper(
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[CHECK_NEXT1]]
+// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
+// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
+//
+// CHECK: [[EXEC_PFN2]]
+// CHECK: call void [[PARALLEL_FN2]]_wrapper(
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[CHECK_NEXT2]]
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[TERM_PARALLEL]]
+// CHECK: call void @__kmpc_kernel_end_parallel()
+// CHECK: br label {{%?}}[[BAR_PARALLEL]]
+//
+// CHECK: [[BAR_PARALLEL]]
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: br label {{%?}}[[AWAIT_WORK]]
+//
+// CHECK: [[EXIT]]
+// CHECK: ret void
 
+// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
+// Create local storage for each capture.
+// CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]],
+// CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+// Store captures in the context.
+// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+//
+// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
+//
+// CHECK: [[WORKER]]
+// CHECK: {{call|invoke}} void [[T6]]_worker()
+// CHECK: br label {{%?}}[[EXIT:.+]]
+//
+// CHECK: [[CHECK_MASTER]]
+// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
+//
+// CHECK: [[MASTER]]
+// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
+// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*),
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @__kmpc_serialized_parallel(
+// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
+// CHECK: call void @__kmpc_end_serialized_parallel(
+// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN2]]_wrapper to i8*),
+// CHECK: call void @

[PATCH] D43625: [OpenMP] Remove implicit data sharing code gen that aims to use device shared memory

2018-03-01 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 136570.
gtbercea added a comment.

Add Source location.


Repository:
  rC Clang

https://reviews.llvm.org/D43625

Files:
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  test/OpenMP/nvptx_data_sharing.cpp
  test/OpenMP/nvptx_parallel_codegen.cpp
  test/OpenMP/nvptx_target_teams_codegen.cpp

Index: test/OpenMP/nvptx_target_teams_codegen.cpp
===
--- test/OpenMP/nvptx_target_teams_codegen.cpp
+++ test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -60,7 +60,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1)
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -146,7 +146,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i8*** %shared_args, i16 1)
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]], i16 1)
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
Index: test/OpenMP/nvptx_parallel_codegen.cpp
===
--- test/OpenMP/nvptx_parallel_codegen.cpp
+++ test/OpenMP/nvptx_parallel_codegen.cpp
@@ -78,7 +78,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -92,20 +92,20 @@
   //
   // CHECK: [[EXEC_PARALLEL]]
   // CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
+  // CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i32*, i32*)* [[PARALLEL_FN1:@.+]] to i8*)
   // CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
   //
   // CHECK: [[EXEC_PFN1]]
-  // CHECK: call void [[PARALLEL_FN1]]_wrapper(
+  // CHECK: call void [[PARALLEL_FN1]](
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[CHECK_NEXT1]]
   // CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
+  // CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i32*, i32*)* [[PARALLEL_FN2:@.+]] to i8*)
   // CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
   //
   // CHECK: [[EXEC_PFN2]]
-  // CHECK: call void [[PARALLEL_FN2]]_wrapper(
+  // CHECK: call void [[PARALLEL_FN2]](
   // CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
   //
   // CHECK: [[CHECK_NEXT2]]
@@ -152,13 +152,13 @@
   // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
   // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
   // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
-  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN1]]_wrapper to i8*),
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN1]] to i8*),
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @__kmpc_serialized_parallel(
   // CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
   // CHECK: call void @__kmpc_end_serialized_parallel(
-  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32, i8**)* [[PARALLEL_FN2]]_wrapper to i8*),
+  // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i32*, i32*)* [[PARALLEL_FN2]] to i8*),
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK-64-DAG: load i32, i32* [[REF_A]]
@@ -203,7 +203,7 @@
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]],
+  // CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
   // CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
   // store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
   // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
@@ -217,11 +217,11 @@
   //
   // CHECK: [[EXEC_PARALLEL]]
   // CHECK: [[WF:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
-  // CHECK: [[WM:%.+]] = icmp eq i8* [[WF]],

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-05 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added a comment.

In https://reviews.llvm.org/D43197#1011256, @Hahnfeld wrote:

> Looking more closely at the patch, this doesn't seem to look into the `lib` / 
> `lib64` next to the compiler. I'm not sure if `LIBRARY_PATH` is set for every 
> installation, so I think we should add this one to catch the obvious case. 
> This probably needs some attention for the tests because they'll find the 
> just-built libraries...


The contract with the user us that the .bc lib needs to be in LIBRARY_PATH, 
this is what we require today.


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-05 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:536-542
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }

Hahnfeld wrote:
> `tools::addDirectoryList` uses `StringRef::find`, I'm not sure if 
> `StringRef::split` creates real copies of the string...
What is your suggestion?


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137203.
gtbercea added a comment.

Address comments.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,25 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+// RUN:   rm %T/libomptarget-nvptx-sm_60.bc
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include 
 
@@ -580,6 +581,42 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+// Add path to lib and/or lib64 folders.
+SmallString<256> DefaultLibPath =
+  llvm::sys::path::parent_path(getDriver().Dir);
+llvm::sys::path::append(DefaultLibPath, "lib" CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DriverArgs.MakeArgString(DefaultLibPath));
+
+// Add user defined library paths from LIBRARY_PATH.
+if (llvm::Optional LibPath =
+  llvm::sys::Process::GetEnv("LIBRARY_PATH")) {
+  SmallVector Frags;
+  llvm::SplitString(*LibPath, Frags,
+  StringRef(&(llvm::sys::EnvPathSeparator)));
+  for (auto Path : Frags)
+LibraryPaths.emplace_back(Path.trim());
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (const std::string &LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -203,6 +203,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,25 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// 

[PATCH] D43660: [OpenMP] Add OpenMP data sharing infrastructure using global memory

2018-03-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137210.
gtbercea added a comment.

Add init stack function.


Repository:
  rC Clang

https://reviews.llvm.org/D43660

Files:
  lib/CodeGen/CGDecl.cpp
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.cpp
  test/OpenMP/nvptx_data_sharing.cpp
  test/OpenMP/nvptx_parallel_codegen.cpp

Index: test/OpenMP/nvptx_parallel_codegen.cpp
===
--- test/OpenMP/nvptx_parallel_codegen.cpp
+++ test/OpenMP/nvptx_parallel_codegen.cpp
@@ -64,254 +64,243 @@
 
   // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
 
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
+// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
+// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+//
+// CHECK: [[AWAIT_WORK]]
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
+// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
+// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
+// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
+// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+//
+// CHECK: [[SEL_WORKERS]]
+// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
+// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
+// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+//
+// CHECK: [[EXEC_PARALLEL]]
+// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
+// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
+//
+// CHECK: [[EXEC_PFN1]]
+// CHECK: call void [[PARALLEL_FN1]]_wrapper(
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[CHECK_NEXT1]]
+// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
+// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
+//
+// CHECK: [[EXEC_PFN2]]
+// CHECK: call void [[PARALLEL_FN2]]_wrapper(
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[CHECK_NEXT2]]
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[TERM_PARALLEL]]
+// CHECK: call void @__kmpc_kernel_end_parallel()
+// CHECK: br label {{%?}}[[BAR_PARALLEL]]
+//
+// CHECK: [[BAR_PARALLEL]]
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: br label {{%?}}[[AWAIT_WORK]]
+//
+// CHECK: [[EXIT]]
+// CHECK: ret void
 
+// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
+// Create local storage for each capture.
+// CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]],
+// CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+// Store captures in the context.
+// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+//
+// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
+//
+// CHECK: [[WORKER]]
+// CHECK: {{call|invoke}} void [[T6]]_worker()
+// CHECK: br label {{%?}}[[EXIT:.+]]
+//
+// CHECK: [[CHECK_MASTER]]
+// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
+//
+// CHECK: [[MASTER]]
+// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
+// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*),
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @__kmpc_serialized_parallel(
+// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
+// CHECK: call void @__kmpc_end_serialized_parallel(
+// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[P

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137219.
gtbercea added a comment.

Address comments.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,25 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+// RUN:   rm %T/libomptarget-nvptx-sm_60.bc
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include 
 
@@ -580,6 +581,43 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+// Add path to lib and/or lib64 folders.
+SmallString<256> DefaultLibPath =
+  llvm::sys::path::parent_path(getDriver().Dir);
+llvm::sys::path::append(DefaultLibPath,
+Twine("lib") + CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DefaultLibPath.c_str());
+
+// Add user defined library paths from LIBRARY_PATH.
+if (llvm::Optional LibPath =
+  llvm::sys::Process::GetEnv("LIBRARY_PATH")) {
+  SmallVector Frags;
+  const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+  llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+  for (auto Path : Frags)
+LibraryPaths.emplace_back(Path.trim());
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (const std::string &LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -203,6 +203,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "Expect degraded performance on the target device due to missing '%0' in 
LIBRARY_PATH.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,25 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ##

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137226.
gtbercea added a comment.

Address comments.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,25 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+// RUN:   rm %T/libomptarget-nvptx-sm_60.bc
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: Expect degraded performance on the target device due to 
missing 'libomptarget-nvptx-sm_20.bc' in LIBRARY_PATH.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include 
 
@@ -580,6 +581,43 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+// Add path to lib and/or lib64 folders.
+SmallString<256> DefaultLibPath =
+  llvm::sys::path::parent_path(getDriver().Dir);
+llvm::sys::path::append(DefaultLibPath,
+Twine("lib") + CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DefaultLibPath.c_str());
+
+// Add user defined library paths from LIBRARY_PATH.
+if (llvm::Optional LibPath =
+  llvm::sys::Process::GetEnv("LIBRARY_PATH")) {
+  SmallVector Frags;
+  const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+  llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+  for (auto Path : Frags)
+LibraryPaths.emplace_back(Path.trim());
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (const std::string &LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -203,6 +203,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "No .bc library found in the default clang lib directory or in LIBRARY_PATH. 
Expect degraded performance due to no inlining of runtime functions on target 
devices.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,25 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137230.
gtbercea added a comment.

Fix test.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,25 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+// RUN:   rm %T/libomptarget-nvptx-sm_60.bc
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: No .bc library 'libomptarget-nvptx-sm_20.bc' found in the 
default clang lib directory or in LIBRARY_PATH. Expect degraded performance due 
to no inlining of runtime functions on target devices.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include 
 
@@ -580,6 +581,43 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+// Add path to lib and/or lib64 folders.
+SmallString<256> DefaultLibPath =
+  llvm::sys::path::parent_path(getDriver().Dir);
+llvm::sys::path::append(DefaultLibPath,
+Twine("lib") + CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DefaultLibPath.c_str());
+
+// Add user defined library paths from LIBRARY_PATH.
+if (llvm::Optional LibPath =
+  llvm::sys::Process::GetEnv("LIBRARY_PATH")) {
+  SmallVector Frags;
+  const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+  llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+  for (auto Path : Frags)
+LibraryPaths.emplace_back(Path.trim());
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (const std::string &LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::remark_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -203,6 +203,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def remark_drv_omp_offload_target_missingbcruntime : Warning<
+  "No .bc library '%0' found in the default clang lib directory or in 
LIBRARY_PATH. Expect degraded performance due to no inlining of runtime 
functions on target devices.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-06 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137233.
gtbercea added a comment.

- Fix message and test.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,25 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+// RUN:   rm %T/libomptarget-nvptx-sm_60.bc
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the 
default clang lib directory or in LIBRARY_PATH. Expect degraded performance due 
to no inlining of runtime functions on target devices.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include 
 
@@ -580,6 +581,43 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+// Add path to lib and/or lib64 folders.
+SmallString<256> DefaultLibPath =
+  llvm::sys::path::parent_path(getDriver().Dir);
+llvm::sys::path::append(DefaultLibPath,
+Twine("lib") + CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DefaultLibPath.c_str());
+
+// Add user defined library paths from LIBRARY_PATH.
+if (llvm::Optional LibPath =
+  llvm::sys::Process::GetEnv("LIBRARY_PATH")) {
+  SmallVector Frags;
+  const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+  llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+  for (auto Path : Frags)
+LibraryPaths.emplace_back(Path.trim());
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (const std::string &LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -203,6 +203,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def warn_drv_omp_offload_target_missingbcruntime : Warning<
+  "No library '%0' found in the default clang lib directory or in 
LIBRARY_PATH. Expect degraded performance due to no inlining of runtime 
functions on target devices.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offlo

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:536-542
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }

Hahnfeld wrote:
> gtbercea wrote:
> > Hahnfeld wrote:
> > > `tools::addDirectoryList` uses `StringRef::find`, I'm not sure if 
> > > `StringRef::split` creates real copies of the string...
> > What is your suggestion?
> IMO you should use whatever existing code does, in that case 
> `StringRef::find`.
Is this comment still relevant in the light of the most recent changes?


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43660: [OpenMP] Add OpenMP data sharing infrastructure using global memory

2018-03-08 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137600.
gtbercea added a comment.

Patch splitting: limit support in this patch to standalone target regions only. 
Support for combined directives will be fully covered in a subsequent patch.


Repository:
  rC Clang

https://reviews.llvm.org/D43660

Files:
  lib/CodeGen/CGDecl.cpp
  lib/CodeGen/CGOpenMPRuntime.cpp
  lib/CodeGen/CGOpenMPRuntime.h
  lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
  lib/CodeGen/CGOpenMPRuntimeNVPTX.h
  lib/CodeGen/CGStmtOpenMP.cpp
  lib/CodeGen/CodeGenFunction.cpp
  test/OpenMP/nvptx_data_sharing.cpp
  test/OpenMP/nvptx_parallel_codegen.cpp

Index: test/OpenMP/nvptx_parallel_codegen.cpp
===
--- test/OpenMP/nvptx_parallel_codegen.cpp
+++ test/OpenMP/nvptx_parallel_codegen.cpp
@@ -64,254 +64,243 @@
 
   // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
 
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
+// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+// CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
+// CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
+//
+// CHECK: [[AWAIT_WORK]]
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: [[KPR:%.+]] = call i1 @__kmpc_kernel_parallel(i8** [[OMP_WORK_FN]]
+// CHECK: [[KPRB:%.+]] = zext i1 [[KPR]] to i8
+// store i8 [[KPRB]], i8* [[OMP_EXEC_STATUS]], align 1
+// CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
+// CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
+//
+// CHECK: [[SEL_WORKERS]]
+// CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]]
+// CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
+// CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
+//
+// CHECK: [[EXEC_PARALLEL]]
+// CHECK: [[WF1:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[WM1:%.+]] = icmp eq i8* [[WF1]], bitcast (void (i16, i32)* [[PARALLEL_FN1:@.+]]_wrapper to i8*)
+// CHECK: br i1 [[WM1]], label {{%?}}[[EXEC_PFN1:.+]], label {{%?}}[[CHECK_NEXT1:.+]]
+//
+// CHECK: [[EXEC_PFN1]]
+// CHECK: call void [[PARALLEL_FN1]]_wrapper(
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[CHECK_NEXT1]]
+// CHECK: [[WF2:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+// CHECK: [[WM2:%.+]] = icmp eq i8* [[WF2]], bitcast (void (i16, i32)* [[PARALLEL_FN2:@.+]]_wrapper to i8*)
+// CHECK: br i1 [[WM2]], label {{%?}}[[EXEC_PFN2:.+]], label {{%?}}[[CHECK_NEXT2:.+]]
+//
+// CHECK: [[EXEC_PFN2]]
+// CHECK: call void [[PARALLEL_FN2]]_wrapper(
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[CHECK_NEXT2]]
+// CHECK: br label {{%?}}[[TERM_PARALLEL:.+]]
+//
+// CHECK: [[TERM_PARALLEL]]
+// CHECK: call void @__kmpc_kernel_end_parallel()
+// CHECK: br label {{%?}}[[BAR_PARALLEL]]
+//
+// CHECK: [[BAR_PARALLEL]]
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: br label {{%?}}[[AWAIT_WORK]]
+//
+// CHECK: [[EXIT]]
+// CHECK: ret void
 
+// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
+// Create local storage for each capture.
+// CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]],
+// CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+// Store captures in the context.
+// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+//
+// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+// CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
+//
+// CHECK: [[WORKER]]
+// CHECK: {{call|invoke}} void [[T6]]_worker()
+// CHECK: br label {{%?}}[[EXIT:.+]]
+//
+// CHECK: [[CHECK_MASTER]]
+// CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+// CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
+//
+// CHECK: [[MASTER]]
+// CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+// CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
+// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* [[PARALLEL_FN1]]_wrapper to i8*),
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @llvm.nvvm.barrier0()
+// CHECK: call void @__kmpc_serialized_parallel(
+// CHECK: {{call|invoke}} void [[PARALLEL_FN3:@.+]](
+// CHE

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-09 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea marked an inline comment as done.
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:536-542
+  StringRef CompilerPath = env;
+  while (!CompilerPath.empty()) {
+std::pair Split =
+CompilerPath.split(llvm::sys::EnvPathSeparator);
+LibraryPaths.push_back(Split.first);
+CompilerPath = Split.second;
+  }

Hahnfeld wrote:
> gtbercea wrote:
> > Hahnfeld wrote:
> > > gtbercea wrote:
> > > > Hahnfeld wrote:
> > > > > `tools::addDirectoryList` uses `StringRef::find`, I'm not sure if 
> > > > > `StringRef::split` creates real copies of the string...
> > > > What is your suggestion?
> > > IMO you should use whatever existing code does, in that case 
> > > `StringRef::find`.
> > Is this comment still relevant in the light of the most recent changes?
> Probably not (although the code is now completely different from 
> `tools::addDirectoryList`)
Gotcha, do let me know if you see any other issue with this version of the 
code. I will mark this one as done for now.


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-09 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137754.
gtbercea added a comment.

Change test.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/Inputs/lib/libomptarget-nvptx-sm_60.bc
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,23 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   env LIBRARY_PATH=%S/Inputs/lib %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the 
default clang lib directory or in LIBRARY_PATH. Expect degraded performance due 
to no inlining of runtime functions on target devices.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include 
 
@@ -580,6 +581,43 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+// Add path to lib and/or lib64 folders.
+SmallString<256> DefaultLibPath =
+  llvm::sys::path::parent_path(getDriver().Dir);
+llvm::sys::path::append(DefaultLibPath,
+Twine("lib") + CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DriverArgs.MakeArgString(DefaultLibPath));
+
+// Add user defined library paths from LIBRARY_PATH.
+if (llvm::Optional LibPath =
+  llvm::sys::Process::GetEnv("LIBRARY_PATH")) {
+  SmallVector Frags;
+  const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+  llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+  for (auto Path : Frags)
+LibraryPaths.emplace_back(Path.trim());
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (const std::string &LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -203,6 +203,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def warn_drv_omp_offload_target_missingbcruntime : Warning<
+  "No library '%0' found in the default clang lib directory or in 
LIBRARY_PATH. Expect degraded performance due to no inlining of runtime 
functions on target devices.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +14

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-09 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: lib/Driver/ToolChains/Cuda.cpp:592
+Twine("lib") + CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DefaultLibPath.c_str());
+

ABataev wrote:
> Do you still need `.c_str()` here?
Doesn't compile without it but we can get there using Args.MakeArgString() also.



Comment at: test/Driver/openmp-offload-gpu.c:150
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \

Hahnfeld wrote:
> grokos wrote:
> > ABataev wrote:
> > > Create empty `libomptarget-nvptx-sm_60.bc` in `Driver/lib` directory and 
> > > use it in the test rather create|delete it dynamically.
> > I'm also in favour of this approach. On some systems /tmp is not accessible 
> > and the regression test fails.
> This test doesn't (and shouldn't!) use `/tmp`. The build directory and `%T` 
> are always writable (if not, you have different issues on your system).
> 
> Btw you need to pay attention that the driver now finds files next to the 
> compiler directory. You may want to make sure that the test always passes / 
> doesn't fail for wrong reasons.
Just added this.


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-09 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137755.
gtbercea added a comment.

Revert to c_str().


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/Inputs/lib/libomptarget-nvptx-sm_60.bc
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,23 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   env LIBRARY_PATH=%S/Inputs/lib %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_60 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_60.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the 
default clang lib directory or in LIBRARY_PATH. Expect degraded performance due 
to no inlining of runtime functions on target devices.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include 
 
@@ -580,6 +581,43 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+// Add path to lib and/or lib64 folders.
+SmallString<256> DefaultLibPath =
+  llvm::sys::path::parent_path(getDriver().Dir);
+llvm::sys::path::append(DefaultLibPath,
+Twine("lib") + CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DefaultLibPath.c_str());
+
+// Add user defined library paths from LIBRARY_PATH.
+if (llvm::Optional LibPath =
+  llvm::sys::Process::GetEnv("LIBRARY_PATH")) {
+  SmallVector Frags;
+  const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+  llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+  for (auto Path : Frags)
+LibraryPaths.emplace_back(Path.trim());
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (const std::string &LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -203,6 +203,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def warn_drv_omp_offload_target_missingbcruntime : Warning<
+  "No library '%0' found in the default clang lib directory or in 
LIBRARY_PATH. Expect degraded performance due to no inlining of runtime 
functions on target devices.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,23 @@
 // 

[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-09 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea added inline comments.



Comment at: test/Driver/openmp-offload-gpu.c:150
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   touch %T/libomptarget-nvptx-sm_60.bc
+// RUN:   env LIBRARY_PATH=%T %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \

gtbercea wrote:
> Hahnfeld wrote:
> > grokos wrote:
> > > ABataev wrote:
> > > > Create empty `libomptarget-nvptx-sm_60.bc` in `Driver/lib` directory 
> > > > and use it in the test rather create|delete it dynamically.
> > > I'm also in favour of this approach. On some systems /tmp is not 
> > > accessible and the regression test fails.
> > This test doesn't (and shouldn't!) use `/tmp`. The build directory and `%T` 
> > are always writable (if not, you have different issues on your system).
> > 
> > Btw you need to pay attention that the driver now finds files next to the 
> > compiler directory. You may want to make sure that the test always passes / 
> > doesn't fail for wrong reasons.
> Just added this.
@Hahnfeld I've used %S instead.

The only way in which the test can be a false positive is when the lib folder 
contains this .bc file. But there's no way to stop this from happening since we 
check DefaultLibPath first.


Repository:
  rC Clang

https://reviews.llvm.org/D43197



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
http://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D43197: [OpenMP] Add flag for linking runtime bitcode library

2018-03-09 Thread Gheorghe-Teodor Bercea via Phabricator via cfe-commits
gtbercea updated this revision to Diff 137769.
gtbercea added a comment.

Fix test.


Repository:
  rC Clang

https://reviews.llvm.org/D43197

Files:
  include/clang/Basic/DiagnosticDriverKinds.td
  lib/Driver/ToolChains/Cuda.cpp
  test/Driver/Inputs/lib/libomptarget-nvptx-sm_20.bc
  test/Driver/openmp-offload-gpu.c


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,23 @@
 // RUN:   | FileCheck -check-prefix=CHK-NOLIBDEVICE %s
 
 // CHK-NOLIBDEVICE-NOT: error:{{.*}}sm_60
+
+/// ###
+
+/// Check that the runtime bitcode library is part of the compile line. Create 
a bogus
+/// bitcode library and add it to the LIBRARY_PATH.
+// RUN:   env LIBRARY_PATH=%S/Inputs/lib %clang -### -fopenmp=libomp 
-fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB %s
+
+// CHK-BCLIB: 
clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-cuda-bitcode{{.*}}libomptarget-nvptx-sm_20.bc
+
+/// ###
+
+/// Check that the warning is thrown when the libomptarget bitcode library is 
not found.
+/// Libomptarget requires sm_35 or newer so an sm_20 bitcode library should 
never exist.
+// RUN:   %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
+// RUN:   -Xopenmp-target -march=sm_20 -fopenmp-relocatable-target -save-temps 
\
+// RUN:   -no-canonical-prefixes %s 2>&1 | FileCheck 
-check-prefix=CHK-BCLIB-WARN %s
+
+// CHK-BCLIB-WARN: No library 'libomptarget-nvptx-sm_20.bc' found in the 
default clang lib directory or in LIBRARY_PATH. Expect degraded performance due 
to no inlining of runtime functions on target devices.
Index: lib/Driver/ToolChains/Cuda.cpp
===
--- lib/Driver/ToolChains/Cuda.cpp
+++ lib/Driver/ToolChains/Cuda.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Option/ArgList.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
 #include 
 
@@ -580,6 +581,43 @@
 CC1Args.push_back("-target-feature");
 CC1Args.push_back("+ptx42");
   }
+
+  if (DeviceOffloadingKind == Action::OFK_OpenMP) {
+SmallVector LibraryPaths;
+// Add path to lib and/or lib64 folders.
+SmallString<256> DefaultLibPath =
+  llvm::sys::path::parent_path(getDriver().Dir);
+llvm::sys::path::append(DefaultLibPath,
+Twine("lib") + CLANG_LIBDIR_SUFFIX);
+LibraryPaths.emplace_back(DefaultLibPath.c_str());
+
+// Add user defined library paths from LIBRARY_PATH.
+if (llvm::Optional LibPath =
+  llvm::sys::Process::GetEnv("LIBRARY_PATH")) {
+  SmallVector Frags;
+  const char EnvPathSeparatorStr[] = {llvm::sys::EnvPathSeparator, '\0'};
+  llvm::SplitString(*LibPath, Frags, EnvPathSeparatorStr);
+  for (auto Path : Frags)
+LibraryPaths.emplace_back(Path.trim());
+}
+
+std::string LibOmpTargetName =
+  "libomptarget-nvptx-" + GpuArch.str() + ".bc";
+bool FoundBCLibrary = false;
+for (const std::string &LibraryPath : LibraryPaths) {
+  SmallString<128> LibOmpTargetFile(LibraryPath);
+  llvm::sys::path::append(LibOmpTargetFile, LibOmpTargetName);
+  if (llvm::sys::fs::exists(LibOmpTargetFile)) {
+CC1Args.push_back("-mlink-cuda-bitcode");
+CC1Args.push_back(DriverArgs.MakeArgString(LibOmpTargetFile));
+FoundBCLibrary = true;
+break;
+  }
+}
+if (!FoundBCLibrary)
+  getDriver().Diag(diag::warn_drv_omp_offload_target_missingbcruntime)
+  << LibOmpTargetName;
+  }
 }
 
 void CudaToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
Index: include/clang/Basic/DiagnosticDriverKinds.td
===
--- include/clang/Basic/DiagnosticDriverKinds.td
+++ include/clang/Basic/DiagnosticDriverKinds.td
@@ -203,6 +203,9 @@
 def warn_drv_omp_offload_target_duplicate : Warning<
   "The OpenMP offloading target '%0' is similar to target '%1' already 
specified - will be ignored.">, 
   InGroup;
+def warn_drv_omp_offload_target_missingbcruntime : Warning<
+  "No library '%0' found in the default clang lib directory or in 
LIBRARY_PATH. Expect degraded performance due to no inlining of runtime 
functions on target devices.">,
+  InGroup;
 def err_drv_bitcode_unsupported_on_toolchain : Error<
   "-fembed-bitcode is not supported on versions of iOS prior to 6.0">;
 


Index: test/Driver/openmp-offload-gpu.c
===
--- test/Driver/openmp-offload-gpu.c
+++ test/Driver/openmp-offload-gpu.c
@@ -142,3 +142,23 @@
 // RUN:   | 

  1   2   3   4   5   6   7   >