On 03/02/2018 09:47 PM, Cesar Philippidis wrote:
libgomp/
* plugin/plugin-nvptx.c (nvptx_exec): Adjust calculations of
workers and vectors.
I wrote a test case that triggers this code, and added it to this code.
Build x86_64 with nvptx accelerator and tested libgomp.
Committed.
Thanks,
- Tom
[nvptx] Handle large vectors in libgomp
2018-04-05 Cesar Philippidis <[email protected]>
Tom de Vries <[email protected]>
* plugin/plugin-nvptx.c (nvptx_exec): Adjust calculations of
workers and vectors.
* testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c: New test.
---
libgomp/plugin/plugin-nvptx.c | 10 +++---
.../vector-length-128-7.c | 41 ++++++++++++++++++++++
2 files changed, 47 insertions(+), 4 deletions(-)
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index bdc0c30..9b4768f 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -734,8 +734,6 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
int threads_per_block = threads_per_sm > block_size
? block_size : threads_per_sm;
- threads_per_block /= warp_size;
-
if (threads_per_sm > cpu_size)
threads_per_sm = cpu_size;
@@ -802,6 +800,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
if (seen_zero)
{
+ int vectors = dims[GOMP_DIM_VECTOR] > 0
+ ? dims[GOMP_DIM_VECTOR] : warp_size;
+ int workers = threads_per_block / vectors;
+
for (i = 0; i != GOMP_DIM_MAX; i++)
if (!dims[i])
{
@@ -819,10 +821,10 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
: 2 * dev_size;
break;
case GOMP_DIM_WORKER:
- dims[i] = threads_per_block;
+ dims[i] = workers;
break;
case GOMP_DIM_VECTOR:
- dims[i] = warp_size;
+ dims[i] = vectors;
break;
default:
abort ();
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c
new file mode 100644
index 0000000..60c264c
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/vector-length-128-7.c
@@ -0,0 +1,41 @@
+/* { dg-do run { target openacc_nvidia_accel_selected } } */
+/* { dg-additional-options "-foffload=-fdump-tree-oaccdevlow" } */
+/* { dg-additional-options "-foffload=-mlong-vector-in-workers" } */
+/* { dg-set-target-env-var "GOMP_DEBUG" "1" } */
+
+#include <stdlib.h>
+
+#define N 1024
+
+unsigned int a[N];
+unsigned int b[N];
+unsigned int c[N];
+unsigned int n = N;
+
+int
+main (void)
+{
+ for (unsigned int i = 0; i < n; ++i)
+ {
+ a[i] = i % 3;
+ b[i] = i % 5;
+ }
+
+#pragma acc parallel vector_length (128) copyin (a,b) copyout (c)
+ {
+#pragma acc loop worker
+ for (unsigned int i = 0; i < 4; i++)
+#pragma acc loop vector
+ for (unsigned int j = 0; j < n / 4; j++)
+ c[(i * N / 4) + j] = a[(i * N / 4) + j] + b[(i * N / 4) + j];
+ }
+
+ for (unsigned int i = 0; i < n; ++i)
+ if (c[i] != (i % 3) + (i % 5))
+ abort ();
+
+ return 0;
+}
+
+/* { dg-final { scan-offload-tree-dump "__attribute__\\(\\(oacc function \\(1, 0, 128\\)" "oaccdevlow" } } */
+/* { dg-output "nvptx_exec: kernel main\\\$_omp_fn\\\$0: launch gangs=1, workers=8, vectors=128" } */