Hi, atm parallel-dims.c fails on og7 with Titan V due to too few resources.
This patch reduces the amount of resources used for one offloading region, and moves another to a link-only test-case.
This allows the test-case to pass. Committed to og7. Thanks, - Tom
[openacc, testsuite] Reduce resource usage for Titan V in parallel-dims.c 2018-04-30 Tom de Vries <t...@codesourcery.com> * testsuite/libgomp.oacc-c-c++-common/parallel-dims-compile.c: New test, factored out of ... * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c (main): ... here. Limit num_workers to avoid insufficient-resources-to-launch fatal error. --- .../parallel-dims-compile.c | 100 +++++++++++++++++++++ .../libgomp.oacc-c-c++-common/parallel-dims.c | 44 ++------- 2 files changed, 107 insertions(+), 37 deletions(-) diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims-compile.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims-compile.c new file mode 100644 index 0000000..2d7fdbd --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims-compile.c @@ -0,0 +1,100 @@ +/* { dg-do "link" } */ +/* { dg-additional-options "-foffload-force" } */ + +#include <limits.h> +#include <openacc.h> + +/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper + not behaving as expected for -O0. */ +#pragma acc routine seq +static unsigned int __attribute__ ((optimize ("O2"))) acc_gang () +{ + if (acc_on_device ((int) acc_device_host)) + return 0; + else if (acc_on_device ((int) acc_device_nvidia)) + { + unsigned int r; + asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r)); + return r; + } + else + __builtin_abort (); +} + +#pragma acc routine seq +static unsigned int __attribute__ ((optimize ("O2"))) acc_worker () +{ + if (acc_on_device ((int) acc_device_host)) + return 0; + else if (acc_on_device ((int) acc_device_nvidia)) + { + unsigned int r; + asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r)); + return r; + } + else + __builtin_abort (); +} + +#pragma acc routine seq +static unsigned int __attribute__ ((optimize ("O2"))) acc_vector () +{ + if (acc_on_device ((int) acc_device_host)) + return 0; + else if (acc_on_device ((int) acc_device_nvidia)) + { + unsigned int r; + asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r)); + return r; + } + else + __builtin_abort (); +} + + +int main () +{ + acc_init (acc_device_default); + + /* GR, WP, VS. */ + { + /* We try with an outrageously large value. */ +#define WORKERS 2 << 20 + int workers_actual = WORKERS; + int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; + gangs_min = workers_min = vectors_min = INT_MAX; + gangs_max = workers_max = vectors_max = INT_MIN; +#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \ + num_workers (WORKERS) + { + if (acc_on_device (acc_device_host)) + { + /* We're actually executing with num_workers (1). */ + workers_actual = 1; + } + else if (acc_on_device (acc_device_nvidia)) + { + /* The GCC nvptx back end enforces num_workers (32). */ + workers_actual = 32; + } + else + __builtin_abort (); +#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) + for (int i = 100 * workers_actual; i > -100 * workers_actual; --i) + { + gangs_min = gangs_max = acc_gang (); + workers_min = workers_max = acc_worker (); + vectors_min = vectors_max = acc_vector (); + } + } + if (workers_actual < 1) + __builtin_abort (); + if (gangs_min != 0 || gangs_max != 0 + || workers_min != 0 || workers_max != workers_actual - 1 + || vectors_min != 0 || vectors_max != 0) + __builtin_abort (); +#undef WORKERS + } + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c index 1dd6353..1498fb4 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c @@ -273,42 +273,11 @@ int main () /* GR, WP, VS. */ { - /* We try with an outrageously large value. */ -#define WORKERS 2 << 20 - int workers_actual = WORKERS; - int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; - gangs_min = workers_min = vectors_min = INT_MAX; - gangs_max = workers_max = vectors_max = INT_MIN; -#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \ - num_workers (WORKERS) - { - if (acc_on_device (acc_device_host)) - { - /* We're actually executing with num_workers (1). */ - workers_actual = 1; - } - else if (acc_on_device (acc_device_nvidia)) - { - /* The GCC nvptx back end enforces num_workers (32). */ - workers_actual = 32; - } - else - __builtin_abort (); -#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max) - for (int i = 100 * workers_actual; i > -100 * workers_actual; --i) - { - gangs_min = gangs_max = acc_gang (); - workers_min = workers_max = acc_worker (); - vectors_min = vectors_max = acc_vector (); - } - } - if (workers_actual < 1) - __builtin_abort (); - if (gangs_min != 0 || gangs_max != 0 - || workers_min != 0 || workers_max != workers_actual - 1 - || vectors_min != 0 || vectors_max != 0) - __builtin_abort (); -#undef WORKERS + /* Factored out to parallel-dims-compile.c. The maximum num_workers for + Titan V for this kernel is 28, so using 32 at runtime will make the + execution fail. OTOH, we want to test the "using num_workers (32), + ignoring <n>" warning, which means defaulting to 32 workers. So, we skip + execution for this region. */ } /* GR, WP, VS. */ @@ -320,7 +289,8 @@ int main () "num_workers (workers)", which will run into "libgomp: cuLaunchKernel error: invalid argument". So, limit ourselves here. */ if (acc_get_device_type () == acc_device_nvidia) - workers = 32; + // Limit to 28 for Titan V. + workers = 28; int workers_actual = workers; int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max; gangs_min = workers_min = vectors_min = INT_MAX;