[og7, openacc, testsuite, committed] Reduce resource usage for Titan V in parallel-dims.c

Tom de Vries Mon, 30 Apr 2018 06:42:53 -0700

Hi,

atm parallel-dims.c fails on og7 with Titan V due to too few resources.

This patch reduces the amount of resources used for one offloadingregion, and moves another to a link-only test-case.


This allows the test-case to pass.

Committed to og7.

Thanks,
- Tom

[openacc, testsuite] Reduce resource usage for Titan V in parallel-dims.c

2018-04-30  Tom de Vries  <t...@codesourcery.com>

	* testsuite/libgomp.oacc-c-c++-common/parallel-dims-compile.c: New test,
	factored out of ...
	* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c (main): ... here.
	Limit num_workers to avoid insufficient-resources-to-launch fatal error.

---
 .../parallel-dims-compile.c                        | 100 +++++++++++++++++++++
 .../libgomp.oacc-c-c++-common/parallel-dims.c      |  44 ++-------
 2 files changed, 107 insertions(+), 37 deletions(-)

diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims-compile.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims-compile.c
new file mode 100644
index 0000000..2d7fdbd
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims-compile.c
@@ -0,0 +1,100 @@
+/* { dg-do "link" } */
+/* { dg-additional-options "-foffload-force" } */
+
+#include <limits.h>
+#include <openacc.h>
+
+/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
+   not behaving as expected for -O0.  */
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia))
+    {
+      unsigned int r;
+      asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
+      return r;
+    }
+  else
+    __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia))
+    {
+      unsigned int r;
+      asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
+      return r;
+    }
+  else
+    __builtin_abort ();
+}
+
+#pragma acc routine seq
+static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
+{
+  if (acc_on_device ((int) acc_device_host))
+    return 0;
+  else if (acc_on_device ((int) acc_device_nvidia))
+    {
+      unsigned int r;
+      asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
+      return r;
+    }
+  else
+    __builtin_abort ();
+}
+
+
+int main ()
+{
+  acc_init (acc_device_default);
+
+  /* GR, WP, VS.  */
+  {
+    /* We try with an outrageously large value. */
+#define WORKERS 2 << 20
+    int workers_actual = WORKERS;
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
+  num_workers (WORKERS)
+    {
+      if (acc_on_device (acc_device_host))
+	{
+	  /* We're actually executing with num_workers (1).  */
+	  workers_actual = 1;
+	}
+      else if (acc_on_device (acc_device_nvidia))
+	{
+	  /* The GCC nvptx back end enforces num_workers (32).  */
+	  workers_actual = 32;
+	}
+      else
+	__builtin_abort ();
+#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
+	{
+	  gangs_min = gangs_max = acc_gang ();
+	  workers_min = workers_max = acc_worker ();
+	  vectors_min = vectors_max = acc_vector ();
+	}
+    }
+    if (workers_actual < 1)
+      __builtin_abort ();
+    if (gangs_min != 0 || gangs_max != 0
+	|| workers_min != 0 || workers_max != workers_actual - 1
+	|| vectors_min != 0 || vectors_max != 0)
+      __builtin_abort ();
+#undef WORKERS
+  }
+
+  return 0;
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index 1dd6353..1498fb4 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -273,42 +273,11 @@ int main ()
 
   /* GR, WP, VS.  */
   {
-    /* We try with an outrageously large value. */
-#define WORKERS 2 << 20
-    int workers_actual = WORKERS;
-    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
-    gangs_min = workers_min = vectors_min = INT_MAX;
-    gangs_max = workers_max = vectors_max = INT_MIN;
-#pragma acc parallel copy (workers_actual) /* { dg-warning "using num_workers \\(32\\), ignoring 2097152" "" { target openacc_nvidia_accel_selected } } */ \
-  num_workers (WORKERS)
-    {
-      if (acc_on_device (acc_device_host))
-	{
-	  /* We're actually executing with num_workers (1).  */
-	  workers_actual = 1;
-	}
-      else if (acc_on_device (acc_device_nvidia))
-	{
-	  /* The GCC nvptx back end enforces num_workers (32).  */
-	  workers_actual = 32;
-	}
-      else
-	__builtin_abort ();
-#pragma acc loop worker reduction (min: gangs_min, workers_min, vectors_min) reduction (max: gangs_max, workers_max, vectors_max)
-      for (int i = 100 * workers_actual; i > -100 * workers_actual; --i)
-	{
-	  gangs_min = gangs_max = acc_gang ();
-	  workers_min = workers_max = acc_worker ();
-	  vectors_min = vectors_max = acc_vector ();
-	}
-    }
-    if (workers_actual < 1)
-      __builtin_abort ();
-    if (gangs_min != 0 || gangs_max != 0
-	|| workers_min != 0 || workers_max != workers_actual - 1
-	|| vectors_min != 0 || vectors_max != 0)
-      __builtin_abort ();
-#undef WORKERS
+    /* Factored out to parallel-dims-compile.c.  The maximum num_workers for
+       Titan V for this kernel is 28, so using 32 at runtime will make the
+       execution fail.  OTOH, we want to test the "using num_workers (32),
+       ignoring <n>" warning, which means defaulting to 32 workers.  So, we skip
+       execution for this region.  */
   }
 
   /* GR, WP, VS.  */
@@ -320,7 +289,8 @@ int main ()
        "num_workers (workers)", which will run into "libgomp: cuLaunchKernel
        error: invalid argument".  So, limit ourselves here.  */
     if (acc_get_device_type () == acc_device_nvidia)
-      workers = 32;
+      // Limit to 28 for Titan V.
+      workers = 28;
     int workers_actual = workers;
     int gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max;
     gangs_min = workers_min = vectors_min = INT_MAX;

[og7, openacc, testsuite, committed] Reduce resource usage for Titan V in parallel-dims.c

Reply via email to