Hi!
On Thu, 29 Oct 2015 17:22:46 -0700, Nathan Sidwell <[email protected]> wrote:
> acc_on_device and it's builtin had a conflict. The function formally takes
> an
> enum argument, but the builtin takes an int -- primarily to avoid the
> compiler
> having to generate the enum type internally.
>
> This works fine for C, where the external declaration of the function (in
> openacc.h) matches up with the builtin, and we optimize the builtin as
> expected.
>
> It fails for C++ where the builtin doesn't match the declaration in the
> header.
> We end up with emitting a call to acc_on_device, which is resolved by
> libgomp. Unfortunately that means we fail to optimize. [...]
> [Nathan's trunk r229562] leaves things unchanged for C -- declare a function
> with an enum arg.
> But for C++ we the extern "C" declaration takes an int -- and therefore
> matches the builtin. We insert an inline wrapper that takes an enum
> argument.
> Because of C++'s overload resolution both the wrapper and the int-taking
> declaration can have the same source name.
> --- libgomp/openacc.h (revision 229535)
> +++ libgomp/openacc.h (working copy)
> -int acc_on_device (acc_device_t) __GOACC_NOTHROW;
> +#ifdef __cplusplus
> +int acc_on_device (int __arg) __GOACC_NOTHROW;
> +#else
> +int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW;
> +#endif
> #ifdef __cplusplus
> }
> +
> +/* Forwarding function with correctly typed arg. */
> +
> +inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
> +{
> + return acc_on_device ((int) __arg);
> +}
> #endif
> --- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
> (revision 0)
> +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
> (working copy)
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O2" } */
> +
> +#include <openacc.h>
> +
> +int Foo (acc_device_t x)
> +{
> + return acc_on_device (x);
> +}
> +
> +/* { dg-final { scan-assembler-not "acc_on_device" } } */
As a user, I'd expect that when compiling such code with "-O0" instead of
"-O2", but adding "__attribute__ ((optimize ("O2")))" to "Foo", that I'd
then get "acc_on_device" expanded as a builtin, and no calls to the
"acc_on_device library function. In C++ that is currently not working,
because the "Forwarding function with correctly typed arg" (cited above)
doesn't "inherit" that "optimize" attribute. Making that one "always
inline" resolves the problem. Also I cleaned up and extended testing
some more. OK for trunk?
commit 9cc3a384c17e9f692f7864c604d2e2f9fbf0bac9
Author: Thomas Schwinge <[email protected]>
Date: Tue May 23 13:21:14 2017 +0200
Make the OpenACC C++ acc_on_device wrapper "always inline"
libgomp/
* openacc.h [__cplusplus] (acc_on_device): Mark as "always
inline".
* testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c: Remove
file; test cases already present...
* testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c: ... in
this file. Update.
* testsuite/libgomp.oacc-c-c++-common/acc-on-device.c: Remove
file; test cases now present...
* testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c: ... in
this new file.
* testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Update.
---
libgomp/openacc.h | 3 +-
.../libgomp.oacc-c-c++-common/acc-on-device-2.c | 22 -------------
.../libgomp.oacc-c-c++-common/acc-on-device.c | 12 -------
.../libgomp.oacc-c-c++-common/acc_on_device-1.c | 38 +++++++++++++---------
.../libgomp.oacc-c-c++-common/acc_on_device-2.c | 21 ++++++++++++
.../libgomp.oacc-c-c++-common/parallel-dims.c | 14 ++++----
6 files changed, 52 insertions(+), 58 deletions(-)
diff --git libgomp/openacc.h libgomp/openacc.h
index 137e2c1..266f559 100644
--- libgomp/openacc.h
+++ libgomp/openacc.h
@@ -121,7 +121,8 @@ int acc_set_cuda_stream (int, void *) __GOACC_NOTHROW;
/* Forwarding function with correctly typed arg. */
#pragma acc routine seq
-inline int acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
+inline __attribute__ ((__always_inline__)) int
+acc_on_device (acc_device_t __arg) __GOACC_NOTHROW
{
return acc_on_device ((int) __arg);
}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
deleted file mode 100644
index bfcb67d..0000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device-2.c
+++ /dev/null
@@ -1,22 +0,0 @@
-/* Test the acc_on_device library function. */
-/* { dg-additional-options "-fno-builtin-acc_on_device" } */
-
-#include <openacc.h>
-
-int main ()
-{
- int dev;
-
-#pragma acc parallel copyout (dev)
- {
- dev = acc_on_device (acc_device_not_host);
- }
-
- int expect = 1;
-
-#if ACC_DEVICE_TYPE_host
- expect = 0;
-#endif
-
- return dev != expect;
-}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
deleted file mode 100644
index e0d8710..0000000
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc-on-device.c
+++ /dev/null
@@ -1,12 +0,0 @@
-/* { dg-do compile } */
-/* We don't expect this to work with optimizations disabled.
- { dg-skip-if "" { *-*-* } { "-O0" } { "" } } */
-
-#include <openacc.h>
-
-int Foo (acc_device_t x)
-{
- return acc_on_device (x);
-}
-
-/* { dg-final { scan-assembler-not "acc_on_device" } } */
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
index 8112745..eb962e4 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-1.c
@@ -1,6 +1,9 @@
/* Disable the acc_on_device builtin; we want to test the libgomp library
function. */
+/* { dg-additional-options "-DACC_ON_DEVICE=acc_on_device" } */
/* { dg-additional-options "-fno-builtin-acc_on_device" } */
+/* { dg-additional-options "-fdump-rtl-expand" }
+ { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 12
"expand" } } */
#include <stdlib.h>
#include <openacc.h>
@@ -11,13 +14,13 @@ main (int argc, char *argv[])
/* Host. */
{
- if (!acc_on_device (acc_device_none))
+ if (!ACC_ON_DEVICE (acc_device_none))
abort ();
- if (!acc_on_device (acc_device_host))
+ if (!ACC_ON_DEVICE (acc_device_host))
abort ();
- if (acc_on_device (acc_device_not_host))
+ if (ACC_ON_DEVICE (acc_device_not_host))
abort ();
- if (acc_on_device (acc_device_nvidia))
+ if (ACC_ON_DEVICE (acc_device_nvidia))
abort ();
}
@@ -26,39 +29,44 @@ main (int argc, char *argv[])
#pragma acc parallel if(0)
{
- if (!acc_on_device (acc_device_none))
+ if (!ACC_ON_DEVICE (acc_device_none))
abort ();
- if (!acc_on_device (acc_device_host))
+ if (!ACC_ON_DEVICE (acc_device_host))
abort ();
- if (acc_on_device (acc_device_not_host))
+ if (ACC_ON_DEVICE (acc_device_not_host))
abort ();
- if (acc_on_device (acc_device_nvidia))
+ if (ACC_ON_DEVICE (acc_device_nvidia))
abort ();
}
-#if !ACC_DEVICE_TYPE_host
+ int on_host_p;
+#if ACC_DEVICE_TYPE_host
+ on_host_p = 1;
+#else
+ on_host_p = 0;
+#endif
/* Offloaded. */
#pragma acc parallel
{
- if (acc_on_device (acc_device_none))
+ if (on_host_p != ACC_ON_DEVICE (acc_device_none))
abort ();
- if (acc_on_device (acc_device_host))
+ if (on_host_p != ACC_ON_DEVICE (acc_device_host))
abort ();
- if (!acc_on_device (acc_device_not_host))
+ if (on_host_p == ACC_ON_DEVICE (acc_device_not_host))
abort ();
+
#if ACC_DEVICE_TYPE_nvidia
- if (!acc_on_device (acc_device_nvidia))
+ if (!ACC_ON_DEVICE (acc_device_nvidia))
abort ();
#else
- if (acc_on_device (acc_device_nvidia))
+ if (ACC_ON_DEVICE (acc_device_nvidia))
abort ();
#endif
}
-#endif
return 0;
}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
new file mode 100644
index 0000000..c3b3378
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_on_device-2.c
@@ -0,0 +1,21 @@
+/* With the acc_on_device builtin enabled, we don't expect any calls to the
+ libgomp library function. */
+/* { dg-additional-options "-fdump-rtl-expand" }
+ { dg-final { scan-rtl-dump-times "\\\(call \[^\\n\]* acc_on_device" 0
"expand" } } */
+
+#include <openacc.h>
+
+#ifdef __OPTIMIZE__
+# define ACC_ON_DEVICE acc_on_device
+#else
+/* Without optimizations enabled, we're not expecting the acc_on_device builtin
+ to be used, so use here a "-O2" wrapper. */
+#pragma acc routine seq
+static int __attribute__ ((optimize ("O2")))
+ACC_ON_DEVICE (acc_device_t arg)
+{
+ return acc_on_device (arg);
+}
+#endif
+
+#include "acc_on_device-1.c"
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index 8308f7c..1c48ab3 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -4,14 +4,12 @@
#include <limits.h>
#include <openacc.h>
-/* TODO: "(int) acc_device_*" casts because of the C++ acc_on_device wrapper
- not behaving as expected for -O0. */
#pragma acc routine seq
static unsigned int __attribute__ ((optimize ("O2"))) acc_gang ()
{
- if (acc_on_device ((int) acc_device_host))
+ if (acc_on_device (acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device (acc_device_nvidia))
{
unsigned int r;
asm volatile ("mov.u32 %0,%%ctaid.x;" : "=r" (r));
@@ -24,9 +22,9 @@ static unsigned int __attribute__ ((optimize ("O2")))
acc_gang ()
#pragma acc routine seq
static unsigned int __attribute__ ((optimize ("O2"))) acc_worker ()
{
- if (acc_on_device ((int) acc_device_host))
+ if (acc_on_device (acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device (acc_device_nvidia))
{
unsigned int r;
asm volatile ("mov.u32 %0,%%tid.y;" : "=r" (r));
@@ -39,9 +37,9 @@ static unsigned int __attribute__ ((optimize ("O2")))
acc_worker ()
#pragma acc routine seq
static unsigned int __attribute__ ((optimize ("O2"))) acc_vector ()
{
- if (acc_on_device ((int) acc_device_host))
+ if (acc_on_device (acc_device_host))
return 0;
- else if (acc_on_device ((int) acc_device_nvidia))
+ else if (acc_on_device (acc_device_nvidia))
{
unsigned int r;
asm volatile ("mov.u32 %0,%%tid.x;" : "=r" (r));
Grüße
Thomas