OpenACC 2.5 Profiling Interface (incomplete)

Thomas Schwinge Tue, 28 Feb 2017 09:44:47 -0800

Hi!

The 2.5 versions of the OpenACC standard added a new chapter "Profiling
Interface".  In r245784, I committed incomplete support to
gomp-4_0-branch.  I plan to continue working on this, but wanted to
synchronize at this point.


commit b22a85fe7f3daeb48460e7aa28606d0cdb799f69
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Tue Feb 28 17:36:03 2017 +0000

    OpenACC 2.5 Profiling Interface (incomplete)
    
            libgomp/
            * acc_prof.h: New file.
            * oacc-profiling-acc_register_library.c: Likewise.
            * oacc-profiling.c: Likewise.
            * Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
            Add these, respectively.
            * Makefile.in: Regenerate.
            * libgomp/config/nvptx/oacc-profiling-acc_register_library.c:
            New empty file.
            * libgomp/config/nvptx/oacc-profiling.c: Likewise.
            * env.c (initialize_env): Call goacc_profiling_initialize.
            * libgomp-plugin.c: New function
            GOMP_PLUGIN_goacc_profiling_dispatch.
            * libgomp-plugin.h: Declare function
            GOMP_PLUGIN_goacc_profiling_dispatch.
            * oacc-plugin.c: New function GOMP_PLUGIN_goacc_thread.
            * oacc-plugin.h: Declare function GOMP_PLUGIN_goacc_thread.
            * libgomp.map (OACC_2.5): Add acc_prof_lookup, acc_prof_register,
            acc_prof_unregister, and acc_register_library.
            Add GOMP_PLUGIN_goacc_profiling_dispatch, and
            GOMP_PLUGIN_goacc_thread with new GOMP_PLUGIN_1.3 symbol version.
            * oacc-int.h (struct goacc_thread): Add "acc_prof_info
            *prof_info", "acc_api_info *api_info", and "bool
            prof_callbacks_enabled" members.
            Declare functions goacc_profiling_initialize,
            goacc_profiling_dispatch_p, and goacc_profiling_dispatch.
            * oacc-init.c (acc_init_1): Add "acc_construct_t", and "int"
            formal parameters.  Adjust all users.
            (acc_init_1, goacc_attach_host_thread_to_device, acc_init)
            (goacc_lazy_initialize): Update for OpenACC Profiling Interface.
            * oacc-parallel.c (GOACC_parallel_keyed): Likewise.
            * plugin/plugin-nvptx.c (cuda_map_create, cuda_map_destroy)
            (map_init, map_fini, map_pop, map_push): Add "struct goacc_thread
            *" formal parameter.  Adjust all users.
            (select_stream_for_async, event_gc, nvptx_exec, nvptx_host2dev)
            (nvptx_dev2host, nvptx_set_cuda_stream): Call
            GOMP_PLUGIN_goacc_thread instead of nvptx_thread.
            (cuda_map_create, cuda_map_destroy, nvptx_exec, nvptx_alloc)
            (nvptx_free, nvptx_host2dev, nvptx_dev2host): Update for OpenACC
            Profiling Interface.
            * libgomp.texi: New chapter "OpenACC Profiling Interface".
            * testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
            file.
            * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
            * testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
            Likewise.
            * testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
            Likewise.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@245784 
138bc75d-0d04-0410-961f-82ee72b054a4
---
 libgomp/ChangeLog.gomp                             |  50 ++
 libgomp/Makefile.am                                |   5 +-
 libgomp/Makefile.in                                |  10 +-
 libgomp/acc_prof.h                                 | 237 +++++++
 .../nvptx/oacc-profiling-acc_register_library.c    |   0
 libgomp/config/nvptx/oacc-profiling.c              |   0
 libgomp/env.c                                      |   3 +-
 libgomp/libgomp-plugin.c                           |   9 +
 libgomp/libgomp-plugin.h                           |   6 +
 libgomp/libgomp.map                                |  11 +
 libgomp/libgomp.texi                               | 246 +++++++
 libgomp/oacc-init.c                                |  68 +-
 libgomp/oacc-int.h                                 |  12 +
 libgomp/oacc-parallel.c                            | 126 +++-
 libgomp/oacc-plugin.c                              |  13 +
 libgomp/oacc-plugin.h                              |   3 +
 ...gin.h => oacc-profiling-acc_register_library.c} |  19 +-
 libgomp/oacc-profiling.c                           | 576 +++++++++++++++++
 libgomp/plugin/plugin-nvptx.c                      | 315 ++++++++-
 .../acc_prof-dispatch-1.c                          | 344 ++++++++++
 .../libgomp.oacc-c-c++-common/acc_prof-init-1.c    | 306 +++++++++
 .../acc_prof-parallel-1.c                          | 703 +++++++++++++++++++++
 .../acc_prof-valid_bytes-1.c                       | 172 +++++
 .../libgomp.oacc-c-c++-common/acc_prof-version-1.c |  55 ++
 24 files changed, 3243 insertions(+), 46 deletions(-)

diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index af6a28b..acdb004 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,5 +1,55 @@
 2017-02-28  Thomas Schwinge  <tho...@codesourcery.com>
 
+       * acc_prof.h: New file.
+       * oacc-profiling-acc_register_library.c: Likewise.
+       * oacc-profiling.c: Likewise.
+       * Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES):
+       Add these, respectively.
+       * Makefile.in: Regenerate.
+       * libgomp/config/nvptx/oacc-profiling-acc_register_library.c:
+       New empty file.
+       * libgomp/config/nvptx/oacc-profiling.c: Likewise.
+       * env.c (initialize_env): Call goacc_profiling_initialize.
+       * libgomp-plugin.c: New function
+       GOMP_PLUGIN_goacc_profiling_dispatch.
+       * libgomp-plugin.h: Declare function
+       GOMP_PLUGIN_goacc_profiling_dispatch.
+       * oacc-plugin.c: New function GOMP_PLUGIN_goacc_thread.
+       * oacc-plugin.h: Declare function GOMP_PLUGIN_goacc_thread.
+       * libgomp.map (OACC_2.5): Add acc_prof_lookup, acc_prof_register,
+       acc_prof_unregister, and acc_register_library.
+       Add GOMP_PLUGIN_goacc_profiling_dispatch, and
+       GOMP_PLUGIN_goacc_thread with new GOMP_PLUGIN_1.3 symbol version.
+       * oacc-int.h (struct goacc_thread): Add "acc_prof_info
+       *prof_info", "acc_api_info *api_info", and "bool
+       prof_callbacks_enabled" members.
+       Declare functions goacc_profiling_initialize,
+       goacc_profiling_dispatch_p, and goacc_profiling_dispatch.
+       * oacc-init.c (acc_init_1): Add "acc_construct_t", and "int"
+       formal parameters.  Adjust all users.
+       (acc_init_1, goacc_attach_host_thread_to_device, acc_init)
+       (goacc_lazy_initialize): Update for OpenACC Profiling Interface.
+       * oacc-parallel.c (GOACC_parallel_keyed): Likewise.
+       * plugin/plugin-nvptx.c (cuda_map_create, cuda_map_destroy)
+       (map_init, map_fini, map_pop, map_push): Add "struct goacc_thread
+       *" formal parameter.  Adjust all users.
+       (select_stream_for_async, event_gc, nvptx_exec, nvptx_host2dev)
+       (nvptx_dev2host, nvptx_set_cuda_stream): Call
+       GOMP_PLUGIN_goacc_thread instead of nvptx_thread.
+       (cuda_map_create, cuda_map_destroy, nvptx_exec, nvptx_alloc)
+       (nvptx_free, nvptx_host2dev, nvptx_dev2host): Update for OpenACC
+       Profiling Interface.
+       * libgomp.texi: New chapter "OpenACC Profiling Interface".
+       * testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New
+       file.
+       * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise.
+       * testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c:
+       Likewise.
+       * testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c:
+       Likewise.
+       * testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c:
+       Likewise.
+
        * oacc-init.c (goacc_register, acc_init, goacc_lazy_initialize):
        Fix locking of cached_base_dev (guarded by acc_device_lock).
        (goacc_lazy_initialize): Don't call acc_init; copy code here,
diff --git libgomp/Makefile.am libgomp/Makefile.am
index a3e1c2b..9a75c48 100644
--- libgomp/Makefile.am
+++ libgomp/Makefile.am
@@ -63,7 +63,8 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c 
error.c iter.c \
        task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \
        time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \
        oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \
-       oacc-plugin.c oacc-cuda.c priority_queue.c
+       oacc-plugin.c oacc-cuda.c priority_queue.c \
+       oacc-profiling.c oacc-profiling-acc_register_library.c
 
 include $(top_srcdir)/plugin/Makefrag.am
 
@@ -72,7 +73,7 @@ libgomp_la_SOURCES += openacc.f90
 endif
 
 nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h openacc.h
+nodist_libsubinclude_HEADERS = acc_prof.h omp.h openacc.h
 if USE_FORTRAN
 nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \
        openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
diff --git libgomp/Makefile.in libgomp/Makefile.in
index 88c8517..dac2b4e 100644
--- libgomp/Makefile.in
+++ libgomp/Makefile.in
@@ -180,7 +180,8 @@ am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo 
env.lo \
        fortran.lo affinity.lo target.lo splay-tree.lo \
        libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \
        oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \
-       priority_queue.lo $(am__objects_1)
+       priority_queue.lo oacc-profiling.lo \
+       oacc-profiling-acc_register_library.lo $(am__objects_1)
 libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS)
 DEFAULT_INCLUDES = -I.@am__isrc@
 depcomp = $(SHELL) $(top_srcdir)/../depcomp
@@ -435,7 +436,8 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c 
error.c iter.c \
        bar.c ptrlock.c time.c fortran.c affinity.c target.c \
        splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \
        oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \
-       priority_queue.c $(am__append_3)
+       priority_queue.c oacc-profiling.c \
+       oacc-profiling-acc_register_library.c $(am__append_3)
 
 # Nvidia PTX OpenACC plugin.
 @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info 
$(libtool_VERSION)
@@ -457,7 +459,7 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c 
error.c iter.c \
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS)
 @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static
 nodist_noinst_HEADERS = libgomp_f.h
-nodist_libsubinclude_HEADERS = omp.h openacc.h
+nodist_libsubinclude_HEADERS = acc_prof.h omp.h openacc.h
 @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod 
omp_lib_kinds.mod \
 @USE_FORTRAN_TRUE@     openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod
 
@@ -620,6 +622,8 @@ distclean-compile:
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ 
@am__quote@./$(DEPDIR)/oacc-profiling-acc_register_library.Plo@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@
diff --git libgomp/acc_prof.h libgomp/acc_prof.h
new file mode 100644
index 0000000..9247790
--- /dev/null
+++ libgomp/acc_prof.h
@@ -0,0 +1,237 @@
+/* OpenACC Runtime Library: Profiling Interface
+
+   Copyright (C) 2017 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _ACC_PROF_H
+#define _ACC_PROF_H 1
+
+/* The OpenACC standard doesn't say so explicitly, but as its Profiling
+   Interface makes use of, for example, <openacc.h>'s acc_device_t, we
+   supposedly are to #include that file here.  */
+#include <openacc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.1. Events.  */
+
+typedef enum acc_event_t
+{
+  acc_ev_none = 0,
+  acc_ev_device_init_start,
+  acc_ev_device_init_end,
+  acc_ev_device_shutdown_start,
+  acc_ev_device_shutdown_end,
+  acc_ev_runtime_shutdown,
+  acc_ev_create,
+  acc_ev_delete,
+  acc_ev_alloc,
+  acc_ev_free,
+  acc_ev_enter_data_start,
+  acc_ev_enter_data_end,
+  acc_ev_exit_data_start,
+  acc_ev_exit_data_end,
+  acc_ev_update_start,
+  acc_ev_update_end,
+  acc_ev_compute_construct_start,
+  acc_ev_compute_construct_end,
+  acc_ev_enqueue_launch_start,
+  acc_ev_enqueue_launch_end,
+  acc_ev_enqueue_upload_start,
+  acc_ev_enqueue_upload_end,
+  acc_ev_enqueue_download_start,
+  acc_ev_enqueue_download_end,
+  acc_ev_wait_start,
+  acc_ev_wait_end,
+  acc_ev_last
+} acc_event_t;
+
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.2. Callbacks Signature.  */
+
+//TODO
+/* 'In all cases, a datatype of "size_t" means a 32-bit integer for a 32-bit
+   binary and a 64-bit integer for a 64-bit binary, and a datatype "int" means
+   a 32-bit integer for both 32-bit and 64-bit binaries'.  */
+typedef long int _acc_prof_size_t;
+typedef int _acc_prof_int_t;
+
+/* Internal helpers: a struct's "valid_bytes" may be less than its "sizeof".  
*/
+#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, 
_valid_bytes_lastfield) \
+  offsetof (_struct, _lastfield) + (_valid_bytes_lastfield)
+#if 0 /* Untested.  */
+#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \
+  ((_n - 1) * sizeof (_type) + (_valid_bytes_type))
+#endif
+#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \
+  (sizeof (_basictype))
+
+typedef struct acc_prof_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  _acc_prof_int_t version;
+  acc_device_t device_type;
+  _acc_prof_int_t device_number;
+  _acc_prof_int_t thread_id;
+  _acc_prof_size_t async;
+  _acc_prof_size_t async_queue;
+  char *src_file;
+  char *func_name;
+  _acc_prof_int_t line_no, end_line_no;
+  _acc_prof_int_t func_line_no, func_end_line_no;
+#define _ACC_PROF_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE 
(_acc_prof_int_t))
+} acc_prof_info;
+
+/* We implement the OpenACC 2.5 Profiling Interface.  */
+#define _ACC_PROF_INFO_VERSION 201510
+
+typedef enum acc_construct_t
+{
+  acc_construct_parallel = 0,
+  acc_construct_kernels,
+  acc_construct_loop,
+  acc_construct_data,
+  acc_construct_enter_data,
+  acc_construct_exit_data,
+  acc_construct_host_data,
+  acc_construct_atomic,
+  acc_construct_declare,
+  acc_construct_init,
+  acc_construct_shutdown,
+  acc_construct_set,
+  acc_construct_update,
+  acc_construct_routine,
+  acc_construct_wait,
+  acc_construct_runtime_api
+} acc_construct_t;
+
+typedef struct acc_data_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  char *var_name;
+  _acc_prof_size_t bytes;
+  void *host_ptr;
+  void *device_ptr;
+#define _ACC_DATA_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_data_event_info;
+
+typedef struct acc_launch_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+  char *kernel_name;
+  _acc_prof_size_t num_gangs, num_workers, vector_length;
+#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE 
(_acc_prof_size_t))
+} acc_launch_event_info;
+
+typedef struct acc_other_event_info
+{
+  acc_event_t event_type;
+  _acc_prof_int_t valid_bytes;
+  acc_construct_t parent_construct;
+  _acc_prof_int_t implicit;
+  void *tool_info;
+#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_other_event_info;
+
+typedef union acc_event_info
+{
+  acc_event_t event_type;
+  acc_data_event_info data_event;
+  acc_launch_event_info launch_event;
+  acc_other_event_info other_event;
+} acc_event_info;
+
+//TODO: should these relate to acc_device_t values?
+typedef enum acc_device_api
+{
+  acc_device_api_none = 0,
+  acc_device_api_cuda,
+  acc_device_api_opencl,
+  acc_device_api_coi,
+  acc_device_api_other
+} acc_device_api;
+
+typedef struct acc_api_info
+{
+  acc_device_api device_api;
+  _acc_prof_int_t valid_bytes;
+  acc_device_t device_type;
+  _acc_prof_int_t vendor;
+  void *device_handle;
+  void *context_handle;
+  void *async_handle;
+#define _ACC_API_INFO_VALID_BYTES \
+  _ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} acc_api_info;
+
+typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *,
+                                  acc_api_info *);
+
+
+/* OpenACC 2.5, 5. Profiling Interface, 5.3. Loading the Library.  */
+
+typedef enum acc_register_t
+{
+  acc_reg = 0,
+  acc_toggle = 1,
+  acc_toggle_per_thread = 2
+} acc_register_t;
+
+typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t);
+extern void acc_prof_register (acc_event_t, acc_prof_callback, acc_register_t) 
__GOACC_NOTHROW;
+extern void acc_prof_unregister (acc_event_t, acc_prof_callback, 
acc_register_t) __GOACC_NOTHROW;
+typedef void (*acc_query_fn) ();
+typedef acc_query_fn (*acc_prof_lookup_func) (const char *);
+extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW;
+/* Don't tag "acc_register_library" as "__GOACC_NOTHROW": this function can be
+   overridden by the application, and must be expected to do "everything".  */
+extern void acc_register_library (acc_prof_reg, acc_prof_reg, 
acc_prof_lookup_func);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ACC_PROF_H */
diff --git libgomp/config/nvptx/oacc-profiling-acc_register_library.c 
libgomp/config/nvptx/oacc-profiling-acc_register_library.c
new file mode 100644
index 0000000..e69de29
diff --git libgomp/config/nvptx/oacc-profiling.c 
libgomp/config/nvptx/oacc-profiling.c
new file mode 100644
index 0000000..e69de29
diff --git libgomp/env.c libgomp/env.c
index ac05c3b..75f8272 100644
--- libgomp/env.c
+++ libgomp/env.c
@@ -1301,8 +1301,9 @@ initialize_env (void)
   parse_acc_device_type ();
 
   goacc_runtime_initialize ();
-}
 
+  goacc_profiling_initialize ();
+}
 
 /* The public OpenMP API routines that access these variables.  */
 
diff --git libgomp/libgomp-plugin.c libgomp/libgomp-plugin.c
index 9bd4047..f834306 100644
--- libgomp/libgomp-plugin.c
+++ libgomp/libgomp-plugin.c
@@ -29,6 +29,7 @@
 #include <stdlib.h>
 
 #include "libgomp.h"
+#include "oacc-int.h"
 #include "libgomp-plugin.h"
 
 void *
@@ -78,3 +79,11 @@ GOMP_PLUGIN_fatal (const char *msg, ...)
   gomp_vfatal (msg, ap);
   va_end (ap);
 }
+
+void
+GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info,
+                                     acc_event_info *event_info,
+                                     acc_api_info *api_info)
+{
+  goacc_profiling_dispatch (prof_info, event_info, api_info);
+}
diff --git libgomp/libgomp-plugin.h libgomp/libgomp-plugin.h
index ff81350..37d9d23 100644
--- libgomp/libgomp-plugin.h
+++ libgomp/libgomp-plugin.h
@@ -33,6 +33,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include "acc_prof.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -74,6 +76,10 @@ extern void GOMP_PLUGIN_error (const char *, ...)
 extern void GOMP_PLUGIN_fatal (const char *, ...)
        __attribute__ ((noreturn, format (printf, 1, 2)));
 
+extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *,
+                                                 acc_event_info *,
+                                                 acc_api_info *);
+
 /* Prototypes for functions implemented by libgomp plugins.  */
 extern const char *GOMP_OFFLOAD_get_name (void);
 extern unsigned int GOMP_OFFLOAD_get_caps (void);
diff --git libgomp/libgomp.map libgomp/libgomp.map
index 2c9a13d..b76a5dd 100644
--- libgomp/libgomp.map
+++ libgomp/libgomp.map
@@ -400,6 +400,10 @@ OACC_2.5 {
        acc_get_default_async_h_;
        acc_memcpy_from_device_async;
        acc_memcpy_to_device_async;
+       acc_prof_lookup;
+       acc_prof_register;
+       acc_prof_unregister;
+       acc_register_library;
        acc_set_default_async;
        acc_set_default_async_h_;
        acc_update_device_async;
@@ -456,3 +460,10 @@ GOMP_PLUGIN_1.2 {
   global:
        GOMP_PLUGIN_acc_thread_default_async;
 } GOMP_PLUGIN_1.1;
+
+# TODO
+GOMP_PLUGIN_1.3 {
+  global:
+       GOMP_PLUGIN_goacc_profiling_dispatch;
+       GOMP_PLUGIN_goacc_thread;
+} GOMP_PLUGIN_1.2;
diff --git libgomp/libgomp.texi libgomp/libgomp.texi
index 7cb677c..93365cd 100644
--- libgomp/libgomp.texi
+++ libgomp/libgomp.texi
@@ -111,6 +111,7 @@ changed to GNU Offloading and Multi Processing Runtime 
Library.
                                asynchronous operations.
 * OpenACC Library Interoperability:: OpenACC library interoperability with the
                                NVIDIA CUBLAS library.
+* OpenACC Profiling Interface::
 * The libgomp ABI::            Notes on the external ABI presented by libgomp.
 * Reporting Bugs::             How to report bugs in the GNU Offloading and
                                Multi Processing Runtime Library.
@@ -3085,6 +3086,251 @@ Application Programming Interface”, Version 2.0.}
 
 
 @c ---------------------------------------------------------------------
+@c OpenACC Profiling Interface
+@c ---------------------------------------------------------------------
+
+@node OpenACC Profiling Interface
+@chapter OpenACC Profiling Interface
+
+@section Implementation Status and Implementation-Defined Behavior
+
+We're not yet implementing the whole Profiling Interface as defined by
+the OpenACC 2.5 specification.  Also, the specification doesn't
+clearly define some aspects of its Profiling Interface, so we're
+clarifying these as @emph{implementation-defined behavior} here.  We
+already have reported to the OpenACC Technical Committee some issues,
+and will report more, later on.
+
+This implementation of the OpenACC Profiling Interface is tuned to
+keep the performance impact as low as possible when it's not in use.
+This is relevant, as the Profiling Interface affects all the
+@emph{hot} code paths (in the target code, not in the offloaded code).
+Users of the OpenACC Profiling Interface can be expected to understand
+that performance will always be impacted to some degree: for example,
+because of the @emph{runtime} (libgomp) calling into a third-party
+@emph{library} for every event that has been registered.
+
+This implementation of the OpenACC Profiling Interface has not yet
+been validated for use in multi-threaded code.  This is a more general
+issue; see CSTS-110 @cite{Make sure all OpenACC entry points in
+libgomp are thread-safe}.
+
+The @code{acc_prof_lookup} interface is not implemented, and
+@code{acc_register_library} will receive @code{NULL} for its
+@code{lookup} parameter.
+
+Remarks about data provided to callbacks:
+
+@table @asis
+
+@item @code{acc_prof_info.event_type}
+It is not clear if for @emph{nested} event callbacks (for example,
+@code{acc_ev_enqueue_launch_start} as part of a parent compute
+construct), this should be set for the nested event
+(@code{acc_ev_enqueue_launch_start}), or if the value of the parent
+construct should remain (@code{acc_ev_compute_construct_start}).  In
+this implementation, the value will generally correspond to the
+innermost nested event type.
+
+@item @code{acc_prof_info.device_type}
+@itemize
+
+@item
+For @code{acc_ev_compute_construct_start}, and in presence of an
+@code{if} clause with @emph{false} argument, this will still refer to
+the offloading device type; unsure whether that's the expected
+behavior.
+
+@item
+Complementary to the item before, for
+@code{acc_ev_compute_construct_end}, this is set to
+@code{acc_device_host} in presence of an @code{if} clause with
+@emph{false} argument, unsure whether that's the expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.thread_id}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.async}
+@itemize
+
+@item
+Not yet implemented correctly for
+@code{acc_ev_compute_construct_start}.
+
+@item
+In a compute construct, for host-fallback
+execution/@code{acc_device_host} it will always be
+@code{acc_async_sync}; unsure if that is the expected behavior.
+
+@item
+For @code{acc_ev_device_init_start} and @code{acc_ev_device_init_end},
+it will always be @code{acc_async_sync}; unsure if that is the
+expected behavior.
+
+@end itemize
+
+@item @code{acc_prof_info.async_queue}
+There is no @cite{limited number of asynchronous queues} in libgomp.
+We define this to always have the same value as
+@code{acc_prof_info.async}.
+
+@item @code{acc_prof_info.file}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_prof_info.func_name}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_prof_info.line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_prof_info.func_end_line_no}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_event_info.event_type}, @code{acc_event_info.*.event_type}
+Relating to @code{acc_prof_info.event_type} discussed above, in this
+implementation, this will always be the same value as
+@code{acc_prof_info.event_type}.
+
+@item @code{acc_event_info.*.parent_construct}
+@itemize
+
+@item
+Will be @code{acc_construct_parallel} for OpenACC kernels constructs;
+should be @code{acc_construct_kernels}.
+
+@item
+For implicit @code{acc_ev_device_init_start},
+@code{acc_ev_device_init_end}, and explicit as well as implicit
+@code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start}, and
+@code{acc_ev_enqueue_download_end}, will be
+@code{acc_construct_parallel}; should reflect the real parent
+construct.
+
+@end itemize
+
+@item @code{acc_event_info.*.implicit}
+For @code{acc_ev_alloc}, @code{acc_ev_free},
+@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end},
+@code{acc_ev_enqueue_download_start}, and
+@code{acc_ev_enqueue_download_end}, this currently will be @code{1}
+also for explicit usage.
+
+@item @code{acc_event_info.data_event.var_name}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_event_info.data_event.host_ptr}
+For @code{acc_ev_alloc}, and @code{acc_ev_free}, this is always
+@code{NULL}.
+
+@item @code{typedef union acc_api_info}
+@dots{} as printed in @cite{5.2.3. Third Argument: API-Specific
+Information}, should obviously be @code{typedef @emph{struct}
+acc_api_info}.
+
+@item @code{acc_api_info.device_api}
+Possibly not yet implemented correctly for
+@code{acc_ev_compute_construct_start},
+@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}:
+will always be @code{acc_device_api_none} for these event types.
+For @code{acc_ev_enter_data_start}, it will be
+@code{acc_device_api_none} in some cases.
+
+@item @code{acc_api_info.device_type}
+Always the same as @code{acc_prof_info.device_type}.
+
+@item @code{acc_api_info.vendor}
+Always @code{-1}; not yet implemented.
+
+@item @code{acc_api_info.device_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.context_handle}
+Always @code{NULL}; not yet implemented.
+
+@item @code{acc_api_info.async_handle}
+Always @code{NULL}; not yet implemented.
+
+@end table
+
+Remarks about certain event types:
+
+@table @asis
+
+@item @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}
+@itemize
+
+@item
+@c See DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT in
+@c libgomp.oacc-c-c++-common/acc_prof-parallel-1.c.
+Whan a compute construct triggers implicit
+@code{acc_ev_device_init_start} and @code{acc_ev_device_init_end}
+events, they currently aren't @emph{nested within} the corresponding
+@code{acc_ev_compute_construct_start} and
+@code{acc_ev_compute_construct_end}, but they're currently observed
+@emph{before} @code{acc_ev_compute_construct_start}.  It is not clear
+what to do: the standard asks us provide a lot of details to the
+@code{acc_ev_compute_construct_start} callback, without (implicitly)
+initializing a device before?
+
+@item
+Callbacks for these event types will not be invoked for calls to the
+@code{acc_set_device_type} and @code{acc_set_device_num} functions;
+it's not clear if they should be.
+
+@end itemize
+
+@end table
+
+Callbacks for the following event types will be invoked, but dispatch
+and information provided therein has not yet been thoroughly reviewed:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}
+@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end}
+@end itemize
+
+During device initialization, and finalization, respectively,
+callbacks for the following event types will not yet be invoked:
+
+@itemize
+@item @code{acc_ev_alloc}
+@item @code{acc_ev_free}
+@end itemize
+
+Callbacks for the following event types will currently only be invoked
+for (implicit) events within compute constructs:
+
+@itemize
+@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end}
+@item @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end}
+@end itemize
+
+Callbacks for the following event types have not yet been implemented,
+so currently won't be invoked:
+
+@itemize
+@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end}
+@item @code{acc_ev_runtime_shutdown}
+@item @code{acc_ev_create}, @code{acc_ev_delete}
+@item @code{acc_ev_update_start}, @code{acc_ev_update_end}
+@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end}
+@end itemize
+
+
+
+@c ---------------------------------------------------------------------
 @c The libgomp ABI
 @c ---------------------------------------------------------------------
 
diff --git libgomp/oacc-init.c libgomp/oacc-init.c
index 05bb663..415c0fa 100644
--- libgomp/oacc-init.c
+++ libgomp/oacc-init.c
@@ -218,8 +218,55 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int 
ndevs)
    held before calling this function.  */
 
 static struct gomp_device_descr *
-acc_init_1 (acc_device_t d)
+acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit)
 {
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (), false);
+
+  acc_prof_info prof_info;
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_device_init_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = d;
+      prof_info.device_number = goacc_device_num;
+      prof_info.thread_id = -1; //TODO
+      prof_info.async = acc_async_sync; //TODO
+      /* See <https://github.com/OpenACC/openacc-spec/issues/71>.  */
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL; //TODO
+      prof_info.func_name = NULL; //TODO
+      prof_info.line_no = -1; //TODO
+      prof_info.end_line_no = -1; //TODO
+      prof_info.func_line_no = -1; //TODO
+      prof_info.func_end_line_no = -1; //TODO
+    }
+  acc_event_info device_init_event_info;
+  if (profiling_dispatch_p)
+    {
+      device_init_event_info.other_event.event_type = prof_info.event_type;
+      device_init_event_info.other_event.valid_bytes
+       = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      device_init_event_info.other_event.parent_construct = parent_construct;
+      device_init_event_info.other_event.implicit = implicit;
+      device_init_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_dispatch_p)
+    {
+      api_info.device_api = acc_device_api_none; //TODO
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1; //TODO
+      api_info.device_handle = NULL; //TODO
+      api_info.context_handle = NULL; //TODO
+      api_info.async_handle = NULL; //TODO
+    }
+
+  if (profiling_dispatch_p)
+    goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info);
+
   struct gomp_device_descr *base_dev, *acc_dev;
   int ndevs;
 
@@ -242,6 +289,14 @@ acc_init_1 (acc_device_t d)
   gomp_init_device (acc_dev);
   gomp_mutex_unlock (&acc_dev->lock);
 
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_device_init_end;
+      device_init_event_info.other_event.event_type = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &device_init_event_info,
+                               &api_info);
+    }
+
   return base_dev;
 }
 
@@ -434,7 +489,11 @@ goacc_attach_host_thread_to_device (int ord)
   thr->dev = acc_dev = &base_dev[ord];
   thr->saved_bound_dev = NULL;
   thr->mapped_data = NULL;
-  
+  thr->prof_info = NULL;
+  thr->api_info = NULL;
+  /* Initially, all callbacks for all events are enabled.  */
+  thr->prof_callbacks_enabled = true;
+
   thr->target_tls
     = acc_dev->openacc.create_thread_data_func (ord);
 
@@ -452,7 +511,7 @@ acc_init (acc_device_t d)
   gomp_mutex_lock (&acc_device_lock);
   if (!cached_base_dev)
     gomp_init_targets_once ();
-  cached_base_dev = acc_init_1 (d);
+  cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0);
   gomp_mutex_unlock (&acc_device_lock);
   
   goacc_attach_host_thread_to_device (-1);
@@ -708,7 +767,8 @@ goacc_lazy_initialize (void)
   if (!cached_base_dev)
     {
       gomp_init_targets_once ();
-      cached_base_dev = acc_init_1 (acc_device_default);
+      cached_base_dev = acc_init_1 (acc_device_default,
+                                   /* TODO */ acc_construct_parallel, 1);
     }
   gomp_mutex_unlock (&acc_device_lock);
 
diff --git libgomp/oacc-int.h libgomp/oacc-int.h
index 1f7adb4..8a62029 100644
--- libgomp/oacc-int.h
+++ libgomp/oacc-int.h
@@ -40,6 +40,7 @@
 
 #include "openacc.h"
 #include "config.h"
+#include "acc_prof.h"
 #include <stddef.h>
 #include <stdbool.h>
 #include <stdarg.h>
@@ -68,6 +69,12 @@ struct goacc_thread
      strictly push/pop semantics according to lexical scope.  */
   struct target_mem_desc *mapped_data;
 
+  /* Data of the OpenACC Profiling Interface.  */
+  acc_prof_info *prof_info;
+  acc_api_info *api_info;
+  /* Per-thread toggle of OpenACC Profiling Interface callbacks.  */
+  bool prof_callbacks_enabled;
+
   /* These structures form a list: this is the next thread in that list.  */
   struct goacc_thread *next;
 
@@ -102,6 +109,11 @@ void goacc_restore_bind (void);
 void goacc_lazy_initialize (void);
 void goacc_host_init (void);
 
+void goacc_profiling_initialize (void);
+bool goacc_profiling_dispatch_p (void);
+void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *,
+                              acc_api_info *);
+
 #ifdef HAVE_ATTRIBUTE_VISIBILITY
 # pragma GCC visibility pop
 #endif
diff --git libgomp/oacc-parallel.c libgomp/oacc-parallel.c
index deab4b3..36e2431 100644
--- libgomp/oacc-parallel.c
+++ libgomp/oacc-parallel.c
@@ -142,21 +142,78 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
   thr = goacc_thread ();
   acc_dev = thr->dev;
 
+  bool profiling_dispatch_p
+    = __builtin_expect (goacc_profiling_dispatch_p (), false);
+
+  acc_prof_info prof_info;
+  if (profiling_dispatch_p)
+    {
+      thr->prof_info = &prof_info;
+
+      prof_info.event_type = acc_ev_compute_construct_start;
+      prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES;
+      prof_info.version = _ACC_PROF_INFO_VERSION;
+      prof_info.device_type = acc_device_type (acc_dev->type);
+      prof_info.device_number = acc_dev->target_id;
+      prof_info.thread_id = -1; //TODO
+      prof_info.async = async;
+      /* See <https://github.com/OpenACC/openacc-spec/issues/71>.  */
+      prof_info.async_queue = prof_info.async;
+      prof_info.src_file = NULL; //TODO
+      prof_info.func_name = NULL; //TODO
+      prof_info.line_no = -1; //TODO
+      prof_info.end_line_no = -1; //TODO
+      prof_info.func_line_no = -1; //TODO
+      prof_info.func_end_line_no = -1; //TODO
+    }
+  acc_event_info compute_construct_event_info;
+  if (profiling_dispatch_p)
+    {
+      compute_construct_event_info.other_event.event_type
+       = prof_info.event_type;
+      compute_construct_event_info.other_event.valid_bytes
+       = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      compute_construct_event_info.other_event.parent_construct
+       = acc_construct_parallel; //TODO: kernels...
+      compute_construct_event_info.other_event.implicit = 0;
+      compute_construct_event_info.other_event.tool_info = NULL;
+    }
+  acc_api_info api_info;
+  if (profiling_dispatch_p)
+    {
+      thr->api_info = &api_info;
+
+      api_info.device_api = acc_device_api_none;
+      api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES;
+      api_info.device_type = prof_info.device_type;
+      api_info.vendor = -1; //TODO
+      api_info.device_handle = NULL; //TODO
+      api_info.context_handle = NULL; //TODO
+      api_info.async_handle = NULL; //TODO
+    }
+
+  if (profiling_dispatch_p)
+    goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+                             &api_info);
+
   handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds);
 
   /* Host fallback if "if" clause is false or if the current device is set to
      the host.  */
   if (host_fallback)
     {
+      //TODO
+      prof_info.device_type = acc_device_host;
+      api_info.device_type = prof_info.device_type;
       goacc_save_and_set_bind (acc_device_host);
       fn (hostaddrs);
       goacc_restore_bind ();
-      return;
+      goto out;
     }
   else if (acc_device_type (acc_dev->type) == acc_device_host)
     {
       fn (hostaddrs);
-      return;
+      goto out;
     }
 
   /* Default: let the runtime choose.  */
@@ -190,6 +247,14 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
 
            if (async == GOMP_LAUNCH_OP_MAX)
              async = va_arg (ap, unsigned);
+
+           if (profiling_dispatch_p)
+             {
+               prof_info.async = async;
+               /* See <https://github.com/OpenACC/openacc-spec/issues/71>.  */
+               prof_info.async_queue = prof_info.async;
+             }
+
            break;
          }
 
@@ -227,8 +292,31 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
   else
     tgt_fn = (void (*)) fn;
 
+  acc_event_info enter_exit_data_event_info;
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_start;
+      enter_exit_data_event_info.other_event.event_type
+       = prof_info.event_type;
+      enter_exit_data_event_info.other_event.valid_bytes
+       = _ACC_OTHER_EVENT_INFO_VALID_BYTES;
+      enter_exit_data_event_info.other_event.parent_construct
+       = compute_construct_event_info.other_event.parent_construct;
+      enter_exit_data_event_info.other_event.implicit = 1;
+      enter_exit_data_event_info.other_event.tool_info = NULL;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+                               &api_info);
+    }
   tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true,
                       GOMP_MAP_VARS_OPENACC);
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_enter_data_end;
+      enter_exit_data_event_info.other_event.event_type
+       = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+                               &api_info);
+    }
 
   devaddrs = gomp_alloca (sizeof (void *) * mapnum);
   for (i = 0; i < mapnum; i++)
@@ -246,11 +334,43 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
 
   /* If running synchronously, unmap immediately.  */
   if (async < acc_async_noval)
-    gomp_unmap_vars (tgt, true);
+    {
+      if (profiling_dispatch_p)
+       {
+         prof_info.event_type = acc_ev_exit_data_start;
+         enter_exit_data_event_info.other_event.event_type
+           = prof_info.event_type;
+         enter_exit_data_event_info.other_event.tool_info = NULL;
+         goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+                                   &api_info);
+       }
+      gomp_unmap_vars (tgt, true);
+      if (profiling_dispatch_p)
+       {
+         prof_info.event_type = acc_ev_exit_data_end;
+         enter_exit_data_event_info.other_event.event_type
+           = prof_info.event_type;
+         goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info,
+                                   &api_info);
+       }
+    }
   else
     tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
 
   acc_dev->openacc.async_set_async_func (acc_async_sync);
+
+ out:
+  if (profiling_dispatch_p)
+    {
+      prof_info.event_type = acc_ev_compute_construct_end;
+      compute_construct_event_info.other_event.event_type
+       = prof_info.event_type;
+      goacc_profiling_dispatch (&prof_info, &compute_construct_event_info,
+                               &api_info);
+
+      thr->prof_info = NULL;
+      thr->api_info = NULL;
+    }
 }
 
 /* Legacy entry point, only provide host execution.  */
diff --git libgomp/oacc-plugin.c libgomp/oacc-plugin.c
index 3f82c07..9707b48 100644
--- libgomp/oacc-plugin.c
+++ libgomp/oacc-plugin.c
@@ -50,6 +50,19 @@ GOMP_PLUGIN_acc_thread (void)
   return thr ? thr->target_tls : NULL;
 }
 
+/* Return the TLS data for the current thread.  */
+/* TODO.  Should we be able to directly call (the static inline function)
+   goacc_thread from within plugin code?  I didn't manage to get the
+   "goacc_tls_data" symbol configured correctly: "[...]/ld:
+   .libs/libgomp-plugin-nvptx.so.1.0.0: hidden symbol `goacc_tls_data' isn't
+   defined".  */
+
+struct goacc_thread *
+GOMP_PLUGIN_goacc_thread (void)
+{
+  return goacc_thread ();
+}
+
 /* Return the default async number from the TLS data for the current thread.  
*/
 
 int
diff --git libgomp/oacc-plugin.h libgomp/oacc-plugin.h
index ff46ed8..5a842a2 100644
--- libgomp/oacc-plugin.h
+++ libgomp/oacc-plugin.h
@@ -27,8 +27,11 @@
 #ifndef OACC_PLUGIN_H
 #define OACC_PLUGIN_H 1
 
+#include "oacc-int.h"
+
 extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
 extern void *GOMP_PLUGIN_acc_thread (void);
+extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void);
 extern int GOMP_PLUGIN_acc_thread_default_async (void);
 
 #endif
diff --git libgomp/oacc-plugin.h libgomp/oacc-profiling-acc_register_library.c
similarity index 71%
copy from libgomp/oacc-plugin.h
copy to libgomp/oacc-profiling-acc_register_library.c
index ff46ed8..f6b482b 100644
--- libgomp/oacc-plugin.h
+++ libgomp/oacc-profiling-acc_register_library.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2016 Free Software Foundation, Inc.
+/* Copyright (C) 2017 Free Software Foundation, Inc.
 
    Contributed by Mentor Embedded.
 
@@ -24,11 +24,16 @@
    see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef OACC_PLUGIN_H
-#define OACC_PLUGIN_H 1
+/* This file provides an stub acc_register_library function.  It's in a
+   separate file so that this function can easily be overridden when linking
+   statically.  */
 
-extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
-extern void *GOMP_PLUGIN_acc_thread (void);
-extern int GOMP_PLUGIN_acc_thread_default_async (void);
+#include "libgomp.h"
+#include "acc_prof.h"
 
-#endif
+void
+acc_register_library (acc_prof_reg reg, acc_prof_reg unreg,
+                     acc_prof_lookup_func lookup)
+{
+  gomp_debug (0, "dummy %s\n", __FUNCTION__);
+}
diff --git libgomp/oacc-profiling.c libgomp/oacc-profiling.c
new file mode 100644
index 0000000..a4671f9
--- /dev/null
+++ libgomp/oacc-profiling.c
@@ -0,0 +1,576 @@
+/* Copyright (C) 2017 Free Software Foundation, Inc.
+
+   Contributed by Mentor Embedded.
+
+   This file is part of the GNU Offloading and Multi Processing Library
+   (libgomp).
+
+   Libgomp is free software; you can redistribute it and/or modify it
+   under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY
+   WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+   FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+   more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* OpenACC Profiling Interface.  */
+
+#include "libgomp.h"
+#include "oacc-int.h"
+#include "acc_prof.h"
+#include <assert.h>
+#ifdef HAVE_STRING_H
+# include <string.h>
+#endif
+#ifdef PLUGIN_SUPPORT
+# include <dlfcn.h>
+#endif
+
+#define STATIC_ASSERT(expr) _Static_assert (expr, "!(" #expr ")")
+
+/* Statically assert that the layout of the common fields in the
+   "acc_event_info" variants matches.  */
+/* event_type */
+STATIC_ASSERT (offsetof (acc_event_info, event_type)
+              == offsetof (acc_event_info, data_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+              == offsetof (acc_event_info, launch_event.event_type));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type)
+              == offsetof (acc_event_info, other_event.event_type));
+/* valid_bytes */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+              == offsetof (acc_event_info, launch_event.valid_bytes));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes)
+              == offsetof (acc_event_info, other_event.valid_bytes));
+/* parent_construct */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+              == offsetof (acc_event_info, launch_event.parent_construct));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct)
+              == offsetof (acc_event_info, other_event.parent_construct));
+/* implicit */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+              == offsetof (acc_event_info, launch_event.implicit));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit)
+              == offsetof (acc_event_info, other_event.implicit));
+/* tool_info */
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+              == offsetof (acc_event_info, launch_event.tool_info));
+STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info)
+              == offsetof (acc_event_info, other_event.tool_info));
+
+struct goacc_prof_callback_entry
+{
+  acc_prof_callback cb;
+  int ref;
+  bool enabled;
+  struct goacc_prof_callback_entry *next;
+};
+  
+/* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global toggle.  */
+static bool goacc_prof_callbacks_enabled[acc_ev_last];
+static struct goacc_prof_callback_entry 
*goacc_prof_callback_entries[acc_ev_last];
+
+/* This lock is used to protect access to goacc_prof_callbacks_enabled, and
+   goacc_prof_callback_entries.  */
+static gomp_mutex_t goacc_prof_lock;
+
+void
+goacc_profiling_initialize (void)
+{
+  gomp_mutex_init (&goacc_prof_lock);
+
+  /* Initially, all callbacks for all events are enabled.  */
+  for (int i = 0; i < acc_ev_last; ++i)
+    goacc_prof_callbacks_enabled[i] = true;
+
+  /* We are to invoke an external acc_register_library routine, defaulting to
+     our stub oacc-profiling-acc_register_library.c:acc_register_library
+     implementation.  */
+  gomp_debug (0, "%s: calling acc_register_library\n", __FUNCTION__);
+  //TODO.
+  acc_register_library (acc_prof_register, acc_prof_unregister, NULL);
+#ifdef PLUGIN_SUPPORT
+  char *acc_proflibs = getenv ("ACC_PROFLIB");
+  while (acc_proflibs != NULL && acc_proflibs[0] != '\0')
+    {
+      char *acc_proflibs_sep = strchr (acc_proflibs, ';');
+      char *acc_proflib;
+      if (acc_proflibs_sep == acc_proflibs)
+       {
+         /* Stray ";" separator: make sure we don't dlopen the main
+            program.  */
+         acc_proflib = NULL;
+       }
+      else
+       {
+         if (acc_proflibs_sep != NULL)
+           {
+             /* Single out the first library.  */
+             acc_proflib = gomp_malloc (acc_proflibs_sep - acc_proflibs + 1);
+             memcpy (acc_proflib, acc_proflibs,
+                     acc_proflibs_sep - acc_proflibs);
+             acc_proflib[acc_proflibs_sep - acc_proflibs] = '\0';
+           }
+         else
+           {
+             /* No ";" separator, so only one library.  */
+             acc_proflib = acc_proflibs;
+           }
+
+         gomp_debug (0, "%s: dlopen(%s)\n", __FUNCTION__, acc_proflib);
+         void *dl_handle = dlopen (acc_proflib, RTLD_LAZY);
+         if (dl_handle != NULL)
+           {
+             typeof (&acc_register_library) a_r_l
+               = dlsym (dl_handle, "acc_register_library");
+             if (a_r_l == NULL)
+               goto dl_fail;
+             /* Avoid duplicate registration, for example if the same shared
+                library is specified in LD_PRELOAD and ACC_PROFLIB -- which
+                TAU 2.26 does when using "tau_exec -openacc".  */
+             if (a_r_l != acc_register_library)
+               {
+                 gomp_debug (0, "  %s: calling %s:acc_register_library\n",
+                             __FUNCTION__, acc_proflib);
+                 //TODO.
+                 a_r_l (acc_prof_register, acc_prof_unregister, NULL);
+               }
+             else
+               gomp_debug (0, "  %s: skipping duplicate"
+                           " %s:acc_register_library\n",
+                           __FUNCTION__, acc_proflib);
+           }
+         else
+           {
+           dl_fail:
+             gomp_error ("while loading ACC_PROFLIB %s: %s",
+                         acc_proflib, dlerror ());
+             if (dl_handle != NULL)
+               {
+                 int err = dlclose (dl_handle);
+                 dl_handle = NULL;
+                 if (err != 0)
+                   goto dl_fail;
+               }
+           }
+       }
+
+      if (acc_proflib != acc_proflibs)
+       {
+         free (acc_proflib);
+
+         acc_proflibs = acc_proflibs_sep + 1;
+       }
+      else
+       acc_proflibs = NULL;
+    }
+#endif /* PLUGIN_SUPPORT */
+}
+
+void
+acc_prof_register (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+  //TODO
+  gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+             __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+  enum
+  {
+    EVENT_KIND_BOGUS,
+    EVENT_KIND_NORMAL,
+    /* As end events invoke callbacks in the reverse order, we register these
+       in the reverse order here.  */
+    EVENT_KIND_END,
+  } event_kind = EVENT_KIND_BOGUS;  
+  switch (ev)
+    {
+    case acc_ev_none:
+    case acc_ev_device_init_start:
+    case acc_ev_device_shutdown_start:
+    case acc_ev_runtime_shutdown:
+    case acc_ev_create:
+    case acc_ev_delete:
+    case acc_ev_alloc:
+    case acc_ev_free:
+    case acc_ev_enter_data_start:
+    case acc_ev_exit_data_start:
+    case acc_ev_update_start:
+    case acc_ev_compute_construct_start:
+    case acc_ev_enqueue_launch_start:
+    case acc_ev_enqueue_upload_start:
+    case acc_ev_enqueue_download_start:
+    case acc_ev_wait_start:
+      event_kind = EVENT_KIND_NORMAL;
+      break;
+    case acc_ev_device_init_end:
+    case acc_ev_device_shutdown_end:
+    case acc_ev_enter_data_end:
+    case acc_ev_exit_data_end:
+    case acc_ev_update_end:
+    case acc_ev_compute_construct_end:
+    case acc_ev_enqueue_launch_end:
+    case acc_ev_enqueue_upload_end:
+    case acc_ev_enqueue_download_end:
+    case acc_ev_wait_end:
+      event_kind = EVENT_KIND_END;
+      break;
+    case acc_ev_last:
+      break;
+    }
+  if (event_kind == EVENT_KIND_BOGUS)
+    {
+      //TODO: should this be a fatal error?  Or, should we (silently?) ignore 
these, for forward compatibility?
+      gomp_error ("ignoring %s request for TODOinvalid acc_event_t %d",
+                 __FUNCTION__, /* TODO */ (int) ev);
+      return;
+    }
+
+  bool bogus = true;
+  switch (reg)
+    {
+    case acc_reg:
+    case acc_toggle:
+    case acc_toggle_per_thread:
+      bogus = false;
+      break;
+    }
+  if (bogus)
+    {
+      //TODO: should this be a fatal error?  Or, should we (silently?) ignore 
these, for forward compatibility?
+      gomp_error ("ignoring %s request with TODOinvalid acc_register_t %d",
+                 __FUNCTION__, /* TODO */ (int) reg);
+      return;
+    }
+
+  /* Special cases.  */
+  if (reg == acc_toggle)
+    {
+      if (cb == NULL)
+       {
+         gomp_debug (0, "  globally enabling callbacks\n");
+         gomp_mutex_lock (&goacc_prof_lock);
+         /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global
+            toggle.  */
+         goacc_prof_callbacks_enabled[ev] = true;
+         gomp_mutex_unlock (&goacc_prof_lock);
+         return;
+       }
+      else if (ev == acc_ev_none && cb != NULL)
+       {
+         gomp_debug (0, "  ignoring request\n");
+         /* Silently ignore request.  */
+         return;
+       }
+    }
+  else if (reg == acc_toggle_per_thread)
+    {
+      if (ev == acc_ev_none && cb == NULL)
+       {
+         gomp_debug (0, "  thread: enabling callbacks\n");
+         goacc_lazy_initialize ();
+         struct goacc_thread *thr = goacc_thread ();
+         thr->prof_callbacks_enabled = true;
+         return;
+       }
+      //TODO: should this be a fatal error?  Or, should we (silently?) ignore 
these?
+      gomp_error ("ignoring %s request for acc_toggle_per_thread with TODO",
+                 __FUNCTION__);
+      return;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  struct goacc_prof_callback_entry *it, *it_p;
+  it = goacc_prof_callback_entries[ev];
+  it_p = NULL;
+  while (it)
+    {
+      if (it->cb == cb)
+       break;
+      it_p = it;
+      it = it->next;
+    }
+
+  switch (reg)
+    {
+    case acc_reg:
+      /* If we already have this callback registered, just increment its ref
+        count.  */
+      if (it != NULL)
+       {
+         it->ref++;
+         gomp_debug (0, "  already registered;"
+                     " incrementing ref count to: %d\n", it->ref);
+       }
+      else
+       {
+         struct goacc_prof_callback_entry *e
+           = gomp_malloc (sizeof (struct goacc_prof_callback_entry));
+         e->cb = cb;
+         e->ref = 1;
+         e->enabled = true;
+         bool prepend = (event_kind == EVENT_KIND_END);
+         /* If we don't have any callback registered yet, also use the
+            "prepend" code path.  */
+         if (it_p == NULL)
+           prepend = true;
+         if (prepend)
+           {
+             gomp_debug (0, "  prepending\n");
+             e->next = goacc_prof_callback_entries[ev];
+             goacc_prof_callback_entries[ev] = e;
+           }
+         else
+           {
+             gomp_debug (0, "  appending\n");
+             e->next = NULL;
+             it_p->next = e;
+           }
+       }
+      break;
+
+    case acc_toggle:
+      if (it == NULL)
+       {
+         /* Silently ignore acc_toggle request if not registered.  */
+         gomp_debug (0, "  not enabling; not registered\n");
+       }
+      else
+       {
+         gomp_debug (0, "  enabling\n");
+         it->enabled = true;
+       }
+      break;
+
+    case acc_toggle_per_thread:
+      __builtin_unreachable ();
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+void
+acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg)
+{
+  //TODO
+  gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n",
+             __FUNCTION__, (int) ev, (void *) cb, (int) reg);
+
+  if (ev < acc_ev_none
+      || ev >= acc_ev_last)
+    {
+      //TODO: should this be a fatal error?  Or, should we (silently?) ignore 
these, for forward compatibility?
+      gomp_error ("ignoring %s request for TODOinvalid acc_event_t %d",
+                 __FUNCTION__, /* TODO */ (int) ev);
+      return;
+    }
+
+  bool bogus = true;
+  switch (reg)
+    {
+    case acc_reg:
+    case acc_toggle:
+    case acc_toggle_per_thread:
+      bogus = false;
+      break;
+    }
+  if (bogus)
+    {
+      //TODO: should this be a fatal error?  Or, should we (silently?) ignore 
these, for forward compatibility?
+      gomp_error ("ignoring %s request with TODOinvalid acc_register_t %d",
+                 __FUNCTION__, /* TODO */ (int) reg);
+      return;
+    }
+
+  /* Special cases.  */
+  if (reg == acc_toggle)
+    {
+      if (cb == NULL)
+       {
+         gomp_debug (0, "  globally disabling callbacks\n");
+         gomp_mutex_lock (&goacc_prof_lock);
+         /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global
+            toggle.  */
+         goacc_prof_callbacks_enabled[ev] = false;
+         gomp_mutex_unlock (&goacc_prof_lock);
+         return;
+       }
+      else if (ev == acc_ev_none && cb != NULL)
+       {
+         gomp_debug (0, "  ignoring request\n");
+         /* Silently ignore request.  */
+         return;
+       }
+    }
+  else if (reg == acc_toggle_per_thread)
+    {
+      if (ev == acc_ev_none && cb == NULL)
+       {
+         gomp_debug (0, "  thread: disabling callbacks\n");
+         goacc_lazy_initialize ();
+         struct goacc_thread *thr = goacc_thread ();
+         thr->prof_callbacks_enabled = false;
+         return;
+       }
+      //TODO: should this be a fatal error?  Or, should we (silently?) ignore 
these?
+      gomp_error ("ignoring %s request for acc_toggle_per_thread with TODO",
+                 __FUNCTION__);
+      return;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  struct goacc_prof_callback_entry *it, *it_p;
+  it = goacc_prof_callback_entries[ev];
+  it_p = NULL;
+  while (it)
+    {
+      if (it->cb == cb)
+       break;
+      it_p = it;
+      it = it->next;
+    }
+
+  switch (reg)
+    {
+    case acc_reg:
+      if (it == NULL)
+       {
+         //TODO: should this be a fatal error?  Or, should we (silently?) 
ignore these?
+         gomp_error ("ignoring %s request for acc_event_t %d: not registered",
+                     __FUNCTION__, /* TODO */ (int) ev);
+         gomp_mutex_unlock (&goacc_prof_lock);
+         return;
+       }
+      it->ref--;
+      gomp_debug (0, "  decrementing ref count to: %d\n", it->ref);
+      if (it->ref == 0)
+       {
+         if (it_p == NULL)
+           goacc_prof_callback_entries[ev] = it->next;
+         else
+           it_p->next = it->next;
+         free (it);
+       }
+      break;
+
+    case acc_toggle:
+      if (it == NULL)
+       {
+         /* Silently ignore acc_toggle request if not registered.  */
+         gomp_debug (0, "  not disabling; not registered\n");
+       }
+      else
+       {
+         gomp_debug (0, "  disabling\n");
+         it->enabled = false;
+       }
+      break;
+
+    case acc_toggle_per_thread:
+      __builtin_unreachable ();
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
+
+/* Prepare to dispatch events?  */
+
+bool
+goacc_profiling_dispatch_p (void)
+{
+  //TODO
+  gomp_debug (0, "%s\n", __FUNCTION__);
+
+  struct goacc_thread *thr = goacc_thread ();
+  if (__builtin_expect (thr == NULL, false))
+    {
+      /* If we don't have any per-thread state yet, that means that per-thread
+        callback dispatch has not been explicitly disabled (which only a call
+        to acc_prof_unregister with acc_toggle_per_thread will do, and that
+        would have allocated per-thread state via goacc_lazy_initialize);
+        initially, all callbacks for all events are enabled.  */
+      //TODO
+      gomp_debug (0, "  %s: don't have any per-thread state yet\n", 
__FUNCTION__);
+    }
+  else if (__builtin_expect (!thr->prof_callbacks_enabled, true))
+    {
+      //TODO
+      gomp_debug (0, "  %s: disabled for this thread\n", __FUNCTION__);
+      return false;
+    }
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global toggle.  */
+  if (__builtin_expect (!goacc_prof_callbacks_enabled[acc_ev_none], true))
+    {
+      //TODO
+      gomp_debug (0, "  %s: disabled globally\n", __FUNCTION__);
+      gomp_mutex_unlock (&goacc_prof_lock);
+      return false;
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+
+  return true;
+}
+
+/* Dispatch events.
+
+   TODO.
+   This must only be called if goacc_profiling_dispatch_p returned a true
+   result.  */
+
+void
+goacc_profiling_dispatch (acc_prof_info *prof_info, acc_event_info *event_info,
+                         acc_api_info *apt_info)
+{
+  acc_event_t event_type = event_info->event_type;
+  //TODO
+  gomp_debug (0, "%s: event_type=%d\n", __FUNCTION__, (int) event_type);
+  //TODO
+  assert (event_type > acc_ev_none
+         && event_type < acc_ev_last);
+
+  gomp_mutex_lock (&goacc_prof_lock);
+
+  if (!goacc_prof_callbacks_enabled[event_type])
+    {
+      //TODO
+      gomp_debug (0, "  %s: disabled for this event type\n", __FUNCTION__);
+      gomp_mutex_unlock (&goacc_prof_lock);
+      return;
+    }
+
+  for (struct goacc_prof_callback_entry *e
+        = goacc_prof_callback_entries[event_type];
+       e != NULL;
+       e = e->next)
+    {
+      if (!e->enabled)
+       {
+         //TODO
+         gomp_debug (0, "  %s: disabled for callback %p\n",
+                     __FUNCTION__, e->cb);
+         continue;
+       }
+
+      //TODO
+      gomp_debug (0, "  %s: calling callback %p\n", __FUNCTION__, e->cb);
+      e->cb (prof_info, event_info, apt_info);
+    }
+
+  gomp_mutex_unlock (&goacc_prof_lock);
+}
diff --git libgomp/plugin/plugin-nvptx.c libgomp/plugin/plugin-nvptx.c
index 51000f3..dbea9da 100644
--- libgomp/plugin/plugin-nvptx.c
+++ libgomp/plugin/plugin-nvptx.c
@@ -36,6 +36,7 @@
 #include "libgomp-plugin.h"
 #include "oacc-plugin.h"
 #include "gomp-constants.h"
+#include "oacc-int.h"
 
 #include <pthread.h>
 #include <cuda.h>
@@ -121,7 +122,7 @@ struct nvptx_thread
 };
 
 static struct cuda_map *
-cuda_map_create (size_t size)
+cuda_map_create (struct goacc_thread *thr, size_t size)
 {
   struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map));
 
@@ -134,13 +135,72 @@ cuda_map_create (size_t size)
   CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size);
   assert (map->d);
 
+  bool profiling_dispatch_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      acc_prof_info *prof_info = thr->prof_info;
+      acc_event_info data_event_info;
+      acc_api_info *api_info = thr->api_info;
+
+      prof_info->event_type = acc_ev_alloc;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+       = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+       = acc_construct_parallel; //TODO
+      /* Always implicit for "data mapping arguments for cuLaunchKernel".  */
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL; //TODO
+      data_event_info.data_event.bytes = size;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = (void *) map->d;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
   return map;
 }
 
 static void
-cuda_map_destroy (struct cuda_map *map)
+cuda_map_destroy (struct goacc_thread *thr, struct cuda_map *map)
 {
   CUDA_CALL_ASSERT (cuMemFree, map->d);
+
+  bool profiling_dispatch_p
+    = __builtin_expect (thr != NULL && thr->prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      acc_prof_info *prof_info = thr->prof_info;
+      acc_event_info data_event_info;
+      acc_api_info *api_info = thr->api_info;
+
+      prof_info->event_type = acc_ev_free;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+       = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+       = acc_construct_parallel; //TODO
+      /* Always implicit for "data mapping arguments for cuLaunchKernel".  */
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL; //TODO
+      data_event_info.data_event.bytes = map->size;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = (void *) map->d;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
   free (map);
 }
 
@@ -156,30 +216,30 @@ cuda_map_destroy (struct cuda_map *map)
    GOMP_OFFLOAD_fini_device, respectively.  */
 
 static bool
-map_init (struct ptx_stream *s)
+map_init (struct goacc_thread *thr, struct ptx_stream *s)
 {
   int size = getpagesize ();
 
   assert (s);
 
-  s->map = cuda_map_create (size);
+  s->map = cuda_map_create (thr, size);
 
   return true;
 }
 
 static bool
-map_fini (struct ptx_stream *s)
+map_fini (struct goacc_thread *thr, struct ptx_stream *s)
 {
   assert (s->map->next == NULL);
   assert (!s->map->active);
 
-  cuda_map_destroy (s->map);
+  cuda_map_destroy (thr, s->map);
 
   return true;
 }
 
 static void
-map_pop (struct ptx_stream *s)
+map_pop (struct goacc_thread *thr, struct ptx_stream *s)
 {
   struct cuda_map *next;
 
@@ -192,12 +252,12 @@ map_pop (struct ptx_stream *s)
     }
 
   next = s->map->next;
-  cuda_map_destroy (s->map);
+  cuda_map_destroy (thr, s->map);
   s->map = next;
 }
 
 static CUdeviceptr
-map_push (struct ptx_stream *s, size_t size)
+map_push (struct goacc_thread *thr, struct ptx_stream *s, size_t size)
 {
   struct cuda_map *map = NULL, *t = NULL;
 
@@ -209,7 +269,7 @@ map_push (struct ptx_stream *s, size_t size)
      cuda_map and push it to the end of the list.  */
   if (s->map->active)
     {
-      map = cuda_map_create (size);
+      map = cuda_map_create (thr, size);
 
       for (t = s->map; t->next != NULL; t = t->next)
        ;
@@ -218,8 +278,8 @@ map_push (struct ptx_stream *s, size_t size)
     }
   else if (s->map->size < size)
     {
-      cuda_map_destroy (s->map);
-      map = cuda_map_create (size);
+      cuda_map_destroy (thr, s->map);
+      map = cuda_map_create (thr, size);
     }
   else
     map = s->map;
@@ -365,7 +425,7 @@ init_streams_for_device (struct ptx_device *ptx_dev, int 
concurrency)
   null_stream->stream = NULL;
   null_stream->host_thread = pthread_self ();
   null_stream->multithreaded = true;
-  if (!map_init (null_stream))
+  if (!map_init (NULL, null_stream))
     return false;
 
   ptx_dev->null_stream = null_stream;
@@ -399,7 +459,7 @@ fini_streams_for_device (struct ptx_device *ptx_dev)
       struct ptx_stream *s = ptx_dev->active_streams;
       ptx_dev->active_streams = ptx_dev->active_streams->next;
 
-      ret &= map_fini (s);
+      ret &= map_fini (NULL, s);
 
       CUresult r = cuStreamDestroy (s->stream);
       if (r != CUDA_SUCCESS)
@@ -410,7 +470,7 @@ fini_streams_for_device (struct ptx_device *ptx_dev)
       free (s);
     }
 
-  ret &= map_fini (ptx_dev->null_stream);
+  ret &= map_fini (NULL, ptx_dev->null_stream);
   free (ptx_dev->null_stream);
   return ret;
 }
@@ -425,7 +485,8 @@ static struct ptx_stream *
 select_stream_for_async (int async, pthread_t thread, bool create,
                         CUstream existing)
 {
-  struct nvptx_thread *nvthd = nvptx_thread ();
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
   /* Local copy of TLS variable.  */
   struct ptx_device *ptx_dev = nvthd->ptx_dev;
   struct ptx_stream *stream = NULL;
@@ -495,7 +556,7 @@ select_stream_for_async (int async, pthread_t thread, bool 
create,
          s->host_thread = thread;
          s->multithreaded = false;
 
-         if (!map_init (s))
+         if (!map_init (thr, s))
            {
              pthread_mutex_unlock (&ptx_dev->stream_lock);
              GOMP_PLUGIN_fatal ("map_init fail");
@@ -840,7 +901,8 @@ event_gc (bool memmap_lockable)
 {
   struct ptx_event *ptx_event = ptx_events;
   struct ptx_event *async_cleanups = NULL;
-  struct nvptx_thread *nvthd = nvptx_thread ();
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
 
   pthread_mutex_lock (&ptx_event_lock);
 
@@ -869,7 +931,7 @@ event_gc (bool memmap_lockable)
              break;
 
            case PTX_EVT_KNL:
-             map_pop (e->addr);
+             map_pop (thr, e->addr);
              break;
 
            case PTX_EVT_ASYNC_CLEANUP:
@@ -960,7 +1022,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, 
void **devaddrs,
   void *kargs[1];
   void *hp;
   CUdeviceptr dp;
-  struct nvptx_thread *nvthd = nvptx_thread ();
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
   const char *maybe_abort_msg = "(perhaps abort was called)";
   int cpu_size = nvptx_thread ()->ptx_dev->max_threads_per_multiprocessor;
   int block_size = nvptx_thread ()->ptx_dev->max_threads_per_block;
@@ -1108,7 +1171,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, 
void **devaddrs,
      the host and the device. HP is a host pointer to the new chunk, and DP is
      the corresponding device pointer.  */
   pthread_mutex_lock (&ptx_event_lock);
-  dp = map_push (dev_str, mapnum * sizeof (void *));
+  dp = map_push (thr, dev_str, mapnum * sizeof (void *));
   pthread_mutex_unlock (&ptx_event_lock);
 
   GOMP_PLUGIN_debug (0, "  %s: prepare mappings\n", __FUNCTION__);
@@ -1120,8 +1183,45 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, 
void **devaddrs,
 
   /* Copy the (device) pointers to arguments to the device (dp and hp might in
      fact have the same value on a unified-memory system).  */
+
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_upload_start;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+       = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+       = acc_construct_parallel; //TODO
+      /* Always implicit for "data mapping arguments for cuLaunchKernel".  */
+      data_event_info.data_event.implicit = 1;
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL; //TODO
+      data_event_info.data_event.bytes = mapnum * sizeof (void *);
+      data_event_info.data_event.host_ptr = hp;
+      data_event_info.data_event.device_ptr = (void *) dp;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
   CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp,
                    mapnum * sizeof (void *));
+
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_upload_end;
+      data_event_info.data_event.event_type = prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
   GOMP_PLUGIN_debug (0, "  %s: kernel %s: launch"
                     " gangs=%u, workers=%u, vectors=%u\n",
                     __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG],
@@ -1133,11 +1233,47 @@ nvptx_exec (void (*fn), size_t mapnum, void 
**hostaddrs, void **devaddrs,
   // num_workers       ntid.y
   // vector length     ntid.x
 
+  acc_event_info enqueue_launch_event_info;
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_start;
+
+      enqueue_launch_event_info.launch_event.event_type
+       = prof_info->event_type;
+      enqueue_launch_event_info.launch_event.valid_bytes
+       = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES;
+      enqueue_launch_event_info.launch_event.parent_construct
+       /* TODO = compute_construct_event_info.other_event.parent_construct */
+       = acc_construct_parallel; //TODO: kernels...
+      enqueue_launch_event_info.launch_event.implicit = 1;
+      enqueue_launch_event_info.launch_event.tool_info = NULL;
+      enqueue_launch_event_info.launch_event.kernel_name
+       = /* TODO */ (char *) /* TODO */ targ_fn->launch->fn;
+      enqueue_launch_event_info.launch_event.num_gangs
+       = dims[GOMP_DIM_GANG];
+      enqueue_launch_event_info.launch_event.num_workers
+       = dims[GOMP_DIM_WORKER];
+      enqueue_launch_event_info.launch_event.vector_length
+       = dims[GOMP_DIM_VECTOR];
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, 
&enqueue_launch_event_info,
+                                           api_info);
+    }
   kargs[0] = &dp;
   CUDA_CALL_ASSERT (cuLaunchKernel, function,
                    dims[GOMP_DIM_GANG], 1, 1,
                    dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1,
                    0, dev_str->stream, kargs, 0);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_launch_end;
+      enqueue_launch_event_info.launch_event.event_type
+       = prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, 
&enqueue_launch_event_info,
+                                           api_info);
+    }
 
 #ifndef DISABLE_ASYNC
   if (async < acc_async_noval)
@@ -1183,7 +1319,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, 
void **devaddrs,
 #ifndef DISABLE_ASYNC
   if (async < acc_async_noval)
 #endif
-    map_pop (dev_str);
+    map_pop (thr, dev_str);
 }
 
 void * openacc_get_current_cuda_context (void);
@@ -1194,6 +1330,34 @@ nvptx_alloc (size_t s)
   CUdeviceptr d;
 
   CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s);
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_alloc;
+
+      acc_event_info data_event_info;
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+       = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+       = acc_construct_parallel; //TODO
+      data_event_info.data_event.implicit = 1; //TODO
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL; //TODO
+      data_event_info.data_event.bytes = s;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = (void *) d;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
   return (void *) d;
 }
 
@@ -1211,6 +1375,34 @@ nvptx_free (void *p)
     }
 
   CUDA_CALL (cuMemFree, (CUdeviceptr) p);
+
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_free;
+
+      acc_event_info data_event_info;
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+       = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+       = acc_construct_parallel; //TODO
+      data_event_info.data_event.implicit = 1; //TODO
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL; //TODO
+      data_event_info.data_event.bytes = ps;
+      data_event_info.data_event.host_ptr = NULL;
+      data_event_info.data_event.device_ptr = p;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
   return true;
 }
 
@@ -1220,7 +1412,8 @@ nvptx_host2dev (void *d, const void *h, size_t s)
 {
   CUdeviceptr pb;
   size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
 
   if (!s)
     return true;
@@ -1253,6 +1446,32 @@ nvptx_host2dev (void *d, const void *h, size_t s)
       return false;
     }
 
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_upload_start;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+       = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+       = acc_construct_parallel; //TODO
+      data_event_info.data_event.implicit = 1; //TODO
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL; //TODO
+      data_event_info.data_event.bytes = s;
+      data_event_info.data_event.host_ptr = /* TODO */ (void *) h;
+      data_event_info.data_event.device_ptr = d;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
 #ifndef DISABLE_ASYNC
   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
@@ -1268,6 +1487,14 @@ nvptx_host2dev (void *d, const void *h, size_t s)
 #endif
     CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s);
 
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_upload_end;
+      data_event_info.data_event.event_type = prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
   return true;
 }
 
@@ -1276,7 +1503,8 @@ nvptx_dev2host (void *h, const void *d, size_t s)
 {
   CUdeviceptr pb;
   size_t ps;
-  struct nvptx_thread *nvthd = nvptx_thread ();
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
 
   if (!s)
     return true;
@@ -1309,6 +1537,32 @@ nvptx_dev2host (void *h, const void *d, size_t s)
       return false;
     }
 
+  acc_prof_info *prof_info = thr->prof_info;
+  acc_event_info data_event_info;
+  acc_api_info *api_info = thr->api_info;
+  bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false);
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_download_start;
+
+      data_event_info.data_event.event_type = prof_info->event_type;
+      data_event_info.data_event.valid_bytes
+       = _ACC_DATA_EVENT_INFO_VALID_BYTES;
+      data_event_info.data_event.parent_construct
+       = acc_construct_parallel; //TODO
+      data_event_info.data_event.implicit = 1; //TODO
+      data_event_info.data_event.tool_info = NULL;
+      data_event_info.data_event.var_name = NULL; //TODO
+      data_event_info.data_event.bytes = s;
+      data_event_info.data_event.host_ptr = h;
+      data_event_info.data_event.device_ptr = /* TODO */ (void *) d;
+
+      api_info->device_api = acc_device_api_cuda;
+
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
 #ifndef DISABLE_ASYNC
   if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
     {
@@ -1324,6 +1578,14 @@ nvptx_dev2host (void *h, const void *d, size_t s)
 #endif
     CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s);
 
+  if (profiling_dispatch_p)
+    {
+      prof_info->event_type = acc_ev_enqueue_download_end;
+      data_event_info.data_event.event_type = prof_info->event_type;
+      GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info,
+                                           api_info);
+    }
+
   return true;
 }
 
@@ -1555,7 +1817,8 @@ nvptx_set_cuda_stream (int async, void *stream)
 {
   struct ptx_stream *oldstream;
   pthread_t self = pthread_self ();
-  struct nvptx_thread *nvthd = nvptx_thread ();
+  struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread ();
+  struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls;
 
   if (async < 0)
     GOMP_PLUGIN_fatal ("bad async %d", async);
@@ -1586,7 +1849,7 @@ nvptx_set_cuda_stream (int async, void *stream)
 
       CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream);
 
-      if (!map_fini (oldstream))
+      if (!map_fini (thr, oldstream))
        GOMP_PLUGIN_fatal ("error when freeing host memory");
 
       free (oldstream);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
new file mode 100644
index 0000000..4c1f2bb
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c
@@ -0,0 +1,344 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+   which will cause the value at the point of call to be used (*before* any
+   potential modifications done in callbacks), as opposed to its address being
+   taken, which then later gets dereferenced (*after* any modifications done in
+   callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+static int state = -1;
+#define STATE_OP(state, op)\
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+void cb_compute_construct_start_1 (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 0
+         || state == 10
+         || state == 30
+         || state == 41
+         || state == 51
+         || state == 91
+         || state == 101
+         || state == 151);
+  STATE_OP (state, ++);
+}
+
+void cb_compute_construct_start_2 (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 1
+         || state == 11
+         || state == 40
+         || state == 50
+         || state == 90
+         || state == 100
+         || state == 150);
+  STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_1 (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 14
+         || state == 21
+         || state == 32
+         || state == 42
+         || state == 80
+         || state == 103
+         || state == 152);
+  STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_2 (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 13
+         || state == 43
+         || state == 102
+         || state == 154);
+  STATE_OP (state, ++);
+}
+
+void cb_compute_construct_end_3 (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 12
+         || state == 20
+         || state == 31
+         || state == 44
+         || state == 81
+         || state == 104
+         || state == 153);
+  STATE_OP (state, ++);
+}
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, 
acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 2);
+  }
+  assert (state == 2);
+
+  STATE_OP (state, = 10);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 12);
+  }
+  assert (state == 15);
+
+  STATE_OP (state, = 20);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, 
acc_toggle);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, 
acc_toggle);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, 
acc_reg);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, 
acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 20);
+  }
+  assert (state == 20);
+
+  STATE_OP (state, = 30);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, 
acc_toggle);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, 
acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 31);
+  }
+  assert (state == 33);
+
+  STATE_OP (state, = 40);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, 
acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 42);
+  }
+  assert (state == 45);
+
+  STATE_OP (state, = 50);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 52);
+  }
+  assert (state == 52);
+
+  STATE_OP (state, = 60);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 60);
+  }
+  assert (state == 60);
+
+  STATE_OP (state, = 70);
+  unreg (acc_ev_compute_construct_start, NULL, acc_toggle);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 70);
+  }
+  assert (state == 70);
+
+  STATE_OP (state, = 80);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 80);
+  }
+  assert (state == 82);
+
+  STATE_OP (state, = 90);
+  reg (acc_ev_compute_construct_start, NULL, acc_toggle);
+  unreg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 92);
+  }
+  assert (state == 92);
+
+  STATE_OP (state, = 100);
+  reg (acc_ev_compute_construct_end, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 102);
+  }
+  assert (state == 105);
+
+  STATE_OP (state, = 110);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 110);
+  }
+  assert (state == 110);
+
+  STATE_OP (state, = 120);
+  unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 120);
+  }
+  assert (state == 120);
+
+  STATE_OP (state, = 130);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 130);
+  }
+  assert (state == 130);
+
+  STATE_OP (state, = 140);
+  unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, 
acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg);
+  unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 140);
+  }
+  assert (state == 140);
+
+  STATE_OP (state, = 150);
+  reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread);
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 152);
+  }
+  assert (state == 155);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
new file mode 100644
index 0000000..436f436
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c
@@ -0,0 +1,306 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+   which will cause the value at the point of call to be used (*before* any
+   potential modifications done in callbacks), as opposed to its address being
+   taken, which then later gets dereferenced (*after* any modifications done in
+   callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+static int state = -1;
+#define STATE_OP(state, op)\
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+struct tool_info
+{
+  acc_event_info event_info;
+  struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+void cb_device_init_start (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 0
+         || state == 100);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_device_init_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  if (state == 1)
+    assert (prof_info->device_type == acc_device_host);
+  else
+    assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == 
acc_construct_runtime_api);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = 
event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+}
+
+void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 1
+         || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_device_init_start);
+
+  assert (prof_info->event_type == acc_ev_device_init_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  if (state == 2)
+    assert (prof_info->device_type == acc_device_host);
+  else
+    assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == 
acc_construct_runtime_api);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 10
+         || state == 110);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_compute_construct_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = 
event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+}
+
+void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 11
+         || state == 111);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+
+  assert (prof_info->event_type == acc_ev_compute_construct_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, 
acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+  assert (state == 0);
+
+  acc_init (acc_device_host);
+  assert (state == 2);
+
+  STATE_OP (state, = 10);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = 12;
+
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 11);
+  }
+  assert (state == 12);
+
+  STATE_OP (state, = 90);
+  acc_shutdown (acc_device_host);
+  assert (state == 90);
+
+
+  STATE_OP (state, = 100);
+  acc_init (acc_device_default);
+  assert (state == 102);
+
+  STATE_OP (state, = 110);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = 12;
+
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 111);
+  }
+  assert (state == 112);
+
+  STATE_OP (state, = 190);
+  acc_shutdown (acc_device_default);
+  assert (state == 190);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
new file mode 100644
index 0000000..de26323
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c
@@ -0,0 +1,703 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+   which will cause the value at the point of call to be used (*before* any
+   potential modifications done in callbacks), as opposed to its address being
+   taken, which then later gets dereferenced (*after* any modifications done in
+   callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+/* See the "DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT" reference in
+   libgomp.texi.  */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+static int state = -1;
+#define STATE_OP(state, op)\
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int acc_async;
+
+struct tool_info
+{
+  acc_event_info event_info;
+  struct tool_info *nested;
+};
+struct tool_info *tool_info;
+
+void cb_device_init_start (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 1
+         || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+#else
+  assert (state == 0
+         || state == 100);
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+#endif
+
+  assert (prof_info->event_type == acc_ev_device_init_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  tool_info->nested->event_info.other_event.event_type = 
event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+#else
+  tool_info->event_info.other_event.event_type = 
event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+#endif
+}
+
+void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 2
+         || state == 102);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == 
acc_ev_device_init_start);
+#else
+  assert (state == 1
+         || state == 101);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_device_init_start);
+#endif
+
+  assert (prof_info->event_type == acc_ev_device_init_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_default);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (event_info->other_event.tool_info == tool_info->nested);
+#else
+  assert (event_info->other_event.tool_info == tool_info);
+#endif
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+#else
+  free (tool_info);
+  tool_info = NULL;
+#endif
+}
+
+void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 3
+         || state == 103);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_enter_data_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  if (acc_device_type == acc_device_host
+      || state < 100) //TODO
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.other_event.event_type = 
event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 4
+         || state == 104);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == 
acc_ev_enter_data_start);
+
+  assert (prof_info->event_type == acc_ev_enter_data_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == tool_info->nested);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 7);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_exit_data_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == NULL);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.other_event.event_type = 
event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (state == 8);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.other_event.event_type == 
acc_ev_exit_data_start);
+
+  assert (prof_info->event_type == acc_ev_exit_data_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 1);
+  assert (event_info->other_event.tool_info == tool_info->nested);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT
+  assert (state == 0
+         || state == 100);
+  if (state == 100)
+    {
+      /* Compensate for the missing acc_ev_device_init_start and
+        acc_ev_device_init_end.  */
+      state += 2;
+    }
+#else
+  if (state == 100)
+    {
+      /* Compensate for the missing acc_ev_device_init_start and
+        acc_ev_device_init_end.  */
+      state += 2;
+    }
+  assert (state == 2
+         || state == 102);
+#endif
+  STATE_OP (state, ++);
+
+  assert (tool_info == NULL);
+  tool_info = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info != NULL);
+  tool_info->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_compute_construct_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == /* TODO acc_async */ acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == NULL);
+
+  assert (api_info->device_api == acc_device_api_none);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->event_info.other_event.event_type = 
event_info->other_event.event_type;
+  event_info->other_event.tool_info = tool_info;
+
+  if (acc_device_type == acc_device_host)
+    {
+      /* Compensate for the missing acc_ev_enter_data_start.  */
+      state += 1;
+    }
+}
+
+void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  if (acc_device_type == acc_device_host)
+    {
+      /* Compensate for the missing acc_ev_enter_data_end.  */
+      state += 1;
+      /* Compensate for the missing acc_ev_enqueue_launch_start and
+        acc_ev_enqueue_launch_end.  */
+      state += 2;
+      /* Compensate for the missing acc_ev_exit_data_start and
+        acc_ev_exit_data_end.  */
+      state += 2;
+    }
+  else if (acc_async != acc_async_sync)
+    {
+      /* Compensate for the missing acc_ev_exit_data_start and
+        acc_ev_exit_data_end.  */
+      state += 2;
+    }
+  assert (state == 9
+         || state == 109);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+
+  assert (prof_info->event_type == acc_ev_compute_construct_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  if (acc_device_type == acc_device_host)
+    assert (prof_info->async == acc_async_sync);
+  else
+    assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->other_event.event_type == prof_info->event_type);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (event_info->other_event.parent_construct == acc_construct_parallel);
+  assert (event_info->other_event.implicit == 0);
+  assert (event_info->other_event.tool_info == tool_info);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info);
+  tool_info = NULL;
+}
+
+void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 5
+         || state == 105);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested == NULL);
+  tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info);
+  assert (tool_info->nested != NULL);
+  tool_info->nested->nested = NULL;
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == 
_ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == NULL);
+  assert (event_info->launch_event.kernel_name != NULL);
+  {
+    char *s = strstr (event_info->launch_event.kernel_name, "main");
+    assert (s != NULL);
+    s = strstr (s, "omp_fn");
+    assert (s != NULL);
+  }
+  assert (event_info->launch_event.num_gangs >= 1);
+  assert (event_info->launch_event.num_workers >= 1);
+  assert (event_info->launch_event.vector_length >= 1);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  tool_info->nested->event_info.launch_event.event_type = 
event_info->launch_event.event_type;
+  tool_info->nested->event_info.launch_event.kernel_name = strdup 
(event_info->launch_event.kernel_name);
+  tool_info->nested->event_info.launch_event.num_gangs = 
event_info->launch_event.num_gangs;
+  tool_info->nested->event_info.launch_event.num_workers = 
event_info->launch_event.num_workers;
+  tool_info->nested->event_info.launch_event.vector_length = 
event_info->launch_event.vector_length;
+  event_info->other_event.tool_info = tool_info->nested;
+}
+
+void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 6
+         || state == 106);
+  STATE_OP (state, ++);
+
+  assert (tool_info != NULL);
+  assert (tool_info->event_info.other_event.event_type == 
acc_ev_compute_construct_start);
+  assert (tool_info->nested != NULL);
+  assert (tool_info->nested->event_info.launch_event.event_type == 
acc_ev_enqueue_launch_start);
+  assert (tool_info->nested->event_info.launch_event.kernel_name != NULL);
+  assert (tool_info->nested->event_info.launch_event.num_gangs >= 1);
+  assert (tool_info->nested->event_info.launch_event.num_workers >= 1);
+  assert (tool_info->nested->event_info.launch_event.vector_length >= 1);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_end);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == 
_ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == tool_info->nested);
+  assert (event_info->launch_event.kernel_name != NULL);
+  assert (strcmp (event_info->launch_event.kernel_name, 
tool_info->nested->event_info.launch_event.kernel_name) == 0);
+  assert (event_info->launch_event.num_gangs == 
tool_info->nested->event_info.launch_event.num_gangs);
+  assert (event_info->launch_event.num_workers == 
tool_info->nested->event_info.launch_event.num_workers);
+  assert (event_info->launch_event.vector_length == 
tool_info->nested->event_info.launch_event.vector_length);
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+
+  free (tool_info->nested->event_info.launch_event.kernel_name);
+  free (tool_info->nested);
+  tool_info->nested = NULL;
+}
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, 
acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_device_init_start, cb_device_init_start, acc_reg);
+  reg (acc_ev_device_init_end, cb_device_init_end, acc_reg);
+  reg (acc_ev_enter_data_start, cb_enter_data_start, acc_reg);
+  reg (acc_ev_enter_data_end, cb_enter_data_end, acc_reg);
+  reg (acc_ev_exit_data_start, cb_exit_data_start, acc_reg);
+  reg (acc_ev_exit_data_end, cb_exit_data_end, acc_reg);
+  reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg);
+  reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg);
+  reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+  reg (acc_ev_enqueue_launch_end, cb_enqueue_launch_end, acc_reg);
+  assert (state == 0);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  acc_async = acc_async_sync;
+  assert (state == 0);
+
+  {
+    int state_init;
+#pragma acc parallel COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+    assert (state_init == 4);
+  }
+#ifdef __OPTIMIZE__
+  /* TODO.  With -O2 optimizations enabled, the compiler believes that here
+     "state == 0" still holds.  It's not yet clear what's going on.
+     Mis-optimization across the GOMP function call boundary?  Per its
+     gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+     "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+     must expect calls back into this compilation unit?  */
+  asm volatile ("" : : : "memory");
+#endif
+  assert (state == 10);
+
+  STATE_OP (state, = 100);
+
+  acc_async = 12;
+  {
+    int state_init;
+#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init)
+    {
+      state_init = state;
+    }
+#pragma acc wait
+    assert (state_init == 104);
+  }
+  assert (state == 110);
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
new file mode 100644
index 0000000..a952c7a
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c
@@ -0,0 +1,172 @@
+/* Test the "valid_bytes" magic.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->data_event.valid_bytes == 
_ACC_DATA_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->launch_event.valid_bytes == 
_ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (event_info->other_event.valid_bytes == 
_ACC_OTHER_EVENT_INFO_VALID_BYTES);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+}
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, 
acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg_ (acc_ev_device_init_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_init_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg);
+  reg_ (acc_ev_create, cb_data_event, acc_reg);
+  reg_ (acc_ev_delete, cb_data_event, acc_reg);
+  reg_ (acc_ev_alloc, cb_data_event, acc_reg);
+  reg_ (acc_ev_free, cb_data_event, acc_reg);
+  reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_update_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_update_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg);
+  reg_ (acc_ev_wait_start, cb_other_event, acc_reg);
+  reg_ (acc_ev_wait_end, cb_other_event, acc_reg);
+}
+
+/* Basic struct.  */
+typedef struct A
+{
+  int a;
+  int b;
+#define VALID_BYTES_A \
+  _ACC_PROF_VALID_BYTES_STRUCT (A, b, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE (int))
+} A;
+
+/* Add a "char" field.  */
+typedef struct B
+{
+  int a;
+  int b;
+  char c;
+#define VALID_BYTES_B \
+  _ACC_PROF_VALID_BYTES_STRUCT (B, c, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} B;
+
+/* Add another "char" field.  */
+typedef struct C
+{
+  int a;
+  int b;
+  char c, d;
+#define VALID_BYTES_C \
+  _ACC_PROF_VALID_BYTES_STRUCT (C, d, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} C;
+
+/* Add two "void *" fields.  */
+typedef struct D
+{
+  int a;
+  int b;
+  char c, d;
+  void *e;
+  void *f;
+#define VALID_BYTES_D \
+  _ACC_PROF_VALID_BYTES_STRUCT (D, f, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE (void *))
+} D;
+
+/* Add another three "char" fields.  */
+typedef struct E
+{
+  int a;
+  int b;
+  char c, d;
+  void *e;
+  void *f;
+  char g, h, i;
+#define VALID_BYTES_E \
+  _ACC_PROF_VALID_BYTES_STRUCT (E, i, \
+                               _ACC_PROF_VALID_BYTES_BASICTYPE (char))
+} E;
+
+int main()
+{
+  A A1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A);
+  assert (VALID_BYTES_A <= sizeof A1);
+  DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b);
+  assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1));
+
+  B B1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B);
+  assert (VALID_BYTES_B <= sizeof B1);
+  DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c);
+  assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1));
+
+  assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char));
+
+  C C1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C);
+  assert (VALID_BYTES_C <= sizeof C1);
+  DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d);
+  assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1));
+
+  assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char));
+
+  D D1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D);
+  assert (VALID_BYTES_D <= sizeof D1);
+  DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f);
+  assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1));
+
+  assert (VALID_BYTES_D > VALID_BYTES_C);
+
+  E E1;
+  DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E);
+  assert (VALID_BYTES_E <= sizeof E1);
+  DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i);
+  assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1));
+
+  assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char));
+
+#pragma acc parallel
+  {
+  }
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
new file mode 100644
index 0000000..b0b8934
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c
@@ -0,0 +1,55 @@
+/* Test "acc_prof_info"'s  "version" field.  */
+
+#undef NDEBUG
+#include <assert.h>
+
+#include <acc_prof.h>
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, 
acc_api_info *api_info)
+{
+  DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type);
+
+  assert (prof_info->version == 201510);
+}
+
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, 
acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg_ (acc_ev_device_init_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_init_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg);
+  reg_ (acc_ev_create, cb_any_event, acc_reg);
+  reg_ (acc_ev_delete, cb_any_event, acc_reg);
+  reg_ (acc_ev_alloc, cb_any_event, acc_reg);
+  reg_ (acc_ev_free, cb_any_event, acc_reg);
+  reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_update_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_update_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg);
+  reg_ (acc_ev_wait_start, cb_any_event, acc_reg);
+  reg_ (acc_ev_wait_end, cb_any_event, acc_reg);
+}
+
+int main()
+{
+#pragma acc parallel
+  {
+  }
+
+  return 0;
+}


Grüße
 Thomas

OpenACC 2.5 Profiling Interface (incomplete)

Reply via email to