Hi! The 2.5 versions of the OpenACC standard added a new chapter "Profiling Interface". In r245784, I committed incomplete support to gomp-4_0-branch. I plan to continue working on this, but wanted to synchronize at this point.
commit b22a85fe7f3daeb48460e7aa28606d0cdb799f69 Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4> Date: Tue Feb 28 17:36:03 2017 +0000 OpenACC 2.5 Profiling Interface (incomplete) libgomp/ * acc_prof.h: New file. * oacc-profiling-acc_register_library.c: Likewise. * oacc-profiling.c: Likewise. * Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES): Add these, respectively. * Makefile.in: Regenerate. * libgomp/config/nvptx/oacc-profiling-acc_register_library.c: New empty file. * libgomp/config/nvptx/oacc-profiling.c: Likewise. * env.c (initialize_env): Call goacc_profiling_initialize. * libgomp-plugin.c: New function GOMP_PLUGIN_goacc_profiling_dispatch. * libgomp-plugin.h: Declare function GOMP_PLUGIN_goacc_profiling_dispatch. * oacc-plugin.c: New function GOMP_PLUGIN_goacc_thread. * oacc-plugin.h: Declare function GOMP_PLUGIN_goacc_thread. * libgomp.map (OACC_2.5): Add acc_prof_lookup, acc_prof_register, acc_prof_unregister, and acc_register_library. Add GOMP_PLUGIN_goacc_profiling_dispatch, and GOMP_PLUGIN_goacc_thread with new GOMP_PLUGIN_1.3 symbol version. * oacc-int.h (struct goacc_thread): Add "acc_prof_info *prof_info", "acc_api_info *api_info", and "bool prof_callbacks_enabled" members. Declare functions goacc_profiling_initialize, goacc_profiling_dispatch_p, and goacc_profiling_dispatch. * oacc-init.c (acc_init_1): Add "acc_construct_t", and "int" formal parameters. Adjust all users. (acc_init_1, goacc_attach_host_thread_to_device, acc_init) (goacc_lazy_initialize): Update for OpenACC Profiling Interface. * oacc-parallel.c (GOACC_parallel_keyed): Likewise. * plugin/plugin-nvptx.c (cuda_map_create, cuda_map_destroy) (map_init, map_fini, map_pop, map_push): Add "struct goacc_thread *" formal parameter. Adjust all users. (select_stream_for_async, event_gc, nvptx_exec, nvptx_host2dev) (nvptx_dev2host, nvptx_set_cuda_stream): Call GOMP_PLUGIN_goacc_thread instead of nvptx_thread. (cuda_map_create, cuda_map_destroy, nvptx_exec, nvptx_alloc) (nvptx_free, nvptx_host2dev, nvptx_dev2host): Update for OpenACC Profiling Interface. * libgomp.texi: New chapter "OpenACC Profiling Interface". * testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New file. * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c: Likewise. * testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c: Likewise. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@245784 138bc75d-0d04-0410-961f-82ee72b054a4 --- libgomp/ChangeLog.gomp | 50 ++ libgomp/Makefile.am | 5 +- libgomp/Makefile.in | 10 +- libgomp/acc_prof.h | 237 +++++++ .../nvptx/oacc-profiling-acc_register_library.c | 0 libgomp/config/nvptx/oacc-profiling.c | 0 libgomp/env.c | 3 +- libgomp/libgomp-plugin.c | 9 + libgomp/libgomp-plugin.h | 6 + libgomp/libgomp.map | 11 + libgomp/libgomp.texi | 246 +++++++ libgomp/oacc-init.c | 68 +- libgomp/oacc-int.h | 12 + libgomp/oacc-parallel.c | 126 +++- libgomp/oacc-plugin.c | 13 + libgomp/oacc-plugin.h | 3 + ...gin.h => oacc-profiling-acc_register_library.c} | 19 +- libgomp/oacc-profiling.c | 576 +++++++++++++++++ libgomp/plugin/plugin-nvptx.c | 315 ++++++++- .../acc_prof-dispatch-1.c | 344 ++++++++++ .../libgomp.oacc-c-c++-common/acc_prof-init-1.c | 306 +++++++++ .../acc_prof-parallel-1.c | 703 +++++++++++++++++++++ .../acc_prof-valid_bytes-1.c | 172 +++++ .../libgomp.oacc-c-c++-common/acc_prof-version-1.c | 55 ++ 24 files changed, 3243 insertions(+), 46 deletions(-) diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp index af6a28b..acdb004 100644 --- libgomp/ChangeLog.gomp +++ libgomp/ChangeLog.gomp @@ -1,5 +1,55 @@ 2017-02-28 Thomas Schwinge <tho...@codesourcery.com> + * acc_prof.h: New file. + * oacc-profiling-acc_register_library.c: Likewise. + * oacc-profiling.c: Likewise. + * Makefile.am (nodist_libsubinclude_HEADERS, libgomp_la_SOURCES): + Add these, respectively. + * Makefile.in: Regenerate. + * libgomp/config/nvptx/oacc-profiling-acc_register_library.c: + New empty file. + * libgomp/config/nvptx/oacc-profiling.c: Likewise. + * env.c (initialize_env): Call goacc_profiling_initialize. + * libgomp-plugin.c: New function + GOMP_PLUGIN_goacc_profiling_dispatch. + * libgomp-plugin.h: Declare function + GOMP_PLUGIN_goacc_profiling_dispatch. + * oacc-plugin.c: New function GOMP_PLUGIN_goacc_thread. + * oacc-plugin.h: Declare function GOMP_PLUGIN_goacc_thread. + * libgomp.map (OACC_2.5): Add acc_prof_lookup, acc_prof_register, + acc_prof_unregister, and acc_register_library. + Add GOMP_PLUGIN_goacc_profiling_dispatch, and + GOMP_PLUGIN_goacc_thread with new GOMP_PLUGIN_1.3 symbol version. + * oacc-int.h (struct goacc_thread): Add "acc_prof_info + *prof_info", "acc_api_info *api_info", and "bool + prof_callbacks_enabled" members. + Declare functions goacc_profiling_initialize, + goacc_profiling_dispatch_p, and goacc_profiling_dispatch. + * oacc-init.c (acc_init_1): Add "acc_construct_t", and "int" + formal parameters. Adjust all users. + (acc_init_1, goacc_attach_host_thread_to_device, acc_init) + (goacc_lazy_initialize): Update for OpenACC Profiling Interface. + * oacc-parallel.c (GOACC_parallel_keyed): Likewise. + * plugin/plugin-nvptx.c (cuda_map_create, cuda_map_destroy) + (map_init, map_fini, map_pop, map_push): Add "struct goacc_thread + *" formal parameter. Adjust all users. + (select_stream_for_async, event_gc, nvptx_exec, nvptx_host2dev) + (nvptx_dev2host, nvptx_set_cuda_stream): Call + GOMP_PLUGIN_goacc_thread instead of nvptx_thread. + (cuda_map_create, cuda_map_destroy, nvptx_exec, nvptx_alloc) + (nvptx_free, nvptx_host2dev, nvptx_dev2host): Update for OpenACC + Profiling Interface. + * libgomp.texi: New chapter "OpenACC Profiling Interface". + * testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c: New + file. + * testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c: Likewise. + * testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c: + Likewise. + * testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c: + Likewise. + * testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c: + Likewise. + * oacc-init.c (goacc_register, acc_init, goacc_lazy_initialize): Fix locking of cached_base_dev (guarded by acc_device_lock). (goacc_lazy_initialize): Don't call acc_init; copy code here, diff --git libgomp/Makefile.am libgomp/Makefile.am index a3e1c2b..9a75c48 100644 --- libgomp/Makefile.am +++ libgomp/Makefile.am @@ -63,7 +63,8 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ task.c team.c work.c lock.c mutex.c proc.c sem.c bar.c ptrlock.c \ time.c fortran.c affinity.c target.c splay-tree.c libgomp-plugin.c \ oacc-parallel.c oacc-host.c oacc-init.c oacc-mem.c oacc-async.c \ - oacc-plugin.c oacc-cuda.c priority_queue.c + oacc-plugin.c oacc-cuda.c priority_queue.c \ + oacc-profiling.c oacc-profiling-acc_register_library.c include $(top_srcdir)/plugin/Makefrag.am @@ -72,7 +73,7 @@ libgomp_la_SOURCES += openacc.f90 endif nodist_noinst_HEADERS = libgomp_f.h -nodist_libsubinclude_HEADERS = omp.h openacc.h +nodist_libsubinclude_HEADERS = acc_prof.h omp.h openacc.h if USE_FORTRAN nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \ openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod diff --git libgomp/Makefile.in libgomp/Makefile.in index 88c8517..dac2b4e 100644 --- libgomp/Makefile.in +++ libgomp/Makefile.in @@ -180,7 +180,8 @@ am_libgomp_la_OBJECTS = alloc.lo barrier.lo critical.lo env.lo \ fortran.lo affinity.lo target.lo splay-tree.lo \ libgomp-plugin.lo oacc-parallel.lo oacc-host.lo oacc-init.lo \ oacc-mem.lo oacc-async.lo oacc-plugin.lo oacc-cuda.lo \ - priority_queue.lo $(am__objects_1) + priority_queue.lo oacc-profiling.lo \ + oacc-profiling-acc_register_library.lo $(am__objects_1) libgomp_la_OBJECTS = $(am_libgomp_la_OBJECTS) DEFAULT_INCLUDES = -I.@am__isrc@ depcomp = $(SHELL) $(top_srcdir)/../depcomp @@ -435,7 +436,8 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ bar.c ptrlock.c time.c fortran.c affinity.c target.c \ splay-tree.c libgomp-plugin.c oacc-parallel.c oacc-host.c \ oacc-init.c oacc-mem.c oacc-async.c oacc-plugin.c oacc-cuda.c \ - priority_queue.c $(am__append_3) + priority_queue.c oacc-profiling.c \ + oacc-profiling-acc_register_library.c $(am__append_3) # Nvidia PTX OpenACC plugin. @PLUGIN_NVPTX_TRUE@libgomp_plugin_nvptx_version_info = -version-info $(libtool_VERSION) @@ -457,7 +459,7 @@ libgomp_la_SOURCES = alloc.c barrier.c critical.c env.c error.c iter.c \ @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBADD = libgomp.la $(PLUGIN_HSA_LIBS) @PLUGIN_HSA_TRUE@libgomp_plugin_hsa_la_LIBTOOLFLAGS = --tag=disable-static nodist_noinst_HEADERS = libgomp_f.h -nodist_libsubinclude_HEADERS = omp.h openacc.h +nodist_libsubinclude_HEADERS = acc_prof.h omp.h openacc.h @USE_FORTRAN_TRUE@nodist_finclude_HEADERS = omp_lib.h omp_lib.f90 omp_lib.mod omp_lib_kinds.mod \ @USE_FORTRAN_TRUE@ openacc_lib.h openacc.f90 openacc.mod openacc_kinds.mod @@ -620,6 +622,8 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-mem.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-parallel.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-plugin.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling-acc_register_library.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/oacc-profiling.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/ordered.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/parallel.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/priority_queue.Plo@am__quote@ diff --git libgomp/acc_prof.h libgomp/acc_prof.h new file mode 100644 index 0000000..9247790 --- /dev/null +++ libgomp/acc_prof.h @@ -0,0 +1,237 @@ +/* OpenACC Runtime Library: Profiling Interface + + Copyright (C) 2017 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef _ACC_PROF_H +#define _ACC_PROF_H 1 + +/* The OpenACC standard doesn't say so explicitly, but as its Profiling + Interface makes use of, for example, <openacc.h>'s acc_device_t, we + supposedly are to #include that file here. */ +#include <openacc.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* OpenACC 2.5, 5. Profiling Interface, 5.1. Events. */ + +typedef enum acc_event_t +{ + acc_ev_none = 0, + acc_ev_device_init_start, + acc_ev_device_init_end, + acc_ev_device_shutdown_start, + acc_ev_device_shutdown_end, + acc_ev_runtime_shutdown, + acc_ev_create, + acc_ev_delete, + acc_ev_alloc, + acc_ev_free, + acc_ev_enter_data_start, + acc_ev_enter_data_end, + acc_ev_exit_data_start, + acc_ev_exit_data_end, + acc_ev_update_start, + acc_ev_update_end, + acc_ev_compute_construct_start, + acc_ev_compute_construct_end, + acc_ev_enqueue_launch_start, + acc_ev_enqueue_launch_end, + acc_ev_enqueue_upload_start, + acc_ev_enqueue_upload_end, + acc_ev_enqueue_download_start, + acc_ev_enqueue_download_end, + acc_ev_wait_start, + acc_ev_wait_end, + acc_ev_last +} acc_event_t; + + +/* OpenACC 2.5, 5. Profiling Interface, 5.2. Callbacks Signature. */ + +//TODO +/* 'In all cases, a datatype of "size_t" means a 32-bit integer for a 32-bit + binary and a 64-bit integer for a 64-bit binary, and a datatype "int" means + a 32-bit integer for both 32-bit and 64-bit binaries'. */ +typedef long int _acc_prof_size_t; +typedef int _acc_prof_int_t; + +/* Internal helpers: a struct's "valid_bytes" may be less than its "sizeof". */ +#define _ACC_PROF_VALID_BYTES_STRUCT(_struct, _lastfield, _valid_bytes_lastfield) \ + offsetof (_struct, _lastfield) + (_valid_bytes_lastfield) +#if 0 /* Untested. */ +#define _ACC_PROF_VALID_BYTES_TYPE_N(_type, _n, _valid_bytes_type) \ + ((_n - 1) * sizeof (_type) + (_valid_bytes_type)) +#endif +#define _ACC_PROF_VALID_BYTES_BASICTYPE(_basictype) \ + (sizeof (_basictype)) + +typedef struct acc_prof_info +{ + acc_event_t event_type; + _acc_prof_int_t valid_bytes; + _acc_prof_int_t version; + acc_device_t device_type; + _acc_prof_int_t device_number; + _acc_prof_int_t thread_id; + _acc_prof_size_t async; + _acc_prof_size_t async_queue; + char *src_file; + char *func_name; + _acc_prof_int_t line_no, end_line_no; + _acc_prof_int_t func_line_no, func_end_line_no; +#define _ACC_PROF_INFO_VALID_BYTES \ + _ACC_PROF_VALID_BYTES_STRUCT (acc_prof_info, func_end_line_no, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_int_t)) +} acc_prof_info; + +/* We implement the OpenACC 2.5 Profiling Interface. */ +#define _ACC_PROF_INFO_VERSION 201510 + +typedef enum acc_construct_t +{ + acc_construct_parallel = 0, + acc_construct_kernels, + acc_construct_loop, + acc_construct_data, + acc_construct_enter_data, + acc_construct_exit_data, + acc_construct_host_data, + acc_construct_atomic, + acc_construct_declare, + acc_construct_init, + acc_construct_shutdown, + acc_construct_set, + acc_construct_update, + acc_construct_routine, + acc_construct_wait, + acc_construct_runtime_api +} acc_construct_t; + +typedef struct acc_data_event_info +{ + acc_event_t event_type; + _acc_prof_int_t valid_bytes; + acc_construct_t parent_construct; + _acc_prof_int_t implicit; + void *tool_info; + char *var_name; + _acc_prof_size_t bytes; + void *host_ptr; + void *device_ptr; +#define _ACC_DATA_EVENT_INFO_VALID_BYTES \ + _ACC_PROF_VALID_BYTES_STRUCT (acc_data_event_info, device_ptr, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (void *)) +} acc_data_event_info; + +typedef struct acc_launch_event_info +{ + acc_event_t event_type; + _acc_prof_int_t valid_bytes; + acc_construct_t parent_construct; + _acc_prof_int_t implicit; + void *tool_info; + char *kernel_name; + _acc_prof_size_t num_gangs, num_workers, vector_length; +#define _ACC_LAUNCH_EVENT_INFO_VALID_BYTES \ + _ACC_PROF_VALID_BYTES_STRUCT (acc_launch_event_info, vector_length, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (_acc_prof_size_t)) +} acc_launch_event_info; + +typedef struct acc_other_event_info +{ + acc_event_t event_type; + _acc_prof_int_t valid_bytes; + acc_construct_t parent_construct; + _acc_prof_int_t implicit; + void *tool_info; +#define _ACC_OTHER_EVENT_INFO_VALID_BYTES \ + _ACC_PROF_VALID_BYTES_STRUCT (acc_other_event_info, tool_info, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (void *)) +} acc_other_event_info; + +typedef union acc_event_info +{ + acc_event_t event_type; + acc_data_event_info data_event; + acc_launch_event_info launch_event; + acc_other_event_info other_event; +} acc_event_info; + +//TODO: should these relate to acc_device_t values? +typedef enum acc_device_api +{ + acc_device_api_none = 0, + acc_device_api_cuda, + acc_device_api_opencl, + acc_device_api_coi, + acc_device_api_other +} acc_device_api; + +typedef struct acc_api_info +{ + acc_device_api device_api; + _acc_prof_int_t valid_bytes; + acc_device_t device_type; + _acc_prof_int_t vendor; + void *device_handle; + void *context_handle; + void *async_handle; +#define _ACC_API_INFO_VALID_BYTES \ + _ACC_PROF_VALID_BYTES_STRUCT (acc_api_info, async_handle, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (void *)) +} acc_api_info; + +typedef void (*acc_prof_callback) (acc_prof_info *, acc_event_info *, + acc_api_info *); + + +/* OpenACC 2.5, 5. Profiling Interface, 5.3. Loading the Library. */ + +typedef enum acc_register_t +{ + acc_reg = 0, + acc_toggle = 1, + acc_toggle_per_thread = 2 +} acc_register_t; + +typedef void (*acc_prof_reg) (acc_event_t, acc_prof_callback, acc_register_t); +extern void acc_prof_register (acc_event_t, acc_prof_callback, acc_register_t) __GOACC_NOTHROW; +extern void acc_prof_unregister (acc_event_t, acc_prof_callback, acc_register_t) __GOACC_NOTHROW; +typedef void (*acc_query_fn) (); +typedef acc_query_fn (*acc_prof_lookup_func) (const char *); +extern acc_query_fn acc_prof_lookup (const char *) __GOACC_NOTHROW; +/* Don't tag "acc_register_library" as "__GOACC_NOTHROW": this function can be + overridden by the application, and must be expected to do "everything". */ +extern void acc_register_library (acc_prof_reg, acc_prof_reg, acc_prof_lookup_func); + +#ifdef __cplusplus +} +#endif + +#endif /* _ACC_PROF_H */ diff --git libgomp/config/nvptx/oacc-profiling-acc_register_library.c libgomp/config/nvptx/oacc-profiling-acc_register_library.c new file mode 100644 index 0000000..e69de29 diff --git libgomp/config/nvptx/oacc-profiling.c libgomp/config/nvptx/oacc-profiling.c new file mode 100644 index 0000000..e69de29 diff --git libgomp/env.c libgomp/env.c index ac05c3b..75f8272 100644 --- libgomp/env.c +++ libgomp/env.c @@ -1301,8 +1301,9 @@ initialize_env (void) parse_acc_device_type (); goacc_runtime_initialize (); -} + goacc_profiling_initialize (); +} /* The public OpenMP API routines that access these variables. */ diff --git libgomp/libgomp-plugin.c libgomp/libgomp-plugin.c index 9bd4047..f834306 100644 --- libgomp/libgomp-plugin.c +++ libgomp/libgomp-plugin.c @@ -29,6 +29,7 @@ #include <stdlib.h> #include "libgomp.h" +#include "oacc-int.h" #include "libgomp-plugin.h" void * @@ -78,3 +79,11 @@ GOMP_PLUGIN_fatal (const char *msg, ...) gomp_vfatal (msg, ap); va_end (ap); } + +void +GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *prof_info, + acc_event_info *event_info, + acc_api_info *api_info) +{ + goacc_profiling_dispatch (prof_info, event_info, api_info); +} diff --git libgomp/libgomp-plugin.h libgomp/libgomp-plugin.h index ff81350..37d9d23 100644 --- libgomp/libgomp-plugin.h +++ libgomp/libgomp-plugin.h @@ -33,6 +33,8 @@ #include <stddef.h> #include <stdint.h> +#include "acc_prof.h" + #ifdef __cplusplus extern "C" { #endif @@ -74,6 +76,10 @@ extern void GOMP_PLUGIN_error (const char *, ...) extern void GOMP_PLUGIN_fatal (const char *, ...) __attribute__ ((noreturn, format (printf, 1, 2))); +extern void GOMP_PLUGIN_goacc_profiling_dispatch (acc_prof_info *, + acc_event_info *, + acc_api_info *); + /* Prototypes for functions implemented by libgomp plugins. */ extern const char *GOMP_OFFLOAD_get_name (void); extern unsigned int GOMP_OFFLOAD_get_caps (void); diff --git libgomp/libgomp.map libgomp/libgomp.map index 2c9a13d..b76a5dd 100644 --- libgomp/libgomp.map +++ libgomp/libgomp.map @@ -400,6 +400,10 @@ OACC_2.5 { acc_get_default_async_h_; acc_memcpy_from_device_async; acc_memcpy_to_device_async; + acc_prof_lookup; + acc_prof_register; + acc_prof_unregister; + acc_register_library; acc_set_default_async; acc_set_default_async_h_; acc_update_device_async; @@ -456,3 +460,10 @@ GOMP_PLUGIN_1.2 { global: GOMP_PLUGIN_acc_thread_default_async; } GOMP_PLUGIN_1.1; + +# TODO +GOMP_PLUGIN_1.3 { + global: + GOMP_PLUGIN_goacc_profiling_dispatch; + GOMP_PLUGIN_goacc_thread; +} GOMP_PLUGIN_1.2; diff --git libgomp/libgomp.texi libgomp/libgomp.texi index 7cb677c..93365cd 100644 --- libgomp/libgomp.texi +++ libgomp/libgomp.texi @@ -111,6 +111,7 @@ changed to GNU Offloading and Multi Processing Runtime Library. asynchronous operations. * OpenACC Library Interoperability:: OpenACC library interoperability with the NVIDIA CUBLAS library. +* OpenACC Profiling Interface:: * The libgomp ABI:: Notes on the external ABI presented by libgomp. * Reporting Bugs:: How to report bugs in the GNU Offloading and Multi Processing Runtime Library. @@ -3085,6 +3086,251 @@ Application Programming Interface”, Version 2.0.} @c --------------------------------------------------------------------- +@c OpenACC Profiling Interface +@c --------------------------------------------------------------------- + +@node OpenACC Profiling Interface +@chapter OpenACC Profiling Interface + +@section Implementation Status and Implementation-Defined Behavior + +We're not yet implementing the whole Profiling Interface as defined by +the OpenACC 2.5 specification. Also, the specification doesn't +clearly define some aspects of its Profiling Interface, so we're +clarifying these as @emph{implementation-defined behavior} here. We +already have reported to the OpenACC Technical Committee some issues, +and will report more, later on. + +This implementation of the OpenACC Profiling Interface is tuned to +keep the performance impact as low as possible when it's not in use. +This is relevant, as the Profiling Interface affects all the +@emph{hot} code paths (in the target code, not in the offloaded code). +Users of the OpenACC Profiling Interface can be expected to understand +that performance will always be impacted to some degree: for example, +because of the @emph{runtime} (libgomp) calling into a third-party +@emph{library} for every event that has been registered. + +This implementation of the OpenACC Profiling Interface has not yet +been validated for use in multi-threaded code. This is a more general +issue; see CSTS-110 @cite{Make sure all OpenACC entry points in +libgomp are thread-safe}. + +The @code{acc_prof_lookup} interface is not implemented, and +@code{acc_register_library} will receive @code{NULL} for its +@code{lookup} parameter. + +Remarks about data provided to callbacks: + +@table @asis + +@item @code{acc_prof_info.event_type} +It is not clear if for @emph{nested} event callbacks (for example, +@code{acc_ev_enqueue_launch_start} as part of a parent compute +construct), this should be set for the nested event +(@code{acc_ev_enqueue_launch_start}), or if the value of the parent +construct should remain (@code{acc_ev_compute_construct_start}). In +this implementation, the value will generally correspond to the +innermost nested event type. + +@item @code{acc_prof_info.device_type} +@itemize + +@item +For @code{acc_ev_compute_construct_start}, and in presence of an +@code{if} clause with @emph{false} argument, this will still refer to +the offloading device type; unsure whether that's the expected +behavior. + +@item +Complementary to the item before, for +@code{acc_ev_compute_construct_end}, this is set to +@code{acc_device_host} in presence of an @code{if} clause with +@emph{false} argument, unsure whether that's the expected behavior. + +@end itemize + +@item @code{acc_prof_info.thread_id} +Always @code{-1}; not yet implemented. + +@item @code{acc_prof_info.async} +@itemize + +@item +Not yet implemented correctly for +@code{acc_ev_compute_construct_start}. + +@item +In a compute construct, for host-fallback +execution/@code{acc_device_host} it will always be +@code{acc_async_sync}; unsure if that is the expected behavior. + +@item +For @code{acc_ev_device_init_start} and @code{acc_ev_device_init_end}, +it will always be @code{acc_async_sync}; unsure if that is the +expected behavior. + +@end itemize + +@item @code{acc_prof_info.async_queue} +There is no @cite{limited number of asynchronous queues} in libgomp. +We define this to always have the same value as +@code{acc_prof_info.async}. + +@item @code{acc_prof_info.file} +Always @code{NULL}; not yet implemented. + +@item @code{acc_prof_info.func_name} +Always @code{NULL}; not yet implemented. + +@item @code{acc_prof_info.line_no} +Always @code{-1}; not yet implemented. + +@item @code{acc_prof_info.end_line_no} +Always @code{-1}; not yet implemented. + +@item @code{acc_prof_info.func_line_no} +Always @code{-1}; not yet implemented. + +@item @code{acc_prof_info.func_end_line_no} +Always @code{-1}; not yet implemented. + +@item @code{acc_event_info.event_type}, @code{acc_event_info.*.event_type} +Relating to @code{acc_prof_info.event_type} discussed above, in this +implementation, this will always be the same value as +@code{acc_prof_info.event_type}. + +@item @code{acc_event_info.*.parent_construct} +@itemize + +@item +Will be @code{acc_construct_parallel} for OpenACC kernels constructs; +should be @code{acc_construct_kernels}. + +@item +For implicit @code{acc_ev_device_init_start}, +@code{acc_ev_device_init_end}, and explicit as well as implicit +@code{acc_ev_alloc}, @code{acc_ev_free}, +@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}, +@code{acc_ev_enqueue_download_start}, and +@code{acc_ev_enqueue_download_end}, will be +@code{acc_construct_parallel}; should reflect the real parent +construct. + +@end itemize + +@item @code{acc_event_info.*.implicit} +For @code{acc_ev_alloc}, @code{acc_ev_free}, +@code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end}, +@code{acc_ev_enqueue_download_start}, and +@code{acc_ev_enqueue_download_end}, this currently will be @code{1} +also for explicit usage. + +@item @code{acc_event_info.data_event.var_name} +Always @code{NULL}; not yet implemented. + +@item @code{acc_event_info.data_event.host_ptr} +For @code{acc_ev_alloc}, and @code{acc_ev_free}, this is always +@code{NULL}. + +@item @code{typedef union acc_api_info} +@dots{} as printed in @cite{5.2.3. Third Argument: API-Specific +Information}, should obviously be @code{typedef @emph{struct} +acc_api_info}. + +@item @code{acc_api_info.device_api} +Possibly not yet implemented correctly for +@code{acc_ev_compute_construct_start}, +@code{acc_ev_device_init_start}, @code{acc_ev_device_init_end}: +will always be @code{acc_device_api_none} for these event types. +For @code{acc_ev_enter_data_start}, it will be +@code{acc_device_api_none} in some cases. + +@item @code{acc_api_info.device_type} +Always the same as @code{acc_prof_info.device_type}. + +@item @code{acc_api_info.vendor} +Always @code{-1}; not yet implemented. + +@item @code{acc_api_info.device_handle} +Always @code{NULL}; not yet implemented. + +@item @code{acc_api_info.context_handle} +Always @code{NULL}; not yet implemented. + +@item @code{acc_api_info.async_handle} +Always @code{NULL}; not yet implemented. + +@end table + +Remarks about certain event types: + +@table @asis + +@item @code{acc_ev_device_init_start}, @code{acc_ev_device_init_end} +@itemize + +@item +@c See DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT in +@c libgomp.oacc-c-c++-common/acc_prof-parallel-1.c. +Whan a compute construct triggers implicit +@code{acc_ev_device_init_start} and @code{acc_ev_device_init_end} +events, they currently aren't @emph{nested within} the corresponding +@code{acc_ev_compute_construct_start} and +@code{acc_ev_compute_construct_end}, but they're currently observed +@emph{before} @code{acc_ev_compute_construct_start}. It is not clear +what to do: the standard asks us provide a lot of details to the +@code{acc_ev_compute_construct_start} callback, without (implicitly) +initializing a device before? + +@item +Callbacks for these event types will not be invoked for calls to the +@code{acc_set_device_type} and @code{acc_set_device_num} functions; +it's not clear if they should be. + +@end itemize + +@end table + +Callbacks for the following event types will be invoked, but dispatch +and information provided therein has not yet been thoroughly reviewed: + +@itemize +@item @code{acc_ev_alloc} +@item @code{acc_ev_free} +@item @code{acc_ev_enqueue_upload_start}, @code{acc_ev_enqueue_upload_end} +@item @code{acc_ev_enqueue_download_start}, @code{acc_ev_enqueue_download_end} +@end itemize + +During device initialization, and finalization, respectively, +callbacks for the following event types will not yet be invoked: + +@itemize +@item @code{acc_ev_alloc} +@item @code{acc_ev_free} +@end itemize + +Callbacks for the following event types will currently only be invoked +for (implicit) events within compute constructs: + +@itemize +@item @code{acc_ev_enter_data_start}, @code{acc_ev_enter_data_end} +@item @code{acc_ev_exit_data_start}, @code{acc_ev_exit_data_end} +@end itemize + +Callbacks for the following event types have not yet been implemented, +so currently won't be invoked: + +@itemize +@item @code{acc_ev_device_shutdown_start}, @code{acc_ev_device_shutdown_end} +@item @code{acc_ev_runtime_shutdown} +@item @code{acc_ev_create}, @code{acc_ev_delete} +@item @code{acc_ev_update_start}, @code{acc_ev_update_end} +@item @code{acc_ev_wait_start}, @code{acc_ev_wait_end} +@end itemize + + + +@c --------------------------------------------------------------------- @c The libgomp ABI @c --------------------------------------------------------------------- diff --git libgomp/oacc-init.c libgomp/oacc-init.c index 05bb663..415c0fa 100644 --- libgomp/oacc-init.c +++ libgomp/oacc-init.c @@ -218,8 +218,55 @@ acc_dev_num_out_of_range (acc_device_t d, int ord, int ndevs) held before calling this function. */ static struct gomp_device_descr * -acc_init_1 (acc_device_t d) +acc_init_1 (acc_device_t d, acc_construct_t parent_construct, int implicit) { + bool profiling_dispatch_p + = __builtin_expect (goacc_profiling_dispatch_p (), false); + + acc_prof_info prof_info; + if (profiling_dispatch_p) + { + prof_info.event_type = acc_ev_device_init_start; + prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES; + prof_info.version = _ACC_PROF_INFO_VERSION; + prof_info.device_type = d; + prof_info.device_number = goacc_device_num; + prof_info.thread_id = -1; //TODO + prof_info.async = acc_async_sync; //TODO + /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */ + prof_info.async_queue = prof_info.async; + prof_info.src_file = NULL; //TODO + prof_info.func_name = NULL; //TODO + prof_info.line_no = -1; //TODO + prof_info.end_line_no = -1; //TODO + prof_info.func_line_no = -1; //TODO + prof_info.func_end_line_no = -1; //TODO + } + acc_event_info device_init_event_info; + if (profiling_dispatch_p) + { + device_init_event_info.other_event.event_type = prof_info.event_type; + device_init_event_info.other_event.valid_bytes + = _ACC_OTHER_EVENT_INFO_VALID_BYTES; + device_init_event_info.other_event.parent_construct = parent_construct; + device_init_event_info.other_event.implicit = implicit; + device_init_event_info.other_event.tool_info = NULL; + } + acc_api_info api_info; + if (profiling_dispatch_p) + { + api_info.device_api = acc_device_api_none; //TODO + api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES; + api_info.device_type = prof_info.device_type; + api_info.vendor = -1; //TODO + api_info.device_handle = NULL; //TODO + api_info.context_handle = NULL; //TODO + api_info.async_handle = NULL; //TODO + } + + if (profiling_dispatch_p) + goacc_profiling_dispatch (&prof_info, &device_init_event_info, &api_info); + struct gomp_device_descr *base_dev, *acc_dev; int ndevs; @@ -242,6 +289,14 @@ acc_init_1 (acc_device_t d) gomp_init_device (acc_dev); gomp_mutex_unlock (&acc_dev->lock); + if (profiling_dispatch_p) + { + prof_info.event_type = acc_ev_device_init_end; + device_init_event_info.other_event.event_type = prof_info.event_type; + goacc_profiling_dispatch (&prof_info, &device_init_event_info, + &api_info); + } + return base_dev; } @@ -434,7 +489,11 @@ goacc_attach_host_thread_to_device (int ord) thr->dev = acc_dev = &base_dev[ord]; thr->saved_bound_dev = NULL; thr->mapped_data = NULL; - + thr->prof_info = NULL; + thr->api_info = NULL; + /* Initially, all callbacks for all events are enabled. */ + thr->prof_callbacks_enabled = true; + thr->target_tls = acc_dev->openacc.create_thread_data_func (ord); @@ -452,7 +511,7 @@ acc_init (acc_device_t d) gomp_mutex_lock (&acc_device_lock); if (!cached_base_dev) gomp_init_targets_once (); - cached_base_dev = acc_init_1 (d); + cached_base_dev = acc_init_1 (d, acc_construct_runtime_api, 0); gomp_mutex_unlock (&acc_device_lock); goacc_attach_host_thread_to_device (-1); @@ -708,7 +767,8 @@ goacc_lazy_initialize (void) if (!cached_base_dev) { gomp_init_targets_once (); - cached_base_dev = acc_init_1 (acc_device_default); + cached_base_dev = acc_init_1 (acc_device_default, + /* TODO */ acc_construct_parallel, 1); } gomp_mutex_unlock (&acc_device_lock); diff --git libgomp/oacc-int.h libgomp/oacc-int.h index 1f7adb4..8a62029 100644 --- libgomp/oacc-int.h +++ libgomp/oacc-int.h @@ -40,6 +40,7 @@ #include "openacc.h" #include "config.h" +#include "acc_prof.h" #include <stddef.h> #include <stdbool.h> #include <stdarg.h> @@ -68,6 +69,12 @@ struct goacc_thread strictly push/pop semantics according to lexical scope. */ struct target_mem_desc *mapped_data; + /* Data of the OpenACC Profiling Interface. */ + acc_prof_info *prof_info; + acc_api_info *api_info; + /* Per-thread toggle of OpenACC Profiling Interface callbacks. */ + bool prof_callbacks_enabled; + /* These structures form a list: this is the next thread in that list. */ struct goacc_thread *next; @@ -102,6 +109,11 @@ void goacc_restore_bind (void); void goacc_lazy_initialize (void); void goacc_host_init (void); +void goacc_profiling_initialize (void); +bool goacc_profiling_dispatch_p (void); +void goacc_profiling_dispatch (acc_prof_info *, acc_event_info *, + acc_api_info *); + #ifdef HAVE_ATTRIBUTE_VISIBILITY # pragma GCC visibility pop #endif diff --git libgomp/oacc-parallel.c libgomp/oacc-parallel.c index deab4b3..36e2431 100644 --- libgomp/oacc-parallel.c +++ libgomp/oacc-parallel.c @@ -142,21 +142,78 @@ GOACC_parallel_keyed (int device, void (*fn) (void *), thr = goacc_thread (); acc_dev = thr->dev; + bool profiling_dispatch_p + = __builtin_expect (goacc_profiling_dispatch_p (), false); + + acc_prof_info prof_info; + if (profiling_dispatch_p) + { + thr->prof_info = &prof_info; + + prof_info.event_type = acc_ev_compute_construct_start; + prof_info.valid_bytes = _ACC_PROF_INFO_VALID_BYTES; + prof_info.version = _ACC_PROF_INFO_VERSION; + prof_info.device_type = acc_device_type (acc_dev->type); + prof_info.device_number = acc_dev->target_id; + prof_info.thread_id = -1; //TODO + prof_info.async = async; + /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */ + prof_info.async_queue = prof_info.async; + prof_info.src_file = NULL; //TODO + prof_info.func_name = NULL; //TODO + prof_info.line_no = -1; //TODO + prof_info.end_line_no = -1; //TODO + prof_info.func_line_no = -1; //TODO + prof_info.func_end_line_no = -1; //TODO + } + acc_event_info compute_construct_event_info; + if (profiling_dispatch_p) + { + compute_construct_event_info.other_event.event_type + = prof_info.event_type; + compute_construct_event_info.other_event.valid_bytes + = _ACC_OTHER_EVENT_INFO_VALID_BYTES; + compute_construct_event_info.other_event.parent_construct + = acc_construct_parallel; //TODO: kernels... + compute_construct_event_info.other_event.implicit = 0; + compute_construct_event_info.other_event.tool_info = NULL; + } + acc_api_info api_info; + if (profiling_dispatch_p) + { + thr->api_info = &api_info; + + api_info.device_api = acc_device_api_none; + api_info.valid_bytes = _ACC_API_INFO_VALID_BYTES; + api_info.device_type = prof_info.device_type; + api_info.vendor = -1; //TODO + api_info.device_handle = NULL; //TODO + api_info.context_handle = NULL; //TODO + api_info.async_handle = NULL; //TODO + } + + if (profiling_dispatch_p) + goacc_profiling_dispatch (&prof_info, &compute_construct_event_info, + &api_info); + handle_ftn_pointers (mapnum, hostaddrs, sizes, kinds); /* Host fallback if "if" clause is false or if the current device is set to the host. */ if (host_fallback) { + //TODO + prof_info.device_type = acc_device_host; + api_info.device_type = prof_info.device_type; goacc_save_and_set_bind (acc_device_host); fn (hostaddrs); goacc_restore_bind (); - return; + goto out; } else if (acc_device_type (acc_dev->type) == acc_device_host) { fn (hostaddrs); - return; + goto out; } /* Default: let the runtime choose. */ @@ -190,6 +247,14 @@ GOACC_parallel_keyed (int device, void (*fn) (void *), if (async == GOMP_LAUNCH_OP_MAX) async = va_arg (ap, unsigned); + + if (profiling_dispatch_p) + { + prof_info.async = async; + /* See <https://github.com/OpenACC/openacc-spec/issues/71>. */ + prof_info.async_queue = prof_info.async; + } + break; } @@ -227,8 +292,31 @@ GOACC_parallel_keyed (int device, void (*fn) (void *), else tgt_fn = (void (*)) fn; + acc_event_info enter_exit_data_event_info; + if (profiling_dispatch_p) + { + prof_info.event_type = acc_ev_enter_data_start; + enter_exit_data_event_info.other_event.event_type + = prof_info.event_type; + enter_exit_data_event_info.other_event.valid_bytes + = _ACC_OTHER_EVENT_INFO_VALID_BYTES; + enter_exit_data_event_info.other_event.parent_construct + = compute_construct_event_info.other_event.parent_construct; + enter_exit_data_event_info.other_event.implicit = 1; + enter_exit_data_event_info.other_event.tool_info = NULL; + goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info, + &api_info); + } tgt = gomp_map_vars (acc_dev, mapnum, hostaddrs, NULL, sizes, kinds, true, GOMP_MAP_VARS_OPENACC); + if (profiling_dispatch_p) + { + prof_info.event_type = acc_ev_enter_data_end; + enter_exit_data_event_info.other_event.event_type + = prof_info.event_type; + goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info, + &api_info); + } devaddrs = gomp_alloca (sizeof (void *) * mapnum); for (i = 0; i < mapnum; i++) @@ -246,11 +334,43 @@ GOACC_parallel_keyed (int device, void (*fn) (void *), /* If running synchronously, unmap immediately. */ if (async < acc_async_noval) - gomp_unmap_vars (tgt, true); + { + if (profiling_dispatch_p) + { + prof_info.event_type = acc_ev_exit_data_start; + enter_exit_data_event_info.other_event.event_type + = prof_info.event_type; + enter_exit_data_event_info.other_event.tool_info = NULL; + goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info, + &api_info); + } + gomp_unmap_vars (tgt, true); + if (profiling_dispatch_p) + { + prof_info.event_type = acc_ev_exit_data_end; + enter_exit_data_event_info.other_event.event_type + = prof_info.event_type; + goacc_profiling_dispatch (&prof_info, &enter_exit_data_event_info, + &api_info); + } + } else tgt->device_descr->openacc.register_async_cleanup_func (tgt, async); acc_dev->openacc.async_set_async_func (acc_async_sync); + + out: + if (profiling_dispatch_p) + { + prof_info.event_type = acc_ev_compute_construct_end; + compute_construct_event_info.other_event.event_type + = prof_info.event_type; + goacc_profiling_dispatch (&prof_info, &compute_construct_event_info, + &api_info); + + thr->prof_info = NULL; + thr->api_info = NULL; + } } /* Legacy entry point, only provide host execution. */ diff --git libgomp/oacc-plugin.c libgomp/oacc-plugin.c index 3f82c07..9707b48 100644 --- libgomp/oacc-plugin.c +++ libgomp/oacc-plugin.c @@ -50,6 +50,19 @@ GOMP_PLUGIN_acc_thread (void) return thr ? thr->target_tls : NULL; } +/* Return the TLS data for the current thread. */ +/* TODO. Should we be able to directly call (the static inline function) + goacc_thread from within plugin code? I didn't manage to get the + "goacc_tls_data" symbol configured correctly: "[...]/ld: + .libs/libgomp-plugin-nvptx.so.1.0.0: hidden symbol `goacc_tls_data' isn't + defined". */ + +struct goacc_thread * +GOMP_PLUGIN_goacc_thread (void) +{ + return goacc_thread (); +} + /* Return the default async number from the TLS data for the current thread. */ int diff --git libgomp/oacc-plugin.h libgomp/oacc-plugin.h index ff46ed8..5a842a2 100644 --- libgomp/oacc-plugin.h +++ libgomp/oacc-plugin.h @@ -27,8 +27,11 @@ #ifndef OACC_PLUGIN_H #define OACC_PLUGIN_H 1 +#include "oacc-int.h" + extern void GOMP_PLUGIN_async_unmap_vars (void *, int); extern void *GOMP_PLUGIN_acc_thread (void); +extern struct goacc_thread *GOMP_PLUGIN_goacc_thread (void); extern int GOMP_PLUGIN_acc_thread_default_async (void); #endif diff --git libgomp/oacc-plugin.h libgomp/oacc-profiling-acc_register_library.c similarity index 71% copy from libgomp/oacc-plugin.h copy to libgomp/oacc-profiling-acc_register_library.c index ff46ed8..f6b482b 100644 --- libgomp/oacc-plugin.h +++ libgomp/oacc-profiling-acc_register_library.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2014-2016 Free Software Foundation, Inc. +/* Copyright (C) 2017 Free Software Foundation, Inc. Contributed by Mentor Embedded. @@ -24,11 +24,16 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see <http://www.gnu.org/licenses/>. */ -#ifndef OACC_PLUGIN_H -#define OACC_PLUGIN_H 1 +/* This file provides an stub acc_register_library function. It's in a + separate file so that this function can easily be overridden when linking + statically. */ -extern void GOMP_PLUGIN_async_unmap_vars (void *, int); -extern void *GOMP_PLUGIN_acc_thread (void); -extern int GOMP_PLUGIN_acc_thread_default_async (void); +#include "libgomp.h" +#include "acc_prof.h" -#endif +void +acc_register_library (acc_prof_reg reg, acc_prof_reg unreg, + acc_prof_lookup_func lookup) +{ + gomp_debug (0, "dummy %s\n", __FUNCTION__); +} diff --git libgomp/oacc-profiling.c libgomp/oacc-profiling.c new file mode 100644 index 0000000..a4671f9 --- /dev/null +++ libgomp/oacc-profiling.c @@ -0,0 +1,576 @@ +/* Copyright (C) 2017 Free Software Foundation, Inc. + + Contributed by Mentor Embedded. + + This file is part of the GNU Offloading and Multi Processing Library + (libgomp). + + Libgomp is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + Libgomp is distributed in the hope that it will be useful, but WITHOUT ANY + WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + FOR A PARTICULAR PURPOSE. See the GNU General Public License for + more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + <http://www.gnu.org/licenses/>. */ + +/* OpenACC Profiling Interface. */ + +#include "libgomp.h" +#include "oacc-int.h" +#include "acc_prof.h" +#include <assert.h> +#ifdef HAVE_STRING_H +# include <string.h> +#endif +#ifdef PLUGIN_SUPPORT +# include <dlfcn.h> +#endif + +#define STATIC_ASSERT(expr) _Static_assert (expr, "!(" #expr ")") + +/* Statically assert that the layout of the common fields in the + "acc_event_info" variants matches. */ +/* event_type */ +STATIC_ASSERT (offsetof (acc_event_info, event_type) + == offsetof (acc_event_info, data_event.event_type)); +STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type) + == offsetof (acc_event_info, launch_event.event_type)); +STATIC_ASSERT (offsetof (acc_event_info, data_event.event_type) + == offsetof (acc_event_info, other_event.event_type)); +/* valid_bytes */ +STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes) + == offsetof (acc_event_info, launch_event.valid_bytes)); +STATIC_ASSERT (offsetof (acc_event_info, data_event.valid_bytes) + == offsetof (acc_event_info, other_event.valid_bytes)); +/* parent_construct */ +STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct) + == offsetof (acc_event_info, launch_event.parent_construct)); +STATIC_ASSERT (offsetof (acc_event_info, data_event.parent_construct) + == offsetof (acc_event_info, other_event.parent_construct)); +/* implicit */ +STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit) + == offsetof (acc_event_info, launch_event.implicit)); +STATIC_ASSERT (offsetof (acc_event_info, data_event.implicit) + == offsetof (acc_event_info, other_event.implicit)); +/* tool_info */ +STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info) + == offsetof (acc_event_info, launch_event.tool_info)); +STATIC_ASSERT (offsetof (acc_event_info, data_event.tool_info) + == offsetof (acc_event_info, other_event.tool_info)); + +struct goacc_prof_callback_entry +{ + acc_prof_callback cb; + int ref; + bool enabled; + struct goacc_prof_callback_entry *next; +}; + +/* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global toggle. */ +static bool goacc_prof_callbacks_enabled[acc_ev_last]; +static struct goacc_prof_callback_entry *goacc_prof_callback_entries[acc_ev_last]; + +/* This lock is used to protect access to goacc_prof_callbacks_enabled, and + goacc_prof_callback_entries. */ +static gomp_mutex_t goacc_prof_lock; + +void +goacc_profiling_initialize (void) +{ + gomp_mutex_init (&goacc_prof_lock); + + /* Initially, all callbacks for all events are enabled. */ + for (int i = 0; i < acc_ev_last; ++i) + goacc_prof_callbacks_enabled[i] = true; + + /* We are to invoke an external acc_register_library routine, defaulting to + our stub oacc-profiling-acc_register_library.c:acc_register_library + implementation. */ + gomp_debug (0, "%s: calling acc_register_library\n", __FUNCTION__); + //TODO. + acc_register_library (acc_prof_register, acc_prof_unregister, NULL); +#ifdef PLUGIN_SUPPORT + char *acc_proflibs = getenv ("ACC_PROFLIB"); + while (acc_proflibs != NULL && acc_proflibs[0] != '\0') + { + char *acc_proflibs_sep = strchr (acc_proflibs, ';'); + char *acc_proflib; + if (acc_proflibs_sep == acc_proflibs) + { + /* Stray ";" separator: make sure we don't dlopen the main + program. */ + acc_proflib = NULL; + } + else + { + if (acc_proflibs_sep != NULL) + { + /* Single out the first library. */ + acc_proflib = gomp_malloc (acc_proflibs_sep - acc_proflibs + 1); + memcpy (acc_proflib, acc_proflibs, + acc_proflibs_sep - acc_proflibs); + acc_proflib[acc_proflibs_sep - acc_proflibs] = '\0'; + } + else + { + /* No ";" separator, so only one library. */ + acc_proflib = acc_proflibs; + } + + gomp_debug (0, "%s: dlopen(%s)\n", __FUNCTION__, acc_proflib); + void *dl_handle = dlopen (acc_proflib, RTLD_LAZY); + if (dl_handle != NULL) + { + typeof (&acc_register_library) a_r_l + = dlsym (dl_handle, "acc_register_library"); + if (a_r_l == NULL) + goto dl_fail; + /* Avoid duplicate registration, for example if the same shared + library is specified in LD_PRELOAD and ACC_PROFLIB -- which + TAU 2.26 does when using "tau_exec -openacc". */ + if (a_r_l != acc_register_library) + { + gomp_debug (0, " %s: calling %s:acc_register_library\n", + __FUNCTION__, acc_proflib); + //TODO. + a_r_l (acc_prof_register, acc_prof_unregister, NULL); + } + else + gomp_debug (0, " %s: skipping duplicate" + " %s:acc_register_library\n", + __FUNCTION__, acc_proflib); + } + else + { + dl_fail: + gomp_error ("while loading ACC_PROFLIB %s: %s", + acc_proflib, dlerror ()); + if (dl_handle != NULL) + { + int err = dlclose (dl_handle); + dl_handle = NULL; + if (err != 0) + goto dl_fail; + } + } + } + + if (acc_proflib != acc_proflibs) + { + free (acc_proflib); + + acc_proflibs = acc_proflibs_sep + 1; + } + else + acc_proflibs = NULL; + } +#endif /* PLUGIN_SUPPORT */ +} + +void +acc_prof_register (acc_event_t ev, acc_prof_callback cb, acc_register_t reg) +{ + //TODO + gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n", + __FUNCTION__, (int) ev, (void *) cb, (int) reg); + + enum + { + EVENT_KIND_BOGUS, + EVENT_KIND_NORMAL, + /* As end events invoke callbacks in the reverse order, we register these + in the reverse order here. */ + EVENT_KIND_END, + } event_kind = EVENT_KIND_BOGUS; + switch (ev) + { + case acc_ev_none: + case acc_ev_device_init_start: + case acc_ev_device_shutdown_start: + case acc_ev_runtime_shutdown: + case acc_ev_create: + case acc_ev_delete: + case acc_ev_alloc: + case acc_ev_free: + case acc_ev_enter_data_start: + case acc_ev_exit_data_start: + case acc_ev_update_start: + case acc_ev_compute_construct_start: + case acc_ev_enqueue_launch_start: + case acc_ev_enqueue_upload_start: + case acc_ev_enqueue_download_start: + case acc_ev_wait_start: + event_kind = EVENT_KIND_NORMAL; + break; + case acc_ev_device_init_end: + case acc_ev_device_shutdown_end: + case acc_ev_enter_data_end: + case acc_ev_exit_data_end: + case acc_ev_update_end: + case acc_ev_compute_construct_end: + case acc_ev_enqueue_launch_end: + case acc_ev_enqueue_upload_end: + case acc_ev_enqueue_download_end: + case acc_ev_wait_end: + event_kind = EVENT_KIND_END; + break; + case acc_ev_last: + break; + } + if (event_kind == EVENT_KIND_BOGUS) + { + //TODO: should this be a fatal error? Or, should we (silently?) ignore these, for forward compatibility? + gomp_error ("ignoring %s request for TODOinvalid acc_event_t %d", + __FUNCTION__, /* TODO */ (int) ev); + return; + } + + bool bogus = true; + switch (reg) + { + case acc_reg: + case acc_toggle: + case acc_toggle_per_thread: + bogus = false; + break; + } + if (bogus) + { + //TODO: should this be a fatal error? Or, should we (silently?) ignore these, for forward compatibility? + gomp_error ("ignoring %s request with TODOinvalid acc_register_t %d", + __FUNCTION__, /* TODO */ (int) reg); + return; + } + + /* Special cases. */ + if (reg == acc_toggle) + { + if (cb == NULL) + { + gomp_debug (0, " globally enabling callbacks\n"); + gomp_mutex_lock (&goacc_prof_lock); + /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global + toggle. */ + goacc_prof_callbacks_enabled[ev] = true; + gomp_mutex_unlock (&goacc_prof_lock); + return; + } + else if (ev == acc_ev_none && cb != NULL) + { + gomp_debug (0, " ignoring request\n"); + /* Silently ignore request. */ + return; + } + } + else if (reg == acc_toggle_per_thread) + { + if (ev == acc_ev_none && cb == NULL) + { + gomp_debug (0, " thread: enabling callbacks\n"); + goacc_lazy_initialize (); + struct goacc_thread *thr = goacc_thread (); + thr->prof_callbacks_enabled = true; + return; + } + //TODO: should this be a fatal error? Or, should we (silently?) ignore these? + gomp_error ("ignoring %s request for acc_toggle_per_thread with TODO", + __FUNCTION__); + return; + } + + gomp_mutex_lock (&goacc_prof_lock); + + struct goacc_prof_callback_entry *it, *it_p; + it = goacc_prof_callback_entries[ev]; + it_p = NULL; + while (it) + { + if (it->cb == cb) + break; + it_p = it; + it = it->next; + } + + switch (reg) + { + case acc_reg: + /* If we already have this callback registered, just increment its ref + count. */ + if (it != NULL) + { + it->ref++; + gomp_debug (0, " already registered;" + " incrementing ref count to: %d\n", it->ref); + } + else + { + struct goacc_prof_callback_entry *e + = gomp_malloc (sizeof (struct goacc_prof_callback_entry)); + e->cb = cb; + e->ref = 1; + e->enabled = true; + bool prepend = (event_kind == EVENT_KIND_END); + /* If we don't have any callback registered yet, also use the + "prepend" code path. */ + if (it_p == NULL) + prepend = true; + if (prepend) + { + gomp_debug (0, " prepending\n"); + e->next = goacc_prof_callback_entries[ev]; + goacc_prof_callback_entries[ev] = e; + } + else + { + gomp_debug (0, " appending\n"); + e->next = NULL; + it_p->next = e; + } + } + break; + + case acc_toggle: + if (it == NULL) + { + /* Silently ignore acc_toggle request if not registered. */ + gomp_debug (0, " not enabling; not registered\n"); + } + else + { + gomp_debug (0, " enabling\n"); + it->enabled = true; + } + break; + + case acc_toggle_per_thread: + __builtin_unreachable (); + } + + gomp_mutex_unlock (&goacc_prof_lock); +} + +void +acc_prof_unregister (acc_event_t ev, acc_prof_callback cb, acc_register_t reg) +{ + //TODO + gomp_debug (0, "%s: ev=%d, cb=%p, reg=%d\n", + __FUNCTION__, (int) ev, (void *) cb, (int) reg); + + if (ev < acc_ev_none + || ev >= acc_ev_last) + { + //TODO: should this be a fatal error? Or, should we (silently?) ignore these, for forward compatibility? + gomp_error ("ignoring %s request for TODOinvalid acc_event_t %d", + __FUNCTION__, /* TODO */ (int) ev); + return; + } + + bool bogus = true; + switch (reg) + { + case acc_reg: + case acc_toggle: + case acc_toggle_per_thread: + bogus = false; + break; + } + if (bogus) + { + //TODO: should this be a fatal error? Or, should we (silently?) ignore these, for forward compatibility? + gomp_error ("ignoring %s request with TODOinvalid acc_register_t %d", + __FUNCTION__, /* TODO */ (int) reg); + return; + } + + /* Special cases. */ + if (reg == acc_toggle) + { + if (cb == NULL) + { + gomp_debug (0, " globally disabling callbacks\n"); + gomp_mutex_lock (&goacc_prof_lock); + /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global + toggle. */ + goacc_prof_callbacks_enabled[ev] = false; + gomp_mutex_unlock (&goacc_prof_lock); + return; + } + else if (ev == acc_ev_none && cb != NULL) + { + gomp_debug (0, " ignoring request\n"); + /* Silently ignore request. */ + return; + } + } + else if (reg == acc_toggle_per_thread) + { + if (ev == acc_ev_none && cb == NULL) + { + gomp_debug (0, " thread: disabling callbacks\n"); + goacc_lazy_initialize (); + struct goacc_thread *thr = goacc_thread (); + thr->prof_callbacks_enabled = false; + return; + } + //TODO: should this be a fatal error? Or, should we (silently?) ignore these? + gomp_error ("ignoring %s request for acc_toggle_per_thread with TODO", + __FUNCTION__); + return; + } + + gomp_mutex_lock (&goacc_prof_lock); + + struct goacc_prof_callback_entry *it, *it_p; + it = goacc_prof_callback_entries[ev]; + it_p = NULL; + while (it) + { + if (it->cb == cb) + break; + it_p = it; + it = it->next; + } + + switch (reg) + { + case acc_reg: + if (it == NULL) + { + //TODO: should this be a fatal error? Or, should we (silently?) ignore these? + gomp_error ("ignoring %s request for acc_event_t %d: not registered", + __FUNCTION__, /* TODO */ (int) ev); + gomp_mutex_unlock (&goacc_prof_lock); + return; + } + it->ref--; + gomp_debug (0, " decrementing ref count to: %d\n", it->ref); + if (it->ref == 0) + { + if (it_p == NULL) + goacc_prof_callback_entries[ev] = it->next; + else + it_p->next = it->next; + free (it); + } + break; + + case acc_toggle: + if (it == NULL) + { + /* Silently ignore acc_toggle request if not registered. */ + gomp_debug (0, " not disabling; not registered\n"); + } + else + { + gomp_debug (0, " disabling\n"); + it->enabled = false; + } + break; + + case acc_toggle_per_thread: + __builtin_unreachable (); + } + + gomp_mutex_unlock (&goacc_prof_lock); +} + +/* Prepare to dispatch events? */ + +bool +goacc_profiling_dispatch_p (void) +{ + //TODO + gomp_debug (0, "%s\n", __FUNCTION__); + + struct goacc_thread *thr = goacc_thread (); + if (__builtin_expect (thr == NULL, false)) + { + /* If we don't have any per-thread state yet, that means that per-thread + callback dispatch has not been explicitly disabled (which only a call + to acc_prof_unregister with acc_toggle_per_thread will do, and that + would have allocated per-thread state via goacc_lazy_initialize); + initially, all callbacks for all events are enabled. */ + //TODO + gomp_debug (0, " %s: don't have any per-thread state yet\n", __FUNCTION__); + } + else if (__builtin_expect (!thr->prof_callbacks_enabled, true)) + { + //TODO + gomp_debug (0, " %s: disabled for this thread\n", __FUNCTION__); + return false; + } + + gomp_mutex_lock (&goacc_prof_lock); + + /* goacc_prof_callbacks_enabled[acc_ev_none] acts as a global toggle. */ + if (__builtin_expect (!goacc_prof_callbacks_enabled[acc_ev_none], true)) + { + //TODO + gomp_debug (0, " %s: disabled globally\n", __FUNCTION__); + gomp_mutex_unlock (&goacc_prof_lock); + return false; + } + + gomp_mutex_unlock (&goacc_prof_lock); + + return true; +} + +/* Dispatch events. + + TODO. + This must only be called if goacc_profiling_dispatch_p returned a true + result. */ + +void +goacc_profiling_dispatch (acc_prof_info *prof_info, acc_event_info *event_info, + acc_api_info *apt_info) +{ + acc_event_t event_type = event_info->event_type; + //TODO + gomp_debug (0, "%s: event_type=%d\n", __FUNCTION__, (int) event_type); + //TODO + assert (event_type > acc_ev_none + && event_type < acc_ev_last); + + gomp_mutex_lock (&goacc_prof_lock); + + if (!goacc_prof_callbacks_enabled[event_type]) + { + //TODO + gomp_debug (0, " %s: disabled for this event type\n", __FUNCTION__); + gomp_mutex_unlock (&goacc_prof_lock); + return; + } + + for (struct goacc_prof_callback_entry *e + = goacc_prof_callback_entries[event_type]; + e != NULL; + e = e->next) + { + if (!e->enabled) + { + //TODO + gomp_debug (0, " %s: disabled for callback %p\n", + __FUNCTION__, e->cb); + continue; + } + + //TODO + gomp_debug (0, " %s: calling callback %p\n", __FUNCTION__, e->cb); + e->cb (prof_info, event_info, apt_info); + } + + gomp_mutex_unlock (&goacc_prof_lock); +} diff --git libgomp/plugin/plugin-nvptx.c libgomp/plugin/plugin-nvptx.c index 51000f3..dbea9da 100644 --- libgomp/plugin/plugin-nvptx.c +++ libgomp/plugin/plugin-nvptx.c @@ -36,6 +36,7 @@ #include "libgomp-plugin.h" #include "oacc-plugin.h" #include "gomp-constants.h" +#include "oacc-int.h" #include <pthread.h> #include <cuda.h> @@ -121,7 +122,7 @@ struct nvptx_thread }; static struct cuda_map * -cuda_map_create (size_t size) +cuda_map_create (struct goacc_thread *thr, size_t size) { struct cuda_map *map = GOMP_PLUGIN_malloc (sizeof (struct cuda_map)); @@ -134,13 +135,72 @@ cuda_map_create (size_t size) CUDA_CALL_ERET (NULL, cuMemAlloc, &map->d, size); assert (map->d); + bool profiling_dispatch_p + = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); + if (profiling_dispatch_p) + { + acc_prof_info *prof_info = thr->prof_info; + acc_event_info data_event_info; + acc_api_info *api_info = thr->api_info; + + prof_info->event_type = acc_ev_alloc; + + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; //TODO + /* Always implicit for "data mapping arguments for cuLaunchKernel". */ + data_event_info.data_event.implicit = 1; + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; //TODO + data_event_info.data_event.bytes = size; + data_event_info.data_event.host_ptr = NULL; + data_event_info.data_event.device_ptr = (void *) map->d; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + return map; } static void -cuda_map_destroy (struct cuda_map *map) +cuda_map_destroy (struct goacc_thread *thr, struct cuda_map *map) { CUDA_CALL_ASSERT (cuMemFree, map->d); + + bool profiling_dispatch_p + = __builtin_expect (thr != NULL && thr->prof_info != NULL, false); + if (profiling_dispatch_p) + { + acc_prof_info *prof_info = thr->prof_info; + acc_event_info data_event_info; + acc_api_info *api_info = thr->api_info; + + prof_info->event_type = acc_ev_free; + + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; //TODO + /* Always implicit for "data mapping arguments for cuLaunchKernel". */ + data_event_info.data_event.implicit = 1; + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; //TODO + data_event_info.data_event.bytes = map->size; + data_event_info.data_event.host_ptr = NULL; + data_event_info.data_event.device_ptr = (void *) map->d; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + free (map); } @@ -156,30 +216,30 @@ cuda_map_destroy (struct cuda_map *map) GOMP_OFFLOAD_fini_device, respectively. */ static bool -map_init (struct ptx_stream *s) +map_init (struct goacc_thread *thr, struct ptx_stream *s) { int size = getpagesize (); assert (s); - s->map = cuda_map_create (size); + s->map = cuda_map_create (thr, size); return true; } static bool -map_fini (struct ptx_stream *s) +map_fini (struct goacc_thread *thr, struct ptx_stream *s) { assert (s->map->next == NULL); assert (!s->map->active); - cuda_map_destroy (s->map); + cuda_map_destroy (thr, s->map); return true; } static void -map_pop (struct ptx_stream *s) +map_pop (struct goacc_thread *thr, struct ptx_stream *s) { struct cuda_map *next; @@ -192,12 +252,12 @@ map_pop (struct ptx_stream *s) } next = s->map->next; - cuda_map_destroy (s->map); + cuda_map_destroy (thr, s->map); s->map = next; } static CUdeviceptr -map_push (struct ptx_stream *s, size_t size) +map_push (struct goacc_thread *thr, struct ptx_stream *s, size_t size) { struct cuda_map *map = NULL, *t = NULL; @@ -209,7 +269,7 @@ map_push (struct ptx_stream *s, size_t size) cuda_map and push it to the end of the list. */ if (s->map->active) { - map = cuda_map_create (size); + map = cuda_map_create (thr, size); for (t = s->map; t->next != NULL; t = t->next) ; @@ -218,8 +278,8 @@ map_push (struct ptx_stream *s, size_t size) } else if (s->map->size < size) { - cuda_map_destroy (s->map); - map = cuda_map_create (size); + cuda_map_destroy (thr, s->map); + map = cuda_map_create (thr, size); } else map = s->map; @@ -365,7 +425,7 @@ init_streams_for_device (struct ptx_device *ptx_dev, int concurrency) null_stream->stream = NULL; null_stream->host_thread = pthread_self (); null_stream->multithreaded = true; - if (!map_init (null_stream)) + if (!map_init (NULL, null_stream)) return false; ptx_dev->null_stream = null_stream; @@ -399,7 +459,7 @@ fini_streams_for_device (struct ptx_device *ptx_dev) struct ptx_stream *s = ptx_dev->active_streams; ptx_dev->active_streams = ptx_dev->active_streams->next; - ret &= map_fini (s); + ret &= map_fini (NULL, s); CUresult r = cuStreamDestroy (s->stream); if (r != CUDA_SUCCESS) @@ -410,7 +470,7 @@ fini_streams_for_device (struct ptx_device *ptx_dev) free (s); } - ret &= map_fini (ptx_dev->null_stream); + ret &= map_fini (NULL, ptx_dev->null_stream); free (ptx_dev->null_stream); return ret; } @@ -425,7 +485,8 @@ static struct ptx_stream * select_stream_for_async (int async, pthread_t thread, bool create, CUstream existing) { - struct nvptx_thread *nvthd = nvptx_thread (); + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls; /* Local copy of TLS variable. */ struct ptx_device *ptx_dev = nvthd->ptx_dev; struct ptx_stream *stream = NULL; @@ -495,7 +556,7 @@ select_stream_for_async (int async, pthread_t thread, bool create, s->host_thread = thread; s->multithreaded = false; - if (!map_init (s)) + if (!map_init (thr, s)) { pthread_mutex_unlock (&ptx_dev->stream_lock); GOMP_PLUGIN_fatal ("map_init fail"); @@ -840,7 +901,8 @@ event_gc (bool memmap_lockable) { struct ptx_event *ptx_event = ptx_events; struct ptx_event *async_cleanups = NULL; - struct nvptx_thread *nvthd = nvptx_thread (); + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls; pthread_mutex_lock (&ptx_event_lock); @@ -869,7 +931,7 @@ event_gc (bool memmap_lockable) break; case PTX_EVT_KNL: - map_pop (e->addr); + map_pop (thr, e->addr); break; case PTX_EVT_ASYNC_CLEANUP: @@ -960,7 +1022,8 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, void *kargs[1]; void *hp; CUdeviceptr dp; - struct nvptx_thread *nvthd = nvptx_thread (); + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls; const char *maybe_abort_msg = "(perhaps abort was called)"; int cpu_size = nvptx_thread ()->ptx_dev->max_threads_per_multiprocessor; int block_size = nvptx_thread ()->ptx_dev->max_threads_per_block; @@ -1108,7 +1171,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ pthread_mutex_lock (&ptx_event_lock); - dp = map_push (dev_str, mapnum * sizeof (void *)); + dp = map_push (thr, dev_str, mapnum * sizeof (void *)); pthread_mutex_unlock (&ptx_event_lock); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); @@ -1120,8 +1183,45 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ + + acc_prof_info *prof_info = thr->prof_info; + acc_event_info data_event_info; + acc_api_info *api_info = thr->api_info; + bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false); + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_enqueue_upload_start; + + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; //TODO + /* Always implicit for "data mapping arguments for cuLaunchKernel". */ + data_event_info.data_event.implicit = 1; + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; //TODO + data_event_info.data_event.bytes = mapnum * sizeof (void *); + data_event_info.data_event.host_ptr = hp; + data_event_info.data_event.device_ptr = (void *) dp; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + CUDA_CALL_ASSERT (cuMemcpyHtoD, dp, hp, mapnum * sizeof (void *)); + + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_enqueue_upload_end; + data_event_info.data_event.event_type = prof_info->event_type; + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[GOMP_DIM_GANG], @@ -1133,11 +1233,47 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, // num_workers ntid.y // vector length ntid.x + acc_event_info enqueue_launch_event_info; + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_enqueue_launch_start; + + enqueue_launch_event_info.launch_event.event_type + = prof_info->event_type; + enqueue_launch_event_info.launch_event.valid_bytes + = _ACC_LAUNCH_EVENT_INFO_VALID_BYTES; + enqueue_launch_event_info.launch_event.parent_construct + /* TODO = compute_construct_event_info.other_event.parent_construct */ + = acc_construct_parallel; //TODO: kernels... + enqueue_launch_event_info.launch_event.implicit = 1; + enqueue_launch_event_info.launch_event.tool_info = NULL; + enqueue_launch_event_info.launch_event.kernel_name + = /* TODO */ (char *) /* TODO */ targ_fn->launch->fn; + enqueue_launch_event_info.launch_event.num_gangs + = dims[GOMP_DIM_GANG]; + enqueue_launch_event_info.launch_event.num_workers + = dims[GOMP_DIM_WORKER]; + enqueue_launch_event_info.launch_event.vector_length + = dims[GOMP_DIM_VECTOR]; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info, + api_info); + } kargs[0] = &dp; CUDA_CALL_ASSERT (cuLaunchKernel, function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_enqueue_launch_end; + enqueue_launch_event_info.launch_event.event_type + = prof_info->event_type; + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &enqueue_launch_event_info, + api_info); + } #ifndef DISABLE_ASYNC if (async < acc_async_noval) @@ -1183,7 +1319,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, #ifndef DISABLE_ASYNC if (async < acc_async_noval) #endif - map_pop (dev_str); + map_pop (thr, dev_str); } void * openacc_get_current_cuda_context (void); @@ -1194,6 +1330,34 @@ nvptx_alloc (size_t s) CUdeviceptr d; CUDA_CALL_ERET (NULL, cuMemAlloc, &d, s); + + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + acc_prof_info *prof_info = thr->prof_info; + acc_api_info *api_info = thr->api_info; + bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false); + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_alloc; + + acc_event_info data_event_info; + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; //TODO + data_event_info.data_event.implicit = 1; //TODO + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; //TODO + data_event_info.data_event.bytes = s; + data_event_info.data_event.host_ptr = NULL; + data_event_info.data_event.device_ptr = (void *) d; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + return (void *) d; } @@ -1211,6 +1375,34 @@ nvptx_free (void *p) } CUDA_CALL (cuMemFree, (CUdeviceptr) p); + + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + acc_prof_info *prof_info = thr->prof_info; + acc_api_info *api_info = thr->api_info; + bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false); + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_free; + + acc_event_info data_event_info; + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; //TODO + data_event_info.data_event.implicit = 1; //TODO + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; //TODO + data_event_info.data_event.bytes = ps; + data_event_info.data_event.host_ptr = NULL; + data_event_info.data_event.device_ptr = p; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + return true; } @@ -1220,7 +1412,8 @@ nvptx_host2dev (void *d, const void *h, size_t s) { CUdeviceptr pb; size_t ps; - struct nvptx_thread *nvthd = nvptx_thread (); + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls; if (!s) return true; @@ -1253,6 +1446,32 @@ nvptx_host2dev (void *d, const void *h, size_t s) return false; } + acc_prof_info *prof_info = thr->prof_info; + acc_event_info data_event_info; + acc_api_info *api_info = thr->api_info; + bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false); + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_enqueue_upload_start; + + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; //TODO + data_event_info.data_event.implicit = 1; //TODO + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; //TODO + data_event_info.data_event.bytes = s; + data_event_info.data_event.host_ptr = /* TODO */ (void *) h; + data_event_info.data_event.device_ptr = d; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { @@ -1268,6 +1487,14 @@ nvptx_host2dev (void *d, const void *h, size_t s) #endif CUDA_CALL (cuMemcpyHtoD, (CUdeviceptr) d, h, s); + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_enqueue_upload_end; + data_event_info.data_event.event_type = prof_info->event_type; + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + return true; } @@ -1276,7 +1503,8 @@ nvptx_dev2host (void *h, const void *d, size_t s) { CUdeviceptr pb; size_t ps; - struct nvptx_thread *nvthd = nvptx_thread (); + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls; if (!s) return true; @@ -1309,6 +1537,32 @@ nvptx_dev2host (void *h, const void *d, size_t s) return false; } + acc_prof_info *prof_info = thr->prof_info; + acc_event_info data_event_info; + acc_api_info *api_info = thr->api_info; + bool profiling_dispatch_p = __builtin_expect (prof_info != NULL, false); + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_enqueue_download_start; + + data_event_info.data_event.event_type = prof_info->event_type; + data_event_info.data_event.valid_bytes + = _ACC_DATA_EVENT_INFO_VALID_BYTES; + data_event_info.data_event.parent_construct + = acc_construct_parallel; //TODO + data_event_info.data_event.implicit = 1; //TODO + data_event_info.data_event.tool_info = NULL; + data_event_info.data_event.var_name = NULL; //TODO + data_event_info.data_event.bytes = s; + data_event_info.data_event.host_ptr = h; + data_event_info.data_event.device_ptr = /* TODO */ (void *) d; + + api_info->device_api = acc_device_api_cuda; + + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { @@ -1324,6 +1578,14 @@ nvptx_dev2host (void *h, const void *d, size_t s) #endif CUDA_CALL (cuMemcpyDtoH, h, (CUdeviceptr) d, s); + if (profiling_dispatch_p) + { + prof_info->event_type = acc_ev_enqueue_download_end; + data_event_info.data_event.event_type = prof_info->event_type; + GOMP_PLUGIN_goacc_profiling_dispatch (prof_info, &data_event_info, + api_info); + } + return true; } @@ -1555,7 +1817,8 @@ nvptx_set_cuda_stream (int async, void *stream) { struct ptx_stream *oldstream; pthread_t self = pthread_self (); - struct nvptx_thread *nvthd = nvptx_thread (); + struct goacc_thread *thr = GOMP_PLUGIN_goacc_thread (); + struct nvptx_thread *nvthd = (struct nvptx_thread *) thr->target_tls; if (async < 0) GOMP_PLUGIN_fatal ("bad async %d", async); @@ -1586,7 +1849,7 @@ nvptx_set_cuda_stream (int async, void *stream) CUDA_CALL_ASSERT (cuStreamDestroy, oldstream->stream); - if (!map_fini (oldstream)) + if (!map_fini (thr, oldstream)) GOMP_PLUGIN_fatal ("error when freeing host memory"); free (oldstream); diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c new file mode 100644 index 0000000..4c1f2bb --- /dev/null +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-dispatch-1.c @@ -0,0 +1,344 @@ +/* Test dispatch of events to callbacks. */ + +#undef NDEBUG +#include <assert.h> + +#include <acc_prof.h> + +/* Use explicit copyin clauses, to work around firstprivate optimizations, + which will cause the value at the point of call to be used (*before* any + potential modifications done in callbacks), as opposed to its address being + taken, which then later gets dereferenced (*after* any modifications done in + callbacks). */ +#define COPYIN(...) copyin(__VA_ARGS__) + +#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__) + +static int state = -1; +#define STATE_OP(state, op)\ + do \ + { \ + typeof (state) state_o = (state); \ + (void) state_o; \ + (state)op; \ + DEBUG_printf("state: %d -> %d\n", state_o, (state)); \ + } \ + while (0) + +void cb_compute_construct_start_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 0 + || state == 10 + || state == 30 + || state == 41 + || state == 51 + || state == 91 + || state == 101 + || state == 151); + STATE_OP (state, ++); +} + +void cb_compute_construct_start_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 1 + || state == 11 + || state == 40 + || state == 50 + || state == 90 + || state == 100 + || state == 150); + STATE_OP (state, ++); +} + +void cb_compute_construct_end_1 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 14 + || state == 21 + || state == 32 + || state == 42 + || state == 80 + || state == 103 + || state == 152); + STATE_OP (state, ++); +} + +void cb_compute_construct_end_2 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 13 + || state == 43 + || state == 102 + || state == 154); + STATE_OP (state, ++); +} + +void cb_compute_construct_end_3 (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 12 + || state == 20 + || state == 31 + || state == 44 + || state == 81 + || state == 104 + || state == 153); + STATE_OP (state, ++); +} + +static acc_prof_reg reg; +static acc_prof_reg unreg; +static acc_prof_lookup_func lookup; +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + reg = reg_; + unreg = unreg_; + lookup = lookup_; +} + +int main() +{ + STATE_OP (state, = 0); + reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg); + reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg); + reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 2); + } + assert (state == 2); + + STATE_OP (state, = 10); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 12); + } + assert (state == 15); + + STATE_OP (state, = 20); + unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle); + unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle); + unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg); + unreg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 20); + } + assert (state == 20); + + STATE_OP (state, = 30); + reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_toggle); + reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_toggle); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_toggle); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_toggle); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_toggle); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 31); + } + assert (state == 33); + + STATE_OP (state, = 40); + reg (acc_ev_compute_construct_start, cb_compute_construct_start_2, acc_reg); + unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg); + reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 42); + } + assert (state == 45); + + STATE_OP (state, = 50); + unreg (acc_ev_compute_construct_end, NULL, acc_toggle); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 52); + } + assert (state == 52); + + STATE_OP (state, = 60); + unreg (acc_ev_compute_construct_end, NULL, acc_toggle); + unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread); + unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 60); + } + assert (state == 60); + + STATE_OP (state, = 70); + unreg (acc_ev_compute_construct_start, NULL, acc_toggle); + reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 70); + } + assert (state == 70); + + STATE_OP (state, = 80); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg); + reg (acc_ev_compute_construct_end, NULL, acc_toggle); + reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 80); + } + assert (state == 82); + + STATE_OP (state, = 90); + reg (acc_ev_compute_construct_start, NULL, acc_toggle); + unreg (acc_ev_compute_construct_end, NULL, acc_toggle); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_2, acc_reg); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 92); + } + assert (state == 92); + + STATE_OP (state, = 100); + reg (acc_ev_compute_construct_end, NULL, acc_toggle); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 102); + } + assert (state == 105); + + STATE_OP (state, = 110); + unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle); + unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 110); + } + assert (state == 110); + + STATE_OP (state, = 120); + unreg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 120); + } + assert (state == 120); + + STATE_OP (state, = 130); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_3, acc_reg); + reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 130); + } + assert (state == 130); + + STATE_OP (state, = 140); + unreg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg); + reg (acc_ev_compute_construct_start, cb_compute_construct_start_1, acc_reg); + unreg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end_1, acc_reg); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 140); + } + assert (state == 140); + + STATE_OP (state, = 150); + reg (/* TODO */ (acc_event_t) 0, NULL, acc_toggle_per_thread); + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 152); + } + assert (state == 155); + + return 0; +} diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c new file mode 100644 index 0000000..436f436 --- /dev/null +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-init-1.c @@ -0,0 +1,306 @@ +/* Test dispatch of events to callbacks. */ + +#undef NDEBUG +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +#include <acc_prof.h> + +/* Use explicit copyin clauses, to work around firstprivate optimizations, + which will cause the value at the point of call to be used (*before* any + potential modifications done in callbacks), as opposed to its address being + taken, which then later gets dereferenced (*after* any modifications done in + callbacks). */ +#define COPYIN(...) copyin(__VA_ARGS__) + +#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__) + +static int state = -1; +#define STATE_OP(state, op)\ + do \ + { \ + typeof (state) state_o = (state); \ + (void) state_o; \ + (state)op; \ + DEBUG_printf("state: %d -> %d\n", state_o, (state)); \ + } \ + while (0) + +static acc_device_t acc_device_type; +static int acc_device_num; +static int acc_async; + +struct tool_info +{ + acc_event_info event_info; + struct tool_info *nested; +}; +struct tool_info *tool_info; + +void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 0 + || state == 100); + STATE_OP (state, ++); + + assert (tool_info == NULL); + tool_info = (struct tool_info *) malloc(sizeof *tool_info); + assert (tool_info != NULL); + tool_info->nested = NULL; + + assert (prof_info->event_type == acc_ev_device_init_start); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + if (state == 1) + assert (prof_info->device_type == acc_device_host); + else + assert (prof_info->device_type == acc_device_default); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async_sync); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_runtime_api); + assert (event_info->other_event.implicit == 0); + assert (event_info->other_event.tool_info == NULL); + + assert (api_info->device_api == acc_device_api_none); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + tool_info->event_info.other_event.event_type = event_info->other_event.event_type; + event_info->other_event.tool_info = tool_info; +} + +void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 1 + || state == 101); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start); + + assert (prof_info->event_type == acc_ev_device_init_end); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + if (state == 2) + assert (prof_info->device_type == acc_device_host); + else + assert (prof_info->device_type == acc_device_default); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async_sync); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_runtime_api); + assert (event_info->other_event.implicit == 0); + assert (event_info->other_event.tool_info == tool_info); + + assert (api_info->device_api == acc_device_api_none); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + free (tool_info); + tool_info = NULL; +} + +void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 10 + || state == 110); + STATE_OP (state, ++); + + assert (tool_info == NULL); + tool_info = (struct tool_info *) malloc(sizeof *tool_info); + assert (tool_info != NULL); + tool_info->nested = NULL; + + assert (prof_info->event_type == acc_ev_compute_construct_start); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == /* TODO acc_async */ acc_async_sync); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 0); + assert (event_info->other_event.tool_info == NULL); + + assert (api_info->device_api == acc_device_api_none); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + tool_info->event_info.other_event.event_type = event_info->other_event.event_type; + event_info->other_event.tool_info = tool_info; +} + +void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 11 + || state == 111); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested == NULL); + + assert (prof_info->event_type == acc_ev_compute_construct_end); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + if (acc_device_type == acc_device_host) + assert (prof_info->async == acc_async_sync); + else + assert (prof_info->async == acc_async); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 0); + assert (event_info->other_event.tool_info == tool_info); + + if (acc_device_type == acc_device_host) + assert (api_info->device_api == acc_device_api_none); + else + assert (api_info->device_api == acc_device_api_cuda); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + free (tool_info); + tool_info = NULL; +} + +static acc_prof_reg reg; +static acc_prof_reg unreg; +static acc_prof_lookup_func lookup; +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + reg = reg_; + unreg = unreg_; + lookup = lookup_; +} + +int main() +{ + STATE_OP (state, = 0); + reg (acc_ev_device_init_start, cb_device_init_start, acc_reg); + reg (acc_ev_device_init_end, cb_device_init_end, acc_reg); + reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg); + assert (state == 0); + + acc_init (acc_device_host); + assert (state == 2); + + STATE_OP (state, = 10); + + acc_device_type = acc_get_device_type (); + acc_device_num = acc_get_device_num (acc_device_type); + acc_async = 12; + + { + int state_init; +#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init) + { + state_init = state; + } +#pragma acc wait + assert (state_init == 11); + } + assert (state == 12); + + STATE_OP (state, = 90); + acc_shutdown (acc_device_host); + assert (state == 90); + + + STATE_OP (state, = 100); + acc_init (acc_device_default); + assert (state == 102); + + STATE_OP (state, = 110); + + acc_device_type = acc_get_device_type (); + acc_device_num = acc_get_device_num (acc_device_type); + acc_async = 12; + + { + int state_init; +#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init) + { + state_init = state; + } +#pragma acc wait + assert (state_init == 111); + } + assert (state == 112); + + STATE_OP (state, = 190); + acc_shutdown (acc_device_default); + assert (state == 190); + + return 0; +} diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c new file mode 100644 index 0000000..de26323 --- /dev/null +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-parallel-1.c @@ -0,0 +1,703 @@ +/* Test dispatch of events to callbacks. */ + +#undef NDEBUG +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +#include <acc_prof.h> + +/* Use explicit copyin clauses, to work around firstprivate optimizations, + which will cause the value at the point of call to be used (*before* any + potential modifications done in callbacks), as opposed to its address being + taken, which then later gets dereferenced (*after* any modifications done in + callbacks). */ +#define COPYIN(...) copyin(__VA_ARGS__) + +/* See the "DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT" reference in + libgomp.texi. */ +#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0 + +#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__) + +static int state = -1; +#define STATE_OP(state, op)\ + do \ + { \ + typeof (state) state_o = (state); \ + (void) state_o; \ + (state)op; \ + DEBUG_printf("state: %d -> %d\n", state_o, (state)); \ + } \ + while (0) + +static acc_device_t acc_device_type; +static int acc_device_num; +static int acc_async; + +struct tool_info +{ + acc_event_info event_info; + struct tool_info *nested; +}; +struct tool_info *tool_info; + +void cb_device_init_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + +#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT + assert (state == 1 + || state == 101); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested == NULL); + tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info); + assert (tool_info->nested != NULL); + tool_info->nested->nested = NULL; +#else + assert (state == 0 + || state == 100); + STATE_OP (state, ++); + + assert (tool_info == NULL); + tool_info = (struct tool_info *) malloc(sizeof *tool_info); + assert (tool_info != NULL); + tool_info->nested = NULL; +#endif + + assert (prof_info->event_type == acc_ev_device_init_start); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_default); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async_sync); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 1); + assert (event_info->other_event.tool_info == NULL); + + assert (api_info->device_api == acc_device_api_none); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + +#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT + tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type; + event_info->other_event.tool_info = tool_info->nested; +#else + tool_info->event_info.other_event.event_type = event_info->other_event.event_type; + event_info->other_event.tool_info = tool_info; +#endif +} + +void cb_device_init_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + +#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT + assert (state == 2 + || state == 102); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested != NULL); + assert (tool_info->nested->event_info.other_event.event_type == acc_ev_device_init_start); +#else + assert (state == 1 + || state == 101); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_device_init_start); +#endif + + assert (prof_info->event_type == acc_ev_device_init_end); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_default); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async_sync); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 1); +#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT + assert (event_info->other_event.tool_info == tool_info->nested); +#else + assert (event_info->other_event.tool_info == tool_info); +#endif + + assert (api_info->device_api == acc_device_api_none); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + +#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT + free (tool_info->nested); + tool_info->nested = NULL; +#else + free (tool_info); + tool_info = NULL; +#endif +} + +void cb_enter_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 3 + || state == 103); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested == NULL); + tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info); + assert (tool_info->nested != NULL); + tool_info->nested->nested = NULL; + + assert (prof_info->event_type == acc_ev_enter_data_start); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 1); + assert (event_info->other_event.tool_info == NULL); + + if (acc_device_type == acc_device_host + || state < 100) //TODO + assert (api_info->device_api == acc_device_api_none); + else + assert (api_info->device_api == acc_device_api_cuda); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type; + event_info->other_event.tool_info = tool_info->nested; +} + +void cb_enter_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 4 + || state == 104); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested != NULL); + assert (tool_info->nested->event_info.other_event.event_type == acc_ev_enter_data_start); + + assert (prof_info->event_type == acc_ev_enter_data_end); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 1); + assert (event_info->other_event.tool_info == tool_info->nested); + + if (acc_device_type == acc_device_host) + assert (api_info->device_api == acc_device_api_none); + else + assert (api_info->device_api == acc_device_api_cuda); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + free (tool_info->nested); + tool_info->nested = NULL; +} + +void cb_exit_data_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 7); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested == NULL); + tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info); + assert (tool_info->nested != NULL); + tool_info->nested->nested = NULL; + + assert (prof_info->event_type == acc_ev_exit_data_start); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 1); + assert (event_info->other_event.tool_info == NULL); + + if (acc_device_type == acc_device_host) + assert (api_info->device_api == acc_device_api_none); + else + assert (api_info->device_api == acc_device_api_cuda); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + tool_info->nested->event_info.other_event.event_type = event_info->other_event.event_type; + event_info->other_event.tool_info = tool_info->nested; +} + +void cb_exit_data_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (state == 8); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested != NULL); + assert (tool_info->nested->event_info.other_event.event_type == acc_ev_exit_data_start); + + assert (prof_info->event_type == acc_ev_exit_data_end); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 1); + assert (event_info->other_event.tool_info == tool_info->nested); + + if (acc_device_type == acc_device_host) + assert (api_info->device_api == acc_device_api_none); + else + assert (api_info->device_api == acc_device_api_cuda); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + free (tool_info->nested); + tool_info->nested = NULL; +} + +void cb_compute_construct_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + +#if DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT + assert (state == 0 + || state == 100); + if (state == 100) + { + /* Compensate for the missing acc_ev_device_init_start and + acc_ev_device_init_end. */ + state += 2; + } +#else + if (state == 100) + { + /* Compensate for the missing acc_ev_device_init_start and + acc_ev_device_init_end. */ + state += 2; + } + assert (state == 2 + || state == 102); +#endif + STATE_OP (state, ++); + + assert (tool_info == NULL); + tool_info = (struct tool_info *) malloc(sizeof *tool_info); + assert (tool_info != NULL); + tool_info->nested = NULL; + + assert (prof_info->event_type == acc_ev_compute_construct_start); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == /* TODO acc_async */ acc_async_sync); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 0); + assert (event_info->other_event.tool_info == NULL); + + assert (api_info->device_api == acc_device_api_none); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + tool_info->event_info.other_event.event_type = event_info->other_event.event_type; + event_info->other_event.tool_info = tool_info; + + if (acc_device_type == acc_device_host) + { + /* Compensate for the missing acc_ev_enter_data_start. */ + state += 1; + } +} + +void cb_compute_construct_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + if (acc_device_type == acc_device_host) + { + /* Compensate for the missing acc_ev_enter_data_end. */ + state += 1; + /* Compensate for the missing acc_ev_enqueue_launch_start and + acc_ev_enqueue_launch_end. */ + state += 2; + /* Compensate for the missing acc_ev_exit_data_start and + acc_ev_exit_data_end. */ + state += 2; + } + else if (acc_async != acc_async_sync) + { + /* Compensate for the missing acc_ev_exit_data_start and + acc_ev_exit_data_end. */ + state += 2; + } + assert (state == 9 + || state == 109); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested == NULL); + + assert (prof_info->event_type == acc_ev_compute_construct_end); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + if (acc_device_type == acc_device_host) + assert (prof_info->async == acc_async_sync); + else + assert (prof_info->async == acc_async); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->other_event.event_type == prof_info->event_type); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (event_info->other_event.parent_construct == acc_construct_parallel); + assert (event_info->other_event.implicit == 0); + assert (event_info->other_event.tool_info == tool_info); + + if (acc_device_type == acc_device_host) + assert (api_info->device_api == acc_device_api_none); + else + assert (api_info->device_api == acc_device_api_cuda); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + free (tool_info); + tool_info = NULL; +} + +void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (acc_device_type != acc_device_host); + + assert (state == 5 + || state == 105); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested == NULL); + tool_info->nested = (struct tool_info *) malloc(sizeof *tool_info); + assert (tool_info->nested != NULL); + tool_info->nested->nested = NULL; + + assert (prof_info->event_type == acc_ev_enqueue_launch_start); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->launch_event.event_type == prof_info->event_type); + assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES); + assert (event_info->launch_event.parent_construct == acc_construct_parallel); + assert (event_info->launch_event.implicit == 1); + assert (event_info->launch_event.tool_info == NULL); + assert (event_info->launch_event.kernel_name != NULL); + { + char *s = strstr (event_info->launch_event.kernel_name, "main"); + assert (s != NULL); + s = strstr (s, "omp_fn"); + assert (s != NULL); + } + assert (event_info->launch_event.num_gangs >= 1); + assert (event_info->launch_event.num_workers >= 1); + assert (event_info->launch_event.vector_length >= 1); + + if (acc_device_type == acc_device_host) + assert (api_info->device_api == acc_device_api_none); + else + assert (api_info->device_api == acc_device_api_cuda); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + tool_info->nested->event_info.launch_event.event_type = event_info->launch_event.event_type; + tool_info->nested->event_info.launch_event.kernel_name = strdup (event_info->launch_event.kernel_name); + tool_info->nested->event_info.launch_event.num_gangs = event_info->launch_event.num_gangs; + tool_info->nested->event_info.launch_event.num_workers = event_info->launch_event.num_workers; + tool_info->nested->event_info.launch_event.vector_length = event_info->launch_event.vector_length; + event_info->other_event.tool_info = tool_info->nested; +} + +void cb_enqueue_launch_end (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + assert (acc_device_type != acc_device_host); + + assert (state == 6 + || state == 106); + STATE_OP (state, ++); + + assert (tool_info != NULL); + assert (tool_info->event_info.other_event.event_type == acc_ev_compute_construct_start); + assert (tool_info->nested != NULL); + assert (tool_info->nested->event_info.launch_event.event_type == acc_ev_enqueue_launch_start); + assert (tool_info->nested->event_info.launch_event.kernel_name != NULL); + assert (tool_info->nested->event_info.launch_event.num_gangs >= 1); + assert (tool_info->nested->event_info.launch_event.num_workers >= 1); + assert (tool_info->nested->event_info.launch_event.vector_length >= 1); + + assert (prof_info->event_type == acc_ev_enqueue_launch_end); + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (prof_info->version == _ACC_PROF_INFO_VERSION); + assert (prof_info->device_type == acc_device_type); + assert (prof_info->device_number == acc_device_num); + assert (prof_info->thread_id == -1); + assert (prof_info->async == acc_async); + assert (prof_info->async_queue == prof_info->async); + assert (prof_info->src_file == NULL); + assert (prof_info->func_name == NULL); + assert (prof_info->line_no == -1); + assert (prof_info->end_line_no == -1); + assert (prof_info->func_line_no == -1); + assert (prof_info->func_end_line_no == -1); + + assert (event_info->launch_event.event_type == prof_info->event_type); + assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES); + assert (event_info->launch_event.parent_construct == acc_construct_parallel); + assert (event_info->launch_event.implicit == 1); + assert (event_info->launch_event.tool_info == tool_info->nested); + assert (event_info->launch_event.kernel_name != NULL); + assert (strcmp (event_info->launch_event.kernel_name, tool_info->nested->event_info.launch_event.kernel_name) == 0); + assert (event_info->launch_event.num_gangs == tool_info->nested->event_info.launch_event.num_gangs); + assert (event_info->launch_event.num_workers == tool_info->nested->event_info.launch_event.num_workers); + assert (event_info->launch_event.vector_length == tool_info->nested->event_info.launch_event.vector_length); + + if (acc_device_type == acc_device_host) + assert (api_info->device_api == acc_device_api_none); + else + assert (api_info->device_api == acc_device_api_cuda); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); + assert (api_info->device_type == prof_info->device_type); + assert (api_info->vendor == -1); + assert (api_info->device_handle == NULL); + assert (api_info->context_handle == NULL); + assert (api_info->async_handle == NULL); + + free (tool_info->nested->event_info.launch_event.kernel_name); + free (tool_info->nested); + tool_info->nested = NULL; +} + +static acc_prof_reg reg; +static acc_prof_reg unreg; +static acc_prof_lookup_func lookup; +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + reg = reg_; + unreg = unreg_; + lookup = lookup_; +} + +int main() +{ + STATE_OP (state, = 0); + reg (acc_ev_device_init_start, cb_device_init_start, acc_reg); + reg (acc_ev_device_init_end, cb_device_init_end, acc_reg); + reg (acc_ev_enter_data_start, cb_enter_data_start, acc_reg); + reg (acc_ev_enter_data_end, cb_enter_data_end, acc_reg); + reg (acc_ev_exit_data_start, cb_exit_data_start, acc_reg); + reg (acc_ev_exit_data_end, cb_exit_data_end, acc_reg); + reg (acc_ev_compute_construct_start, cb_compute_construct_start, acc_reg); + reg (acc_ev_compute_construct_end, cb_compute_construct_end, acc_reg); + reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg); + reg (acc_ev_enqueue_launch_end, cb_enqueue_launch_end, acc_reg); + assert (state == 0); + + acc_device_type = acc_get_device_type (); + acc_device_num = acc_get_device_num (acc_device_type); + acc_async = acc_async_sync; + assert (state == 0); + + { + int state_init; +#pragma acc parallel COPYIN(state) copyout(state_init) + { + state_init = state; + } + assert (state_init == 4); + } +#ifdef __OPTIMIZE__ + /* TODO. With -O2 optimizations enabled, the compiler believes that here + "state == 0" still holds. It's not yet clear what's going on. + Mis-optimization across the GOMP function call boundary? Per its + gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL + "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler + must expect calls back into this compilation unit? */ + asm volatile ("" : : : "memory"); +#endif + assert (state == 10); + + STATE_OP (state, = 100); + + acc_async = 12; + { + int state_init; +#pragma acc parallel async(acc_async) COPYIN(state) copyout(state_init) + { + state_init = state; + } +#pragma acc wait + assert (state_init == 104); + } + assert (state == 110); + + return 0; +} diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c new file mode 100644 index 0000000..a952c7a --- /dev/null +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-valid_bytes-1.c @@ -0,0 +1,172 @@ +/* Test the "valid_bytes" magic. */ + +#undef NDEBUG +#include <assert.h> + +#include <acc_prof.h> + +#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__) + +void cb_data_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type); + + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (event_info->data_event.valid_bytes == _ACC_DATA_EVENT_INFO_VALID_BYTES); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); +} + +void cb_launch_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type); + + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (event_info->launch_event.valid_bytes == _ACC_LAUNCH_EVENT_INFO_VALID_BYTES); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); +} + +void cb_other_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type); + + assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES); + assert (event_info->other_event.valid_bytes == _ACC_OTHER_EVENT_INFO_VALID_BYTES); + assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES); +} + +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + reg_ (acc_ev_device_init_start, cb_other_event, acc_reg); + reg_ (acc_ev_device_init_end, cb_other_event, acc_reg); + reg_ (acc_ev_device_shutdown_start, cb_other_event, acc_reg); + reg_ (acc_ev_device_shutdown_end, cb_other_event, acc_reg); + reg_ (acc_ev_runtime_shutdown, cb_other_event, acc_reg); + reg_ (acc_ev_create, cb_data_event, acc_reg); + reg_ (acc_ev_delete, cb_data_event, acc_reg); + reg_ (acc_ev_alloc, cb_data_event, acc_reg); + reg_ (acc_ev_free, cb_data_event, acc_reg); + reg_ (acc_ev_enter_data_start, cb_other_event, acc_reg); + reg_ (acc_ev_enter_data_end, cb_other_event, acc_reg); + reg_ (acc_ev_exit_data_start, cb_other_event, acc_reg); + reg_ (acc_ev_exit_data_end, cb_other_event, acc_reg); + reg_ (acc_ev_update_start, cb_other_event, acc_reg); + reg_ (acc_ev_update_end, cb_other_event, acc_reg); + reg_ (acc_ev_compute_construct_start, cb_other_event, acc_reg); + reg_ (acc_ev_compute_construct_end, cb_other_event, acc_reg); + reg_ (acc_ev_enqueue_launch_start, cb_launch_event, acc_reg); + reg_ (acc_ev_enqueue_launch_end, cb_launch_event, acc_reg); + reg_ (acc_ev_enqueue_upload_start, cb_data_event, acc_reg); + reg_ (acc_ev_enqueue_upload_end, cb_data_event, acc_reg); + reg_ (acc_ev_enqueue_download_start, cb_data_event, acc_reg); + reg_ (acc_ev_enqueue_download_end, cb_data_event, acc_reg); + reg_ (acc_ev_wait_start, cb_other_event, acc_reg); + reg_ (acc_ev_wait_end, cb_other_event, acc_reg); +} + +/* Basic struct. */ +typedef struct A +{ + int a; + int b; +#define VALID_BYTES_A \ + _ACC_PROF_VALID_BYTES_STRUCT (A, b, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (int)) +} A; + +/* Add a "char" field. */ +typedef struct B +{ + int a; + int b; + char c; +#define VALID_BYTES_B \ + _ACC_PROF_VALID_BYTES_STRUCT (B, c, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (char)) +} B; + +/* Add another "char" field. */ +typedef struct C +{ + int a; + int b; + char c, d; +#define VALID_BYTES_C \ + _ACC_PROF_VALID_BYTES_STRUCT (C, d, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (char)) +} C; + +/* Add two "void *" fields. */ +typedef struct D +{ + int a; + int b; + char c, d; + void *e; + void *f; +#define VALID_BYTES_D \ + _ACC_PROF_VALID_BYTES_STRUCT (D, f, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (void *)) +} D; + +/* Add another three "char" fields. */ +typedef struct E +{ + int a; + int b; + char c, d; + void *e; + void *f; + char g, h, i; +#define VALID_BYTES_E \ + _ACC_PROF_VALID_BYTES_STRUCT (E, i, \ + _ACC_PROF_VALID_BYTES_BASICTYPE (char)) +} E; + +int main() +{ + A A1; + DEBUG_printf ("s=%zd, vb=%zd\n", sizeof A1, VALID_BYTES_A); + assert (VALID_BYTES_A <= sizeof A1); + DEBUG_printf ("&A1=%p, &A1.b=%p\n", &A1, &A1.b); + assert (((char *) &A1) + VALID_BYTES_A == (char *) (&A1.b + 1)); + + B B1; + DEBUG_printf ("s=%zd, vb=%zd\n", sizeof B1, VALID_BYTES_B); + assert (VALID_BYTES_B <= sizeof B1); + DEBUG_printf ("&B1=%p, &B1.c=%p\n", &B1, &B1.c); + assert (((char *) &B1) + VALID_BYTES_B == (char *) (&B1.c + 1)); + + assert (VALID_BYTES_B == VALID_BYTES_A + 1 * sizeof (char)); + + C C1; + DEBUG_printf ("s=%zd, vb=%zd\n", sizeof C1, VALID_BYTES_C); + assert (VALID_BYTES_C <= sizeof C1); + DEBUG_printf ("&C1=%p, &C1.d=%p\n", &C1, &C1.d); + assert (((char *) &C1) + VALID_BYTES_C == (char *) (&C1.d + 1)); + + assert (VALID_BYTES_C == VALID_BYTES_B + 1 * sizeof (char)); + + D D1; + DEBUG_printf ("s=%zd, vb=%zd\n", sizeof D1, VALID_BYTES_D); + assert (VALID_BYTES_D <= sizeof D1); + DEBUG_printf ("&D1=%p, &D1.f=%p\n", &D1, &D1.f); + assert (((char *) &D1) + VALID_BYTES_D == (char *) (&D1.f + 1)); + + assert (VALID_BYTES_D > VALID_BYTES_C); + + E E1; + DEBUG_printf ("s=%zd, vb=%zd\n", sizeof E1, VALID_BYTES_E); + assert (VALID_BYTES_E <= sizeof E1); + DEBUG_printf ("&E1=%p, &E1.i=%p\n", &E1, &E1.i); + assert (((char *) &E1) + VALID_BYTES_E == (char *) (&E1.i + 1)); + + assert (VALID_BYTES_E == VALID_BYTES_D + 3 * sizeof (char)); + +#pragma acc parallel + { + } + + return 0; +} diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c new file mode 100644 index 0000000..b0b8934 --- /dev/null +++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-version-1.c @@ -0,0 +1,55 @@ +/* Test "acc_prof_info"'s "version" field. */ + +#undef NDEBUG +#include <assert.h> + +#include <acc_prof.h> + +#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__) + +void cb_any_event (acc_prof_info *prof_info, acc_event_info *event_info, acc_api_info *api_info) +{ + DEBUG_printf ("%s %d\n", __FUNCTION__, prof_info->event_type); + + assert (prof_info->version == 201510); +} + +void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, acc_prof_lookup_func lookup_) +{ + DEBUG_printf ("%s\n", __FUNCTION__); + + reg_ (acc_ev_device_init_start, cb_any_event, acc_reg); + reg_ (acc_ev_device_init_end, cb_any_event, acc_reg); + reg_ (acc_ev_device_shutdown_start, cb_any_event, acc_reg); + reg_ (acc_ev_device_shutdown_end, cb_any_event, acc_reg); + reg_ (acc_ev_runtime_shutdown, cb_any_event, acc_reg); + reg_ (acc_ev_create, cb_any_event, acc_reg); + reg_ (acc_ev_delete, cb_any_event, acc_reg); + reg_ (acc_ev_alloc, cb_any_event, acc_reg); + reg_ (acc_ev_free, cb_any_event, acc_reg); + reg_ (acc_ev_enter_data_start, cb_any_event, acc_reg); + reg_ (acc_ev_enter_data_end, cb_any_event, acc_reg); + reg_ (acc_ev_exit_data_start, cb_any_event, acc_reg); + reg_ (acc_ev_exit_data_end, cb_any_event, acc_reg); + reg_ (acc_ev_update_start, cb_any_event, acc_reg); + reg_ (acc_ev_update_end, cb_any_event, acc_reg); + reg_ (acc_ev_compute_construct_start, cb_any_event, acc_reg); + reg_ (acc_ev_compute_construct_end, cb_any_event, acc_reg); + reg_ (acc_ev_enqueue_launch_start, cb_any_event, acc_reg); + reg_ (acc_ev_enqueue_launch_end, cb_any_event, acc_reg); + reg_ (acc_ev_enqueue_upload_start, cb_any_event, acc_reg); + reg_ (acc_ev_enqueue_upload_end, cb_any_event, acc_reg); + reg_ (acc_ev_enqueue_download_start, cb_any_event, acc_reg); + reg_ (acc_ev_enqueue_download_end, cb_any_event, acc_reg); + reg_ (acc_ev_wait_start, cb_any_event, acc_reg); + reg_ (acc_ev_wait_end, cb_any_event, acc_reg); +} + +int main() +{ +#pragma acc parallel + { + } + + return 0; +} Grüße Thomas