This patch implements a malloc optimization to improve the startup and shutdown overhead for each OpenMP team.
New malloc functions are created, "team_malloc" and "team_free", that take memory from a per-team memory arena provided by the plugin, rather than the shared heap space, which is slow, and gets worse the more teams are trying to allocate at once. These new functions are used both in the gcn/team.c file and in selected places elsewhere in libgomp. Arena-space is limited (and larger sizes have greater overhead at launch time) so this should not be a global search and replace. Dummy pass-through definitions are provided for other targets. OK to commit? Thanks Andrew 2019-11-12 Andrew Stubbs <a...@codesourcery.com> libgomp/ * config/gcn/team.c (gomp_gcn_enter_kernel): Set up the team arena and use team_malloc variants. (gomp_gcn_exit_kernel): Use team_free. * libgomp.h (TEAM_ARENA_SIZE): Define. (TEAM_ARENA_FREE): Define. (TEAM_ARENA_END): Define. (team_malloc): New function. (team_malloc_cleared): New function. (team_free): New function. * team.c (gomp_new_team): Use team_malloc. (free_team): Use team_free. (gomp_free_thread): Use team_free. (gomp_pause_host): Use team_free. * work.c (gomp_init_work_share): Use team_malloc. (gomp_fini_work_share): Use team_free. --- libgomp/config/gcn/team.c | 18 ++++++++++--- libgomp/libgomp.h | 56 +++++++++++++++++++++++++++++++++++++++ libgomp/team.c | 12 ++++----- libgomp/work.c | 4 +-- 4 files changed, 78 insertions(+), 12 deletions(-)
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c index c566482bda2..063571fc751 100644 --- a/libgomp/config/gcn/team.c +++ b/libgomp/config/gcn/team.c @@ -57,16 +57,26 @@ gomp_gcn_enter_kernel (void) /* Starting additional threads is not supported. */ gomp_global_icv.dyn_var = true; + /* Initialize the team arena for optimized memory allocation. + The arena has been allocated on the host side, and the address + passed in via the kernargs. Each team takes a small slice of it. */ + register void **kernargs asm("s8"); + void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid); + void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE; + void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END; + *arena_free = team_arena; + *arena_end = team_arena + TEAM_ARENA_SIZE; + /* Allocate and initialize the team-local-storage data. */ - struct gomp_thread *thrs = gomp_malloc_cleared (sizeof (*thrs) + struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs) * numthreads); set_gcn_thrs (thrs); /* Allocate and initailize a pool of threads in the team. The threads are already running, of course, we just need to manage the communication between them. */ - struct gomp_thread_pool *pool = gomp_malloc (sizeof (*pool)); - pool->threads = gomp_malloc (sizeof (void *) * numthreads); + struct gomp_thread_pool *pool = team_malloc (sizeof (*pool)); + pool->threads = team_malloc (sizeof (void *) * numthreads); for (int tid = 0; tid < numthreads; tid++) pool->threads[tid] = &thrs[tid]; pool->threads_size = numthreads; @@ -91,7 +101,7 @@ void gomp_gcn_exit_kernel (void) { gomp_free_thread (gcn_thrs ()); - free (gcn_thrs ()); + team_free (gcn_thrs ()); } /* This function contains the idle loop in which a thread waits diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h index 19e1241ee4c..659aeb95ffe 100644 --- a/libgomp/libgomp.h +++ b/libgomp/libgomp.h @@ -106,6 +106,62 @@ extern void gomp_aligned_free (void *); GCC's builtin alloca(). */ #define gomp_alloca(x) __builtin_alloca(x) +/* Optimized allocators for team-specific data that will die with the team. */ + +#ifdef __AMDGCN__ +/* The arena is initialized in config/gcn/team.c. */ +#define TEAM_ARENA_SIZE 64*1024 /* Must match the value in plugin-gcn.c. */ +#define TEAM_ARENA_FREE 16 /* LDS offset of free pointer. */ +#define TEAM_ARENA_END 24 /* LDS offset of end pointer. */ + +static inline void * __attribute__((malloc)) +team_malloc (size_t size) +{ + /* 4-byte align the size. */ + size = (size + 3) & ~3; + + /* Allocate directly from the arena. + The compiler does not support DS atomics, yet. */ + void *result; + asm ("ds_add_rtn_u64 %0, %1, %2\n\ts_waitcnt 0" + : "=v"(result) : "v"(TEAM_ARENA_FREE), "v"(size), "e"(1L) : "memory"); + + /* Handle OOM. */ + if (result + size > *(void * __lds *)TEAM_ARENA_END) + { + const char msg[] = "GCN team arena exhausted\n"; + write (2, msg, sizeof(msg)-1); + /* It's better to continue with reeduced performance than abort. + Beware that this won't get freed, which might cause more problems. */ + result = gomp_malloc (size); + } + return result; +} + +static inline void * __attribute__((malloc)) __attribute__((optimize("-O3"))) +team_malloc_cleared (size_t size) +{ + char *result = team_malloc (size); + + /* Clear the allocated memory. + This should vectorize. The allocation has been rounded up to the next + 4-byte boundary, so this is safe. */ + for (int i = 0; i<size; i+=4) + *(int*)(result+i) = 0; + return result; +} + +static inline void +team_free (void *ptr) +{ + /* The whole arena is freed when the kernel exits. */ +} +#else +#define team_malloc(...) gomp_malloc (__VA_ARGS__) +#define team_malloc_cleared(...) gomp_malloc_cleared (__VA_ARGS__) +#define team_free(...) free (__VA_ARGS__) +#endif + /* error.c */ extern void gomp_vdebug (int, const char *, va_list); diff --git a/libgomp/team.c b/libgomp/team.c index b26caaaaec6..cdfb9ba6c98 100644 --- a/libgomp/team.c +++ b/libgomp/team.c @@ -171,7 +171,7 @@ gomp_new_team (unsigned nthreads) { size_t extra = sizeof (team->ordered_release[0]) + sizeof (team->implicit_task[0]); - team = gomp_malloc (sizeof (*team) + nthreads * extra); + team = team_malloc (sizeof (*team) + nthreads * extra); #ifndef HAVE_SYNC_BUILTINS gomp_mutex_init (&team->work_share_list_free_lock); @@ -221,7 +221,7 @@ free_team (struct gomp_team *team) gomp_barrier_destroy (&team->barrier); gomp_mutex_destroy (&team->task_lock); priority_queue_free (&team->task_queue); - free (team); + team_free (team); } static void @@ -285,8 +285,8 @@ gomp_free_thread (void *arg __attribute__((unused))) if (pool->last_team) free_team (pool->last_team); #ifndef __nvptx__ - free (pool->threads); - free (pool); + team_free (pool->threads); + team_free (pool); #endif thr->thread_pool = NULL; } @@ -1082,8 +1082,8 @@ gomp_pause_host (void) if (pool->last_team) free_team (pool->last_team); #ifndef __nvptx__ - free (pool->threads); - free (pool); + team_free (pool->threads); + team_free (pool); #endif thr->thread_pool = NULL; } diff --git a/libgomp/work.c b/libgomp/work.c index a589b8b5231..28bb0c11255 100644 --- a/libgomp/work.c +++ b/libgomp/work.c @@ -120,7 +120,7 @@ gomp_init_work_share (struct gomp_work_share *ws, size_t ordered, else ordered = nthreads * sizeof (*ws->ordered_team_ids); if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE) - ws->ordered_team_ids = gomp_malloc (ordered); + ws->ordered_team_ids = team_malloc (ordered); else ws->ordered_team_ids = ws->inline_ordered_team_ids; memset (ws->ordered_team_ids, '\0', ordered); @@ -142,7 +142,7 @@ gomp_fini_work_share (struct gomp_work_share *ws) { gomp_mutex_destroy (&ws->lock); if (ws->ordered_team_ids != ws->inline_ordered_team_ids) - free (ws->ordered_team_ids); + team_free (ws->ordered_team_ids); gomp_ptrlock_destroy (&ws->next_ws); }