On 12/11/2019 13:56, Jakub Jelinek wrote:
s/reeduced/reduced/

Done.

Not really sure if it is a good idea to print anything, at least not when
in some debugging mode.  I mean, it is fairly easy to write code that will
trigger this.  And, what is the reason why you can't free the
gomp_malloced memory, like comparing if the team_freed pointer is in between
TEAM_ARENA_START and TEAM_ARENA_END or similar, don't do anything in that
case, otherwise use free?

Falling back to malloc is a big performance hit. There's a global lock affecting all teams in all running kernels. If we're running into this then a) I want to know about it so I can tune the arena size, and b) I want the user to know why performance is suddenly worse.

At least for now, I want to keep the message. I've updated the comment though.

+  /* Clear the allocated memory.
+     This should vectorize.  The allocation has been rounded up to the next
+     4-byte boundary, so this is safe.  */
+  for (int i = 0; i<size; i+=4)
+    *(int*)(result+i) = 0;

Formatting (spaces around <, +=, +, between int and *.  Shouldn't 4 be
sizeof (int)?  And wouldn't memset (result, 0, size); do the same job?

I wanted to ensure that the loop would vectorize inline, but I don't think it was doing so anyway. I need to look at that, but how is this, for now?

Andrew
Optimize GCN OpenMP malloc performance

2019-11-12  Andrew Stubbs  <a...@codesourcery.com>

	libgomp/
	* config/gcn/team.c (gomp_gcn_enter_kernel): Set up the team arena
	and use team_malloc variants.
	(gomp_gcn_exit_kernel): Use team_free.
	* libgomp.h (TEAM_ARENA_SIZE): Define.
	(TEAM_ARENA_START): Define.
	(TEAM_ARENA_FREE): Define.
	(TEAM_ARENA_END): Define.
	(team_malloc): New function.
	(team_malloc_cleared): New function.
	(team_free): New function.
	* team.c (gomp_new_team): Initialize and use team_malloc.
	(free_team): Use team_free.
	(gomp_free_thread): Use team_free.
	(gomp_pause_host): Use team_free.
	* work.c (gomp_init_work_share): Use team_malloc.
	(gomp_fini_work_share): Use team_free.

diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index c566482bda2..20d419198e0 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -57,16 +57,28 @@ gomp_gcn_enter_kernel (void)
       /* Starting additional threads is not supported.  */
       gomp_global_icv.dyn_var = true;
 
+      /* Initialize the team arena for optimized memory allocation.
+         The arena has been allocated on the host side, and the address
+         passed in via the kernargs.  Each team takes a small slice of it.  */
+      register void **kernargs asm("s8");
+      void *team_arena = (kernargs[4] + TEAM_ARENA_SIZE*teamid);
+      void * __lds *arena_start = (void * __lds *)TEAM_ARENA_START;
+      void * __lds *arena_free = (void * __lds *)TEAM_ARENA_FREE;
+      void * __lds *arena_end = (void * __lds *)TEAM_ARENA_END;
+      *arena_start = team_arena;
+      *arena_free = team_arena;
+      *arena_end = team_arena + TEAM_ARENA_SIZE;
+
       /* Allocate and initialize the team-local-storage data.  */
-      struct gomp_thread *thrs = gomp_malloc_cleared (sizeof (*thrs)
+      struct gomp_thread *thrs = team_malloc_cleared (sizeof (*thrs)
 						      * numthreads);
       set_gcn_thrs (thrs);
 
       /* Allocate and initailize a pool of threads in the team.
          The threads are already running, of course, we just need to manage
          the communication between them.  */
-      struct gomp_thread_pool *pool = gomp_malloc (sizeof (*pool));
-      pool->threads = gomp_malloc (sizeof (void *) * numthreads);
+      struct gomp_thread_pool *pool = team_malloc (sizeof (*pool));
+      pool->threads = team_malloc (sizeof (void *) * numthreads);
       for (int tid = 0; tid < numthreads; tid++)
 	pool->threads[tid] = &thrs[tid];
       pool->threads_size = numthreads;
@@ -91,7 +103,7 @@ void
 gomp_gcn_exit_kernel (void)
 {
   gomp_free_thread (gcn_thrs ());
-  free (gcn_thrs ());
+  team_free (gcn_thrs ());
 }
 
 /* This function contains the idle loop in which a thread waits
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index 19e1241ee4c..bab733d2b2d 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -106,6 +106,69 @@ extern void gomp_aligned_free (void *);
    GCC's builtin alloca().  */
 #define gomp_alloca(x)  __builtin_alloca(x)
 
+/* Optimized allocators for team-specific data that will die with the team.  */
+
+#ifdef __AMDGCN__
+/* The arena is initialized in config/gcn/team.c.  */
+#define TEAM_ARENA_SIZE  64*1024  /* Must match the value in plugin-gcn.c.  */
+#define TEAM_ARENA_START 16  /* LDS offset of free pointer.  */
+#define TEAM_ARENA_FREE  24  /* LDS offset of free pointer.  */
+#define TEAM_ARENA_END   32  /* LDS offset of end pointer.  */
+
+static inline void * __attribute__((malloc))
+team_malloc (size_t size)
+{
+  /* 4-byte align the size.  */
+  size = (size + 3) & ~3;
+
+  /* Allocate directly from the arena.
+     The compiler does not support DS atomics, yet. */
+  void *result;
+  asm ("ds_add_rtn_u64 %0, %1, %2\n\ts_waitcnt 0"
+       : "=v"(result) : "v"(TEAM_ARENA_FREE), "v"(size), "e"(1L) : "memory");
+
+  /* Handle OOM.  */
+  if (result + size > *(void * __lds *)TEAM_ARENA_END)
+    {
+      /* While this is experimental, let's make sure we know when OOM
+	 happens.  */
+      const char msg[] = "GCN team arena exhausted\n";
+      write (2, msg, sizeof(msg)-1);
+
+      /* Fall back to using the heap (slowly).  */
+      result = gomp_malloc (size);
+    }
+  return result;
+}
+
+static inline void * __attribute__((malloc))
+team_malloc_cleared (size_t size)
+{
+  char *result = team_malloc (size);
+
+  /* Clear the allocated memory.  */
+  __builtin_memset (result, 0, size);
+
+  return result;
+}
+
+static inline void
+team_free (void *ptr)
+{
+  /* The whole arena is freed when the kernel exits.
+     However, if we fell back to using heap then we should free it.
+     It would be better if this function could be a no-op, but at least
+     LDS loads are cheap.  */
+  if (ptr < *(void * __lds *)TEAM_ARENA_START
+      || ptr >= *(void * __lds *)TEAM_ARENA_END)
+    free (ptr);
+}
+#else
+#define team_malloc(...) gomp_malloc (__VA_ARGS__)
+#define team_malloc_cleared(...) gomp_malloc_cleared (__VA_ARGS__)
+#define team_free(...) free (__VA_ARGS__)
+#endif
+
 /* error.c */
 
 extern void gomp_vdebug (int, const char *, va_list);
diff --git a/libgomp/team.c b/libgomp/team.c
index b26caaaaec6..cdfb9ba6c98 100644
--- a/libgomp/team.c
+++ b/libgomp/team.c
@@ -171,7 +171,7 @@ gomp_new_team (unsigned nthreads)
     {
       size_t extra = sizeof (team->ordered_release[0])
 		     + sizeof (team->implicit_task[0]);
-      team = gomp_malloc (sizeof (*team) + nthreads * extra);
+      team = team_malloc (sizeof (*team) + nthreads * extra);
 
 #ifndef HAVE_SYNC_BUILTINS
       gomp_mutex_init (&team->work_share_list_free_lock);
@@ -221,7 +221,7 @@ free_team (struct gomp_team *team)
   gomp_barrier_destroy (&team->barrier);
   gomp_mutex_destroy (&team->task_lock);
   priority_queue_free (&team->task_queue);
-  free (team);
+  team_free (team);
 }
 
 static void
@@ -285,8 +285,8 @@ gomp_free_thread (void *arg __attribute__((unused)))
       if (pool->last_team)
 	free_team (pool->last_team);
 #ifndef __nvptx__
-      free (pool->threads);
-      free (pool);
+      team_free (pool->threads);
+      team_free (pool);
 #endif
       thr->thread_pool = NULL;
     }
@@ -1082,8 +1082,8 @@ gomp_pause_host (void)
       if (pool->last_team)
 	free_team (pool->last_team);
 #ifndef __nvptx__
-      free (pool->threads);
-      free (pool);
+      team_free (pool->threads);
+      team_free (pool);
 #endif
       thr->thread_pool = NULL;
     }
diff --git a/libgomp/work.c b/libgomp/work.c
index a589b8b5231..28bb0c11255 100644
--- a/libgomp/work.c
+++ b/libgomp/work.c
@@ -120,7 +120,7 @@ gomp_init_work_share (struct gomp_work_share *ws, size_t ordered,
       else
 	ordered = nthreads * sizeof (*ws->ordered_team_ids);
       if (ordered > INLINE_ORDERED_TEAM_IDS_SIZE)
-	ws->ordered_team_ids = gomp_malloc (ordered);
+	ws->ordered_team_ids = team_malloc (ordered);
       else
 	ws->ordered_team_ids = ws->inline_ordered_team_ids;
       memset (ws->ordered_team_ids, '\0', ordered);
@@ -142,7 +142,7 @@ gomp_fini_work_share (struct gomp_work_share *ws)
 {
   gomp_mutex_destroy (&ws->lock);
   if (ws->ordered_team_ids != ws->inline_ordered_team_ids)
-    free (ws->ordered_team_ids);
+    team_free (ws->ordered_team_ids);
   gomp_ptrlock_destroy (&ws->next_ws);
 }
 

Reply via email to