Hi!

On 2021-11-12T18:58:04+0100, Jakub Jelinek via Gcc-patches 
<gcc-patches@gcc.gnu.org> wrote:
> And finally here is a third version, [...]

... which became commit 9fa72756d90e0d9edadf6e6f5f56476029925788
"libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound".

Attached here is "GCN: Honor OpenMP 5.1 'num_teams' lower bound", which
are exactly the corresponding changes for GCN (see below Jakub's nvptx
changes for reference); OK to push?


Grüße
 Thomas


> 2021-11-12  Jakub Jelinek  <ja...@redhat.com>
>
>       * config/nvptx/team.c (__gomp_team_num): Define as
>       __attribute__((shared)) var.
>       (gomp_nvptx_main): Initialize __gomp_team_num to 0.
>       * config/nvptx/target.c (__gomp_team_num): Declare as
>       extern __attribute__((shared)) var.
>       (GOMP_teams4): Use __gomp_team_num as the team number instead of
>       %ctaid.x.  If first, initialize it to %ctaid.x.  If num_teams_lower
>       is bigger than num_blocks, use num_teams_lower teams and arrange for
>       bumping of __gomp_team_num if !first and returning false once we run
>       out of teams.
>       * config/nvptx/teams.c (__gomp_team_num): Declare as
>       extern __attribute__((shared)) var.
>       (omp_get_team_num): Return __gomp_team_num value instead of %ctaid.x.
>
> --- libgomp/config/nvptx/team.c.jj    2021-05-25 13:43:02.793121350 +0200
> +++ libgomp/config/nvptx/team.c       2021-11-12 17:49:02.847341650 +0100
> @@ -32,6 +32,7 @@
>  #include <string.h>
>  
>  struct gomp_thread *nvptx_thrs __attribute__((shared,nocommon));
> +int __gomp_team_num __attribute__((shared));
>  
>  static void gomp_thread_start (struct gomp_thread_pool *);
>  
> @@ -57,6 +58,7 @@ gomp_nvptx_main (void (*fn) (void *), vo
>        /* Starting additional threads is not supported.  */
>        gomp_global_icv.dyn_var = true;
>  
> +      __gomp_team_num = 0;
>        nvptx_thrs = alloca (ntids * sizeof (*nvptx_thrs));
>        memset (nvptx_thrs, 0, ntids * sizeof (*nvptx_thrs));
>  
> --- libgomp/config/nvptx/target.c.jj  2021-11-12 15:57:29.400632875 +0100
> +++ libgomp/config/nvptx/target.c     2021-11-12 17:47:39.499533296 +0100
> @@ -26,28 +26,41 @@
>  #include "libgomp.h"
>  #include <limits.h>
>  
> +extern int __gomp_team_num __attribute__((shared));
> +
>  bool
>  GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
>            unsigned int thread_limit, bool first)
>  {
> +  unsigned int num_blocks, block_id;
> +  asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
>    if (!first)
> -    return false;
> +    {
> +      unsigned int team_num;
> +      if (num_blocks > gomp_num_teams_var)
> +     return false;
> +      team_num = __gomp_team_num;
> +      if (team_num > gomp_num_teams_var - num_blocks)
> +     return false;
> +      __gomp_team_num = team_num + num_blocks;
> +      return true;
> +    }
>    if (thread_limit)
>      {
>        struct gomp_task_icv *icv = gomp_icv (true);
>        icv->thread_limit_var
>       = thread_limit > INT_MAX ? UINT_MAX : thread_limit;
>      }
> -  unsigned int num_blocks, block_id;
> -  asm ("mov.u32 %0, %%nctaid.x;" : "=r" (num_blocks));
> -  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
> -  /* FIXME: If num_teams_lower > num_blocks, we want to loop multiple
> -     times for some CTAs.  */
> -  (void) num_teams_lower;
> -  if (!num_teams_upper || num_teams_upper >= num_blocks)
> +  if (!num_teams_upper)
>      num_teams_upper = num_blocks;
> -  else if (block_id >= num_teams_upper)
> +  else if (num_blocks < num_teams_lower)
> +    num_teams_upper = num_teams_lower;
> +  else if (num_blocks < num_teams_upper)
> +    num_teams_upper = num_blocks;
> +  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (block_id));
> +  if (block_id >= num_teams_upper)
>      return false;
> +  __gomp_team_num = block_id;
>    gomp_num_teams_var = num_teams_upper - 1;
>    return true;
>  }
> --- libgomp/config/nvptx/teams.c.jj   2021-05-25 13:43:02.793121350 +0200
> +++ libgomp/config/nvptx/teams.c      2021-11-12 17:37:18.933361024 +0100
> @@ -28,6 +28,8 @@
>  
>  #include "libgomp.h"
>  
> +extern int __gomp_team_num __attribute__((shared));
> +
>  void
>  GOMP_teams_reg (void (*fn) (void *), void *data, unsigned int num_teams,
>               unsigned int thread_limit, unsigned int flags)
> @@ -48,9 +50,7 @@ omp_get_num_teams (void)
>  int
>  omp_get_team_num (void)
>  {
> -  int ctaid;
> -  asm ("mov.u32 %0, %%ctaid.x;" : "=r" (ctaid));
> -  return ctaid;
> +  return __gomp_team_num;
>  }
>  
>  ialias (omp_get_num_teams)
>
>
>       Jakub


>From f078b635f033dcb80ce8cd48de3bf62ad5e285bf Mon Sep 17 00:00:00 2001
From: Thomas Schwinge <tschwi...@baylibre.com>
Date: Mon, 15 Jul 2024 11:19:28 +0200
Subject: [PATCH] GCN: Honor OpenMP 5.1 'num_teams' lower bound

Corresponding to commit 9fa72756d90e0d9edadf6e6f5f56476029925788
"libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound", these are the
GCN offloading changes to fix:

    PASS: libgomp.c/../libgomp.c-c++-common/teams-2.c (test for excess errors)
    [-FAIL:-]{+PASS:+} libgomp.c/../libgomp.c-c++-common/teams-2.c execution test

    PASS: libgomp.c++/../libgomp.c-c++-common/teams-2.c (test for excess errors)
    [-FAIL:-]{+PASS:+} libgomp.c++/../libgomp.c-c++-common/teams-2.c execution test

..., and omptests' 't-critical' test case.  I've cross checked that those test
cases are the ones that regress for nvptx offloading, if I locally revert the
"libgomp, nvptx: Honor OpenMP 5.1 num_teams lower bound" changes.

	libgomp/
	* config/gcn/libgomp-gcn.h (GOMP_TEAM_NUM): Inject.
	* config/gcn/target.c (GOMP_teams4): Handle.
	* config/gcn/team.c (gomp_gcn_enter_kernel): Initialize.
	* config/gcn/teams.c (omp_get_team_num): Adjust.
---
 libgomp/config/gcn/libgomp-gcn.h |  9 +++++----
 libgomp/config/gcn/target.c      | 29 ++++++++++++++++++++---------
 libgomp/config/gcn/team.c        |  3 +++
 libgomp/config/gcn/teams.c       |  5 +++--
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/libgomp/config/gcn/libgomp-gcn.h b/libgomp/config/gcn/libgomp-gcn.h
index e94f0c7ae68..48a3741b04d 100644
--- a/libgomp/config/gcn/libgomp-gcn.h
+++ b/libgomp/config/gcn/libgomp-gcn.h
@@ -34,10 +34,11 @@
 #define DEFAULT_TEAM_ARENA_SIZE (64*1024)
 
 /* These define the LDS location of data needed by OpenMP.  */
-#define TEAM_ARENA_START 16  /* LDS offset of free pointer.  */
-#define TEAM_ARENA_FREE  24  /* LDS offset of free pointer.  */
-#define TEAM_ARENA_END   32  /* LDS offset of end pointer.  */
-#define GCN_LOWLAT_HEAP  40  /* LDS offset of the OpenMP low-latency heap.  */
+#define GOMP_TEAM_NUM    16
+#define TEAM_ARENA_START 24  /* LDS offset of free pointer.  */
+#define TEAM_ARENA_FREE  32  /* LDS offset of free pointer.  */
+#define TEAM_ARENA_END   40  /* LDS offset of end pointer.  */
+#define GCN_LOWLAT_HEAP  48  /* LDS offset of the OpenMP low-latency heap.  */
 
 struct heap
 {
diff --git a/libgomp/config/gcn/target.c b/libgomp/config/gcn/target.c
index 1d4a23cb8d2..e57d2e5f93f 100644
--- a/libgomp/config/gcn/target.c
+++ b/libgomp/config/gcn/target.c
@@ -33,26 +33,37 @@ bool
 GOMP_teams4 (unsigned int num_teams_lower, unsigned int num_teams_upper,
 	     unsigned int thread_limit, bool first)
 {
+  int __lds *gomp_team_num = (int __lds *) GOMP_TEAM_NUM;
+  unsigned int num_workgroups = __builtin_gcn_dim_size (0);
   if (!first)
-    return false;
+    {
+      unsigned int team_num;
+      if (num_workgroups > gomp_num_teams_var)
+	return false;
+      team_num = *gomp_team_num;
+      if (team_num > gomp_num_teams_var - num_workgroups)
+	return false;
+      *gomp_team_num = team_num + num_workgroups;
+      return true;
+    }
   if (thread_limit)
     {
       struct gomp_task_icv *icv = gomp_icv (true);
       icv->thread_limit_var
 	= thread_limit > INT_MAX ? UINT_MAX : thread_limit;
     }
-  unsigned int num_workgroups, workgroup_id;
-  num_workgroups = __builtin_gcn_dim_size (0);
-  workgroup_id = __builtin_gcn_dim_pos (0);
-  /* FIXME: If num_teams_lower > num_workgroups, we want to loop
-     multiple times at least for some workgroups.  */
-  (void) num_teams_lower;
-  if (!num_teams_upper || num_teams_upper >= num_workgroups)
+  if (!num_teams_upper)
     num_teams_upper = ((GOMP_ADDITIONAL_ICVS.nteams > 0
 			&& num_workgroups > GOMP_ADDITIONAL_ICVS.nteams)
 		       ? GOMP_ADDITIONAL_ICVS.nteams : num_workgroups);
-  else if (workgroup_id >= num_teams_upper)
+  else if (num_workgroups < num_teams_lower)
+    num_teams_upper = num_teams_lower;
+  else if (num_workgroups < num_teams_upper)
+    num_teams_upper = num_workgroups;
+  unsigned int workgroup_id = __builtin_gcn_dim_pos (0);
+  if (workgroup_id >= num_teams_upper)
     return false;
+  *gomp_team_num = workgroup_id;
   gomp_num_teams_var = num_teams_upper - 1;
   return true;
 }
diff --git a/libgomp/config/gcn/team.c b/libgomp/config/gcn/team.c
index bd3df448b52..aa68b3abe0b 100644
--- a/libgomp/config/gcn/team.c
+++ b/libgomp/config/gcn/team.c
@@ -68,6 +68,9 @@ gomp_gcn_enter_kernel (void)
       /* Starting additional threads is not supported.  */
       gomp_global_icv.dyn_var = true;
 
+      int __lds *gomp_team_num = (int __lds *) GOMP_TEAM_NUM;
+      *gomp_team_num = 0;
+
       /* Initialize the team arena for optimized memory allocation.
          The arena has been allocated on the host side, and the address
          passed in via the kernargs.  Each team takes a small slice of it.  */
diff --git a/libgomp/config/gcn/teams.c b/libgomp/config/gcn/teams.c
index 8a91ba8f5c1..57404184c89 100644
--- a/libgomp/config/gcn/teams.c
+++ b/libgomp/config/gcn/teams.c
@@ -44,10 +44,11 @@ omp_get_num_teams (void)
   return gomp_num_teams_var + 1;
 }
 
-int __attribute__ ((__optimize__ ("O2")))
+int
 omp_get_team_num (void)
 {
-  return __builtin_gcn_dim_pos (0);
+  int __lds *gomp_team_num = (int __lds *) GOMP_TEAM_NUM;
+  return *gomp_team_num;
 }
 
 ialias (omp_get_num_teams)
-- 
2.34.1

Reply via email to