[PATCH] [amdgcn] Scale number of threads/workers with VGPR usage

Kwok Cheung Yeung Fri, 31 Jan 2020 05:57:38 -0800

The GCN architecture has 4 SIMD units per compute unit, with 256 VGPRs per SIMDunit. OpenMP threads or OpenACC workers must be distributed across the SIMDunits, with each thread/worker fitting entirely within a single SIMD unit. VGPRsare shared by the kernels running in a SIMD unit, so we can have 4 workers thatuse up to 256 VGPRs, 8 workers that use up to 128 VGPRs, 16 workers that use upto 64 VGPRs and so on.

If more threads/workers are requested than can be supported, then the runtimefails with the message:


libgomp: GCN fatal error: Asynchronous queue error

Runtime message: HSA_STATUS_ERROR_INVALID_ISA: The instruction set architectureis invalid.

This patch adds code to mkoffload such that the number of VGPRs (and SGPRs forgood measure) requested by a kernel is reported to libgomp at runtime. Whenlaunching a kernel, if libgomp detects that the number of threads/workersexceeds what can be supported by the hardware, it automatically scales down thenumber to the maximum supported value.

This behaviour can be overridden using environment variables to set an explicitnumber of threads/workers (GCN_NUM_THREADS/GCN_NUM_WORKERS), but there is notmuch point IMO as the kernel will just fail to run.

Tested on a GCN3 accelerator with 6 new passes and no regressions noted inlibgomp. Okay for trunk?


Kwok

    gcc/
    * config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count to
    definition of hsa_kernel_description.  Parse assembly to find SGPR and
    VGPR count of kernel and store in hsa_kernel_description.

    libgomp/
    * plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count
    and vgpr_count fields.
    (struct kernel_info): Add a field for a hsa_kernel_description.
    (run_kernel): Reduce the number of threads/workers if the requested
    number would require too many VGPRs.
    (init_basic_kernel_info): Initialize description field with
    the hsa_kernel_description entry for the kernel.

diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c
index 0062f15..723da10 100644
--- a/gcc/config/gcn/mkoffload.c
+++ b/gcc/config/gcn/mkoffload.c
@@ -211,12 +211,13 @@ access_check (const char *name, int mode)
 static void
 process_asm (FILE *in, FILE *out, FILE *cfile)
 {
-  int fn_count = 0, var_count = 0, dims_count = 0;
-  struct obstack fns_os, vars_os, varsizes_os, dims_os;
+  int fn_count = 0, var_count = 0, dims_count = 0, regcount_count = 0;
+  struct obstack fns_os, vars_os, varsizes_os, dims_os, regcounts_os;
   obstack_init (&fns_os);
   obstack_init (&vars_os);
   obstack_init (&varsizes_os);
   obstack_init (&dims_os);
+  obstack_init (&regcounts_os);
 
   struct oaccdims
   {
@@ -224,13 +225,20 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
     char *name;
   } dim;
 
+  struct regcount
+  {
+    int sgpr_count;
+    int vgpr_count;
+    char *kernel_name;
+  } regcount;
+
   /* Always add _init_array and _fini_array as kernels.  */
   obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
   obstack_ptr_grow (&fns_os, xstrdup ("_fini_array"));
   fn_count += 2;
 
   char buf[1000];
-  enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE;
+  enum { IN_CODE, IN_AMD_KERNEL_CODE_T, IN_VARS, IN_FUNCS } state = IN_CODE;
   while (fgets (buf, sizeof (buf), in))
     {
       switch (state)
@@ -243,6 +251,22 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
                obstack_grow (&dims_os, &dim, sizeof (dim));
                dims_count++;
              }
+           else if (sscanf (buf, " .amdgpu_hsa_kernel %ms\n",
+                            &regcount.kernel_name) == 1)
+             break;
+
+           break;
+         }
+       case IN_AMD_KERNEL_CODE_T:
+         {
+           gcc_assert (regcount.kernel_name);
+           if (sscanf (buf, " wavefront_sgpr_count = %d\n",
+                       &regcount.sgpr_count) == 1)
+             break;
+           else if (sscanf (buf, " workitem_vgpr_count = %d\n",
+                            &regcount.vgpr_count) == 1)
+             break;
+
            break;
          }
        case IN_VARS:
@@ -282,19 +306,36 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
        state = IN_VARS;
       else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
        state = IN_FUNCS;
+      else if (sscanf (buf, " .amd_kernel_code_%c", &dummy) > 0)
+       {
+         state = IN_AMD_KERNEL_CODE_T;
+         regcount.sgpr_count = regcount.vgpr_count = -1;
+       }
       else if (sscanf (buf, " .section %c", &dummy) > 0
               || sscanf (buf, " .text%c", &dummy) > 0
               || sscanf (buf, " .bss%c", &dummy) > 0
               || sscanf (buf, " .data%c", &dummy) > 0
               || sscanf (buf, " .ident %c", &dummy) > 0)
        state = IN_CODE;
+      else if (sscanf (buf, " .end_amd_kernel_code_%c", &dummy) > 0)
+       {
+         state = IN_CODE;
+         gcc_assert (regcount.kernel_name != NULL
+                     && regcount.sgpr_count >= 0
+                     && regcount.vgpr_count >= 0);
+         obstack_grow (&regcounts_os, &regcount, sizeof (regcount));
+         regcount_count++;
+         regcount.kernel_name = NULL;
+         regcount.sgpr_count = regcount.vgpr_count = -1;
+       }
 
-      if (state == IN_CODE)
+      if (state == IN_CODE || state == IN_AMD_KERNEL_CODE_T)
        fputs (buf, out);
     }
 
   char **fns = XOBFINISH (&fns_os, char **);
   struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
+  struct regcount *regcounts = XOBFINISH (&regcounts_os, struct regcount *);
 
   fprintf (cfile, "#include <stdlib.h>\n");
   fprintf (cfile, "#include <stdbool.h>\n\n");
@@ -322,6 +363,8 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
   fprintf (cfile, "static const struct hsa_kernel_description {\n"
           "  const char *name;\n"
           "  int oacc_dims[3];\n"
+          "  int sgpr_count;\n"
+          "  int vgpr_count;\n"
           "} gcn_kernels[] = {\n  ");
   dim.d[0] = dim.d[1] = dim.d[2] = 0;
   const char *comma;
@@ -329,15 +372,24 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
     {
       /* Find if we recorded dimensions for this function.  */
       int *d = dim.d;          /* Previously zeroed.  */
+      int sgpr_count = 0;
+      int vgpr_count = 0;
       for (int j = 0; j < dims_count; j++)
        if (strcmp (fns[i], dims[j].name) == 0)
          {
            d = dims[j].d;
            break;
          }
+      for (int j = 0; j < regcount_count; j++)
+       if (strcmp (fns[i], regcounts[j].kernel_name) == 0)
+         {
+           sgpr_count = regcounts[j].sgpr_count;
+           vgpr_count = regcounts[j].vgpr_count;
+           break;
+         }
 
-      fprintf (cfile, "%s{\"%s\", {%d, %d, %d}}", comma,
-              fns[i], d[0], d[1], d[2]);
+      fprintf (cfile, "%s{\"%s\", {%d, %d, %d}, %d, %d}", comma,
+              fns[i], d[0], d[1], d[2], sgpr_count, vgpr_count);
 
       free (fns[i]);
     }
@@ -346,7 +398,10 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
   obstack_free (&fns_os, NULL);
   for (i = 0; i < dims_count; i++)
     free (dims[i].name);
+  for (i = 0; i < regcount_count; i++)
+    free (regcounts[i].kernel_name);
   obstack_free (&dims_os, NULL);
+  obstack_free (&regcounts_os, NULL);
 }
 
 /* Embed an object file into a C source file.  */
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 22676b4..25547ef 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -371,6 +371,8 @@ struct hsa_kernel_description
 {
   const char *name;
   int oacc_dims[3];  /* Only present for GCN kernels.  */
+  int sgpr_count;
+  int vpgr_count;
 };
 
 /* Mkoffload uses this structure to describe an offload variable.  */
@@ -478,6 +480,8 @@ struct kernel_info
   struct agent_info *agent;
   /* The specific module where the kernel takes place.  */
   struct module_info *module;
+  /* Information provided by mkoffload associated with the kernel.  */
+  struct hsa_kernel_description *description;
   /* Mutex enforcing that at most once thread ever initializes a kernel for
      use.  A thread should have locked agent->module_rwlock for reading before
      acquiring it.  */
@@ -2102,6 +2106,24 @@ run_kernel (struct kernel_info *kernel, void *vars,
            struct GOMP_kernel_launch_attributes *kla,
            struct goacc_asyncqueue *aq, bool module_locked)
 {
+  GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
+            kernel->description->vpgr_count);
+
+  /* Reduce the number of threads/workers if there are insufficient
+     VGPRs available to run the kernels together.  */
+  if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
+    {
+      int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
+      int max_threads = (256 / granulated_vgprs) * 4;
+      if (kla->gdims[2] > max_threads)
+       {
+         GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
+                      " per team/gang - reducing to %d threads/workers.\n",
+                      kla->gdims[2], max_threads);
+         kla->gdims[2] = max_threads;
+       }
+    }
+
   GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
             (aq ? aq->id : 0));
   GCN_DEBUG ("GCN launch attribs: gdims:[");
@@ -2303,6 +2325,7 @@ init_basic_kernel_info (struct kernel_info *kernel,
   kernel->agent = agent;
   kernel->module = module;
   kernel->name = d->name;
+  kernel->description = d;
   if (pthread_mutex_init (&kernel->init_mutex, NULL))
     {
       GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");

[PATCH] [amdgcn] Scale number of threads/workers with VGPR usage

Reply via email to