The GCN architecture has 4 SIMD units per compute unit, with 256 VGPRs per SIMD
unit. OpenMP threads or OpenACC workers must be distributed across the SIMD
units, with each thread/worker fitting entirely within a single SIMD unit. VGPRs
are shared by the kernels running in a SIMD unit, so we can have 4 workers that
use up to 256 VGPRs, 8 workers that use up to 128 VGPRs, 16 workers that use up
to 64 VGPRs and so on.
If more threads/workers are requested than can be supported, then the runtime
fails with the message:
libgomp: GCN fatal error: Asynchronous queue error
Runtime message: HSA_STATUS_ERROR_INVALID_ISA: The instruction set architecture
is invalid.
This patch adds code to mkoffload such that the number of VGPRs (and SGPRs for
good measure) requested by a kernel is reported to libgomp at runtime. When
launching a kernel, if libgomp detects that the number of threads/workers
exceeds what can be supported by the hardware, it automatically scales down the
number to the maximum supported value.
This behaviour can be overridden using environment variables to set an explicit
number of threads/workers (GCN_NUM_THREADS/GCN_NUM_WORKERS), but there is not
much point IMO as the kernel will just fail to run.
Tested on a GCN3 accelerator with 6 new passes and no regressions noted in
libgomp. Okay for trunk?
Kwok
gcc/
* config/gcn/mkoffload.c (process_asm): Add sgpr_count and vgpr_count to
definition of hsa_kernel_description. Parse assembly to find SGPR and
VGPR count of kernel and store in hsa_kernel_description.
libgomp/
* plugin/plugin-gcn.c (struct hsa_kernel_description): Add sgpr_count
and vgpr_count fields.
(struct kernel_info): Add a field for a hsa_kernel_description.
(run_kernel): Reduce the number of threads/workers if the requested
number would require too many VGPRs.
(init_basic_kernel_info): Initialize description field with
the hsa_kernel_description entry for the kernel.
diff --git a/gcc/config/gcn/mkoffload.c b/gcc/config/gcn/mkoffload.c
index 0062f15..723da10 100644
--- a/gcc/config/gcn/mkoffload.c
+++ b/gcc/config/gcn/mkoffload.c
@@ -211,12 +211,13 @@ access_check (const char *name, int mode)
static void
process_asm (FILE *in, FILE *out, FILE *cfile)
{
- int fn_count = 0, var_count = 0, dims_count = 0;
- struct obstack fns_os, vars_os, varsizes_os, dims_os;
+ int fn_count = 0, var_count = 0, dims_count = 0, regcount_count = 0;
+ struct obstack fns_os, vars_os, varsizes_os, dims_os, regcounts_os;
obstack_init (&fns_os);
obstack_init (&vars_os);
obstack_init (&varsizes_os);
obstack_init (&dims_os);
+ obstack_init (®counts_os);
struct oaccdims
{
@@ -224,13 +225,20 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
char *name;
} dim;
+ struct regcount
+ {
+ int sgpr_count;
+ int vgpr_count;
+ char *kernel_name;
+ } regcount;
+
/* Always add _init_array and _fini_array as kernels. */
obstack_ptr_grow (&fns_os, xstrdup ("_init_array"));
obstack_ptr_grow (&fns_os, xstrdup ("_fini_array"));
fn_count += 2;
char buf[1000];
- enum { IN_CODE, IN_VARS, IN_FUNCS } state = IN_CODE;
+ enum { IN_CODE, IN_AMD_KERNEL_CODE_T, IN_VARS, IN_FUNCS } state = IN_CODE;
while (fgets (buf, sizeof (buf), in))
{
switch (state)
@@ -243,6 +251,22 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
obstack_grow (&dims_os, &dim, sizeof (dim));
dims_count++;
}
+ else if (sscanf (buf, " .amdgpu_hsa_kernel %ms\n",
+ ®count.kernel_name) == 1)
+ break;
+
+ break;
+ }
+ case IN_AMD_KERNEL_CODE_T:
+ {
+ gcc_assert (regcount.kernel_name);
+ if (sscanf (buf, " wavefront_sgpr_count = %d\n",
+ ®count.sgpr_count) == 1)
+ break;
+ else if (sscanf (buf, " workitem_vgpr_count = %d\n",
+ ®count.vgpr_count) == 1)
+ break;
+
break;
}
case IN_VARS:
@@ -282,19 +306,36 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
state = IN_VARS;
else if (sscanf (buf, " .section .gnu.offload_funcs%c", &dummy) > 0)
state = IN_FUNCS;
+ else if (sscanf (buf, " .amd_kernel_code_%c", &dummy) > 0)
+ {
+ state = IN_AMD_KERNEL_CODE_T;
+ regcount.sgpr_count = regcount.vgpr_count = -1;
+ }
else if (sscanf (buf, " .section %c", &dummy) > 0
|| sscanf (buf, " .text%c", &dummy) > 0
|| sscanf (buf, " .bss%c", &dummy) > 0
|| sscanf (buf, " .data%c", &dummy) > 0
|| sscanf (buf, " .ident %c", &dummy) > 0)
state = IN_CODE;
+ else if (sscanf (buf, " .end_amd_kernel_code_%c", &dummy) > 0)
+ {
+ state = IN_CODE;
+ gcc_assert (regcount.kernel_name != NULL
+ && regcount.sgpr_count >= 0
+ && regcount.vgpr_count >= 0);
+ obstack_grow (®counts_os, ®count, sizeof (regcount));
+ regcount_count++;
+ regcount.kernel_name = NULL;
+ regcount.sgpr_count = regcount.vgpr_count = -1;
+ }
- if (state == IN_CODE)
+ if (state == IN_CODE || state == IN_AMD_KERNEL_CODE_T)
fputs (buf, out);
}
char **fns = XOBFINISH (&fns_os, char **);
struct oaccdims *dims = XOBFINISH (&dims_os, struct oaccdims *);
+ struct regcount *regcounts = XOBFINISH (®counts_os, struct regcount *);
fprintf (cfile, "#include <stdlib.h>\n");
fprintf (cfile, "#include <stdbool.h>\n\n");
@@ -322,6 +363,8 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
fprintf (cfile, "static const struct hsa_kernel_description {\n"
" const char *name;\n"
" int oacc_dims[3];\n"
+ " int sgpr_count;\n"
+ " int vgpr_count;\n"
"} gcn_kernels[] = {\n ");
dim.d[0] = dim.d[1] = dim.d[2] = 0;
const char *comma;
@@ -329,15 +372,24 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
{
/* Find if we recorded dimensions for this function. */
int *d = dim.d; /* Previously zeroed. */
+ int sgpr_count = 0;
+ int vgpr_count = 0;
for (int j = 0; j < dims_count; j++)
if (strcmp (fns[i], dims[j].name) == 0)
{
d = dims[j].d;
break;
}
+ for (int j = 0; j < regcount_count; j++)
+ if (strcmp (fns[i], regcounts[j].kernel_name) == 0)
+ {
+ sgpr_count = regcounts[j].sgpr_count;
+ vgpr_count = regcounts[j].vgpr_count;
+ break;
+ }
- fprintf (cfile, "%s{\"%s\", {%d, %d, %d}}", comma,
- fns[i], d[0], d[1], d[2]);
+ fprintf (cfile, "%s{\"%s\", {%d, %d, %d}, %d, %d}", comma,
+ fns[i], d[0], d[1], d[2], sgpr_count, vgpr_count);
free (fns[i]);
}
@@ -346,7 +398,10 @@ process_asm (FILE *in, FILE *out, FILE *cfile)
obstack_free (&fns_os, NULL);
for (i = 0; i < dims_count; i++)
free (dims[i].name);
+ for (i = 0; i < regcount_count; i++)
+ free (regcounts[i].kernel_name);
obstack_free (&dims_os, NULL);
+ obstack_free (®counts_os, NULL);
}
/* Embed an object file into a C source file. */
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 22676b4..25547ef 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -371,6 +371,8 @@ struct hsa_kernel_description
{
const char *name;
int oacc_dims[3]; /* Only present for GCN kernels. */
+ int sgpr_count;
+ int vpgr_count;
};
/* Mkoffload uses this structure to describe an offload variable. */
@@ -478,6 +480,8 @@ struct kernel_info
struct agent_info *agent;
/* The specific module where the kernel takes place. */
struct module_info *module;
+ /* Information provided by mkoffload associated with the kernel. */
+ struct hsa_kernel_description *description;
/* Mutex enforcing that at most once thread ever initializes a kernel for
use. A thread should have locked agent->module_rwlock for reading before
acquiring it. */
@@ -2102,6 +2106,24 @@ run_kernel (struct kernel_info *kernel, void *vars,
struct GOMP_kernel_launch_attributes *kla,
struct goacc_asyncqueue *aq, bool module_locked)
{
+ GCN_DEBUG ("SGPRs: %d, VGPRs: %d\n", kernel->description->sgpr_count,
+ kernel->description->vpgr_count);
+
+ /* Reduce the number of threads/workers if there are insufficient
+ VGPRs available to run the kernels together. */
+ if (kla->ndim == 3 && kernel->description->vpgr_count > 0)
+ {
+ int granulated_vgprs = (kernel->description->vpgr_count + 3) & ~3;
+ int max_threads = (256 / granulated_vgprs) * 4;
+ if (kla->gdims[2] > max_threads)
+ {
+ GCN_WARNING ("Too many VGPRs required to support %d threads/workers"
+ " per team/gang - reducing to %d threads/workers.\n",
+ kla->gdims[2], max_threads);
+ kla->gdims[2] = max_threads;
+ }
+ }
+
GCN_DEBUG ("GCN launch on queue: %d:%d\n", kernel->agent->device_id,
(aq ? aq->id : 0));
GCN_DEBUG ("GCN launch attribs: gdims:[");
@@ -2303,6 +2325,7 @@ init_basic_kernel_info (struct kernel_info *kernel,
kernel->agent = agent;
kernel->module = module;
kernel->name = d->name;
+ kernel->description = d;
if (pthread_mutex_init (&kernel->init_mutex, NULL))
{
GOMP_PLUGIN_error ("Failed to initialize a GCN kernel mutex");