[PATCH 3/4] amdgcn: Enable OpenACC worker partitioning for AMD GCN

Julian Brown Tue, 02 Mar 2021 04:20:53 -0800

This patch enables worker-partitioning support via gimple rewriting for
AMD GCN. Older (and currently unused) parts of this support are already
present in the AMD GCN backend: those vestigial parts are enabled or
updated, as appropriate.


I can probably self-approve this -- I will commit if/when the other
patches in the series are committed in stage 1.

Julian

2021-03-02  Julian Brown  <jul...@codesourcery.com>
            Kwok Cheung Yeung  <k...@codesourcery.com>

gcc/
        * config/gcn/gcn-protos.h (gcn_goacc_adjust_propagation_record):
        Rename prototype to...
        (gcn_goacc_create_propagation_record): This.
        * config/gcn/gcn-tree.c (gcn_goacc_adjust_propagation_record): Rename
        function to...
        (gcn_goacc_create_propagation_record): This.  Adjust comment.
        * config/gcn/gcn.c (gcn_init_builtins): Override decls for
        BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START,
        BUILT_IN_GOACC_SINGLE_COPY_END and BUILT_IN_GOACC_BARRIER.
        (gcn_goacc_validate_dims): Turn on worker partitioning unconditionally.
        (gcn_fork_join): Update comment.
        (TARGET_GOACC_ADJUST_PROPAGATION_RECORD): Rename to...
        (TARGET_GOACC_CREATE_PROPAGATION_RECORD): This.
        (TARGET_GOACC_WORKER_PARTITIONING): Define target hook.
        * config/gcn/gcn.opt (flag_worker_partitioning): Remove.
        (macc_experimental_workers): Remove unused option.

libgomp/
        * plugin/plugin-gcn.c (gcn_exec): Change default number of workers to
        16.
        * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (check): Skip
        vector dimension test for AMD GCN.  Enable multiple workers.
        * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Enable multiple
        workers.  Update line numbers for scan tests.
        * testsuite/libgomp.oacc-fortran/parallel-dims-aux.c: Support AMD GCN.
---
 gcc/config/gcn/gcn-protos.h                   |  2 +-
 gcc/config/gcn/gcn-tree.c                     |  6 ++---
 gcc/config/gcn/gcn.c                          | 23 +++++++------------
 gcc/config/gcn/gcn.opt                        |  5 ----
 libgomp/plugin/plugin-gcn.c                   |  4 +---
 .../loop-dim-default.c                        | 11 +++++----
 .../libgomp.oacc-c-c++-common/parallel-dims.c | 13 ++++-------
 .../libgomp.oacc-fortran/parallel-dims-aux.c  |  9 +++++---
 8 files changed, 31 insertions(+), 42 deletions(-)

diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h
index 7ef7ae8af46..6238bdc8a96 100644
--- a/gcc/config/gcn/gcn-protos.h
+++ b/gcc/config/gcn/gcn-protos.h
@@ -38,7 +38,7 @@ extern rtx gcn_full_exec ();
 extern rtx gcn_full_exec_reg ();
 extern rtx gcn_gen_undef (machine_mode);
 extern bool gcn_global_address_p (rtx);
-extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
+extern tree gcn_goacc_create_propagation_record (tree record_type, bool sender,
                                                 const char *name);
 extern tree gcn_goacc_adjust_private_decl (tree var, int level);
 extern void gcn_goacc_reduction (gcall *call);
diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c
index 75ea50c59dd..a457121c72b 100644
--- a/gcc/config/gcn/gcn-tree.c
+++ b/gcc/config/gcn/gcn-tree.c
@@ -548,12 +548,12 @@ gcn_goacc_reduction (gcall *call)
     }
 }
 
-/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD.
+/* Implement TARGET_GOACC_CREATE_PROPAGATION_RECORD.
  
-   Tweak (worker) propagation record, e.g. to put it in shared memory.  */
+   Create (worker) propagation record in shared memory.  */
 
 tree
-gcn_goacc_adjust_propagation_record (tree record_type, bool sender,
+gcn_goacc_create_propagation_record (tree record_type, bool sender,
                                     const char *name)
 {
   tree type = record_type;
diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c
index 1ea919bf058..fe4fa68f4ce 100644
--- a/gcc/config/gcn/gcn.c
+++ b/gcc/config/gcn/gcn.c
@@ -3588,8 +3588,6 @@ gcn_init_builtins (void)
       TREE_NOTHROW (gcn_builtin_decls[i]) = 1;
     }
 
-/* FIXME: remove the ifdef once OpenACC support is merged upstream.  */
-#ifdef BUILT_IN_GOACC_SINGLE_START
   /* These builtins need to take/return an LDS pointer: override the generic
      versions here.  */
 
@@ -3606,7 +3604,6 @@ gcn_init_builtins (void)
 
   set_builtin_decl (BUILT_IN_GOACC_BARRIER,
                    gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false);
-#endif
 }
 
 /* Expand the CMP_SWAP GCN builtins.  We have our own versions that do
@@ -4865,11 +4862,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int 
fn_level,
                         unsigned /*used*/)
 {
   bool changed = false;
-
-  /* FIXME: remove -facc-experimental-workers when they're ready.  */
-  int max_workers = flag_worker_partitioning ? 16 : 1;
-
-  gcc_assert (!flag_worker_partitioning);
+  const int max_workers = 16;
 
   /* The vector size must appear to be 64, to the user, unless this is a
      SEQ routine.  The real, internal value is always 1, which means use
@@ -4906,8 +4899,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int 
fn_level,
     {
       dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS;
       if (dims[GOMP_DIM_WORKER] < 0)
-       dims[GOMP_DIM_WORKER] = (flag_worker_partitioning
-                                ? GCN_DEFAULT_WORKERS : 1);
+       dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS;
       if (dims[GOMP_DIM_GANG] < 0)
        dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS;
       changed = true;
@@ -4972,8 +4964,7 @@ static bool
 gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims),
               bool ARG_UNUSED (is_fork))
 {
-  /* GCN does not use the fork/join concept invented for NVPTX.
-     Instead we use standard autovectorization.  */
+  /* GCN does not need to expand fork/join markers at the RTL level.  */
   return false;
 }
 
@@ -6314,9 +6305,9 @@ gcn_dwarf_register_span (rtx rtl)
 #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr
 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa
-#undef  TARGET_GOACC_ADJUST_PROPAGATION_RECORD
-#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \
-  gcn_goacc_adjust_propagation_record
+#undef  TARGET_GOACC_CREATE_PROPAGATION_RECORD
+#define TARGET_GOACC_CREATE_PROPAGATION_RECORD \
+  gcn_goacc_create_propagation_record
 #undef  TARGET_GOACC_ADJUST_PRIVATE_DECL
 #define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl
 #undef  TARGET_GOACC_FORK_JOIN
@@ -6325,6 +6316,8 @@ gcn_dwarf_register_span (rtx rtl)
 #define TARGET_GOACC_REDUCTION gcn_goacc_reduction
 #undef  TARGET_GOACC_VALIDATE_DIMS
 #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims
+#undef  TARGET_GOACC_WORKER_PARTITIONING
+#define TARGET_GOACC_WORKER_PARTITIONING true
 #undef  TARGET_HARD_REGNO_MODE_OK
 #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok
 #undef  TARGET_HARD_REGNO_NREGS
diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt
index 767d45826c2..41cc49095b1 100644
--- a/gcc/config/gcn/gcn.opt
+++ b/gcc/config/gcn/gcn.opt
@@ -62,11 +62,6 @@ bool flag_bypass_init_error = false
 mbypass-init-error
 Target RejectNegative Var(flag_bypass_init_error)
 
-bool flag_worker_partitioning = false
-
-macc-experimental-workers
-Target Var(flag_worker_partitioning) Init(0)
-
 int stack_size_opt = -1
 
 mstack-size=
diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c
index 8e6af69988e..b89470199cb 100644
--- a/libgomp/plugin/plugin-gcn.c
+++ b/libgomp/plugin/plugin-gcn.c
@@ -3041,10 +3041,8 @@ gcn_exec (struct kernel_info *kernel, size_t mapnum, 
void **hostaddrs,
      problem size, so let's do a reasonable number of single-worker gangs.
      64 gangs matches a typical Fiji device.  */
 
-  /* NOTE: Until support for middle-end worker partitioning is merged, use 1
-     for the default number of workers.  */
   if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs.  */
-  if (dims[1] == 0) dims[1] = 1;  /* Workers.  */
+  if (dims[1] == 0) dims[1] = 16; /* Workers.  */
 
   /* The incoming dimensions are expressed in terms of gangs, workers, and
      vectors.  The HSA dimensions are expressed in terms of "work-items",
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c 
b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
index ca771646655..ddf0a29d304 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c
@@ -79,13 +79,18 @@ int check (const int *ary, int size, int gp, int wp, int vp)
        exit = 1;
       }
   
+#ifndef ACC_DEVICE_TYPE_radeon
+  /* AMD GCN uses the autovectorizer for the vector dimension: the use
+     of a function call in vector-partitioned code in this test is not
+     currently supported.  */
   for (ix = 0; ix < vp; ix++)
     if (vectors[ix] != vectors[0])
       {
        printf ("vector %d not used %d times\n", ix, vectors[0]);
        exit = 1;
       }
-  
+#endif
+
   return exit;
 }
 
@@ -132,9 +137,7 @@ int main ()
   /* AMD GCN uses the autovectorizer for the vector dimension: the use
      of a function call in vector-partitioned code in this test is not
      currently supported.  */
-  /* AMD GCN does not currently support multiple workers.  This should be
-     set to 16 when that changes.  */
-  return test_1 (16, 1, 1);
+  return test_1 (16, 16, 64);
 #else
   return test_1 (16, 16, 32);
 #endif
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c 
b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index 003bcac2413..10bb7b61f50 100644
--- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -288,9 +288,8 @@ int main ()
        }
       else if (acc_on_device (acc_device_radeon))
        {
-         /* The GCC GCN back end is limited to num_workers (16).
-            Temporarily set this to 1 until multiple workers are permitted. */
-         workers_actual = 1; // 16;
+         /* The GCC GCN back end is limited to num_workers (16).  */
+         workers_actual = 16;
        }
       else
        __builtin_abort ();
@@ -491,8 +490,6 @@ int main ()
        }
       else if (acc_on_device (acc_device_radeon))
        {
-         /* Temporary setting, until multiple workers are permitted.  */
-         workers_actual = 1;
          /* See above comments about GCN vectors_actual.  */
          vectors_actual = 1;
        }
@@ -618,9 +615,9 @@ int main ()
     gangs_max = workers_max = vectors_max = INT_MIN;
 #pragma acc serial copy (vectors_actual) /* { dg-warning "using vector_length 
\\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \
   copy (gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
vectors_max)
-/* { dg-warning "not gang partitioned" "" { target *-*-* } 619 } */
-/* { dg-warning "not worker partitioned" "" { target *-*-* } 619 } */
-/* { dg-warning "not vector partitioned" "" { target *-*-* } 619 } */
+/* { dg-warning "not gang partitioned" "" { target *-*-* } 616 } */
+/* { dg-warning "not worker partitioned" "" { target *-*-* } 616 } */
+/* { dg-warning "not vector partitioned" "" { target *-*-* } 616 } */
     {
       if (acc_on_device (acc_device_nvidia))
        {
diff --git a/libgomp/testsuite/libgomp.oacc-fortran/parallel-dims-aux.c 
b/libgomp/testsuite/libgomp.oacc-fortran/parallel-dims-aux.c
index b5986f4afef..9810a259f2a 100644
--- a/libgomp/testsuite/libgomp.oacc-fortran/parallel-dims-aux.c
+++ b/libgomp/testsuite/libgomp.oacc-fortran/parallel-dims-aux.c
@@ -16,7 +16,8 @@
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+          || acc_on_device ((int) acc_device_radeon))
     return __builtin_goacc_parlevel_id (GOMP_DIM_GANG);
   else
     __builtin_abort ();
@@ -27,7 +28,8 @@
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+          || acc_on_device ((int) acc_device_radeon))
     return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER);
   else
     __builtin_abort ();
@@ -38,7 +40,8 @@
 {
   if (acc_on_device ((int) acc_device_host))
     return 0;
-  else if (acc_on_device ((int) acc_device_nvidia))
+  else if (acc_on_device ((int) acc_device_nvidia)
+          || acc_on_device ((int) acc_device_radeon))
     return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR);
   else
     __builtin_abort ();
-- 
2.29.2

[PATCH 3/4] amdgcn: Enable OpenACC worker partitioning for AMD GCN

Reply via email to