This patch enables worker-partitioning support via gimple rewriting for AMD GCN. Older (and currently unused) parts of this support are already present in the AMD GCN backend: those vestigial parts are enabled or updated, as appropriate.
I can probably self-approve this -- I will commit if/when the other patches in the series are committed in stage 1. Julian 2021-03-02 Julian Brown <jul...@codesourcery.com> Kwok Cheung Yeung <k...@codesourcery.com> gcc/ * config/gcn/gcn-protos.h (gcn_goacc_adjust_propagation_record): Rename prototype to... (gcn_goacc_create_propagation_record): This. * config/gcn/gcn-tree.c (gcn_goacc_adjust_propagation_record): Rename function to... (gcn_goacc_create_propagation_record): This. Adjust comment. * config/gcn/gcn.c (gcn_init_builtins): Override decls for BUILT_IN_GOACC_SINGLE_START, BUILT_IN_GOACC_SINGLE_COPY_START, BUILT_IN_GOACC_SINGLE_COPY_END and BUILT_IN_GOACC_BARRIER. (gcn_goacc_validate_dims): Turn on worker partitioning unconditionally. (gcn_fork_join): Update comment. (TARGET_GOACC_ADJUST_PROPAGATION_RECORD): Rename to... (TARGET_GOACC_CREATE_PROPAGATION_RECORD): This. (TARGET_GOACC_WORKER_PARTITIONING): Define target hook. * config/gcn/gcn.opt (flag_worker_partitioning): Remove. (macc_experimental_workers): Remove unused option. libgomp/ * plugin/plugin-gcn.c (gcn_exec): Change default number of workers to 16. * testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c (check): Skip vector dimension test for AMD GCN. Enable multiple workers. * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Enable multiple workers. Update line numbers for scan tests. * testsuite/libgomp.oacc-fortran/parallel-dims-aux.c: Support AMD GCN. --- gcc/config/gcn/gcn-protos.h | 2 +- gcc/config/gcn/gcn-tree.c | 6 ++--- gcc/config/gcn/gcn.c | 23 +++++++------------ gcc/config/gcn/gcn.opt | 5 ---- libgomp/plugin/plugin-gcn.c | 4 +--- .../loop-dim-default.c | 11 +++++---- .../libgomp.oacc-c-c++-common/parallel-dims.c | 13 ++++------- .../libgomp.oacc-fortran/parallel-dims-aux.c | 9 +++++--- 8 files changed, 31 insertions(+), 42 deletions(-) diff --git a/gcc/config/gcn/gcn-protos.h b/gcc/config/gcn/gcn-protos.h index 7ef7ae8af46..6238bdc8a96 100644 --- a/gcc/config/gcn/gcn-protos.h +++ b/gcc/config/gcn/gcn-protos.h @@ -38,7 +38,7 @@ extern rtx gcn_full_exec (); extern rtx gcn_full_exec_reg (); extern rtx gcn_gen_undef (machine_mode); extern bool gcn_global_address_p (rtx); -extern tree gcn_goacc_adjust_propagation_record (tree record_type, bool sender, +extern tree gcn_goacc_create_propagation_record (tree record_type, bool sender, const char *name); extern tree gcn_goacc_adjust_private_decl (tree var, int level); extern void gcn_goacc_reduction (gcall *call); diff --git a/gcc/config/gcn/gcn-tree.c b/gcc/config/gcn/gcn-tree.c index 75ea50c59dd..a457121c72b 100644 --- a/gcc/config/gcn/gcn-tree.c +++ b/gcc/config/gcn/gcn-tree.c @@ -548,12 +548,12 @@ gcn_goacc_reduction (gcall *call) } } -/* Implement TARGET_GOACC_ADJUST_PROPAGATION_RECORD. +/* Implement TARGET_GOACC_CREATE_PROPAGATION_RECORD. - Tweak (worker) propagation record, e.g. to put it in shared memory. */ + Create (worker) propagation record in shared memory. */ tree -gcn_goacc_adjust_propagation_record (tree record_type, bool sender, +gcn_goacc_create_propagation_record (tree record_type, bool sender, const char *name) { tree type = record_type; diff --git a/gcc/config/gcn/gcn.c b/gcc/config/gcn/gcn.c index 1ea919bf058..fe4fa68f4ce 100644 --- a/gcc/config/gcn/gcn.c +++ b/gcc/config/gcn/gcn.c @@ -3588,8 +3588,6 @@ gcn_init_builtins (void) TREE_NOTHROW (gcn_builtin_decls[i]) = 1; } -/* FIXME: remove the ifdef once OpenACC support is merged upstream. */ -#ifdef BUILT_IN_GOACC_SINGLE_START /* These builtins need to take/return an LDS pointer: override the generic versions here. */ @@ -3606,7 +3604,6 @@ gcn_init_builtins (void) set_builtin_decl (BUILT_IN_GOACC_BARRIER, gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false); -#endif } /* Expand the CMP_SWAP GCN builtins. We have our own versions that do @@ -4865,11 +4862,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned /*used*/) { bool changed = false; - - /* FIXME: remove -facc-experimental-workers when they're ready. */ - int max_workers = flag_worker_partitioning ? 16 : 1; - - gcc_assert (!flag_worker_partitioning); + const int max_workers = 16; /* The vector size must appear to be 64, to the user, unless this is a SEQ routine. The real, internal value is always 1, which means use @@ -4906,8 +4899,7 @@ gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, { dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS; if (dims[GOMP_DIM_WORKER] < 0) - dims[GOMP_DIM_WORKER] = (flag_worker_partitioning - ? GCN_DEFAULT_WORKERS : 1); + dims[GOMP_DIM_WORKER] = GCN_DEFAULT_WORKERS; if (dims[GOMP_DIM_GANG] < 0) dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS; changed = true; @@ -4972,8 +4964,7 @@ static bool gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims), bool ARG_UNUSED (is_fork)) { - /* GCN does not use the fork/join concept invented for NVPTX. - Instead we use standard autovectorization. */ + /* GCN does not need to expand fork/join markers at the RTL level. */ return false; } @@ -6314,9 +6305,9 @@ gcn_dwarf_register_span (rtx rtl) #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA #define TARGET_OMP_DEVICE_KIND_ARCH_ISA gcn_omp_device_kind_arch_isa -#undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD -#define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \ - gcn_goacc_adjust_propagation_record +#undef TARGET_GOACC_CREATE_PROPAGATION_RECORD +#define TARGET_GOACC_CREATE_PROPAGATION_RECORD \ + gcn_goacc_create_propagation_record #undef TARGET_GOACC_ADJUST_PRIVATE_DECL #define TARGET_GOACC_ADJUST_PRIVATE_DECL gcn_goacc_adjust_private_decl #undef TARGET_GOACC_FORK_JOIN @@ -6325,6 +6316,8 @@ gcn_dwarf_register_span (rtx rtl) #define TARGET_GOACC_REDUCTION gcn_goacc_reduction #undef TARGET_GOACC_VALIDATE_DIMS #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims +#undef TARGET_GOACC_WORKER_PARTITIONING +#define TARGET_GOACC_WORKER_PARTITIONING true #undef TARGET_HARD_REGNO_MODE_OK #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok #undef TARGET_HARD_REGNO_NREGS diff --git a/gcc/config/gcn/gcn.opt b/gcc/config/gcn/gcn.opt index 767d45826c2..41cc49095b1 100644 --- a/gcc/config/gcn/gcn.opt +++ b/gcc/config/gcn/gcn.opt @@ -62,11 +62,6 @@ bool flag_bypass_init_error = false mbypass-init-error Target RejectNegative Var(flag_bypass_init_error) -bool flag_worker_partitioning = false - -macc-experimental-workers -Target Var(flag_worker_partitioning) Init(0) - int stack_size_opt = -1 mstack-size= diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 8e6af69988e..b89470199cb 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -3041,10 +3041,8 @@ gcn_exec (struct kernel_info *kernel, size_t mapnum, void **hostaddrs, problem size, so let's do a reasonable number of single-worker gangs. 64 gangs matches a typical Fiji device. */ - /* NOTE: Until support for middle-end worker partitioning is merged, use 1 - for the default number of workers. */ if (dims[0] == 0) dims[0] = get_cu_count (kernel->agent); /* Gangs. */ - if (dims[1] == 0) dims[1] = 1; /* Workers. */ + if (dims[1] == 0) dims[1] = 16; /* Workers. */ /* The incoming dimensions are expressed in terms of gangs, workers, and vectors. The HSA dimensions are expressed in terms of "work-items", diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c index ca771646655..ddf0a29d304 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/loop-dim-default.c @@ -79,13 +79,18 @@ int check (const int *ary, int size, int gp, int wp, int vp) exit = 1; } +#ifndef ACC_DEVICE_TYPE_radeon + /* AMD GCN uses the autovectorizer for the vector dimension: the use + of a function call in vector-partitioned code in this test is not + currently supported. */ for (ix = 0; ix < vp; ix++) if (vectors[ix] != vectors[0]) { printf ("vector %d not used %d times\n", ix, vectors[0]); exit = 1; } - +#endif + return exit; } @@ -132,9 +137,7 @@ int main () /* AMD GCN uses the autovectorizer for the vector dimension: the use of a function call in vector-partitioned code in this test is not currently supported. */ - /* AMD GCN does not currently support multiple workers. This should be - set to 16 when that changes. */ - return test_1 (16, 1, 1); + return test_1 (16, 16, 64); #else return test_1 (16, 16, 32); #endif diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c index 003bcac2413..10bb7b61f50 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c @@ -288,9 +288,8 @@ int main () } else if (acc_on_device (acc_device_radeon)) { - /* The GCC GCN back end is limited to num_workers (16). - Temporarily set this to 1 until multiple workers are permitted. */ - workers_actual = 1; // 16; + /* The GCC GCN back end is limited to num_workers (16). */ + workers_actual = 16; } else __builtin_abort (); @@ -491,8 +490,6 @@ int main () } else if (acc_on_device (acc_device_radeon)) { - /* Temporary setting, until multiple workers are permitted. */ - workers_actual = 1; /* See above comments about GCN vectors_actual. */ vectors_actual = 1; } @@ -618,9 +615,9 @@ int main () gangs_max = workers_max = vectors_max = INT_MIN; #pragma acc serial copy (vectors_actual) /* { dg-warning "using vector_length \\(32\\), ignoring 1" "" { target openacc_nvidia_accel_selected } } */ \ copy (gangs_min, gangs_max, workers_min, workers_max, vectors_min, vectors_max) -/* { dg-warning "not gang partitioned" "" { target *-*-* } 619 } */ -/* { dg-warning "not worker partitioned" "" { target *-*-* } 619 } */ -/* { dg-warning "not vector partitioned" "" { target *-*-* } 619 } */ +/* { dg-warning "not gang partitioned" "" { target *-*-* } 616 } */ +/* { dg-warning "not worker partitioned" "" { target *-*-* } 616 } */ +/* { dg-warning "not vector partitioned" "" { target *-*-* } 616 } */ { if (acc_on_device (acc_device_nvidia)) { diff --git a/libgomp/testsuite/libgomp.oacc-fortran/parallel-dims-aux.c b/libgomp/testsuite/libgomp.oacc-fortran/parallel-dims-aux.c index b5986f4afef..9810a259f2a 100644 --- a/libgomp/testsuite/libgomp.oacc-fortran/parallel-dims-aux.c +++ b/libgomp/testsuite/libgomp.oacc-fortran/parallel-dims-aux.c @@ -16,7 +16,8 @@ { if (acc_on_device ((int) acc_device_host)) return 0; - else if (acc_on_device ((int) acc_device_nvidia)) + else if (acc_on_device ((int) acc_device_nvidia) + || acc_on_device ((int) acc_device_radeon)) return __builtin_goacc_parlevel_id (GOMP_DIM_GANG); else __builtin_abort (); @@ -27,7 +28,8 @@ { if (acc_on_device ((int) acc_device_host)) return 0; - else if (acc_on_device ((int) acc_device_nvidia)) + else if (acc_on_device ((int) acc_device_nvidia) + || acc_on_device ((int) acc_device_radeon)) return __builtin_goacc_parlevel_id (GOMP_DIM_WORKER); else __builtin_abort (); @@ -38,7 +40,8 @@ { if (acc_on_device ((int) acc_device_host)) return 0; - else if (acc_on_device ((int) acc_device_nvidia)) + else if (acc_on_device ((int) acc_device_nvidia) + || acc_on_device ((int) acc_device_radeon)) return __builtin_goacc_parlevel_id (GOMP_DIM_VECTOR); else __builtin_abort (); -- 2.29.2