This patch, which is largely implemented by Chung-Lin, is a first step towards teaching the c and c++ FEs how to allocate shared memory for gang local variables. E.g.
#pragma acc parallel { int some_array[N], some_var; Both some_array and some_var will be stored in shared memory with this patch. Shared memory is allocated for local variables in a similar fashion to worker reductions. The nvptx BE maintains a global __gangprivate_shared variable for all of the local variables that require shared memory. During RTL expansion, decls are checked for an "oacc gangprivate" attribute, then those decls are remapped to a pointer within __gangprivate_shared via the new expand_accel_var target hook. That hook is also responsible for reserving shared memory for each decl in the offloaded program. The c and c++ FEs attach "oacc gangprivate" attributes to decls immediately after they process OpenACC kernels and parallel regions. This implementation still has a number of limitations, which will be addressed in follow up patches at some later date: * Currently variables in private clauses inside acc loops will not utilize shared memory. * OpenACC routines don't use shared memory, except for reductions and worker state propagation. * Variables local to worker loops don't use shared memory. * Variables local to automatically partitioned gang and worker loops don't use shared memory. * Shared memory is allocated globally, not locally on a per-function basis. We're not sure if that matters though. This patch has been applied to gomp-4_0-branch. Cesar
2017-02-27 Chung-Lin Tang <clt...@codesourcery.com> Cesar Philippidis <ce...@codesourcery.com> gcc/c/ * c-parser.c (mark_vars_oacc_gangprivate): New function. (c_parser_oacc_kernels_parallel): Call it to mark gang local variables with attribute "oacc gangprivate". gcc/cp/ * cp-tree.h (mark_vars_oacc_gangprivate): Declare. * parser.c (mark_vars_oacc_gangprivate): New function. (cp_parser_oacc_kernels_parallel): Call it to mark gang local variables with attribute "oacc gangprivate". * pt.c (tsubst_expr): Likewise. gcc/ * config/nvptx/nvptx.c (gangprivate_shared_size): New global variable. (gangprivate_shared_align): Likewise. (gangprivate_shared_sym): Likewise. (gangprivate_shared_hmap): Likewise. (nvptx_option_override): Initialize gangprivate_shared_sym. (nvptx_file_end): Output gangprivate_shared_sym. (nvptx_goacc_expand_accel_var): New function. (nvptx_set_current_function): New function. (TARGET_SET_CURRENT_FUNCTION): Define hook. (TARGET_GOACC_EXPAND_ACCEL): Likewise. * doc/tm.texi (TARGET_GOACC_EXPAND_ACCEL_VAR): Document new hook. * doc/tm.texi.in (TARGET_GOACC_EXPAND_ACCEL_VAR): Likewise. * expr.c (expand_expr_real_1): Remap decls marked with the "oacc gangprivate" atttribute. * omp-low.c (scan_sharing_clauses): Strip out any "oacc gangprivate" attributes from acc loop private clauses. * target.def (expand_accel_var): New hook. libgomp/ * testsuite/libgomp.oacc-c-c++-common/gang-private-1.c: New test. diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index 3f994e3..728c31b 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -14086,6 +14086,32 @@ c_parser_oacc_loop (location_t loc, c_parser *parser, char *p_name, | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) ) static tree +mark_vars_oacc_gangprivate (tree *tp, + int *walk_subtrees ATTRIBUTE_UNUSED, + void *data ATTRIBUTE_UNUSED) +{ + /* We back away from nested OpenACC non-gang loop directives. */ + if (TREE_CODE (*tp) == OACC_LOOP + && find_omp_clause (OMP_FOR_CLAUSES (*tp), OMP_CLAUSE_GANG) == NULL_TREE) + { + return *tp; + } + if (TREE_CODE (*tp) == BIND_EXPR) + { + tree block = BIND_EXPR_BLOCK (*tp); + for (tree var = BLOCK_VARS (block); var; var = DECL_CHAIN (var)) + { + gcc_assert (TREE_CODE (var) == VAR_DECL); + DECL_ATTRIBUTES (var) + = tree_cons (get_identifier ("oacc gangprivate"), + NULL, DECL_ATTRIBUTES (var)); + c_mark_addressable (var); + } + } + return NULL; +} + +static tree c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser, enum pragma_kind p_kind, char *p_name, bool *if_p) @@ -14119,7 +14145,9 @@ c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser, tree block = c_begin_omp_parallel (); tree clauses; c_parser_oacc_loop (loc, parser, p_name, mask, &clauses, if_p); - return c_finish_omp_construct (loc, code, block, clauses); + block = c_finish_omp_construct (loc, code, block, clauses); + walk_tree_1 (&block, mark_vars_oacc_gangprivate, NULL, NULL, NULL); + return block; } } @@ -14128,7 +14156,9 @@ c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser, tree block = c_begin_omp_parallel (); add_stmt (c_parser_omp_structured_block (parser, if_p)); - return c_finish_omp_construct (loc, code, block, clauses); + block = c_finish_omp_construct (loc, code, block, clauses); + walk_tree_1 (&block, mark_vars_oacc_gangprivate, NULL, NULL, NULL); + return block; } /* OpenACC 2.0: diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c index a9822e268..f790728 100644 --- a/gcc/config/nvptx/nvptx.c +++ b/gcc/config/nvptx/nvptx.c @@ -66,6 +66,7 @@ #include "tree-phinodes.h" #include "cfgloop.h" #include "fold-const.h" +#include "tree-hash-traits.h" /* This file should be included last. */ #include "target-def.h" @@ -136,6 +137,12 @@ static unsigned worker_red_size; static unsigned worker_red_align; static GTY(()) rtx worker_red_sym; +/* Shared memory block for gang-private variables. */ +static unsigned gangprivate_shared_size; +static unsigned gangprivate_shared_align; +static GTY(()) rtx gangprivate_shared_sym; +static hash_map<tree_decl_hash, unsigned int> gangprivate_shared_hmap; + /* Global lock variable, needed for 128bit worker & gang reductions. */ static GTY(()) tree global_lock_var; @@ -167,7 +174,7 @@ nvptx_option_override (void) needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); declared_libfuncs_htab = hash_table<declared_libfunc_hasher>::create_ggc (17); - + worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast"); SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED); worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; @@ -175,6 +182,11 @@ nvptx_option_override (void) worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; + + gangprivate_shared_sym + = gen_rtx_SYMBOL_REF (Pmode, "__gangprivate_shared"); + SET_SYMBOL_DATA_AREA (gangprivate_shared_sym, DATA_AREA_SHARED); + gangprivate_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; } /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to @@ -4048,6 +4060,10 @@ nvptx_file_end (void) if (worker_red_size) write_worker_buffer (asm_out_file, worker_red_sym, worker_red_align, worker_red_size); + + if (gangprivate_shared_size) + write_worker_buffer (asm_out_file, gangprivate_shared_sym, + gangprivate_shared_align, gangprivate_shared_size); } /* Expander for the shuffle builtins. */ @@ -5073,6 +5089,47 @@ nvptx_goacc_reduction (gcall *call) } } +static rtx +nvptx_goacc_expand_accel_var (tree var) +{ + if (TREE_CODE (var) == VAR_DECL + && lookup_attribute ("oacc gangprivate", DECL_ATTRIBUTES (var))) + { + unsigned int offset, *poffset; + poffset = gangprivate_shared_hmap.get (var); + if (poffset) + offset = *poffset; + else + { + unsigned HOST_WIDE_INT align = DECL_ALIGN (var); + gangprivate_shared_size = + (gangprivate_shared_size + align - 1) & ~(align - 1); + if (gangprivate_shared_align < align) + gangprivate_shared_align = align; + + offset = gangprivate_shared_size; + bool existed = gangprivate_shared_hmap.put (var, offset); + gcc_assert (!existed); + gangprivate_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var)); + } + rtx addr = plus_constant (Pmode, gangprivate_shared_sym, offset); + return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr); + } + return NULL_RTX; +} + +static GTY(()) tree nvptx_previous_fndecl; + +static void +nvptx_set_current_function (tree fndecl) +{ + if (!fndecl || fndecl == nvptx_previous_fndecl) + return; + + gangprivate_shared_hmap.empty (); + nvptx_previous_fndecl = fndecl; +} + #undef TARGET_OPTION_OVERRIDE #define TARGET_OPTION_OVERRIDE nvptx_option_override @@ -5169,6 +5226,9 @@ nvptx_goacc_reduction (gcall *call) #undef TARGET_BUILTIN_DECL #define TARGET_BUILTIN_DECL nvptx_builtin_decl +#undef TARGET_SET_CURRENT_FUNCTION +#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function + #undef TARGET_GOACC_VALIDATE_DIMS #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims @@ -5181,6 +5241,9 @@ nvptx_goacc_reduction (gcall *call) #undef TARGET_GOACC_REDUCTION #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction +#undef TARGET_GOACC_EXPAND_ACCEL_VAR +#define TARGET_GOACC_EXPAND_ACCEL_VAR nvptx_goacc_expand_accel_var + struct gcc_target targetm = TARGET_INITIALIZER; #include "gt-nvptx.h" diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h index 8a635ba..7bd337a 100644 --- a/gcc/cp/cp-tree.h +++ b/gcc/cp/cp-tree.h @@ -6015,6 +6015,8 @@ extern bool maybe_clone_body (tree); extern tree cp_convert_range_for (tree, tree, tree, bool); extern bool parsing_nsdmi (void); extern void inject_this_parameter (tree, cp_cv_quals); +extern tree mark_vars_oacc_gangprivate (tree *, int *, void *); + /* in pt.c */ extern bool check_template_shadow (tree); diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c index ddb0ab1..6dcc099 100644 --- a/gcc/cp/parser.c +++ b/gcc/cp/parser.c @@ -35757,6 +35757,34 @@ cp_parser_oacc_loop (cp_parser *parser, cp_token *pragma_tok, char *p_name, | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR_LENGTH) \ | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT)) +tree +mark_vars_oacc_gangprivate (tree *tp, + int *walk_subtrees ATTRIBUTE_UNUSED, + void *data ATTRIBUTE_UNUSED) +{ + /* We back away from nested OpenACC non-gang loop directives. */ + if (TREE_CODE (*tp) == OACC_LOOP + && find_omp_clause (OMP_FOR_CLAUSES (*tp), OMP_CLAUSE_GANG) == NULL_TREE) + { + return *tp; + } + if (TREE_CODE (*tp) == BIND_EXPR) + { + tree block = BIND_EXPR_BLOCK (*tp); + if (block == NULL) + return NULL; + for (tree var = BLOCK_VARS (block); var; var = DECL_CHAIN (var)) + { + gcc_assert (TREE_CODE (var) == VAR_DECL); + DECL_ATTRIBUTES (var) + = tree_cons (get_identifier ("oacc gangprivate"), + NULL, DECL_ATTRIBUTES (var)); + cxx_mark_addressable (var); + } + } + return NULL; +} + static tree cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok, char *p_name, bool *if_p) @@ -35793,7 +35821,9 @@ cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok, tree stmt = cp_parser_oacc_loop (parser, pragma_tok, p_name, mask, &clauses, if_p); protected_set_expr_location (stmt, pragma_tok->location); - return finish_omp_construct (code, block, clauses); + block = finish_omp_construct (code, block, clauses); + walk_tree_1 (&block, mark_vars_oacc_gangprivate, NULL, NULL, NULL); + return block; } } @@ -35804,7 +35834,9 @@ cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok, unsigned int save = cp_parser_begin_omp_structured_block (parser); cp_parser_statement (parser, NULL_TREE, false, if_p); cp_parser_end_omp_structured_block (parser, save); - return finish_omp_construct (code, block, clauses); + block = finish_omp_construct (code, block, clauses); + walk_tree_1 (&block, mark_vars_oacc_gangprivate, NULL, NULL, NULL); + return block; } /* OpenACC 2.0: diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c index 2e13a01..56758d6 100644 --- a/gcc/cp/pt.c +++ b/gcc/cp/pt.c @@ -15530,6 +15530,7 @@ tsubst_expr (tree t, tree args, tsubst_flags_t complain, tree in_decl, stmt = begin_omp_parallel (); RECUR (OMP_BODY (t)); finish_omp_construct (TREE_CODE (t), stmt, tmp); + walk_tree_1 (&OMP_BODY (t), mark_vars_oacc_gangprivate, NULL, NULL, NULL); break; case OMP_PARALLEL: diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi index 3de3554..0ab7231 100644 --- a/gcc/doc/tm.texi +++ b/gcc/doc/tm.texi @@ -5801,6 +5801,14 @@ expanded sequence has been inserted. This hook is also responsible for allocating any storage for reductions when necessary. @end deftypefn +@deftypefn {Target Hook} rtx TARGET_GOACC_EXPAND_ACCEL_VAR (tree @var{var}) +This hook, if defined, is used by accelerator target back-ends to expand +specially handled kinds of VAR_DECL expressions. A particular use is to +place variables with specific attributes inside special accelarator +memories. A return value of NULL indicates that the target does not +handle this VAR_DECL, and normal RTL expanding is resumed. +@end deftypefn + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in index f31c763..3b66a1d 100644 --- a/gcc/doc/tm.texi.in +++ b/gcc/doc/tm.texi.in @@ -4271,6 +4271,8 @@ address; but often a machine-dependent strategy can generate better code. @hook TARGET_GOACC_REDUCTION +@hook TARGET_GOACC_EXPAND_ACCEL_VAR + @node Anchored Addresses @section Anchored Addresses @cindex anchored addresses diff --git a/gcc/expr.c b/gcc/expr.c index 70540f0..79e7ce5 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -9591,8 +9591,19 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode, exp = SSA_NAME_VAR (ssa_name); goto expand_decl_rtl; - case PARM_DECL: case VAR_DECL: + /* Allow accel compiler to handle specific cases of variables, + specifically those tagged with the "oacc gangprivate" attribute, + which may intended to be placed in special memory in GPUs. */ + if (flag_openacc && targetm.goacc.expand_accel_var) + { + temp = targetm.goacc.expand_accel_var (exp); + if (temp) + return temp; + } + /* ... fall through ... */ + + case PARM_DECL: /* If a static var's type was incomplete when the decl was written, but the type is complete now, lay out the decl now. */ if (DECL_SIZE (exp) == 0 diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 40f2003..73666d4 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -2061,7 +2061,19 @@ scan_sharing_clauses (tree clauses, omp_context *ctx, if (OMP_CLAUSE_PRIVATE_OUTER_REF (c)) goto do_private; else if (!is_variable_sized (decl)) - install_var_local (decl, ctx); + { + tree new_decl = install_var_local (decl, ctx); + /* FIXME: The "oacc gangprivate" attribute conflicts with + the privatization of acc loops. Remove that attribute, + if present. */ + if (!is_oacc_parallel (ctx)) + { + tree attributes = DECL_ATTRIBUTES (new_decl); + attributes = remove_attribute ("oacc gangprivate", + attributes); + DECL_ATTRIBUTES (new_decl) = attributes; + } + } break; case OMP_CLAUSE_SHARED: diff --git a/gcc/target.def b/gcc/target.def index bf8b7d8..c25f30b 100644 --- a/gcc/target.def +++ b/gcc/target.def @@ -1689,6 +1689,16 @@ for allocating any storage for reductions when necessary.", void, (gcall *call), default_goacc_reduction) +DEFHOOK +(expand_accel_var, +"This hook, if defined, is used by accelerator target back-ends to expand\n\ +specially handled kinds of VAR_DECL expressions. A particular use is to\n\ +place variables with specific attributes inside special accelarator\n\ +memories. A return value of NULL indicates that the target does not\n\ +handle this VAR_DECL, and normal RTL expanding is resumed.", +rtx, (tree var), +NULL) + HOOK_VECTOR_END (goacc) /* Functions relating to vectorization. */ diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c new file mode 100644 index 0000000..40f8b91 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c @@ -0,0 +1,38 @@ +#include <assert.h> + +int main (void) +{ + int ret; + + #pragma acc parallel num_gangs(1) num_workers(32) copyout(ret) + { + int w = 0; + + #pragma acc loop worker + for (int i = 0; i < 32; i++) + { + #pragma acc atomic update + w++; + } + + ret = (w == 32); + } + assert (ret); + + #pragma acc parallel num_gangs(1) vector_length(32) copyout(ret) + { + int v = 0; + + #pragma acc loop vector + for (int i = 0; i < 32; i++) + { + #pragma acc atomic update + v++; + } + + ret = (v == 32); + } + assert (ret); + + return 0; +}