This patch, which is largely implemented by Chung-Lin, is a first step
towards teaching the c and c++ FEs how to allocate shared memory for
gang local variables. E.g.

  #pragma acc parallel
  {
    int some_array[N], some_var;

Both some_array and some_var will be stored in shared memory with this
patch.

Shared memory is allocated for local variables in a similar fashion to
worker reductions. The nvptx BE maintains a global __gangprivate_shared
variable for all of the local variables that require shared memory.
During RTL expansion, decls are checked for an "oacc gangprivate"
attribute, then those decls are remapped to a pointer within
__gangprivate_shared via the new expand_accel_var target hook. That hook
is also responsible for reserving shared memory for each decl in the
offloaded program. The c and c++ FEs attach "oacc gangprivate"
attributes to decls immediately after they process OpenACC kernels and
parallel regions.

This implementation still has a number of limitations, which will be
addressed in follow up patches at some later date:

 * Currently variables in private clauses inside acc loops will not
   utilize shared memory.
 * OpenACC routines don't use shared memory, except for reductions and
   worker state propagation.
 * Variables local to worker loops don't use shared memory.
 * Variables local to automatically partitioned gang and worker loops
   don't use shared memory.
 * Shared memory is allocated globally, not locally on a per-function
   basis. We're not sure if that matters though.

This patch has been applied to gomp-4_0-branch.

Cesar
2017-02-27  Chung-Lin Tang  <clt...@codesourcery.com>
	    Cesar Philippidis  <ce...@codesourcery.com>

	gcc/c/
	* c-parser.c (mark_vars_oacc_gangprivate): New function.
	(c_parser_oacc_kernels_parallel): Call it to mark gang local variables
	with attribute "oacc gangprivate".

	gcc/cp/
	* cp-tree.h (mark_vars_oacc_gangprivate): Declare.
	* parser.c (mark_vars_oacc_gangprivate): New function.
	(cp_parser_oacc_kernels_parallel): Call it to mark gang local variables
	with attribute "oacc gangprivate".
	* pt.c (tsubst_expr): Likewise.

	gcc/
	* config/nvptx/nvptx.c (gangprivate_shared_size): New global variable.
	(gangprivate_shared_align): Likewise.
	(gangprivate_shared_sym): Likewise.
	(gangprivate_shared_hmap): Likewise.
	(nvptx_option_override): Initialize gangprivate_shared_sym.
	(nvptx_file_end): Output gangprivate_shared_sym.
	(nvptx_goacc_expand_accel_var): New function.
	(nvptx_set_current_function): New function.
	(TARGET_SET_CURRENT_FUNCTION): Define hook.
	(TARGET_GOACC_EXPAND_ACCEL): Likewise.
	* doc/tm.texi (TARGET_GOACC_EXPAND_ACCEL_VAR): Document new hook.
	* doc/tm.texi.in (TARGET_GOACC_EXPAND_ACCEL_VAR): Likewise.
	* expr.c (expand_expr_real_1): Remap decls marked with the
	"oacc gangprivate" atttribute.
	* omp-low.c (scan_sharing_clauses): Strip out any "oacc gangprivate"
	attributes from acc loop private clauses.
	* target.def (expand_accel_var): New hook.

	libgomp/
	* testsuite/libgomp.oacc-c-c++-common/gang-private-1.c: New test.


diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 3f994e3..728c31b 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -14086,6 +14086,32 @@ c_parser_oacc_loop (location_t loc, c_parser *parser, char *p_name,
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
 static tree
+mark_vars_oacc_gangprivate (tree *tp,
+			    int *walk_subtrees ATTRIBUTE_UNUSED,
+			    void *data ATTRIBUTE_UNUSED)
+{
+  /* We back away from nested OpenACC non-gang loop directives.  */
+  if (TREE_CODE (*tp) == OACC_LOOP
+      && find_omp_clause (OMP_FOR_CLAUSES (*tp), OMP_CLAUSE_GANG) == NULL_TREE)
+    {
+      return *tp;
+    }
+  if (TREE_CODE (*tp) == BIND_EXPR)
+    {
+      tree block = BIND_EXPR_BLOCK (*tp);
+      for (tree var = BLOCK_VARS (block); var; var = DECL_CHAIN (var))
+	{
+	  gcc_assert (TREE_CODE (var) == VAR_DECL);
+	  DECL_ATTRIBUTES (var)
+	    = tree_cons (get_identifier ("oacc gangprivate"),
+			 NULL, DECL_ATTRIBUTES (var));
+	  c_mark_addressable (var);
+	}
+    }
+  return NULL;
+}
+
+static tree
 c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser,
 				enum pragma_kind p_kind, char *p_name,
 				bool *if_p)
@@ -14119,7 +14145,9 @@ c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser,
 	  tree block = c_begin_omp_parallel ();
 	  tree clauses;
 	  c_parser_oacc_loop (loc, parser, p_name, mask, &clauses, if_p);
-	  return c_finish_omp_construct (loc, code, block, clauses);
+	  block = c_finish_omp_construct (loc, code, block, clauses);
+	  walk_tree_1 (&block, mark_vars_oacc_gangprivate, NULL, NULL, NULL);
+	  return block;
 	}
     }
 
@@ -14128,7 +14156,9 @@ c_parser_oacc_kernels_parallel (location_t loc, c_parser *parser,
   tree block = c_begin_omp_parallel ();
   add_stmt (c_parser_omp_structured_block (parser, if_p));
 
-  return c_finish_omp_construct (loc, code, block, clauses);
+  block = c_finish_omp_construct (loc, code, block, clauses);
+  walk_tree_1 (&block, mark_vars_oacc_gangprivate, NULL, NULL, NULL);
+  return block;
 }
 
 /* OpenACC 2.0:
diff --git a/gcc/config/nvptx/nvptx.c b/gcc/config/nvptx/nvptx.c
index a9822e268..f790728 100644
--- a/gcc/config/nvptx/nvptx.c
+++ b/gcc/config/nvptx/nvptx.c
@@ -66,6 +66,7 @@
 #include "tree-phinodes.h"
 #include "cfgloop.h"
 #include "fold-const.h"
+#include "tree-hash-traits.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -136,6 +137,12 @@ static unsigned worker_red_size;
 static unsigned worker_red_align;
 static GTY(()) rtx worker_red_sym;
 
+/* Shared memory block for gang-private variables.  */
+static unsigned gangprivate_shared_size;
+static unsigned gangprivate_shared_align;
+static GTY(()) rtx gangprivate_shared_sym;
+static hash_map<tree_decl_hash, unsigned int> gangprivate_shared_hmap;
+
 /* Global lock variable, needed for 128bit worker & gang reductions.  */
 static GTY(()) tree global_lock_var;
 
@@ -167,7 +174,7 @@ nvptx_option_override (void)
   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
   declared_libfuncs_htab
     = hash_table<declared_libfunc_hasher>::create_ggc (17);
-
+  
   worker_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_bcast");
   SET_SYMBOL_DATA_AREA (worker_bcast_sym, DATA_AREA_SHARED);
   worker_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
@@ -175,6 +182,11 @@ nvptx_option_override (void)
   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
+
+  gangprivate_shared_sym
+    = gen_rtx_SYMBOL_REF (Pmode, "__gangprivate_shared");
+  SET_SYMBOL_DATA_AREA (gangprivate_shared_sym, DATA_AREA_SHARED);
+  gangprivate_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
 }
 
 /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
@@ -4048,6 +4060,10 @@ nvptx_file_end (void)
   if (worker_red_size)
     write_worker_buffer (asm_out_file, worker_red_sym,
 			 worker_red_align, worker_red_size);
+
+  if (gangprivate_shared_size)
+    write_worker_buffer (asm_out_file, gangprivate_shared_sym,
+			 gangprivate_shared_align, gangprivate_shared_size);
 }
 
 /* Expander for the shuffle builtins.  */
@@ -5073,6 +5089,47 @@ nvptx_goacc_reduction (gcall *call)
     }
 }
 
+static rtx
+nvptx_goacc_expand_accel_var (tree var)
+{
+  if (TREE_CODE (var) == VAR_DECL
+      && lookup_attribute ("oacc gangprivate", DECL_ATTRIBUTES (var)))
+    {
+      unsigned int offset, *poffset;
+      poffset = gangprivate_shared_hmap.get (var);
+      if (poffset)
+	offset = *poffset;
+      else
+	{
+	  unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
+	  gangprivate_shared_size =
+	    (gangprivate_shared_size + align - 1) & ~(align - 1);
+	  if (gangprivate_shared_align < align)
+	    gangprivate_shared_align = align;
+
+	  offset = gangprivate_shared_size;
+	  bool existed = gangprivate_shared_hmap.put (var, offset);
+	  gcc_assert (!existed);
+	  gangprivate_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
+	}
+      rtx addr = plus_constant (Pmode, gangprivate_shared_sym, offset);
+      return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
+    }
+  return NULL_RTX;
+}
+
+static GTY(()) tree nvptx_previous_fndecl;
+
+static void
+nvptx_set_current_function (tree fndecl)
+{
+  if (!fndecl || fndecl == nvptx_previous_fndecl)
+    return;
+  
+  gangprivate_shared_hmap.empty ();
+  nvptx_previous_fndecl = fndecl;
+}
+
 #undef TARGET_OPTION_OVERRIDE
 #define TARGET_OPTION_OVERRIDE nvptx_option_override
 
@@ -5169,6 +5226,9 @@ nvptx_goacc_reduction (gcall *call)
 #undef  TARGET_BUILTIN_DECL
 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
 
+#undef TARGET_SET_CURRENT_FUNCTION
+#define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
+
 #undef TARGET_GOACC_VALIDATE_DIMS
 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
 
@@ -5181,6 +5241,9 @@ nvptx_goacc_reduction (gcall *call)
 #undef TARGET_GOACC_REDUCTION
 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
 
+#undef TARGET_GOACC_EXPAND_ACCEL_VAR
+#define TARGET_GOACC_EXPAND_ACCEL_VAR nvptx_goacc_expand_accel_var
+
 struct gcc_target targetm = TARGET_INITIALIZER;
 
 #include "gt-nvptx.h"
diff --git a/gcc/cp/cp-tree.h b/gcc/cp/cp-tree.h
index 8a635ba..7bd337a 100644
--- a/gcc/cp/cp-tree.h
+++ b/gcc/cp/cp-tree.h
@@ -6015,6 +6015,8 @@ extern bool maybe_clone_body			(tree);
 extern tree cp_convert_range_for (tree, tree, tree, bool);
 extern bool parsing_nsdmi (void);
 extern void inject_this_parameter (tree, cp_cv_quals);
+extern tree mark_vars_oacc_gangprivate (tree *, int *, void *);
+
 
 /* in pt.c */
 extern bool check_template_shadow		(tree);
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index ddb0ab1..6dcc099 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -35757,6 +35757,34 @@ cp_parser_oacc_loop (cp_parser *parser, cp_token *pragma_tok, char *p_name,
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR_LENGTH)	\
 	| (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT))
 
+tree
+mark_vars_oacc_gangprivate (tree *tp,
+			    int *walk_subtrees ATTRIBUTE_UNUSED,
+			    void *data ATTRIBUTE_UNUSED)
+{
+  /* We back away from nested OpenACC non-gang loop directives.  */
+  if (TREE_CODE (*tp) == OACC_LOOP
+      && find_omp_clause (OMP_FOR_CLAUSES (*tp), OMP_CLAUSE_GANG) == NULL_TREE)
+    {
+      return *tp;
+    }
+  if (TREE_CODE (*tp) == BIND_EXPR)
+    {
+      tree block = BIND_EXPR_BLOCK (*tp);
+      if (block == NULL)
+	return NULL;
+      for (tree var = BLOCK_VARS (block); var; var = DECL_CHAIN (var))
+	{
+	  gcc_assert (TREE_CODE (var) == VAR_DECL);
+	  DECL_ATTRIBUTES (var)
+	    = tree_cons (get_identifier ("oacc gangprivate"),
+			 NULL, DECL_ATTRIBUTES (var));
+	  cxx_mark_addressable (var);
+	}
+    }
+  return NULL;
+}
+
 static tree
 cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok,
 				 char *p_name, bool *if_p)
@@ -35793,7 +35821,9 @@ cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok,
 	  tree stmt = cp_parser_oacc_loop (parser, pragma_tok, p_name, mask,
 					   &clauses, if_p);
 	  protected_set_expr_location (stmt, pragma_tok->location);
-	  return finish_omp_construct (code, block, clauses);
+	  block =  finish_omp_construct (code, block, clauses);
+	  walk_tree_1 (&block, mark_vars_oacc_gangprivate, NULL, NULL, NULL);
+	  return block;
 	}
     }
 
@@ -35804,7 +35834,9 @@ cp_parser_oacc_kernels_parallel (cp_parser *parser, cp_token *pragma_tok,
   unsigned int save = cp_parser_begin_omp_structured_block (parser);
   cp_parser_statement (parser, NULL_TREE, false, if_p);
   cp_parser_end_omp_structured_block (parser, save);
-  return finish_omp_construct (code, block, clauses);
+  block = finish_omp_construct (code, block, clauses);
+  walk_tree_1 (&block, mark_vars_oacc_gangprivate, NULL, NULL, NULL);
+  return block;
 }
 
 /* OpenACC 2.0:
diff --git a/gcc/cp/pt.c b/gcc/cp/pt.c
index 2e13a01..56758d6 100644
--- a/gcc/cp/pt.c
+++ b/gcc/cp/pt.c
@@ -15530,6 +15530,7 @@ tsubst_expr (tree t, tree args, tsubst_flags_t complain, tree in_decl,
       stmt = begin_omp_parallel ();
       RECUR (OMP_BODY (t));
       finish_omp_construct (TREE_CODE (t), stmt, tmp);
+      walk_tree_1 (&OMP_BODY (t), mark_vars_oacc_gangprivate, NULL, NULL, NULL);
       break;
 
     case OMP_PARALLEL:
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 3de3554..0ab7231 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -5801,6 +5801,14 @@ expanded sequence has been inserted.  This hook is also responsible
 for allocating any storage for reductions when necessary.
 @end deftypefn
 
+@deftypefn {Target Hook} rtx TARGET_GOACC_EXPAND_ACCEL_VAR (tree @var{var})
+This hook, if defined, is used by accelerator target back-ends to expand
+specially handled kinds of VAR_DECL expressions.  A particular use is to
+place variables with specific attributes inside special accelarator
+memories.  A return value of NULL indicates that the target does not
+handle this VAR_DECL, and normal RTL expanding is resumed.
+@end deftypefn
+
 @node Anchored Addresses
 @section Anchored Addresses
 @cindex anchored addresses
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f31c763..3b66a1d 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4271,6 +4271,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_GOACC_REDUCTION
 
+@hook TARGET_GOACC_EXPAND_ACCEL_VAR
+
 @node Anchored Addresses
 @section Anchored Addresses
 @cindex anchored addresses
diff --git a/gcc/expr.c b/gcc/expr.c
index 70540f0..79e7ce5 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -9591,8 +9591,19 @@ expand_expr_real_1 (tree exp, rtx target, machine_mode tmode,
       exp = SSA_NAME_VAR (ssa_name);
       goto expand_decl_rtl;
 
-    case PARM_DECL:
     case VAR_DECL:
+      /* Allow accel compiler to handle specific cases of variables,
+	 specifically those tagged with the "oacc gangprivate" attribute,
+	 which may intended to be placed in special memory in GPUs.  */
+      if (flag_openacc && targetm.goacc.expand_accel_var)
+	{
+	  temp = targetm.goacc.expand_accel_var (exp);
+	  if (temp)
+	    return temp;
+	}
+      /* ... fall through ...  */
+
+    case PARM_DECL:
       /* If a static var's type was incomplete when the decl was written,
 	 but the type is complete now, lay out the decl now.  */
       if (DECL_SIZE (exp) == 0
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 40f2003..73666d4 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -2061,7 +2061,19 @@ scan_sharing_clauses (tree clauses, omp_context *ctx,
 	  if (OMP_CLAUSE_PRIVATE_OUTER_REF (c))
 	    goto do_private;
 	  else if (!is_variable_sized (decl))
-	    install_var_local (decl, ctx);
+	    {
+	      tree new_decl = install_var_local (decl, ctx);
+	      /* FIXME: The "oacc gangprivate" attribute conflicts with
+		 the privatization of acc loops.  Remove that attribute,
+		 if present.  */
+	      if (!is_oacc_parallel (ctx))
+		{
+		  tree attributes = DECL_ATTRIBUTES (new_decl);
+		  attributes = remove_attribute ("oacc gangprivate",
+						 attributes);
+		  DECL_ATTRIBUTES (new_decl) = attributes;
+		}
+	    }
 	  break;
 
 	case OMP_CLAUSE_SHARED:
diff --git a/gcc/target.def b/gcc/target.def
index bf8b7d8..c25f30b 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1689,6 +1689,16 @@ for allocating any storage for reductions when necessary.",
 void, (gcall *call),
 default_goacc_reduction)
 
+DEFHOOK
+(expand_accel_var,
+"This hook, if defined, is used by accelerator target back-ends to expand\n\
+specially handled kinds of VAR_DECL expressions.  A particular use is to\n\
+place variables with specific attributes inside special accelarator\n\
+memories.  A return value of NULL indicates that the target does not\n\
+handle this VAR_DECL, and normal RTL expanding is resumed.",
+rtx, (tree var),
+NULL)
+
 HOOK_VECTOR_END (goacc)
 
 /* Functions relating to vectorization.  */
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c
new file mode 100644
index 0000000..40f8b91
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/gang-private-1.c
@@ -0,0 +1,38 @@
+#include <assert.h>
+
+int main (void)
+{
+  int ret;
+  
+  #pragma acc parallel num_gangs(1) num_workers(32) copyout(ret)
+  {
+    int w = 0;
+    
+    #pragma acc loop worker
+    for (int i = 0; i < 32; i++)
+      {
+        #pragma acc atomic update
+	w++;
+      }
+
+    ret = (w == 32);
+  }
+  assert (ret);
+  
+  #pragma acc parallel num_gangs(1) vector_length(32) copyout(ret)
+  {
+    int v = 0;
+
+    #pragma acc loop vector
+    for (int i = 0; i < 32; i++)
+      {
+        #pragma acc atomic update
+	v++;
+      }
+
+    ret = (v == 32);
+  }
+  assert (ret);
+
+  return 0;
+}

Reply via email to