I've committed this to gomp4. It gets partitioned routines working by surrounding the call by fork & join instructions such that the right number of active threads get to the call.

nathan
2015-08-22  Nathan Sidwell  <nat...@codesourcery.com>

	gcc/
	* omp-low.c (build_oacc_routine_dims): Size non-partitioned
	levels to 1.
	* config/nvptx/nvptx.c (nvptx_expand_call): Move later, emit
	forking and joining instructions.
	(nvptx_process_pars): Cope with both Worker and Vector being set.

	libgomp/
	*  testsuite/libgomp.oacc-c-c++-common/routine-vec-1.c: New.
	*  testsuite/libgomp.oacc-c-c++-common/routine-work-1.c: New.

Index: gcc/omp-low.c
===================================================================
--- gcc/omp-low.c	(revision 227089)
+++ gcc/omp-low.c	(working copy)
@@ -9374,7 +9374,10 @@ set_oacc_fn_attrib (tree fn, tree clause
     (dynamic).  TREE_PURPOSE is set to indicate whether that dimension
     can have a loop partitioned on it.  non-zero indicates
     yes, zero indicates no.  By construction once a non-zero has been
-    reached, further inner dimensions must also be non-zero.  */
+    reached, further inner dimensions must also be non-zero.  We set
+    TREE_VALUE to zero for the dimensions that may be partitioned and
+    1 for the other ones -- if a loop is (erroneously) spawned at
+    an outer level, we don't want to try and partition it.  */
 
 tree
 build_oacc_routine_dims (tree clauses)
@@ -9404,7 +9407,7 @@ build_oacc_routine_dims (tree clauses)
 
   for (ix = GOMP_DIM_MAX; ix--;)
     dims = tree_cons (build_int_cst (boolean_type_node, ix >= level),
-		      integer_zero_node, dims);
+		      build_int_cst (integer_type_node, ix < level), dims);
 
   return dims;
 }
Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c	(revision 227089)
+++ gcc/config/nvptx/nvptx.c	(working copy)
@@ -840,127 +840,6 @@ nvptx_end_call_args (void)
   free_EXPR_LIST_list (&cfun->machine->call_args);
 }
 
-/* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
-   track of whether calls involving static chains or varargs were seen
-   in the current function.
-   For libcalls, maintain a hash table of decls we have seen, and
-   record a function decl for later when encountering a new one.  */
-
-void
-nvptx_expand_call (rtx retval, rtx address)
-{
-  int nargs = 0;
-  rtx callee = XEXP (address, 0);
-  rtx pat, t;
-  rtvec vec;
-  bool external_decl = false;
-  rtx varargs = NULL_RTX;
-  tree decl_type = NULL_TREE;
-  unsigned parallel = 0;
-
-  for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
-    nargs++;
-
-  if (!call_insn_operand (callee, Pmode))
-    {
-      callee = force_reg (Pmode, callee);
-      address = change_address (address, QImode, callee);
-    }
-
-  if (GET_CODE (callee) == SYMBOL_REF)
-    {
-      tree decl = SYMBOL_REF_DECL (callee);
-      if (decl != NULL_TREE)
-	{
-	  decl_type = TREE_TYPE (decl);
-	  if (DECL_STATIC_CHAIN (decl))
-	    cfun->machine->has_call_with_sc = true;
-	  if (DECL_EXTERNAL (decl))
-	    external_decl = true;
-	  tree attr = get_oacc_fn_attrib (decl);
-	  if (attr)
-	    {
-	      tree dims = TREE_VALUE (attr);
-
-	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
-		{
-		  if (TREE_PURPOSE (dims)
-		      && !integer_zerop (TREE_PURPOSE (dims)))
-		    {
-		      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
-		      if (ix)
-			parallel ^= GOMP_DIM_MASK (ix - 1) - 1;
-		      break;
-		    }
-		  dims = TREE_CHAIN (dims);
-		}
-	    }
-	}
-    }
-
-  if (cfun->machine->funtype
-      /* It's possible to construct testcases where we call a variable.
-	 See compile/20020129-1.c.  stdarg_p will crash so avoid calling it
-	 in such a case.  */
-      && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
-	  || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
-      && stdarg_p (cfun->machine->funtype))
-    {
-      varargs = gen_reg_rtx (Pmode);
-      if (Pmode == DImode)
-	emit_move_insn (varargs, stack_pointer_rtx);
-      else
-	emit_move_insn (varargs, stack_pointer_rtx);
-      cfun->machine->has_call_with_varargs = true;
-    }
-  vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
-  pat = gen_rtx_PARALLEL (VOIDmode, vec);
-
-  int vec_pos = 0;
-  
-  rtx tmp_retval = retval;
-  t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
-  if (retval != NULL_RTX)
-    {
-      if (!nvptx_register_operand (retval, GET_MODE (retval)))
-	tmp_retval = gen_reg_rtx (GET_MODE (retval));
-      t = gen_rtx_SET (tmp_retval, t);
-    }
-  XVECEXP (pat, 0, vec_pos++) = t;
-
-  /* Construct the call insn, including a USE for each argument pseudo
-     register.  These will be used when printing the insn.  */
-  for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
-    {
-      rtx this_arg = XEXP (arg, 0);
-      XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
-    }
-
-  if (varargs)
-      XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
-
-  gcc_assert (vec_pos = XVECLEN (pat, 0));
-
-  /* If this is a libcall, decl_type is NULL. For a call to a non-libcall
-     undeclared function, we'll have an external decl without arg types.
-     In either case we have to try to construct a ptx declaration from one of
-     the calls to the function.  */
-  if (!REG_P (callee)
-      && (decl_type == NULL_TREE
-	  || (external_decl && TYPE_ARG_TYPES (decl_type) == NULL_TREE)))
-    {
-      rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
-      if (*slot == NULL)
-	{
-	  *slot = callee;
-	  write_func_decl_from_insn (func_decls, retval, pat, callee);
-	}
-    }
-  emit_call_insn (pat);
-  if (tmp_retval != retval)
-    emit_move_insn (retval, tmp_retval);
-}
-
 /* Implement TARGET_FUNCTION_ARG.  */
 
 static rtx
@@ -1200,6 +1079,129 @@ nvptx_emit_joining (unsigned mask)
     }
 }
 
+/* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
+   track of whether calls involving static chains or varargs were seen
+   in the current function.
+   For libcalls, maintain a hash table of decls we have seen, and
+   record a function decl for later when encountering a new one.  */
+
+void
+nvptx_expand_call (rtx retval, rtx address)
+{
+  int nargs = 0;
+  rtx callee = XEXP (address, 0);
+  rtx pat, t;
+  rtvec vec;
+  bool external_decl = false;
+  rtx varargs = NULL_RTX;
+  tree decl_type = NULL_TREE;
+  unsigned parallel = 0;
+
+  for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
+    nargs++;
+
+  if (!call_insn_operand (callee, Pmode))
+    {
+      callee = force_reg (Pmode, callee);
+      address = change_address (address, QImode, callee);
+    }
+
+  if (GET_CODE (callee) == SYMBOL_REF)
+    {
+      tree decl = SYMBOL_REF_DECL (callee);
+      if (decl != NULL_TREE)
+	{
+	  decl_type = TREE_TYPE (decl);
+	  if (DECL_STATIC_CHAIN (decl))
+	    cfun->machine->has_call_with_sc = true;
+	  if (DECL_EXTERNAL (decl))
+	    external_decl = true;
+	  tree attr = get_oacc_fn_attrib (decl);
+	  if (attr)
+	    {
+	      tree dims = TREE_VALUE (attr);
+
+	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
+	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
+		{
+		  if (TREE_PURPOSE (dims)
+		      && !integer_zerop (TREE_PURPOSE (dims)))
+		    break;
+		  /* Not on this axis.  */
+		  parallel ^= GOMP_DIM_MASK (ix);
+		  dims = TREE_CHAIN (dims);
+		}
+	    }
+	}
+    }
+
+  nvptx_emit_forking (parallel);
+
+  if (cfun->machine->funtype
+      /* It's possible to construct testcases where we call a variable.
+	 See compile/20020129-1.c.  stdarg_p will crash so avoid calling it
+	 in such a case.  */
+      && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
+	  || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
+      && stdarg_p (cfun->machine->funtype))
+    {
+      varargs = gen_reg_rtx (Pmode);
+      if (Pmode == DImode)
+	emit_move_insn (varargs, stack_pointer_rtx);
+      else
+	emit_move_insn (varargs, stack_pointer_rtx);
+      cfun->machine->has_call_with_varargs = true;
+    }
+  vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
+  pat = gen_rtx_PARALLEL (VOIDmode, vec);
+
+  int vec_pos = 0;
+  
+  rtx tmp_retval = retval;
+  t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
+  if (retval != NULL_RTX)
+    {
+      if (!nvptx_register_operand (retval, GET_MODE (retval)))
+	tmp_retval = gen_reg_rtx (GET_MODE (retval));
+      t = gen_rtx_SET (tmp_retval, t);
+    }
+  XVECEXP (pat, 0, vec_pos++) = t;
+
+  /* Construct the call insn, including a USE for each argument pseudo
+     register.  These will be used when printing the insn.  */
+  for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
+    {
+      rtx this_arg = XEXP (arg, 0);
+      XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
+    }
+
+  if (varargs)
+      XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
+
+  gcc_assert (vec_pos = XVECLEN (pat, 0));
+
+  /* If this is a libcall, decl_type is NULL. For a call to a non-libcall
+     undeclared function, we'll have an external decl without arg types.
+     In either case we have to try to construct a ptx declaration from one of
+     the calls to the function.  */
+  if (!REG_P (callee)
+      && (decl_type == NULL_TREE
+	  || (external_decl && TYPE_ARG_TYPES (decl_type) == NULL_TREE)))
+    {
+      rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
+      if (*slot == NULL)
+	{
+	  *slot = callee;
+	  write_func_decl_from_insn (func_decls, retval, pat, callee);
+	}
+    }
+  emit_call_insn (pat);
+  if (tmp_retval != retval)
+    emit_move_insn (retval, tmp_retval);
+
+  nvptx_emit_joining (parallel);
+}
+
 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
 
 void
@@ -3010,30 +3012,17 @@ nvptx_process_pars (parallel *par)
       par->inner_mask = nvptx_process_pars (par->inner);
       inner_mask |= par->inner_mask;
     }
-  
-  switch (par->mask)
-    {
-    case 0:
-      /* Dummy parallel.  */
-      break;
 
-    case GOMP_DIM_MASK (GOMP_DIM_VECTOR):
-      nvptx_vpropagate (par->forked_block, par->forked_insn);
-      break;
-      
-    case GOMP_DIM_MASK (GOMP_DIM_WORKER):
-      {
-	nvptx_wpropagate (false, par->forked_block,
-			  par->forked_insn);
-	nvptx_wpropagate (true, par->forked_block, par->fork_insn);
-	/* Insert begin and end synchronizations.  */
-	emit_insn_after (nvptx_wsync (false), par->forked_insn);
-	emit_insn_before (nvptx_wsync (true), par->joining_insn);
-      }
-      break;
-
-    default:gcc_unreachable ();
+  if (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+    {
+      nvptx_wpropagate (false, par->forked_block, par->forked_insn);
+      nvptx_wpropagate (true, par->forked_block, par->fork_insn);
+      /* Insert begin and end synchronizations.  */
+      emit_insn_after (nvptx_wsync (false), par->forked_insn);
+      emit_insn_before (nvptx_wsync (true), par->joining_insn);
     }
+  else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
+    nvptx_vpropagate (par->forked_block, par->forked_insn);
 
   /* Now do siblings.  */
   if (par->next)
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/routine-work-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/routine-work-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/routine-work-1.c	(revision 0)
@@ -0,0 +1,53 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O1" } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+#define WORK_ID(I,N)						\
+  (acc_on_device (acc_device_nvidia)				\
+   ? ({unsigned __r;						\
+       __asm__ volatile ("mov.u32 %0,%%tid.y;" : "=r" (__r));	\
+       __r; }) : (I % N))
+
+#pragma acc routine worker
+void Work (int *ptr, int lim, int N)
+{
+#pragma acc loop worker
+  for (int i = 0; i < lim; i++)
+    ptr[i] = WORK_ID(i, N);
+}
+
+#define LEN 32
+
+int DoWork (int err, int N)
+{
+  int ary[LEN];
+
+  for (int ix = 0; ix != LEN; ix++)
+    ary[ix] = 0xdeadbeef;
+  
+#pragma acc parallel num_workers(N) copy (ary)
+  {
+    Work (ary, LEN, N);
+  }
+
+  for (int ix = 0; ix != LEN; ix++)
+    if (ary[ix] != ix % N)
+      {
+	printf ("ary[%d] = %d expected %d\n", ix, ary[ix], ix % N);
+	err = 1;
+      }
+  return err;
+}
+
+
+int main ()
+{
+  int err = 0;
+
+  for (int W = 1; W <= LEN; W <<= 1)
+    err = DoWork (err, W);
+
+  return err;
+}
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/routine-vec-1.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/routine-vec-1.c	(revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/routine-vec-1.c	(revision 0)
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-additional-options "-O1" } */
+
+#include <stdio.h>
+#include <openacc.h>
+
+#define VEC_ID(I, N)						\
+  (acc_on_device (acc_device_nvidia)				\
+   ? ({unsigned __r;						\
+       __asm__ volatile ("mov.u32 %0,%%tid.x;" : "=r" (__r));	\
+       __r; }) : (I % N))
+
+#pragma acc routine vector
+void Vec (int *ptr, int lim, int N)
+{
+#pragma acc loop vector
+  for (int i = 0; i < lim; i++)
+    ptr[i] = VEC_ID(i, N);
+}
+
+#define LEN 32
+
+int main ()
+{
+  int ary[LEN];
+  int err = 0;
+
+  for (int ix = 0; ix != LEN; ix++)
+    ary[ix] = 0xdeadbeef;
+  
+#pragma acc parallel vector_length(32) copy (ary)
+  {
+    Vec (ary, LEN, 32);
+  }
+
+  for (int ix = 0; ix != LEN; ix++)
+    {
+      if (ary[ix] != ix % 32)
+	{
+	  printf ("ary[%d] = %d expected %d\n", ix, ary[ix], ix % 32);
+	  err = 1;
+	}
+    }
+
+  return err;
+}

Reply via email to