I've committed this to gomp4 to reduce divergence from trunk.

nathan
2015-09-10  Nathan Sidwell  <nat...@codesourcery.com>

        * config/nvptx/nvptx.c: Move df.h include earlier.
        (nvptx_emit_forking, nvptx_emit_joining): Move earlier.
        (nvptx_expand_call): Restore to earlier position in file.
        (nvptx_reorg_subreg): Pass PATTERN to asm_noperands.
        (nvptx_reorg): Expand some comments.

Index: gcc/config/nvptx/nvptx.c
===================================================================
--- gcc/config/nvptx/nvptx.c    (revision 227665)
+++ gcc/config/nvptx/nvptx.c    (working copy)
@@ -26,6 +26,7 @@
 #include "cfghooks.h"
 #include "tree.h"
 #include "rtl.h"
+#include "df.h"
 #include "alias.h"
 #include "insn-flags.h"
 #include "output.h"
@@ -56,7 +57,6 @@
 #include "cfghooks.h"
 #include "cfgloop.h"
 #include "stor-layout.h"
-#include "df.h"
 #include "dumpfile.h"
 #include "builtins.h"
 #include "dominance.h"
@@ -297,6 +297,44 @@ nvptx_split_reg_p (machine_mode mode)
   return false;
 }
 
+/* Emit forking instructions for MASK.  */
+
+static void
+nvptx_emit_forking (unsigned mask, bool is_call)
+{
+  mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+          | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+  if (mask)
+    {
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
+      
+      /* Emit fork for worker level.  */
+      if (!is_call && mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+       emit_insn (gen_nvptx_fork (op));
+      emit_insn (gen_nvptx_forked (op));
+    }
+}
+
+/* Emit joining instructions for MASK.  */
+
+static void
+nvptx_emit_joining (unsigned mask, bool is_call)
+{
+  mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+          | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+  if (mask)
+    {
+      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
+
+      /* Emit joining for all non-call pars to ensure there's a single
+        predecessor for the block the join insn ends up in.  This is
+        needed for skipping entire loops.  */
+      if (!is_call)
+       emit_insn (gen_nvptx_joining (op));
+      emit_insn (gen_nvptx_join (op));
+    }
+}
+
 #define PASS_IN_REG_P(MODE, TYPE)                              \
   ((GET_MODE_CLASS (MODE) == MODE_INT                          \
     || GET_MODE_CLASS (MODE) == MODE_FLOAT                     \
@@ -893,6 +931,128 @@ nvptx_end_call_args (void)
   free_EXPR_LIST_list (&cfun->machine->call_args);
 }
 
+/* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
+   track of whether calls involving static chains or varargs were seen
+   in the current function.
+   For libcalls, maintain a hash table of decls we have seen, and
+   record a function decl for later when encountering a new one.  */
+
+void
+nvptx_expand_call (rtx retval, rtx address)
+{
+  int nargs = 0;
+  rtx callee = XEXP (address, 0);
+  rtx pat, t;
+  rtvec vec;
+  bool external_decl = false;
+  rtx varargs = NULL_RTX;
+  tree decl_type = NULL_TREE;
+  unsigned parallel = 0;
+
+  for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
+    nargs++;
+
+  if (!call_insn_operand (callee, Pmode))
+    {
+      callee = force_reg (Pmode, callee);
+      address = change_address (address, QImode, callee);
+    }
+
+  if (GET_CODE (callee) == SYMBOL_REF)
+    {
+      tree decl = SYMBOL_REF_DECL (callee);
+      if (decl != NULL_TREE)
+       {
+         decl_type = TREE_TYPE (decl);
+         if (DECL_STATIC_CHAIN (decl))
+           cfun->machine->has_call_with_sc = true;
+         if (DECL_EXTERNAL (decl))
+           external_decl = true;
+         tree attr = get_oacc_fn_attrib (decl);
+         if (attr)
+           {
+             tree dims = TREE_VALUE (attr);
+
+             parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
+             for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
+               {
+                 if (TREE_PURPOSE (dims)
+                     && !integer_zerop (TREE_PURPOSE (dims)))
+                   break;
+                 /* Not on this axis.  */
+                 parallel ^= GOMP_DIM_MASK (ix);
+                 dims = TREE_CHAIN (dims);
+               }
+           }
+       }
+    }
+
+  if (cfun->machine->funtype
+      /* It's possible to construct testcases where we call a variable.
+        See compile/20020129-1.c.  stdarg_p will crash so avoid calling it
+        in such a case.  */
+      && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
+         || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
+      && stdarg_p (cfun->machine->funtype))
+    {
+      varargs = gen_reg_rtx (Pmode);
+      if (Pmode == DImode)
+       emit_move_insn (varargs, stack_pointer_rtx);
+      else
+       emit_move_insn (varargs, stack_pointer_rtx);
+      cfun->machine->has_call_with_varargs = true;
+    }
+  vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
+  pat = gen_rtx_PARALLEL (VOIDmode, vec);
+
+  int vec_pos = 0;
+  
+  rtx tmp_retval = retval;
+  t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
+  if (retval != NULL_RTX)
+    {
+      if (!nvptx_register_operand (retval, GET_MODE (retval)))
+       tmp_retval = gen_reg_rtx (GET_MODE (retval));
+      t = gen_rtx_SET (tmp_retval, t);
+    }
+  XVECEXP (pat, 0, vec_pos++) = t;
+
+  /* Construct the call insn, including a USE for each argument pseudo
+     register.  These will be used when printing the insn.  */
+  for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
+    {
+      rtx this_arg = XEXP (arg, 0);
+      XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
+    }
+
+  if (varargs)
+      XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
+
+  gcc_assert (vec_pos = XVECLEN (pat, 0));
+
+  /* If this is a libcall, decl_type is NULL. For a call to a non-libcall
+     undeclared function, we'll have an external decl without arg types.
+     In either case we have to try to construct a ptx declaration from one of
+     the calls to the function.  */
+  if (!REG_P (callee)
+      && (decl_type == NULL_TREE
+         || (external_decl && TYPE_ARG_TYPES (decl_type) == NULL_TREE)))
+    {
+      rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
+      if (*slot == NULL)
+       {
+         *slot = callee;
+         write_func_decl_from_insn (func_decls, retval, pat, callee);
+       }
+    }
+  nvptx_emit_forking (parallel, true);
+  emit_call_insn (pat);
+  nvptx_emit_joining (parallel, true);
+
+  if (tmp_retval != retval)
+    emit_move_insn (retval, tmp_retval);
+}
+
 /* Implement TARGET_FUNCTION_ARG.  */
 
 static rtx
@@ -1097,166 +1257,6 @@ nvptx_expand_compare (rtx compare)
   return gen_rtx_NE (BImode, pred, const0_rtx);
 }
 
-/* Emit forking instructions for MASK.  */
-
-static void
-nvptx_emit_forking (unsigned mask, bool is_call)
-{
-  mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
-          | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
-  if (mask)
-    {
-      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
-      
-      /* Emit fork for worker level.  */
-      if (!is_call && mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
-       emit_insn (gen_nvptx_fork (op));
-      emit_insn (gen_nvptx_forked (op));
-    }
-}
-
-/* Emit joining instructions for MASK.  */
-
-static void
-nvptx_emit_joining (unsigned mask, bool is_call)
-{
-  mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
-          | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
-  if (mask)
-    {
-      rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
-
-      /* Emit joining for all non-call pars to ensure there's a single
-        predecessor for the block the join insn ends up in.  This is
-        needed for skipping entire loops.  */
-      if (!is_call)
-       emit_insn (gen_nvptx_joining (op));
-      emit_insn (gen_nvptx_join (op));
-    }
-}
-
-/* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
-   track of whether calls involving static chains or varargs were seen
-   in the current function.
-   For libcalls, maintain a hash table of decls we have seen, and
-   record a function decl for later when encountering a new one.  */
-
-void
-nvptx_expand_call (rtx retval, rtx address)
-{
-  int nargs = 0;
-  rtx callee = XEXP (address, 0);
-  rtx pat, t;
-  rtvec vec;
-  bool external_decl = false;
-  rtx varargs = NULL_RTX;
-  tree decl_type = NULL_TREE;
-  unsigned parallel = 0;
-
-  for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
-    nargs++;
-
-  if (!call_insn_operand (callee, Pmode))
-    {
-      callee = force_reg (Pmode, callee);
-      address = change_address (address, QImode, callee);
-    }
-
-  if (GET_CODE (callee) == SYMBOL_REF)
-    {
-      tree decl = SYMBOL_REF_DECL (callee);
-      if (decl != NULL_TREE)
-       {
-         decl_type = TREE_TYPE (decl);
-         if (DECL_STATIC_CHAIN (decl))
-           cfun->machine->has_call_with_sc = true;
-         if (DECL_EXTERNAL (decl))
-           external_decl = true;
-         tree attr = get_oacc_fn_attrib (decl);
-         if (attr)
-           {
-             tree dims = TREE_VALUE (attr);
-
-             parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
-             for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
-               {
-                 if (TREE_PURPOSE (dims)
-                     && !integer_zerop (TREE_PURPOSE (dims)))
-                   break;
-                 /* Not on this axis.  */
-                 parallel ^= GOMP_DIM_MASK (ix);
-                 dims = TREE_CHAIN (dims);
-               }
-           }
-       }
-    }
-
-  if (cfun->machine->funtype
-      /* It's possible to construct testcases where we call a variable.
-        See compile/20020129-1.c.  stdarg_p will crash so avoid calling it
-        in such a case.  */
-      && (TREE_CODE (cfun->machine->funtype) == FUNCTION_TYPE
-         || TREE_CODE (cfun->machine->funtype) == METHOD_TYPE)
-      && stdarg_p (cfun->machine->funtype))
-    {
-      varargs = gen_reg_rtx (Pmode);
-      if (Pmode == DImode)
-       emit_move_insn (varargs, stack_pointer_rtx);
-      else
-       emit_move_insn (varargs, stack_pointer_rtx);
-      cfun->machine->has_call_with_varargs = true;
-    }
-  vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
-  pat = gen_rtx_PARALLEL (VOIDmode, vec);
-
-  int vec_pos = 0;
-  
-  rtx tmp_retval = retval;
-  t = gen_rtx_CALL (VOIDmode, address, const0_rtx);
-  if (retval != NULL_RTX)
-    {
-      if (!nvptx_register_operand (retval, GET_MODE (retval)))
-       tmp_retval = gen_reg_rtx (GET_MODE (retval));
-      t = gen_rtx_SET (tmp_retval, t);
-    }
-  XVECEXP (pat, 0, vec_pos++) = t;
-
-  /* Construct the call insn, including a USE for each argument pseudo
-     register.  These will be used when printing the insn.  */
-  for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
-    {
-      rtx this_arg = XEXP (arg, 0);
-      XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, this_arg);
-    }
-
-  if (varargs)
-      XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
-
-  gcc_assert (vec_pos = XVECLEN (pat, 0));
-
-  /* If this is a libcall, decl_type is NULL. For a call to a non-libcall
-     undeclared function, we'll have an external decl without arg types.
-     In either case we have to try to construct a ptx declaration from one of
-     the calls to the function.  */
-  if (!REG_P (callee)
-      && (decl_type == NULL_TREE
-         || (external_decl && TYPE_ARG_TYPES (decl_type) == NULL_TREE)))
-    {
-      rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
-      if (*slot == NULL)
-       {
-         *slot = callee;
-         write_func_decl_from_insn (func_decls, retval, pat, callee);
-       }
-    }
-  nvptx_emit_forking (parallel, true);
-  emit_call_insn (pat);
-  if (tmp_retval != retval)
-    emit_move_insn (retval, tmp_retval);
-
-  nvptx_emit_joining (parallel, true);
-}
-
 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
 
 void
@@ -2145,7 +2145,7 @@ nvptx_print_operand (FILE *file, rtx x,
        fprintf (file, ".%s", kinds[kind]);
       }
       break;
-      
+
     case 't':
       op_mode = nvptx_underlying_object_mode (x);
       fprintf (file, "%s", nvptx_ptx_type_from_mode (op_mode, true));
@@ -2340,7 +2340,7 @@ get_replacement (struct reg_replace *r)
    conversion copyin/copyout instructions.  */
 
 static void
-nvptx_reorg_subreg ()
+nvptx_reorg_subreg (void)
 {
   struct reg_replace qiregs, hiregs, siregs, diregs;
   rtx_insn *insn, *next;
@@ -2358,7 +2358,7 @@ nvptx_reorg_subreg ()
     {
       next = NEXT_INSN (insn);
       if (!NONDEBUG_INSN_P (insn)
-         || asm_noperands (insn) >= 0
+         || asm_noperands (PATTERN (insn)) >= 0
          || GET_CODE (PATTERN (insn)) == USE
          || GET_CODE (PATTERN (insn)) == CLOBBER)
        continue;
@@ -2368,8 +2368,8 @@ nvptx_reorg_subreg ()
       siregs.n_in_use = 0;
       diregs.n_in_use = 0;
       extract_insn (insn);
-
       enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
+
       for (int i = 0; i < recog_data.n_operands; i++)
        {
          rtx op = recog_data.operand[i];
@@ -3219,9 +3219,16 @@ nvptx_reorg_reductions (void)
     }
 }
 
-/* NVPTX machine dependent reorg.
-   Insert vector and worker single neutering code and state
-   propagation when entering partioned mode.  Fixup subregs.  */
+/* PTX-specific reorganization
+   - Scan and release reduction buffers
+   - Split blocks at fork and join instructions
+   - Compute live registers
+   - Mark now-unused registers, so function begin doesn't declare
+   unused registers.
+   - Insert state propagation when entering partitioned mode
+   - Insert neutering instructions when in single mode
+   - Replace subregs with suitable sequences.
+*/
 
 static void
 nvptx_reorg (void)
@@ -3281,6 +3288,7 @@ nvptx_reorg (void)
       gcc_assert (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
                  || (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
 
+      /* Discover & process partitioned regions.  */
       parallel *pars = nvptx_discover_pars (&bb_insn_map);
       nvptx_process_pars (pars);
       nvptx_neuter_pars (pars, mask, 0);

Reply via email to