https://gcc.gnu.org/g:0626769c2203460e61fe3c5e4083549723453f3c

commit 0626769c2203460e61fe3c5e4083549723453f3c
Author: Julian Brown <jul...@codesourcery.com>
Date:   Tue Feb 12 15:06:55 2019 -0800

    Various OpenACC reduction enhancements - ME and nvptx changes
    
    Parts of the first posting got lost in the second posting, above.
    This version hopefully contains everything.
    
    2018-10-30  Cesar Philippidis  <ce...@codesourcery.com>
    
            gcc/
            * config/nvptx/nvptx.cc (nvptx_propagate_unified): New.
            (nvptx_split_blocks): Call it for cond_uni insn.
            (nvptx_expand_cond_uni): New.
            (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
            (nvptx_init_builtins): Initialize it.
            (nvptx_expand_builtin):
            (nvptx_generate_vector_shuffle): Change integral SHIFT operand to
            tree BITS operand.
            (nvptx_vector_reduction): New.
            (nvptx_adjust_reduction_type): New.
            (nvptx_goacc_reduction_setup): Use it to adjust the type of 
ref_to_res.
            (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
            (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for 
vector.
            Use it to adjust the type of ref_to_res.
            (nvptx_goacc_reduction_teardown):
            * config/nvptx/nvptx.md (cond_uni): New pattern.

Diff:
---
 gcc/ChangeLog.omp         |  19 ++++
 gcc/config/nvptx/nvptx.cc | 233 +++++++++++++++++++++++++++++++++++++++++-----
 gcc/config/nvptx/nvptx.md |   7 ++
 gcc/gimplify.cc           |   8 +-
 gcc/omp-general.h         |   2 +-
 gcc/omp-low.cc            |  29 +++++-
 gcc/omp-offload.cc        |  11 +++
 7 files changed, 278 insertions(+), 31 deletions(-)

diff --git a/gcc/ChangeLog.omp b/gcc/ChangeLog.omp
index 4f4d1dbfad4..4ba97e30d5c 100644
--- a/gcc/ChangeLog.omp
+++ b/gcc/ChangeLog.omp
@@ -1,3 +1,22 @@
+2018-10-30  Cesar Philippidis  <ce...@codesourcery.com>
+
+       * config/nvptx/nvptx.cc (nvptx_propagate_unified): New.
+       (nvptx_split_blocks): Call it for cond_uni insn.
+       (nvptx_expand_cond_uni): New.
+       (enum nvptx_builtins): Add NVPTX_BUILTIN_COND_UNI.
+       (nvptx_init_builtins): Initialize it.
+       (nvptx_expand_builtin):
+       (nvptx_generate_vector_shuffle): Change integral SHIFT operand to
+       tree BITS operand.
+       (nvptx_vector_reduction): New.
+       (nvptx_adjust_reduction_type): New.
+       (nvptx_goacc_reduction_setup): Use it to adjust the type of ref_to_res.
+       (nvptx_goacc_reduction_init): Don't update LHS if it doesn't exist.
+       (nvptx_goacc_reduction_fini): Call nvptx_vector_reduction for vector.
+       Use it to adjust the type of ref_to_res.
+       (nvptx_goacc_reduction_teardown):
+       * config/nvptx/nvptx.md (cond_uni): New pattern.
+
 2018-06-29  Cesar Philippidis  <ce...@codesourcery.com>
            James Norris  <jnor...@codesourcery.com>
 
diff --git a/gcc/config/nvptx/nvptx.cc b/gcc/config/nvptx/nvptx.cc
index 2a8f713c680..3e52354bd12 100644
--- a/gcc/config/nvptx/nvptx.cc
+++ b/gcc/config/nvptx/nvptx.cc
@@ -1,5 +1,5 @@
 /* Target code for NVPTX.
-   Copyright (C) 2014-2024 Free Software Foundation, Inc.
+   Copyright (C) 2014-2023 Free Software Foundation, Inc.
    Contributed by Bernd Schmidt <ber...@codesourcery.com>
 
    This file is part of GCC.
@@ -3487,6 +3487,52 @@ nvptx_mach_vector_length ()
   return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
 }
 
+/* UNIFIED is a cond_uni insn.  Find the branch insn it affects, and
+   mark that as unified.  We expect to be in a single block.  */
+
+static void
+nvptx_propagate_unified (rtx_insn *unified)
+{
+  rtx_insn *probe = unified;
+  rtx cond_reg = SET_DEST (PATTERN (unified));
+  rtx pat = NULL_RTX;
+
+  /* Find the comparison.  (We could skip this and simply scan to he
+     blocks' terminating branch, if we didn't care for self
+     checking.)  */
+  for (;;)
+    {
+      probe = next_real_insn (probe);
+      if (!probe)
+       break;
+      pat = PATTERN (probe);
+
+      if (GET_CODE (pat) == SET
+         && GET_RTX_CLASS (GET_CODE (SET_SRC (pat))) == RTX_COMPARE
+         && XEXP (SET_SRC (pat), 0) == cond_reg)
+       break;
+      gcc_assert (NONJUMP_INSN_P (probe));
+    }
+  gcc_assert (pat);
+  rtx pred_reg = SET_DEST (pat);
+
+  /* Find the branch.  */
+  do
+    probe = NEXT_INSN (probe);
+  while (!JUMP_P (probe));
+
+  pat = PATTERN (probe);
+  rtx itec = XEXP (SET_SRC (pat), 0);
+  gcc_assert (XEXP (itec, 0) == pred_reg);
+
+  /* Mark the branch's condition as unified.  */
+  rtx unspec = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pred_reg),
+                              UNSPEC_BR_UNIFIED);
+  bool ok = validate_change (probe, &XEXP (itec, 0), unspec, false);
+
+  gcc_assert (ok);
+}
+
 /* Loop structure of the function.  The entire function is described as
    a NULL loop.  */
 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'.  */
@@ -3590,6 +3636,9 @@ nvptx_split_blocks (bb_insn_map_t *map)
            continue;
          switch (recog_memoized (insn))
            {
+           case CODE_FOR_cond_uni:
+             nvptx_propagate_unified (insn);
+             /* FALLTHROUGH */
            default:
              seen_insn = true;
              continue;
@@ -6178,6 +6227,20 @@ nvptx_expand_cmp_swap (tree exp, rtx target,
   return target;
 }
 
+/* Expander for the compare unified builtin.  */
+
+static rtx
+nvptx_expand_cond_uni (tree exp, rtx target, machine_mode mode, int ignore)
+{
+  if (ignore)
+    return target;
+
+  rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, mode, 
EXPAND_NORMAL);
+
+  emit_insn (gen_cond_uni (target, src));
+
+  return target;
+}
 
 /* Codes for all the NVPTX builtins.  */
 enum nvptx_builtins
@@ -6195,6 +6258,7 @@ enum nvptx_builtins
   NVPTX_BUILTIN_BAR_RED_POPC,
   NVPTX_BUILTIN_BREV,
   NVPTX_BUILTIN_BREVLL,
+  NVPTX_BUILTIN_COND_UNI,
   NVPTX_BUILTIN_MAX
 };
 
@@ -6315,6 +6379,7 @@ nvptx_init_builtins (void)
   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
   DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
   DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
+  DEF (COND_UNI, "cond_uni", (integer_type_node, integer_type_node, 
NULL_TREE));
 
   DEF (BAR_RED_AND, "bar_red_and",
        (UINT, UINT, UINT, UINT, UINT, NULL_TREE));
@@ -6377,6 +6442,9 @@ nvptx_expand_builtin (tree exp, rtx target, rtx 
ARG_UNUSED (subtarget),
     case NVPTX_BUILTIN_BREVLL:
       return nvptx_expand_brev (exp, target, mode, ignore);
 
+    case NVPTX_BUILTIN_COND_UNI:
+      return nvptx_expand_cond_uni (exp, target, mode, ignore);
+
     default: gcc_unreachable ();
     }
 }
@@ -6726,7 +6794,7 @@ nvptx_get_shared_red_addr (tree type, tree offset, bool 
vector)
 
 static void
 nvptx_generate_vector_shuffle (location_t loc,
-                              tree dest_var, tree var, unsigned shift,
+                              tree dest_var, tree var, tree bits,
                               gimple_seq *seq)
 {
   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
@@ -6749,7 +6817,6 @@ nvptx_generate_vector_shuffle (location_t loc,
     }
   
   tree call = nvptx_builtin_decl (fn, true);
-  tree bits = build_int_cst (unsigned_type_node, shift);
   tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
   tree expr;
 
@@ -7039,6 +7106,131 @@ nvptx_reduction_update (location_t loc, 
gimple_stmt_iterator *gsi,
     return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
 }
 
+/* Emit a vector-level reduction loop.  OLD_VAR is the incoming
+   variable to reduce (valid in each vector), OP is the reduction
+   operator.  Return the reduced value (an SSA var).
+
+   The code we generate looks like:
+      unsigned old_shift = DIM_SIZE(VECTOR);
+      do
+       {
+         shift = PHI (old_shift, new_shift);
+         var = PHI (old_var, new_var);
+         new_shift = shift >> 1;
+         other_var = VSHUFFLE (var, new_shift);
+         new_var = var OP other_var;
+         cond_var = builtin_cond_uni (new_shift);
+       }
+       while (cond_var > 1);
+
+  The builtin_cond_ini expands to a cond_uni instruction, which is
+  processed in nvpts_split_blocks to mark the loop's terminating
+  branch instruction.  */
+
+static tree
+nvptx_vector_reduction (location_t loc, gimple_stmt_iterator *gsi,
+                       tree old_var, tree_code op)
+{
+  tree var_type = TREE_TYPE (old_var);
+
+  /*  Emit old_shift = DIM_SIZE(VECTOR) */
+  tree old_shift = make_ssa_name (integer_type_node);
+  tree dim = build_int_cst (integer_type_node, GOMP_DIM_VECTOR);
+  gcall *call = gimple_build_call_internal (IFN_GOACC_DIM_SIZE, 1, dim);
+  gimple_set_lhs (call, old_shift);
+  gimple_set_location (call, loc);
+  gsi_insert_before (gsi, call, GSI_SAME_STMT);
+
+  /* Split the block just after the init stmts.  */
+  basic_block pre_bb = gsi_bb (*gsi);
+  edge pre_edge = split_block (pre_bb, call);
+  pre_edge->probability = profile_probability::even ();
+  basic_block loop_bb = pre_edge->dest;
+  pre_bb = pre_edge->src;
+  /* Reset the iterator.  */
+  *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+  tree shift = make_ssa_name (integer_type_node);
+  // Avoid segfault in gimple_stmt_nonnegative_warnv_p during simplification
+  SSA_NAME_DEF_STMT (shift) = gimple_build_nop ();
+  tree new_shift = make_ssa_name (integer_type_node);
+  tree var = make_ssa_name (var_type);
+  tree other_var = make_ssa_name (var_type);
+  tree new_var = make_ssa_name (var_type);
+
+  /* Build and insert the loop body.  */
+  gimple_seq loop_seq = NULL;
+
+  /* new_shift = shift >> 1 */
+  tree shift_expr = fold_build2 (RSHIFT_EXPR, integer_type_node,
+                                shift, integer_one_node);
+  gimplify_assign (new_shift, shift_expr, &loop_seq);
+
+  /* other_var = shuffle (var, shift) */
+  nvptx_generate_vector_shuffle (loc, other_var, var, new_shift, &loop_seq);
+  /* new_var = var  OP other_var */
+  tree red_expr = fold_build2 (op, var_type, var, other_var);
+  gimplify_assign (new_var, red_expr, &loop_seq);
+
+  /* Mark the iterator variable as unified.  */
+  tree cond_var = make_ssa_name (integer_type_node);
+  tree uni_fn = nvptx_builtin_decl (NVPTX_BUILTIN_COND_UNI, true);
+  tree uni_expr = build_call_expr_loc (loc, uni_fn, 1, new_shift);
+  gimplify_assign (cond_var,  uni_expr, &loop_seq);
+
+  gcond *cond = gimple_build_cond (LE_EXPR, cond_var, integer_one_node,
+                                  NULL_TREE, NULL_TREE);
+  gimple_seq_add_stmt (&loop_seq, cond);
+
+  gsi_insert_seq_before (gsi, loop_seq, GSI_SAME_STMT);
+
+  /* Split the block just after the loop stmts.  */
+  edge post_edge = split_block (loop_bb, cond);
+  post_edge->probability = profile_probability::even ();
+  basic_block post_bb = post_edge->dest;
+  loop_bb = post_edge->src;
+  *gsi = gsi_for_stmt (gsi_stmt (*gsi));
+
+  /* Create the loop.  */
+  post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
+  edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
+  loop_edge->probability = profile_probability::even ();
+  set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
+  set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
+
+  gphi *shift_phi = create_phi_node (shift, loop_bb);
+  add_phi_arg (shift_phi, old_shift, pre_edge, loc);
+  add_phi_arg (shift_phi, new_shift, loop_edge, loc);
+
+  gphi *var_phi = create_phi_node (var, loop_bb);
+  add_phi_arg (var_phi, old_var, pre_edge, loc);
+  add_phi_arg (var_phi, new_var, loop_edge, loc);
+
+  loop *loop = alloc_loop ();
+  loop->header = loop_bb;
+  loop->latch = loop_bb;
+  add_loop (loop, loop_bb->loop_father);
+
+  return new_var;
+}
+
+/* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data
+   mappings gets retyped to (void *).  Adjust the type of VAR to TYPE
+   as appropriate.  */
+
+static tree
+nvptx_adjust_reduction_type (tree var, tree type, gimple_seq *seq)
+{
+  if (TREE_TYPE (TREE_TYPE (var)) == type)
+    return var;
+
+  tree ptype = build_pointer_type (type);
+  tree t = make_ssa_name (ptype);
+  tree expr = fold_build1 (NOP_EXPR, ptype, var);
+  gimple_seq_add_stmt (seq, gimple_build_assign (t, expr));
+  return t;
+}
+
 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
 
 static void
@@ -7058,7 +7250,11 @@ nvptx_goacc_reduction_setup (gcall *call, offload_attrs 
*oa)
       tree ref_to_res = gimple_call_arg (call, 1);
 
       if (!integer_zerop (ref_to_res))
-       var = build_simple_mem_ref (ref_to_res);
+       {
+         ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+                                                   &seq);
+         var = build_simple_mem_ref (ref_to_res);
+       }
     }
   
   if (level == GOMP_DIM_WORKER
@@ -7186,22 +7382,7 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs 
*oa)
   push_gimplify_context (true);
 
   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
-    {
-      /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
-        but that requires a method of emitting a unified jump at the
-        gimple level.  */
-      for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
-       {
-         tree other_var = make_ssa_name (TREE_TYPE (var));
-         nvptx_generate_vector_shuffle (gimple_location (call),
-                                        other_var, var, shfl, &seq);
-
-         r = make_ssa_name (TREE_TYPE (var));
-         gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
-                                          var, other_var), &seq);
-         var = r;
-       }
-    }
+    r = nvptx_vector_reduction (gimple_location (call), &gsi, var, op);
   else
     {
       tree accum = NULL_TREE;
@@ -7220,7 +7401,11 @@ nvptx_goacc_reduction_fini (gcall *call, offload_attrs 
*oa)
       else if (integer_zerop (ref_to_res))
        r = var;
       else
-       accum = ref_to_res;
+       {
+         ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+                                                   &seq);
+         accum = ref_to_res;
+       }
 
       if (accum)
        {
@@ -7271,7 +7456,11 @@ nvptx_goacc_reduction_teardown (gcall *call, 
offload_attrs *oa)
       tree ref_to_res = gimple_call_arg (call, 1);
 
       if (!integer_zerop (ref_to_res))
-       gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
+       {
+         ref_to_res = nvptx_adjust_reduction_type (ref_to_res, TREE_TYPE (var),
+                                                   &seq);
+         gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
+       }
     }
 
   if (lhs)
diff --git a/gcc/config/nvptx/nvptx.md b/gcc/config/nvptx/nvptx.md
index 96e6c911608..4118de52a9a 100644
--- a/gcc/config/nvptx/nvptx.md
+++ b/gcc/config/nvptx/nvptx.md
@@ -942,6 +942,13 @@
   "%J0\\tbra.uni\\t%l1;"
   [(set_attr "predicable" "no")])
 
+(define_insn "cond_uni"
+  [(set (match_operand:SI 0 "nvptx_register_operand" "=R")
+          (unspec:SI [(match_operand:SI 1 "nvptx_nonmemory_operand" "R")]
+                     UNSPEC_BR_UNIFIED))]
+  ""
+  "%.\\tmov%t0\\t%0, %1; // unified")
+
 (define_expand "cbranch<mode>4"
   [(set (pc)
        (if_then_else (match_operator 0 "nvptx_comparison_operator"
diff --git a/gcc/gimplify.cc b/gcc/gimplify.cc
index 12be6d0a6e8..07f835c17d9 100644
--- a/gcc/gimplify.cc
+++ b/gcc/gimplify.cc
@@ -14466,9 +14466,11 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, 
gimple_seq body, tree *list_p,
        case OMP_CLAUSE_TASK_REDUCTION:
          decl = OMP_CLAUSE_DECL (c);
          /* OpenACC reductions need a present_or_copy data clause.
-            Add one if necessary.  Emit error when the reduction is private.  
*/
-         if (ctx->region_type == ORT_ACC_PARALLEL
-             || ctx->region_type == ORT_ACC_SERIAL)
+            Add one if necessary.  Emit error when the reduction is
+            private.  */
+         if (DECL_P (decl) &&
+             (ctx->region_type == ORT_ACC_PARALLEL
+              || ctx->region_type == ORT_ACC_SERIAL))
            {
              n = splay_tree_lookup (ctx->variables, (splay_tree_key) decl);
              if (n->value & (GOVD_PRIVATE | GOVD_FIRSTPRIVATE))
diff --git a/gcc/omp-general.h b/gcc/omp-general.h
index b7d85a768ce..15e092f1286 100644
--- a/gcc/omp-general.h
+++ b/gcc/omp-general.h
@@ -35,7 +35,7 @@ enum oacc_loop_flags {
   OLF_GANG_STATIC = 1u << 3,   /* Gang partitioning is static (has op). */
   OLF_TILE     = 1u << 4,      /* Tiled loop. */
   OLF_REDUCTION = 1u << 5,     /* Reduction loop.  */
-  
+
   /* Explicitly specified loop axes.  */
   OLF_DIM_BASE = 6,
   OLF_DIM_GANG   = 1u << (OLF_DIM_BASE + GOMP_DIM_GANG),
diff --git a/gcc/omp-low.cc b/gcc/omp-low.cc
index 778c75dc59d..e64e784940f 100644
--- a/gcc/omp-low.cc
+++ b/gcc/omp-low.cc
@@ -7610,6 +7610,7 @@ lower_oacc_reductions (location_t loc, tree clauses, tree 
level, bool inner,
        tree ref_to_res = NULL_TREE;
        tree incoming, outgoing, v1, v2, v3;
        bool is_private = false;
+       bool is_fpp = false;
 
        enum tree_code rcode = OMP_CLAUSE_REDUCTION_CODE (c);
        if (rcode == MINUS_EXPR)
@@ -7671,19 +7672,37 @@ lower_oacc_reductions (location_t loc, tree clauses, 
tree level, bool inner,
                      is_private = true;
                      goto do_lookup;
                    }
+                 else if (OMP_CLAUSE_CODE (cls) == OMP_CLAUSE_MAP
+                          && (OMP_CLAUSE_MAP_KIND (cls)
+                              == GOMP_MAP_FIRSTPRIVATE_POINTER)
+                          && orig == OMP_CLAUSE_DECL (cls))
+                   {
+                     is_fpp = true;
+                     goto do_lookup;
+                   }
              }
 
          do_lookup:
            /* This is the outermost construct with this reduction,
               see if there's a mapping for it.  */
            if (gimple_code (outer->stmt) == GIMPLE_OMP_TARGET
-               && maybe_lookup_field (orig, outer) && !is_private)
+               && (maybe_lookup_field (orig, outer) || is_fpp) && !is_private)
              {
-               ref_to_res = build_receiver_ref (orig, false, outer);
-               if (omp_privatize_by_reference (orig))
-                 ref_to_res = build_simple_mem_ref (ref_to_res);
-
                tree type = TREE_TYPE (var);
+
+               if (is_fpp)
+                 {
+                   tree x = create_tmp_var (type);
+                   gimplify_assign (x, lookup_decl (orig, outer), fork_seq);
+                   ref_to_res = x;
+                 }
+               else
+                 {
+                   ref_to_res = build_receiver_ref (orig, false, outer);
+                   if (omp_privatize_by_reference (orig))
+                     ref_to_res = build_simple_mem_ref (ref_to_res);
+                 }
+
                if (POINTER_TYPE_P (type))
                  type = TREE_TYPE (type);
 
diff --git a/gcc/omp-offload.cc b/gcc/omp-offload.cc
index 35313c2ecf3..738b64f8c16 100644
--- a/gcc/omp-offload.cc
+++ b/gcc/omp-offload.cc
@@ -1804,6 +1804,17 @@ default_goacc_reduction (gcall *call)
 
       if (!integer_zerop (ref_to_res))
        {
+         /* Dummy reduction vars that have GOMP_MAP_FIRSTPRIVATE_POINTER data
+            mappings gets retyped to (void *).  Adjust the type of ref_to_res
+            as appropriate.  */
+         if (TREE_TYPE (TREE_TYPE (ref_to_res)) != TREE_TYPE (var))
+           {
+             tree ptype = build_pointer_type (TREE_TYPE (var));
+             tree t = make_ssa_name (ptype);
+             tree expr = fold_build1 (NOP_EXPR, ptype, ref_to_res);
+             gimple_seq_add_stmt (&seq, gimple_build_assign (t, expr));
+             ref_to_res = t;
+           }
          tree dst = build_simple_mem_ref (ref_to_res);
          tree src = var;

Reply via email to