Hi Tom!

On Mon, 13 Jul 2015 12:32:20 +0200, Tom de Vries <tom_devr...@mentor.com> wrote:
> On 13/07/15 10:31, Thomas Schwinge wrote:
> > On Mon, 13 Jul 2015 09:20:16 +0200, Tom de Vries<tom_devr...@mentor.com>  
> > wrote:
> >> >On 12/07/15 11:39, Thomas Schwinge wrote:
> >> >I've looked at the merge commit, gcc/tree-parloops.c was not modified.

> > (Well, it was, but not "substantially".)

> Hmm, the reason why I said tree-parloops.c was not modified, was that 
> the git show of your merge commit (which invokes git diff-tree --cc) 
> does not show any differences for tree-parloops.c:
> ...
> $ git show 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 | grep tree-parloops.c
> $
> ...
> 
> OTOH, if I use --numstat as diff-tree argument, I see:
> ...
> $ git diff-tree --numstat --cc 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 
> | grep tree-parloops.c
> 7     34      gcc/tree-parloops.c
> ...
> 
> I'm not sure if this is expected behaviour.

Yes, I think so, because »--cc [...] compresses the patch output by
omitting uninteresting hunks whose the contents in the parents have only
two variants and the merge result picks one of them without
modification«, and, as I said, for »merge conflicts, I just retained the
code that was present on gomp-4_0-branch already«.

In contrast, see the -c and -m options (which get passed from git show to
git diff-tree):

    $ git show -c 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 -- 
gcc/tree-parloops.c
    commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1
    Merge: f9d00ca cacef50
    Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
    Date:   Sun Jul 12 09:30:39 2015 +0000
    
        svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk
        
        
        git-svn-id: 
svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719 
138bc75d-0d04-0410-961f-82ee72b054a4
    
    diff --combined gcc/tree-parloops.c
    index 04708c0,846077a..80a215d
    --- gcc/tree-parloops.c
    +++ gcc/tree-parloops.c
    @@@ -22,43 -22,22 +22,22 @@@ along with GCC; see the file COPYING3
      #include "config.h"
      #include "system.h"
      #include "coretypes.h"
    - #include "hash-set.h"
    - #include "machmode.h"
    - #include "vec.h"
    - #include "double-int.h"
    - #include "input.h"
      #include "alias.h"
    - #include "symtab.h"
    - #include "options.h"
    - #include "wide-int.h"
    - #include "inchash.h"
    + #include "backend.h"
      #include "tree.h"
    - #include "fold-const.h"
    - #include "predict.h"
    - #include "tm.h"
    + #include "gimple.h"
      #include "hard-reg-set.h"
    - #include "input.h"
    - #include "function.h"
    - #include "dominance.h"
    - #include "cfg.h"
    - #include "basic-block.h"
    - #include "tree-ssa-alias.h"
    + #include "ssa.h"
    + #include "options.h"
    + #include "fold-const.h"
      #include "internal-fn.h"
    - #include "gimple-expr.h"
    - #include "is-a.h"
    - #include "gimple.h"
      #include "gimplify.h"
      #include "gimple-iterator.h"
      #include "gimplify-me.h"
      #include "gimple-walk.h"
      #include "stor-layout.h"
      #include "tree-nested.h"
    - #include "gimple-ssa.h"
      #include "tree-cfg.h"
    - #include "tree-phinodes.h"
    - #include "ssa-iterators.h"
    - #include "stringpool.h"
    - #include "tree-ssanames.h"
      #include "tree-ssa-loop-ivopts.h"
      #include "tree-ssa-loop-manip.h"
      #include "tree-ssa-loop-niter.h"
    @@@ -75,8 -54,6 +54,6 @@@
      #include "tree-parloops.h"
      #include "omp-low.h"
      #include "tree-nested.h"
    - #include "plugin-api.h"
    - #include "ipa-ref.h"
      #include "cgraph.h"
      #include "tree-ssa.h"
      
    @@@ -218,8 -195,6 +195,8 @@@ struct reduction_inf
                                   of the reduction variable when existing the 
loop. */
        tree initial_value;             /* The initial value of the reduction 
var before entering the loop.  */
        tree field;                     /*  the name of the field in the 
parloop data structure intended for reduction.  */
     +  tree reduc_addr;                /* The address of the reduction 
variable for
     +                             openacc reductions.  */
        tree init;                      /* reduction initialization value.  */
        gphi *new_phi;          /* (helper field) Newly created phi node whose 
result
                                   will be passed to the atomic operation.  
Represents
    @@@ -229,10 -204,8 +206,8 @@@
      
      /* Reduction info hashtable helpers.  */
      
    - struct reduction_hasher : typed_free_remove <reduction_info>
    + struct reduction_hasher : free_ptr_hash <reduction_info>
      {
    -   typedef reduction_info *value_type;
    -   typedef reduction_info *compare_type;
        static inline hashval_t hash (const reduction_info *);
        static inline bool equal (const reduction_info *, const reduction_info 
*);
      };
    @@@ -281,10 -254,8 +256,8 @@@ struct name_to_copy_el
      
      /* Name copies hashtable helpers.  */
      
    - struct name_to_copy_hasher : typed_free_remove <name_to_copy_elt>
    + struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
      {
    -   typedef name_to_copy_elt *value_type;
    -   typedef name_to_copy_elt *compare_type;
        static inline hashval_t hash (const name_to_copy_elt *);
        static inline bool equal (const name_to_copy_elt *, const 
name_to_copy_elt *);
      };
    @@@ -1109,30 -1080,10 +1082,30 @@@ create_call_for_reduction_1 (reduction_
        tree tmp_load, name;
        gimple load;
      
     -  load_struct = build_simple_mem_ref (clsn_data->load);
     -  t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
     +  if (reduc->reduc_addr == NULL_TREE)
     +    {
     +      load_struct = build_simple_mem_ref (clsn_data->load);
     +      t = build3 (COMPONENT_REF, type, load_struct, reduc->field, 
NULL_TREE);
     +
     +      addr = build_addr (t, current_function_decl);
     +    }
     +  else
     +    {
     +      /* Set the address for the atomic store.  */
     +      addr = reduc->reduc_addr;
     +
     +      /* Remove the non-atomic store '*addr = sum'.  */
     +      tree res = PHI_RESULT (reduc->keep_res);
     +      use_operand_p use_p;
     +      gimple stmt;
     +      bool single_use_p = single_imm_use (res, &use_p, &stmt);
     +      gcc_assert (single_use_p);
     +      replace_uses_by (gimple_vdef (stmt),
     +                 gimple_vuse (stmt));
     +      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
     +      gsi_remove (&gsi, true);
     +    }
      
     -  addr = build_addr (t, current_function_decl);
      
        /* Create phi node.  */
        bb = clsn_data->load_bb;
    @@@ -1734,15 -1685,10 +1707,15 @@@ transform_to_exit_first_loop_alt (struc
        /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
        flush_pending_stmts (post_inc_edge);
      
     -  /* Create a new empty exit block, inbetween the new loop header and the 
old
     -     exit block.  The function separate_decls_in_region needs this block 
to
     -     insert code that is active on loop exit, but not any other path.  */
     -  basic_block new_exit_block = split_edge (exit);
     +
     +  basic_block new_exit_block = NULL;
     +  if (!single_pred_p (exit->dest))
     +    {
     +      /* Create a new empty exit block, inbetween the new loop header and 
the
     +   old exit block.  The function separate_decls_in_region needs this block
     +   to insert code that is active on loop exit, but not any other path.  */
     +      new_exit_block = split_edge (exit);
     +    }
      
        /* Insert and register the reduction exit phis.  */
        for (gphi_iterator gsi = gsi_start_phis (exit_block);
    @@@ -1750,24 -1696,17 +1723,24 @@@
             gsi_next (&gsi))
          {
            gphi *phi = gsi.phi ();
     +      gphi *nphi = NULL;
            tree res_z = PHI_RESULT (phi);
     +      tree res_c;
      
     -      /* Now that we have a new exit block, duplicate the phi of the old 
exit
     -   block in the new exit block to preserve loop-closed ssa.  */
     -      edge succ_new_exit_block = single_succ_edge (new_exit_block);
     -      edge pred_new_exit_block = single_pred_edge (new_exit_block);
     -      tree res_y = copy_ssa_name (res_z, phi);
     -      gphi *nphi = create_phi_node (res_y, new_exit_block);
     -      tree res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
     -      add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
     -      add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
     +      if (new_exit_block != NULL)
     +  {
     +    /* Now that we have a new exit block, duplicate the phi of the old
     +       exit block in the new exit block to preserve loop-closed ssa.  */
     +    edge succ_new_exit_block = single_succ_edge (new_exit_block);
     +    edge pred_new_exit_block = single_pred_edge (new_exit_block);
     +    tree res_y = copy_ssa_name (res_z, phi);
     +    nphi = create_phi_node (res_y, new_exit_block);
     +    res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
     +    add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
     +    add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
     +  }
     +      else
     +  res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
      
            if (virtual_operand_p (res_z))
        continue;
    @@@ -1775,9 -1714,7 +1748,9 @@@
            gimple reduc_phi = SSA_NAME_DEF_STMT (res_c);
            struct reduction_info *red = reduction_phi (reduction_list, 
reduc_phi);
            if (red != NULL)
     -  red->keep_res = nphi;
     +  red->keep_res = (nphi != NULL
     +                   ? nphi
     +                   : phi);
          }
      
        /* We're going to cancel the loop at the end of gen_parallel_loop, but 
until
    @@@ -1891,24 -1828,8 +1864,24 @@@ try_transform_to_exit_first_loop_alt (s
        alt_bound = op1;
          }
      
     +  /* If not found, insert nit + 1.  */
        if (alt_bound == NULL_TREE)
     -    return false;
     +    {
     +      alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
     +                         build_int_cst_type (nit_type, 1));
     +
     +      gimple_seq pre = NULL, post = NULL;
     +      push_gimplify_context (true);
     +      gimplify_expr (&alt_bound, &pre, &post, is_gimple_reg,
     +               fb_rvalue);
     +      pop_gimplify_context (NULL);
     +
     +      gimple_seq_add_seq (&pre, post);
     +
     +      gimple_stmt_iterator gsi
     +  = gsi_last_bb (loop_preheader_edge (loop)->src);
     +      gsi_insert_seq_after (&gsi, pre, GSI_CONTINUE_LINKING);
     +    }
      
        transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
        return true;
    @@@ -2032,10 -1953,9 +2005,10 @@@ transform_to_exit_first_loop (struct lo
         of LOOP_FN.  N_THREADS is the requested number of threads.  Returns the
         basic block containing GIMPLE_OMP_PARALLEL tree.  */
      
     -static basic_block
     +static void
      create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
     -                tree new_data, unsigned n_threads, location_t loc)
     +                tree new_data, unsigned n_threads, location_t loc,
     +                basic_block region_entry, bool oacc_kernels_p)
      {
        gimple_stmt_iterator gsi;
        basic_block bb, paral_bb, for_bb, ex_bb;
    @@@ -2048,79 -1968,19 +2021,79 @@@
        gomp_continue *omp_cont_stmt;
        tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
        edge exit, nexit, guard, end, e;
     +  tree for_clauses = NULL_TREE;
      
        /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
        bb = loop_preheader_edge (loop)->src;
        paral_bb = single_pred (bb);
     -  gsi = gsi_last_bb (paral_bb);
     +  if (!oacc_kernels_p)
     +    gsi = gsi_last_bb (paral_bb);
     +  else
     +    /* Make sure the oacc parallel is inserted on top of the oacc kernels
     +       region.  */
     +    gsi = gsi_last_bb (region_entry);
     +
     +  if (!oacc_kernels_p)
     +    {
     +      t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
     +      OMP_CLAUSE_NUM_THREADS_EXPR (t)
     +  = build_int_cst (integer_type_node, n_threads);
     +      omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
     +      gimple_set_location (omp_par_stmt, loc);
     +
     +      gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
     +    }
     +  else
     +    {
     +      /* Create oacc parallel pragma based on oacc kernels pragma.  */
     +      gomp_target *kernels = as_a <gomp_target *> (gsi_stmt (gsi));
     +
     +      gsi_prev (&gsi);
     +      gcall *goacc_kernels = as_a <gcall *> (gsi_stmt (gsi));
      
     -  t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
     -  OMP_CLAUSE_NUM_THREADS_EXPR (t)
     -    = build_int_cst (integer_type_node, n_threads);
     -  omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
     -  gimple_set_location (omp_par_stmt, loc);
     +      tree clauses = gimple_omp_target_clauses (kernels);
     +      /* FIXME: We need a more intelligent mapping onto vector, gangs,
     +   workers.  */
     +      if (1)
     +  {
     +    tree clause = build_omp_clause (gimple_location (kernels),
     +                                    OMP_CLAUSE_NUM_GANGS);
     +    OMP_CLAUSE_NUM_GANGS_EXPR (clause) 
     +      = build_int_cst (integer_type_node, n_threads);
     +    OMP_CLAUSE_CHAIN (clause) = clauses;
     +    clauses = clause;
     +  }
     +      gomp_target *stmt
     +  = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_PARALLEL,
     +                             clauses);
     +      tree child_fn = gimple_omp_target_child_fn (kernels);
     +      gimple_omp_target_set_child_fn (stmt, child_fn);
     +      tree data_arg = gimple_omp_target_data_arg (kernels);
     +      gimple_omp_target_set_data_arg (stmt, data_arg);
     +      tree ganglocal_size = gimple_call_arg (goacc_kernels, /* TODO */ 9);
     +      gimple_omp_target_set_ganglocal_size (stmt, ganglocal_size);
     +
     +      gimple_set_location (stmt, loc);
     +
     +      /* Insert oacc parallel pragma after the oacc kernels pragma.  */
     +      {
     +  gimple_stmt_iterator gsi2;
     +  gsi = gsi_last_bb (region_entry);
     +  gsi2 = gsi;
     +  gsi_prev (&gsi2);
     +
     +  /* Insert pragma acc parallel.  */
     +  gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
      
     -  gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
     +  /* Remove GOACC_kernels.  */
     +  replace_uses_by (gimple_vdef (gsi_stmt (gsi2)),
     +                   gimple_vuse (gsi_stmt (gsi2)));
     +  gsi_remove (&gsi2, true);
     +
     +  /* Remove pragma acc kernels.  */
     +  gsi_remove (&gsi2, true);
     +      }
     +    }
      
        /* Initialize NEW_DATA.  */
        if (data)
    @@@ -2138,18 -1998,12 +2111,18 @@@
            gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
          }
      
     -  /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
     -  bb = split_loop_exit_edge (single_dom_exit (loop));
     -  gsi = gsi_last_bb (bb);
     -  omp_return_stmt1 = gimple_build_omp_return (false);
     -  gimple_set_location (omp_return_stmt1, loc);
     -  gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
     +  /* Skip insertion of OMP_RETURN for oacc_kernels_p.  We've already 
generated
     +     one when lowering the oacc kernels directive in
     +     pass_lower_omp/lower_omp (). */
     +  if (!oacc_kernels_p)
     +    {
     +      /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
     +      bb = split_loop_exit_edge (single_dom_exit (loop));
     +      gsi = gsi_last_bb (bb);
     +      omp_return_stmt1 = gimple_build_omp_return (false);
     +      gimple_set_location (omp_return_stmt1, loc);
     +      gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
     +    }
      
        /* Extract data for GIMPLE_OMP_FOR.  */
        gcc_assert (loop->header == single_dom_exit (loop)->src);
    @@@ -2206,17 -2060,7 +2179,17 @@@
        t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
        OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
      
     -  for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
     +  if (1)
     +    {
     +      /* In combination with the NUM_GANGS on the parallel.  */
     +      for_clauses = build_omp_clause (loc, OMP_CLAUSE_GANG);
     +    }
     +
     +  for_stmt = gimple_build_omp_for (NULL,
     +                             (oacc_kernels_p
     +                              ? GF_OMP_FOR_KIND_OACC_LOOP
     +                              : GF_OMP_FOR_KIND_FOR),
     +                             for_clauses, 1, NULL);
        gimple_set_location (for_stmt, loc);
        gimple_omp_for_set_index (for_stmt, 0, initvar);
        gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
    @@@ -2246,6 -2090,8 +2219,6 @@@
        /* After the above dom info is hosed.  Re-compute it.  */
        free_dominance_info (CDI_DOMINATORS);
        calculate_dominance_info (CDI_DOMINATORS);
     -
     -  return paral_bb;
      }
      
      /* Generates code to execute the iterations of LOOP in N_THREADS
    @@@ -2257,8 -2103,7 +2230,8 @@@
      static void
      gen_parallel_loop (struct loop *loop,
                   reduction_info_table_type *reduction_list,
     -             unsigned n_threads, struct tree_niter_desc *niter)
     +             unsigned n_threads, struct tree_niter_desc *niter,
     +             basic_block region_entry, bool oacc_kernels_p)
      {
        tree many_iterations_cond, type, nit;
        tree arg_struct, new_arg_struct;
    @@@ -2339,43 -2184,40 +2312,43 @@@
        if (stmts)
          gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
      
     -  if (loop->inner)
     -    m_p_thread=2;
     -  else
     -    m_p_thread=MIN_PER_THREAD;
     -
     -   many_iterations_cond =
     -     fold_build2 (GE_EXPR, boolean_type_node,
     -                nit, build_int_cst (type, m_p_thread * n_threads));
     -
     -  many_iterations_cond
     -    = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
     -             invert_truthvalue (unshare_expr (niter->may_be_zero)),
     -             many_iterations_cond);
     -  many_iterations_cond
     -    = force_gimple_operand (many_iterations_cond, &stmts, false, 
NULL_TREE);
     -  if (stmts)
     -    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
     -  if (!is_gimple_condexpr (many_iterations_cond))
     +  if (!oacc_kernels_p)
          {
     +      if (loop->inner)
     +  m_p_thread=2;
     +      else
     +  m_p_thread=MIN_PER_THREAD;
     +
     +      many_iterations_cond =
     +  fold_build2 (GE_EXPR, boolean_type_node,
     +               nit, build_int_cst (type, m_p_thread * n_threads));
     +
     +      many_iterations_cond
     +  = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
     +                 invert_truthvalue (unshare_expr (niter->may_be_zero)),
     +                 many_iterations_cond);
            many_iterations_cond
     -  = force_gimple_operand (many_iterations_cond, &stmts,
     -                          true, NULL_TREE);
     +  = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
            if (stmts)
        gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
     -    }
     +      if (!is_gimple_condexpr (many_iterations_cond))
     +  {
     +    many_iterations_cond
     +      = force_gimple_operand (many_iterations_cond, &stmts,
     +                              true, NULL_TREE);
     +    if (stmts)
     +      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), 
stmts);
     +  }
      
     -  initialize_original_copy_tables ();
     +      initialize_original_copy_tables ();
      
     -  /* We assume that the loop usually iterates a lot.  */
     -  prob = 4 * REG_BR_PROB_BASE / 5;
     -  loop_version (loop, many_iterations_cond, NULL,
     -          prob, prob, REG_BR_PROB_BASE - prob, true);
     -  update_ssa (TODO_update_ssa);
     -  free_original_copy_tables ();
     +      /* We assume that the loop usually iterates a lot.  */
     +      prob = 4 * REG_BR_PROB_BASE / 5;
     +      loop_version (loop, many_iterations_cond, NULL,
     +              prob, prob, REG_BR_PROB_BASE - prob, true);
     +      update_ssa (TODO_update_ssa);
     +      free_original_copy_tables ();
     +    }
      
        /* Base all the induction variables in LOOP on a single control one.  */
        canonicalize_loop_ivs (loop, &nit, true);
    @@@ -2387,9 -2229,6 +2360,9 @@@
           iterations of the loop by one.  */
        if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
          {
     +      if (oacc_kernels_p)
     +  n_threads = 1;
     +
            /* Fall back on the method that handles more cases, but duplicates 
the
         loop body: move the exit condition of LOOP to the beginning of its
         header, and duplicate the part of the last iteration that gets disabled
    @@@ -2406,34 -2245,19 +2379,34 @@@
        entry = loop_preheader_edge (loop);
        exit = single_dom_exit (loop);
      
     -  eliminate_local_variables (entry, exit);
     -  /* In the old loop, move all variables non-local to the loop to a 
structure
     -     and back, and create separate decls for the variables used in loop.  
*/
     -  separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
     -                      &new_arg_struct, &clsn_data);
     +  /* This rewrites the body in terms of new variables.  This has already
     +     been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
     +  if (!oacc_kernels_p)
     +    {
     +      eliminate_local_variables (entry, exit);
     +      /* In the old loop, move all variables non-local to the loop to a
     +   structure and back, and create separate decls for the variables used in
     +   loop.  */
     +      separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
     +                          &new_arg_struct, &clsn_data);
     +    }
     +  else
     +    {
     +      arg_struct = NULL_TREE;
     +      new_arg_struct = NULL_TREE;
     +      clsn_data.load = NULL_TREE;
     +      clsn_data.load_bb = exit->dest;
     +      clsn_data.store = NULL_TREE;
     +      clsn_data.store_bb = NULL;
     +    }
      
        /* Create the parallel constructs.  */
        loc = UNKNOWN_LOCATION;
        cond_stmt = last_stmt (loop->header);
        if (cond_stmt)
          loc = gimple_location (cond_stmt);
     -  create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
     -                  new_arg_struct, n_threads, loc);
     +  create_parallel_loop (loop, create_loop_fn (loc), arg_struct, 
new_arg_struct,
     +                  n_threads, loc, region_entry, oacc_kernels_p);
        if (reduction_list->elements () > 0)
          create_call_for_reduction (loop, reduction_list, &clsn_data);
      
    @@@ -2575,8 -2399,7 +2548,8 @@@ try_get_loop_niter (loop_p loop, struc
      
      static bool
      try_create_reduction_list (loop_p loop,
     -                     reduction_info_table_type *reduction_list)
     +                     reduction_info_table_type *reduction_list,
     +                     bool oacc_kernels_p)
      {
        edge exit = single_dom_exit (loop);
        gphi_iterator gsi;
    @@@ -2666,61 -2489,6 +2639,61 @@@
          }
      
      
     +  if (oacc_kernels_p)
     +    {
     +      edge e = loop_preheader_edge (loop);
     +
     +      for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
     +     gsi_next (&gsi))
     +  {
     +    gphi *phi = gsi.phi ();
     +    tree def = PHI_RESULT (phi);
     +    affine_iv iv;
     +
     +    if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, 
true))
     +      {
     +        struct reduction_info *red;
     +        red = reduction_phi (reduction_list, phi);
     +
     +        /* Look for pattern:
     +
     +           <bb preheader>
     +             .omp_data_i = &.omp_data_arr;
     +             addr = .omp_data_i->sum;
     +             sum_a = *addr;
     +
     +           <bb header>:
     +             sum_b = PHI <sum_a (preheader), sum_c (latch)>
     +
     +           and assign addr to reduc->reduc_addr.  */
     +
     +        tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
     +        gimple stmt = SSA_NAME_DEF_STMT (arg);
     +        if (!gimple_assign_single_p (stmt))
     +          return false;
     +        tree memref = gimple_assign_rhs1 (stmt);
     +        if (TREE_CODE (memref) != MEM_REF)
     +          return false;
     +        tree addr = TREE_OPERAND (memref, 0);
     +
     +        gimple stmt2 = SSA_NAME_DEF_STMT (addr);
     +        if (!gimple_assign_single_p (stmt2))
     +          return false;
     +        tree compref = gimple_assign_rhs1 (stmt2);
     +        if (TREE_CODE (compref) != COMPONENT_REF)
     +          return false;
     +        tree addr2 = TREE_OPERAND (compref, 0);
     +        if (TREE_CODE (addr2) != MEM_REF)
     +          return false;
     +        addr2 = TREE_OPERAND (addr2, 0);
     +        if (TREE_CODE (addr2) != SSA_NAME
     +            || !gimple_stmt_omp_data_i_init_p (SSA_NAME_DEF_STMT (addr2)))
     +          return false;
     +        red->reduc_addr = addr;
     +      }
     +  }
     +    }
     +
        return true;
      }
      
    @@@ -2729,7 -2497,7 +2702,7 @@@
         otherwise.  */
      
      static bool
     -parallelize_loops (void)
     +parallelize_loops (bool oacc_kernels_p)
      {
        unsigned n_threads = flag_tree_parallelize_loops;
        bool changed = false;
    @@@ -2738,7 -2506,6 +2711,7 @@@
        struct obstack parloop_obstack;
        HOST_WIDE_INT estimated;
        source_location loop_loc;
     +  basic_block region_entry = NULL;
      
        /* Do not parallelize loops in the functions created by 
parallelization.  */
        if (parallelized_function_p (cfun->decl))
    @@@ -2750,29 -2517,9 +2723,29 @@@
        reduction_info_table_type reduction_list (10);
        init_stmt_vec_info_vec ();
      
     +  calculate_dominance_info (CDI_DOMINATORS);
     +
        FOR_EACH_LOOP (loop, 0)
          {
            reduction_list.empty ();
     +
     +      if (oacc_kernels_p)
     +  {
     +    if (!loop->in_oacc_kernels_region)
     +      continue;
     +
     +    /* TODO: Allow nested loops.  */
     +    if (loop->inner)
     +      continue;
     +
     +    if (dump_file && (dump_flags & TDF_DETAILS))
     +      fprintf (dump_file,
     +               "Trying loop %d with header bb %d in oacc kernels 
region\n",
     +               loop->num, loop->header->index);
     +
     +    region_entry = loop_get_oacc_kernels_region_entry (loop);
     +  }
     +
            if (dump_file && (dump_flags & TDF_DETAILS))
            {
              fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
    @@@ -2814,7 -2561,6 +2787,7 @@@
            /* FIXME: Bypass this check as graphite doesn't update the
         count and frequency correctly now.  */
            if (!flag_loop_parallelize_all
     +    && !oacc_kernels_p
          && ((estimated != -1
               && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
              /* Do not bother with loops in cold areas.  */
    @@@ -2824,7 -2570,7 +2797,7 @@@
            if (!try_get_loop_niter (loop, &niter_desc))
        continue;
      
     -      if (!try_create_reduction_list (loop, &reduction_list))
     +      if (!try_create_reduction_list (loop, &reduction_list, 
oacc_kernels_p))
        continue;
      
            if (!flag_loop_parallelize_all
    @@@ -2843,9 -2589,8 +2816,9 @@@
          fprintf (dump_file, "\nloop at %s:%d: ",
                   LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
            }
     +
            gen_parallel_loop (loop, &reduction_list,
     -                   n_threads, &niter_desc);
     +                   n_threads, &niter_desc, region_entry, oacc_kernels_p);
          }
      
        free_stmt_vec_info_vec ();
    @@@ -2896,7 -2641,7 +2869,7 @@@ pass_parallelize_loops::execute (functi
        if (number_of_loops (fun) <= 1)
          return 0;
      
     -  if (parallelize_loops ())
     +  if (parallelize_loops (false))
          {
            fun->curr_properties &= ~(PROP_gimple_eomp);
            return TODO_update_ssa;
    @@@ -2912,51 -2657,3 +2885,51 @@@ make_pass_parallelize_loops (gcc::conte
      {
        return new pass_parallelize_loops (ctxt);
      }
     +
     +namespace {
     +
     +const pass_data pass_data_parallelize_loops_oacc_kernels =
     +{
     +  GIMPLE_PASS, /* type */
     +  "parloops_oacc_kernels", /* name */
     +  OPTGROUP_LOOP, /* optinfo_flags */
     +  TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
     +  ( PROP_cfg | PROP_ssa ), /* properties_required */
     +  0, /* properties_provided */
     +  0, /* properties_destroyed */
     +  0, /* todo_flags_start */
     +  0, /* todo_flags_finish */
     +};
     +
     +class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass
     +{
     +public:
     +  pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
     +    : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt)
     +  {}
     +
     +  /* opt_pass methods: */
     +  virtual bool gate (function *) { return flag_tree_parallelize_loops > 
1; }
     +  virtual unsigned int execute (function *);
     +
     +}; // class pass_parallelize_loops_oacc_kernels
     +
     +unsigned
     +pass_parallelize_loops_oacc_kernels::execute (function *fun)
     +{
     +  if (number_of_loops (fun) <= 1)
     +    return 0;
     +
     +  if (parallelize_loops (true))
     +    return TODO_update_ssa;
     +
     +  return 0;
     +}
     +
     +} // anon namespace
     +
     +gimple_opt_pass *
     +make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
     +{
     +  return new pass_parallelize_loops_oacc_kernels (ctxt);
     +}

..., and:

    $ git show -m 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 -- 
gcc/tree-parloops.c
    commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 (from 
f9d00ca614a8dc28f21ab4a16d7cdbbe16668ca3)
    Merge: f9d00ca cacef50
    Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
    Date:   Sun Jul 12 09:30:39 2015 +0000
    
        svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk
        
        
        git-svn-id: 
svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719 
138bc75d-0d04-0410-961f-82ee72b054a4
    
    diff --git gcc/tree-parloops.c gcc/tree-parloops.c
    index 04708c0..80a215d 100644
    --- gcc/tree-parloops.c
    +++ gcc/tree-parloops.c
    @@ -22,43 +22,22 @@ along with GCC; see the file COPYING3.  If not see
     #include "config.h"
     #include "system.h"
     #include "coretypes.h"
    -#include "hash-set.h"
    -#include "machmode.h"
    -#include "vec.h"
    -#include "double-int.h"
    -#include "input.h"
     #include "alias.h"
    -#include "symtab.h"
    -#include "options.h"
    -#include "wide-int.h"
    -#include "inchash.h"
    +#include "backend.h"
     #include "tree.h"
    -#include "fold-const.h"
    -#include "predict.h"
    -#include "tm.h"
    +#include "gimple.h"
     #include "hard-reg-set.h"
    -#include "input.h"
    -#include "function.h"
    -#include "dominance.h"
    -#include "cfg.h"
    -#include "basic-block.h"
    -#include "tree-ssa-alias.h"
    +#include "ssa.h"
    +#include "options.h"
    +#include "fold-const.h"
     #include "internal-fn.h"
    -#include "gimple-expr.h"
    -#include "is-a.h"
    -#include "gimple.h"
     #include "gimplify.h"
     #include "gimple-iterator.h"
     #include "gimplify-me.h"
     #include "gimple-walk.h"
     #include "stor-layout.h"
     #include "tree-nested.h"
    -#include "gimple-ssa.h"
     #include "tree-cfg.h"
    -#include "tree-phinodes.h"
    -#include "ssa-iterators.h"
    -#include "stringpool.h"
    -#include "tree-ssanames.h"
     #include "tree-ssa-loop-ivopts.h"
     #include "tree-ssa-loop-manip.h"
     #include "tree-ssa-loop-niter.h"
    @@ -75,8 +54,6 @@ along with GCC; see the file COPYING3.  If not see
     #include "tree-parloops.h"
     #include "omp-low.h"
     #include "tree-nested.h"
    -#include "plugin-api.h"
    -#include "ipa-ref.h"
     #include "cgraph.h"
     #include "tree-ssa.h"
     
    @@ -229,10 +206,8 @@ struct reduction_info
     
     /* Reduction info hashtable helpers.  */
     
    -struct reduction_hasher : typed_free_remove <reduction_info>
    +struct reduction_hasher : free_ptr_hash <reduction_info>
     {
    -  typedef reduction_info *value_type;
    -  typedef reduction_info *compare_type;
       static inline hashval_t hash (const reduction_info *);
       static inline bool equal (const reduction_info *, const reduction_info 
*);
     };
    @@ -281,10 +256,8 @@ struct name_to_copy_elt
     
     /* Name copies hashtable helpers.  */
     
    -struct name_to_copy_hasher : typed_free_remove <name_to_copy_elt>
    +struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
     {
    -  typedef name_to_copy_elt *value_type;
    -  typedef name_to_copy_elt *compare_type;
       static inline hashval_t hash (const name_to_copy_elt *);
       static inline bool equal (const name_to_copy_elt *, const 
name_to_copy_elt *);
     };
    
    commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 (from 
cacef506e4205bac13a0dd1de238d1a8cc78af28)
    Merge: f9d00ca cacef50
    Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
    Date:   Sun Jul 12 09:30:39 2015 +0000
    
        svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk
        
        
        git-svn-id: 
svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719 
138bc75d-0d04-0410-961f-82ee72b054a4
    
    diff --git gcc/tree-parloops.c gcc/tree-parloops.c
    index 846077a..80a215d 100644
    --- gcc/tree-parloops.c
    +++ gcc/tree-parloops.c
    @@ -195,6 +195,8 @@ struct reduction_info
                                   of the reduction variable when existing the 
loop. */
       tree initial_value;              /* The initial value of the reduction 
var before entering the loop.  */
       tree field;                      /*  the name of the field in the 
parloop data structure intended for reduction.  */
    +  tree reduc_addr;         /* The address of the reduction variable for
    +                              openacc reductions.  */
       tree init;                       /* reduction initialization value.  */
       gphi *new_phi;           /* (helper field) Newly created phi node whose 
result
                                   will be passed to the atomic operation.  
Represents
    @@ -1080,10 +1082,30 @@ create_call_for_reduction_1 (reduction_info **slot, 
struct clsn_data *clsn_data)
       tree tmp_load, name;
       gimple load;
     
    -  load_struct = build_simple_mem_ref (clsn_data->load);
    -  t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
    +  if (reduc->reduc_addr == NULL_TREE)
    +    {
    +      load_struct = build_simple_mem_ref (clsn_data->load);
    +      t = build3 (COMPONENT_REF, type, load_struct, reduc->field, 
NULL_TREE);
    +
    +      addr = build_addr (t, current_function_decl);
    +    }
    +  else
    +    {
    +      /* Set the address for the atomic store.  */
    +      addr = reduc->reduc_addr;
    +
    +      /* Remove the non-atomic store '*addr = sum'.  */
    +      tree res = PHI_RESULT (reduc->keep_res);
    +      use_operand_p use_p;
    +      gimple stmt;
    +      bool single_use_p = single_imm_use (res, &use_p, &stmt);
    +      gcc_assert (single_use_p);
    +      replace_uses_by (gimple_vdef (stmt),
    +                  gimple_vuse (stmt));
    +      gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
    +      gsi_remove (&gsi, true);
    +    }
     
    -  addr = build_addr (t, current_function_decl);
     
       /* Create phi node.  */
       bb = clsn_data->load_bb;
    @@ -1685,10 +1707,15 @@ transform_to_exit_first_loop_alt (struct loop *loop,
       /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
       flush_pending_stmts (post_inc_edge);
     
    -  /* Create a new empty exit block, inbetween the new loop header and the 
old
    -     exit block.  The function separate_decls_in_region needs this block to
    -     insert code that is active on loop exit, but not any other path.  */
    -  basic_block new_exit_block = split_edge (exit);
    +
    +  basic_block new_exit_block = NULL;
    +  if (!single_pred_p (exit->dest))
    +    {
    +      /* Create a new empty exit block, inbetween the new loop header and 
the
    +    old exit block.  The function separate_decls_in_region needs this block
    +    to insert code that is active on loop exit, but not any other path.  */
    +      new_exit_block = split_edge (exit);
    +    }
     
       /* Insert and register the reduction exit phis.  */
       for (gphi_iterator gsi = gsi_start_phis (exit_block);
    @@ -1696,17 +1723,24 @@ transform_to_exit_first_loop_alt (struct loop *loop,
            gsi_next (&gsi))
         {
           gphi *phi = gsi.phi ();
    +      gphi *nphi = NULL;
           tree res_z = PHI_RESULT (phi);
    +      tree res_c;
     
    -      /* Now that we have a new exit block, duplicate the phi of the old 
exit
    -    block in the new exit block to preserve loop-closed ssa.  */
    -      edge succ_new_exit_block = single_succ_edge (new_exit_block);
    -      edge pred_new_exit_block = single_pred_edge (new_exit_block);
    -      tree res_y = copy_ssa_name (res_z, phi);
    -      gphi *nphi = create_phi_node (res_y, new_exit_block);
    -      tree res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
    -      add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
    -      add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
    +      if (new_exit_block != NULL)
    +   {
    +     /* Now that we have a new exit block, duplicate the phi of the old
    +        exit block in the new exit block to preserve loop-closed ssa.  */
    +     edge succ_new_exit_block = single_succ_edge (new_exit_block);
    +     edge pred_new_exit_block = single_pred_edge (new_exit_block);
    +     tree res_y = copy_ssa_name (res_z, phi);
    +     nphi = create_phi_node (res_y, new_exit_block);
    +     res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
    +     add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
    +     add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
    +   }
    +      else
    +   res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
     
           if (virtual_operand_p (res_z))
        continue;
    @@ -1714,7 +1748,9 @@ transform_to_exit_first_loop_alt (struct loop *loop,
           gimple reduc_phi = SSA_NAME_DEF_STMT (res_c);
           struct reduction_info *red = reduction_phi (reduction_list, 
reduc_phi);
           if (red != NULL)
    -   red->keep_res = nphi;
    +   red->keep_res = (nphi != NULL
    +                    ? nphi
    +                    : phi);
         }
     
       /* We're going to cancel the loop at the end of gen_parallel_loop, but 
until
    @@ -1828,8 +1864,24 @@ try_transform_to_exit_first_loop_alt (struct loop 
*loop,
        alt_bound = op1;
         }
     
    +  /* If not found, insert nit + 1.  */
       if (alt_bound == NULL_TREE)
    -    return false;
    +    {
    +      alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
    +                          build_int_cst_type (nit_type, 1));
    +
    +      gimple_seq pre = NULL, post = NULL;
    +      push_gimplify_context (true);
    +      gimplify_expr (&alt_bound, &pre, &post, is_gimple_reg,
    +                fb_rvalue);
    +      pop_gimplify_context (NULL);
    +
    +      gimple_seq_add_seq (&pre, post);
    +
    +      gimple_stmt_iterator gsi
    +   = gsi_last_bb (loop_preheader_edge (loop)->src);
    +      gsi_insert_seq_after (&gsi, pre, GSI_CONTINUE_LINKING);
    +    }
     
       transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
       return true;
    @@ -1953,9 +2005,10 @@ transform_to_exit_first_loop (struct loop *loop,
        of LOOP_FN.  N_THREADS is the requested number of threads.  Returns the
        basic block containing GIMPLE_OMP_PARALLEL tree.  */
     
    -static basic_block
    +static void
     create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
    -                 tree new_data, unsigned n_threads, location_t loc)
    +                 tree new_data, unsigned n_threads, location_t loc,
    +                 basic_block region_entry, bool oacc_kernels_p)
     {
       gimple_stmt_iterator gsi;
       basic_block bb, paral_bb, for_bb, ex_bb;
    @@ -1968,19 +2021,79 @@ create_parallel_loop (struct loop *loop, tree 
loop_fn, tree data,
       gomp_continue *omp_cont_stmt;
       tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
       edge exit, nexit, guard, end, e;
    +  tree for_clauses = NULL_TREE;
     
       /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
       bb = loop_preheader_edge (loop)->src;
       paral_bb = single_pred (bb);
    -  gsi = gsi_last_bb (paral_bb);
    +  if (!oacc_kernels_p)
    +    gsi = gsi_last_bb (paral_bb);
    +  else
    +    /* Make sure the oacc parallel is inserted on top of the oacc kernels
    +       region.  */
    +    gsi = gsi_last_bb (region_entry);
    +
    +  if (!oacc_kernels_p)
    +    {
    +      t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
    +      OMP_CLAUSE_NUM_THREADS_EXPR (t)
    +   = build_int_cst (integer_type_node, n_threads);
    +      omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
    +      gimple_set_location (omp_par_stmt, loc);
    +
    +      gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
    +    }
    +  else
    +    {
    +      /* Create oacc parallel pragma based on oacc kernels pragma.  */
    +      gomp_target *kernels = as_a <gomp_target *> (gsi_stmt (gsi));
    +
    +      gsi_prev (&gsi);
    +      gcall *goacc_kernels = as_a <gcall *> (gsi_stmt (gsi));
     
    -  t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
    -  OMP_CLAUSE_NUM_THREADS_EXPR (t)
    -    = build_int_cst (integer_type_node, n_threads);
    -  omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
    -  gimple_set_location (omp_par_stmt, loc);
    +      tree clauses = gimple_omp_target_clauses (kernels);
    +      /* FIXME: We need a more intelligent mapping onto vector, gangs,
    +    workers.  */
    +      if (1)
    +   {
    +     tree clause = build_omp_clause (gimple_location (kernels),
    +                                     OMP_CLAUSE_NUM_GANGS);
    +     OMP_CLAUSE_NUM_GANGS_EXPR (clause) 
    +       = build_int_cst (integer_type_node, n_threads);
    +     OMP_CLAUSE_CHAIN (clause) = clauses;
    +     clauses = clause;
    +   }
    +      gomp_target *stmt
    +   = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_PARALLEL,
    +                              clauses);
    +      tree child_fn = gimple_omp_target_child_fn (kernels);
    +      gimple_omp_target_set_child_fn (stmt, child_fn);
    +      tree data_arg = gimple_omp_target_data_arg (kernels);
    +      gimple_omp_target_set_data_arg (stmt, data_arg);
    +      tree ganglocal_size = gimple_call_arg (goacc_kernels, /* TODO */ 9);
    +      gimple_omp_target_set_ganglocal_size (stmt, ganglocal_size);
    +
    +      gimple_set_location (stmt, loc);
    +
    +      /* Insert oacc parallel pragma after the oacc kernels pragma.  */
    +      {
    +   gimple_stmt_iterator gsi2;
    +   gsi = gsi_last_bb (region_entry);
    +   gsi2 = gsi;
    +   gsi_prev (&gsi2);
    +
    +   /* Insert pragma acc parallel.  */
    +   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
     
    -  gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
    +   /* Remove GOACC_kernels.  */
    +   replace_uses_by (gimple_vdef (gsi_stmt (gsi2)),
    +                    gimple_vuse (gsi_stmt (gsi2)));
    +   gsi_remove (&gsi2, true);
    +
    +   /* Remove pragma acc kernels.  */
    +   gsi_remove (&gsi2, true);
    +      }
    +    }
     
       /* Initialize NEW_DATA.  */
       if (data)
    @@ -1998,12 +2111,18 @@ create_parallel_loop (struct loop *loop, tree 
loop_fn, tree data,
           gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
         }
     
    -  /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
    -  bb = split_loop_exit_edge (single_dom_exit (loop));
    -  gsi = gsi_last_bb (bb);
    -  omp_return_stmt1 = gimple_build_omp_return (false);
    -  gimple_set_location (omp_return_stmt1, loc);
    -  gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
    +  /* Skip insertion of OMP_RETURN for oacc_kernels_p.  We've already 
generated
    +     one when lowering the oacc kernels directive in
    +     pass_lower_omp/lower_omp (). */
    +  if (!oacc_kernels_p)
    +    {
    +      /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
    +      bb = split_loop_exit_edge (single_dom_exit (loop));
    +      gsi = gsi_last_bb (bb);
    +      omp_return_stmt1 = gimple_build_omp_return (false);
    +      gimple_set_location (omp_return_stmt1, loc);
    +      gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
    +    }
     
       /* Extract data for GIMPLE_OMP_FOR.  */
       gcc_assert (loop->header == single_dom_exit (loop)->src);
    @@ -2060,7 +2179,17 @@ create_parallel_loop (struct loop *loop, tree 
loop_fn, tree data,
       t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
       OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
     
    -  for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
    +  if (1)
    +    {
    +      /* In combination with the NUM_GANGS on the parallel.  */
    +      for_clauses = build_omp_clause (loc, OMP_CLAUSE_GANG);
    +    }
    +
    +  for_stmt = gimple_build_omp_for (NULL,
    +                              (oacc_kernels_p
    +                               ? GF_OMP_FOR_KIND_OACC_LOOP
    +                               : GF_OMP_FOR_KIND_FOR),
    +                              for_clauses, 1, NULL);
       gimple_set_location (for_stmt, loc);
       gimple_omp_for_set_index (for_stmt, 0, initvar);
       gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
    @@ -2090,8 +2219,6 @@ create_parallel_loop (struct loop *loop, tree 
loop_fn, tree data,
       /* After the above dom info is hosed.  Re-compute it.  */
       free_dominance_info (CDI_DOMINATORS);
       calculate_dominance_info (CDI_DOMINATORS);
    -
    -  return paral_bb;
     }
     
     /* Generates code to execute the iterations of LOOP in N_THREADS
    @@ -2103,7 +2230,8 @@ create_parallel_loop (struct loop *loop, tree 
loop_fn, tree data,
     static void
     gen_parallel_loop (struct loop *loop,
                   reduction_info_table_type *reduction_list,
    -              unsigned n_threads, struct tree_niter_desc *niter)
    +              unsigned n_threads, struct tree_niter_desc *niter,
    +              basic_block region_entry, bool oacc_kernels_p)
     {
       tree many_iterations_cond, type, nit;
       tree arg_struct, new_arg_struct;
    @@ -2184,40 +2312,43 @@ gen_parallel_loop (struct loop *loop,
       if (stmts)
         gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
     
    -  if (loop->inner)
    -    m_p_thread=2;
    -  else
    -    m_p_thread=MIN_PER_THREAD;
    -
    -   many_iterations_cond =
    -     fold_build2 (GE_EXPR, boolean_type_node,
    -                nit, build_int_cst (type, m_p_thread * n_threads));
    -
    -  many_iterations_cond
    -    = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
    -              invert_truthvalue (unshare_expr (niter->may_be_zero)),
    -              many_iterations_cond);
    -  many_iterations_cond
    -    = force_gimple_operand (many_iterations_cond, &stmts, false, 
NULL_TREE);
    -  if (stmts)
    -    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
    -  if (!is_gimple_condexpr (many_iterations_cond))
    +  if (!oacc_kernels_p)
         {
    +      if (loop->inner)
    +   m_p_thread=2;
    +      else
    +   m_p_thread=MIN_PER_THREAD;
    +
    +      many_iterations_cond =
    +   fold_build2 (GE_EXPR, boolean_type_node,
    +                nit, build_int_cst (type, m_p_thread * n_threads));
    +
    +      many_iterations_cond
    +   = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
    +                  invert_truthvalue (unshare_expr (niter->may_be_zero)),
    +                  many_iterations_cond);
           many_iterations_cond
    -   = force_gimple_operand (many_iterations_cond, &stmts,
    -                           true, NULL_TREE);
    +   = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
           if (stmts)
        gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
    -    }
    +      if (!is_gimple_condexpr (many_iterations_cond))
    +   {
    +     many_iterations_cond
    +       = force_gimple_operand (many_iterations_cond, &stmts,
    +                               true, NULL_TREE);
    +     if (stmts)
    +       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), 
stmts);
    +   }
     
    -  initialize_original_copy_tables ();
    +      initialize_original_copy_tables ();
     
    -  /* We assume that the loop usually iterates a lot.  */
    -  prob = 4 * REG_BR_PROB_BASE / 5;
    -  loop_version (loop, many_iterations_cond, NULL,
    -           prob, prob, REG_BR_PROB_BASE - prob, true);
    -  update_ssa (TODO_update_ssa);
    -  free_original_copy_tables ();
    +      /* We assume that the loop usually iterates a lot.  */
    +      prob = 4 * REG_BR_PROB_BASE / 5;
    +      loop_version (loop, many_iterations_cond, NULL,
    +               prob, prob, REG_BR_PROB_BASE - prob, true);
    +      update_ssa (TODO_update_ssa);
    +      free_original_copy_tables ();
    +    }
     
       /* Base all the induction variables in LOOP on a single control one.  */
       canonicalize_loop_ivs (loop, &nit, true);
    @@ -2229,6 +2360,9 @@ gen_parallel_loop (struct loop *loop,
          iterations of the loop by one.  */
       if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
         {
    +      if (oacc_kernels_p)
    +   n_threads = 1;
    +
           /* Fall back on the method that handles more cases, but duplicates 
the
         loop body: move the exit condition of LOOP to the beginning of its
         header, and duplicate the part of the last iteration that gets disabled
    @@ -2245,19 +2379,34 @@ gen_parallel_loop (struct loop *loop,
       entry = loop_preheader_edge (loop);
       exit = single_dom_exit (loop);
     
    -  eliminate_local_variables (entry, exit);
    -  /* In the old loop, move all variables non-local to the loop to a 
structure
    -     and back, and create separate decls for the variables used in loop.  
*/
    -  separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
    -                       &new_arg_struct, &clsn_data);
    +  /* This rewrites the body in terms of new variables.  This has already
    +     been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
    +  if (!oacc_kernels_p)
    +    {
    +      eliminate_local_variables (entry, exit);
    +      /* In the old loop, move all variables non-local to the loop to a
    +    structure and back, and create separate decls for the variables used in
    +    loop.  */
    +      separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
    +                           &new_arg_struct, &clsn_data);
    +    }
    +  else
    +    {
    +      arg_struct = NULL_TREE;
    +      new_arg_struct = NULL_TREE;
    +      clsn_data.load = NULL_TREE;
    +      clsn_data.load_bb = exit->dest;
    +      clsn_data.store = NULL_TREE;
    +      clsn_data.store_bb = NULL;
    +    }
     
       /* Create the parallel constructs.  */
       loc = UNKNOWN_LOCATION;
       cond_stmt = last_stmt (loop->header);
       if (cond_stmt)
         loc = gimple_location (cond_stmt);
    -  create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
    -                   new_arg_struct, n_threads, loc);
    +  create_parallel_loop (loop, create_loop_fn (loc), arg_struct, 
new_arg_struct,
    +                   n_threads, loc, region_entry, oacc_kernels_p);
       if (reduction_list->elements () > 0)
         create_call_for_reduction (loop, reduction_list, &clsn_data);
     
    @@ -2399,7 +2548,8 @@ try_get_loop_niter (loop_p loop, struct 
tree_niter_desc *niter)
     
     static bool
     try_create_reduction_list (loop_p loop,
    -                      reduction_info_table_type *reduction_list)
    +                      reduction_info_table_type *reduction_list,
    +                      bool oacc_kernels_p)
     {
       edge exit = single_dom_exit (loop);
       gphi_iterator gsi;
    @@ -2489,6 +2639,61 @@ try_create_reduction_list (loop_p loop,
         }
     
     
    +  if (oacc_kernels_p)
    +    {
    +      edge e = loop_preheader_edge (loop);
    +
    +      for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
    +      gsi_next (&gsi))
    +   {
    +     gphi *phi = gsi.phi ();
    +     tree def = PHI_RESULT (phi);
    +     affine_iv iv;
    +
    +     if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, 
true))
    +       {
    +         struct reduction_info *red;
    +         red = reduction_phi (reduction_list, phi);
    +
    +         /* Look for pattern:
    +
    +            <bb preheader>
    +              .omp_data_i = &.omp_data_arr;
    +              addr = .omp_data_i->sum;
    +              sum_a = *addr;
    +
    +            <bb header>:
    +              sum_b = PHI <sum_a (preheader), sum_c (latch)>
    +
    +            and assign addr to reduc->reduc_addr.  */
    +
    +         tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
    +         gimple stmt = SSA_NAME_DEF_STMT (arg);
    +         if (!gimple_assign_single_p (stmt))
    +           return false;
    +         tree memref = gimple_assign_rhs1 (stmt);
    +         if (TREE_CODE (memref) != MEM_REF)
    +           return false;
    +         tree addr = TREE_OPERAND (memref, 0);
    +
    +         gimple stmt2 = SSA_NAME_DEF_STMT (addr);
    +         if (!gimple_assign_single_p (stmt2))
    +           return false;
    +         tree compref = gimple_assign_rhs1 (stmt2);
    +         if (TREE_CODE (compref) != COMPONENT_REF)
    +           return false;
    +         tree addr2 = TREE_OPERAND (compref, 0);
    +         if (TREE_CODE (addr2) != MEM_REF)
    +           return false;
    +         addr2 = TREE_OPERAND (addr2, 0);
    +         if (TREE_CODE (addr2) != SSA_NAME
    +             || !gimple_stmt_omp_data_i_init_p (SSA_NAME_DEF_STMT (addr2)))
    +           return false;
    +         red->reduc_addr = addr;
    +       }
    +   }
    +    }
    +
       return true;
     }
     
    @@ -2497,7 +2702,7 @@ try_create_reduction_list (loop_p loop,
        otherwise.  */
     
     static bool
    -parallelize_loops (void)
    +parallelize_loops (bool oacc_kernels_p)
     {
       unsigned n_threads = flag_tree_parallelize_loops;
       bool changed = false;
    @@ -2506,6 +2711,7 @@ parallelize_loops (void)
       struct obstack parloop_obstack;
       HOST_WIDE_INT estimated;
       source_location loop_loc;
    +  basic_block region_entry = NULL;
     
       /* Do not parallelize loops in the functions created by parallelization. 
 */
       if (parallelized_function_p (cfun->decl))
    @@ -2517,9 +2723,29 @@ parallelize_loops (void)
       reduction_info_table_type reduction_list (10);
       init_stmt_vec_info_vec ();
     
    +  calculate_dominance_info (CDI_DOMINATORS);
    +
       FOR_EACH_LOOP (loop, 0)
         {
           reduction_list.empty ();
    +
    +      if (oacc_kernels_p)
    +   {
    +     if (!loop->in_oacc_kernels_region)
    +       continue;
    +
    +     /* TODO: Allow nested loops.  */
    +     if (loop->inner)
    +       continue;
    +
    +     if (dump_file && (dump_flags & TDF_DETAILS))
    +       fprintf (dump_file,
    +                "Trying loop %d with header bb %d in oacc kernels 
region\n",
    +                loop->num, loop->header->index);
    +
    +     region_entry = loop_get_oacc_kernels_region_entry (loop);
    +   }
    +
           if (dump_file && (dump_flags & TDF_DETAILS))
           {
             fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
    @@ -2561,6 +2787,7 @@ parallelize_loops (void)
           /* FIXME: Bypass this check as graphite doesn't update the
         count and frequency correctly now.  */
           if (!flag_loop_parallelize_all
    +     && !oacc_kernels_p
          && ((estimated != -1
               && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
              /* Do not bother with loops in cold areas.  */
    @@ -2570,7 +2797,7 @@ parallelize_loops (void)
           if (!try_get_loop_niter (loop, &niter_desc))
        continue;
     
    -      if (!try_create_reduction_list (loop, &reduction_list))
    +      if (!try_create_reduction_list (loop, &reduction_list, 
oacc_kernels_p))
        continue;
     
           if (!flag_loop_parallelize_all
    @@ -2589,8 +2816,9 @@ parallelize_loops (void)
          fprintf (dump_file, "\nloop at %s:%d: ",
                   LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
           }
    +
           gen_parallel_loop (loop, &reduction_list,
    -                    n_threads, &niter_desc);
    +                    n_threads, &niter_desc, region_entry, oacc_kernels_p);
         }
     
       free_stmt_vec_info_vec ();
    @@ -2641,7 +2869,7 @@ pass_parallelize_loops::execute (function *fun)
       if (number_of_loops (fun) <= 1)
         return 0;
     
    -  if (parallelize_loops ())
    +  if (parallelize_loops (false))
         {
           fun->curr_properties &= ~(PROP_gimple_eomp);
           return TODO_update_ssa;
    @@ -2657,3 +2885,51 @@ make_pass_parallelize_loops (gcc::context *ctxt)
     {
       return new pass_parallelize_loops (ctxt);
     }
    +
    +namespace {
    +
    +const pass_data pass_data_parallelize_loops_oacc_kernels =
    +{
    +  GIMPLE_PASS, /* type */
    +  "parloops_oacc_kernels", /* name */
    +  OPTGROUP_LOOP, /* optinfo_flags */
    +  TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
    +  ( PROP_cfg | PROP_ssa ), /* properties_required */
    +  0, /* properties_provided */
    +  0, /* properties_destroyed */
    +  0, /* todo_flags_start */
    +  0, /* todo_flags_finish */
    +};
    +
    +class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass
    +{
    +public:
    +  pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
    +    : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt)
    +  {}
    +
    +  /* opt_pass methods: */
    +  virtual bool gate (function *) { return flag_tree_parallelize_loops > 1; 
}
    +  virtual unsigned int execute (function *);
    +
    +}; // class pass_parallelize_loops_oacc_kernels
    +
    +unsigned
    +pass_parallelize_loops_oacc_kernels::execute (function *fun)
    +{
    +  if (number_of_loops (fun) <= 1)
    +    return 0;
    +
    +  if (parallelize_loops (true))
    +    return TODO_update_ssa;
    +
    +  return 0;
    +}
    +
    +} // anon namespace
    +
    +gimple_opt_pass *
    +make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
    +{
    +  return new pass_parallelize_loops_oacc_kernels (ctxt);
    +}


Grüße,
 Thomas

Attachment: signature.asc
Description: PGP signature

Reply via email to