Hi Tom! On Mon, 13 Jul 2015 12:32:20 +0200, Tom de Vries <[email protected]> wrote: > On 13/07/15 10:31, Thomas Schwinge wrote: > > On Mon, 13 Jul 2015 09:20:16 +0200, Tom de Vries<[email protected]> > > wrote: > >> >On 12/07/15 11:39, Thomas Schwinge wrote: > >> >I've looked at the merge commit, gcc/tree-parloops.c was not modified.
> > (Well, it was, but not "substantially".)
> Hmm, the reason why I said tree-parloops.c was not modified, was that
> the git show of your merge commit (which invokes git diff-tree --cc)
> does not show any differences for tree-parloops.c:
> ...
> $ git show 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 | grep tree-parloops.c
> $
> ...
>
> OTOH, if I use --numstat as diff-tree argument, I see:
> ...
> $ git diff-tree --numstat --cc 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1
> | grep tree-parloops.c
> 7 34 gcc/tree-parloops.c
> ...
>
> I'm not sure if this is expected behaviour.
Yes, I think so, because »--cc [...] compresses the patch output by
omitting uninteresting hunks whose the contents in the parents have only
two variants and the merge result picks one of them without
modification«, and, as I said, for »merge conflicts, I just retained the
code that was present on gomp-4_0-branch already«.
In contrast, see the -c and -m options (which get passed from git show to
git diff-tree):
$ git show -c 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 --
gcc/tree-parloops.c
commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1
Merge: f9d00ca cacef50
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Sun Jul 12 09:30:39 2015 +0000
svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk
git-svn-id:
svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719
138bc75d-0d04-0410-961f-82ee72b054a4
diff --combined gcc/tree-parloops.c
index 04708c0,846077a..80a215d
--- gcc/tree-parloops.c
+++ gcc/tree-parloops.c
@@@ -22,43 -22,22 +22,22 @@@ along with GCC; see the file COPYING3
#include "config.h"
#include "system.h"
#include "coretypes.h"
- #include "hash-set.h"
- #include "machmode.h"
- #include "vec.h"
- #include "double-int.h"
- #include "input.h"
#include "alias.h"
- #include "symtab.h"
- #include "options.h"
- #include "wide-int.h"
- #include "inchash.h"
+ #include "backend.h"
#include "tree.h"
- #include "fold-const.h"
- #include "predict.h"
- #include "tm.h"
+ #include "gimple.h"
#include "hard-reg-set.h"
- #include "input.h"
- #include "function.h"
- #include "dominance.h"
- #include "cfg.h"
- #include "basic-block.h"
- #include "tree-ssa-alias.h"
+ #include "ssa.h"
+ #include "options.h"
+ #include "fold-const.h"
#include "internal-fn.h"
- #include "gimple-expr.h"
- #include "is-a.h"
- #include "gimple.h"
#include "gimplify.h"
#include "gimple-iterator.h"
#include "gimplify-me.h"
#include "gimple-walk.h"
#include "stor-layout.h"
#include "tree-nested.h"
- #include "gimple-ssa.h"
#include "tree-cfg.h"
- #include "tree-phinodes.h"
- #include "ssa-iterators.h"
- #include "stringpool.h"
- #include "tree-ssanames.h"
#include "tree-ssa-loop-ivopts.h"
#include "tree-ssa-loop-manip.h"
#include "tree-ssa-loop-niter.h"
@@@ -75,8 -54,6 +54,6 @@@
#include "tree-parloops.h"
#include "omp-low.h"
#include "tree-nested.h"
- #include "plugin-api.h"
- #include "ipa-ref.h"
#include "cgraph.h"
#include "tree-ssa.h"
@@@ -218,8 -195,6 +195,8 @@@ struct reduction_inf
of the reduction variable when existing the
loop. */
tree initial_value; /* The initial value of the reduction
var before entering the loop. */
tree field; /* the name of the field in the
parloop data structure intended for reduction. */
+ tree reduc_addr; /* The address of the reduction
variable for
+ openacc reductions. */
tree init; /* reduction initialization value. */
gphi *new_phi; /* (helper field) Newly created phi node whose
result
will be passed to the atomic operation.
Represents
@@@ -229,10 -204,8 +206,8 @@@
/* Reduction info hashtable helpers. */
- struct reduction_hasher : typed_free_remove <reduction_info>
+ struct reduction_hasher : free_ptr_hash <reduction_info>
{
- typedef reduction_info *value_type;
- typedef reduction_info *compare_type;
static inline hashval_t hash (const reduction_info *);
static inline bool equal (const reduction_info *, const reduction_info
*);
};
@@@ -281,10 -254,8 +256,8 @@@ struct name_to_copy_el
/* Name copies hashtable helpers. */
- struct name_to_copy_hasher : typed_free_remove <name_to_copy_elt>
+ struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
{
- typedef name_to_copy_elt *value_type;
- typedef name_to_copy_elt *compare_type;
static inline hashval_t hash (const name_to_copy_elt *);
static inline bool equal (const name_to_copy_elt *, const
name_to_copy_elt *);
};
@@@ -1109,30 -1080,10 +1082,30 @@@ create_call_for_reduction_1 (reduction_
tree tmp_load, name;
gimple load;
- load_struct = build_simple_mem_ref (clsn_data->load);
- t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
+ if (reduc->reduc_addr == NULL_TREE)
+ {
+ load_struct = build_simple_mem_ref (clsn_data->load);
+ t = build3 (COMPONENT_REF, type, load_struct, reduc->field,
NULL_TREE);
+
+ addr = build_addr (t, current_function_decl);
+ }
+ else
+ {
+ /* Set the address for the atomic store. */
+ addr = reduc->reduc_addr;
+
+ /* Remove the non-atomic store '*addr = sum'. */
+ tree res = PHI_RESULT (reduc->keep_res);
+ use_operand_p use_p;
+ gimple stmt;
+ bool single_use_p = single_imm_use (res, &use_p, &stmt);
+ gcc_assert (single_use_p);
+ replace_uses_by (gimple_vdef (stmt),
+ gimple_vuse (stmt));
+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+ gsi_remove (&gsi, true);
+ }
- addr = build_addr (t, current_function_decl);
/* Create phi node. */
bb = clsn_data->load_bb;
@@@ -1734,15 -1685,10 +1707,15 @@@ transform_to_exit_first_loop_alt (struc
/* Set the latch arguments of the new phis to ivtmp/sum_b. */
flush_pending_stmts (post_inc_edge);
- /* Create a new empty exit block, inbetween the new loop header and the
old
- exit block. The function separate_decls_in_region needs this block
to
- insert code that is active on loop exit, but not any other path. */
- basic_block new_exit_block = split_edge (exit);
+
+ basic_block new_exit_block = NULL;
+ if (!single_pred_p (exit->dest))
+ {
+ /* Create a new empty exit block, inbetween the new loop header and
the
+ old exit block. The function separate_decls_in_region needs this block
+ to insert code that is active on loop exit, but not any other path. */
+ new_exit_block = split_edge (exit);
+ }
/* Insert and register the reduction exit phis. */
for (gphi_iterator gsi = gsi_start_phis (exit_block);
@@@ -1750,24 -1696,17 +1723,24 @@@
gsi_next (&gsi))
{
gphi *phi = gsi.phi ();
+ gphi *nphi = NULL;
tree res_z = PHI_RESULT (phi);
+ tree res_c;
- /* Now that we have a new exit block, duplicate the phi of the old
exit
- block in the new exit block to preserve loop-closed ssa. */
- edge succ_new_exit_block = single_succ_edge (new_exit_block);
- edge pred_new_exit_block = single_pred_edge (new_exit_block);
- tree res_y = copy_ssa_name (res_z, phi);
- gphi *nphi = create_phi_node (res_y, new_exit_block);
- tree res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
- add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
- add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
+ if (new_exit_block != NULL)
+ {
+ /* Now that we have a new exit block, duplicate the phi of the old
+ exit block in the new exit block to preserve loop-closed ssa. */
+ edge succ_new_exit_block = single_succ_edge (new_exit_block);
+ edge pred_new_exit_block = single_pred_edge (new_exit_block);
+ tree res_y = copy_ssa_name (res_z, phi);
+ nphi = create_phi_node (res_y, new_exit_block);
+ res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
+ add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
+ add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
+ }
+ else
+ res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
if (virtual_operand_p (res_z))
continue;
@@@ -1775,9 -1714,7 +1748,9 @@@
gimple reduc_phi = SSA_NAME_DEF_STMT (res_c);
struct reduction_info *red = reduction_phi (reduction_list,
reduc_phi);
if (red != NULL)
- red->keep_res = nphi;
+ red->keep_res = (nphi != NULL
+ ? nphi
+ : phi);
}
/* We're going to cancel the loop at the end of gen_parallel_loop, but
until
@@@ -1891,24 -1828,8 +1864,24 @@@ try_transform_to_exit_first_loop_alt (s
alt_bound = op1;
}
+ /* If not found, insert nit + 1. */
if (alt_bound == NULL_TREE)
- return false;
+ {
+ alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
+ build_int_cst_type (nit_type, 1));
+
+ gimple_seq pre = NULL, post = NULL;
+ push_gimplify_context (true);
+ gimplify_expr (&alt_bound, &pre, &post, is_gimple_reg,
+ fb_rvalue);
+ pop_gimplify_context (NULL);
+
+ gimple_seq_add_seq (&pre, post);
+
+ gimple_stmt_iterator gsi
+ = gsi_last_bb (loop_preheader_edge (loop)->src);
+ gsi_insert_seq_after (&gsi, pre, GSI_CONTINUE_LINKING);
+ }
transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
return true;
@@@ -2032,10 -1953,9 +2005,10 @@@ transform_to_exit_first_loop (struct lo
of LOOP_FN. N_THREADS is the requested number of threads. Returns the
basic block containing GIMPLE_OMP_PARALLEL tree. */
-static basic_block
+static void
create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
- tree new_data, unsigned n_threads, location_t loc)
+ tree new_data, unsigned n_threads, location_t loc,
+ basic_block region_entry, bool oacc_kernels_p)
{
gimple_stmt_iterator gsi;
basic_block bb, paral_bb, for_bb, ex_bb;
@@@ -2048,79 -1968,19 +2021,79 @@@
gomp_continue *omp_cont_stmt;
tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
edge exit, nexit, guard, end, e;
+ tree for_clauses = NULL_TREE;
/* Prepare the GIMPLE_OMP_PARALLEL statement. */
bb = loop_preheader_edge (loop)->src;
paral_bb = single_pred (bb);
- gsi = gsi_last_bb (paral_bb);
+ if (!oacc_kernels_p)
+ gsi = gsi_last_bb (paral_bb);
+ else
+ /* Make sure the oacc parallel is inserted on top of the oacc kernels
+ region. */
+ gsi = gsi_last_bb (region_entry);
+
+ if (!oacc_kernels_p)
+ {
+ t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+ OMP_CLAUSE_NUM_THREADS_EXPR (t)
+ = build_int_cst (integer_type_node, n_threads);
+ omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+ gimple_set_location (omp_par_stmt, loc);
+
+ gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
+ }
+ else
+ {
+ /* Create oacc parallel pragma based on oacc kernels pragma. */
+ gomp_target *kernels = as_a <gomp_target *> (gsi_stmt (gsi));
+
+ gsi_prev (&gsi);
+ gcall *goacc_kernels = as_a <gcall *> (gsi_stmt (gsi));
- t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
- OMP_CLAUSE_NUM_THREADS_EXPR (t)
- = build_int_cst (integer_type_node, n_threads);
- omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
- gimple_set_location (omp_par_stmt, loc);
+ tree clauses = gimple_omp_target_clauses (kernels);
+ /* FIXME: We need a more intelligent mapping onto vector, gangs,
+ workers. */
+ if (1)
+ {
+ tree clause = build_omp_clause (gimple_location (kernels),
+ OMP_CLAUSE_NUM_GANGS);
+ OMP_CLAUSE_NUM_GANGS_EXPR (clause)
+ = build_int_cst (integer_type_node, n_threads);
+ OMP_CLAUSE_CHAIN (clause) = clauses;
+ clauses = clause;
+ }
+ gomp_target *stmt
+ = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_PARALLEL,
+ clauses);
+ tree child_fn = gimple_omp_target_child_fn (kernels);
+ gimple_omp_target_set_child_fn (stmt, child_fn);
+ tree data_arg = gimple_omp_target_data_arg (kernels);
+ gimple_omp_target_set_data_arg (stmt, data_arg);
+ tree ganglocal_size = gimple_call_arg (goacc_kernels, /* TODO */ 9);
+ gimple_omp_target_set_ganglocal_size (stmt, ganglocal_size);
+
+ gimple_set_location (stmt, loc);
+
+ /* Insert oacc parallel pragma after the oacc kernels pragma. */
+ {
+ gimple_stmt_iterator gsi2;
+ gsi = gsi_last_bb (region_entry);
+ gsi2 = gsi;
+ gsi_prev (&gsi2);
+
+ /* Insert pragma acc parallel. */
+ gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
- gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
+ /* Remove GOACC_kernels. */
+ replace_uses_by (gimple_vdef (gsi_stmt (gsi2)),
+ gimple_vuse (gsi_stmt (gsi2)));
+ gsi_remove (&gsi2, true);
+
+ /* Remove pragma acc kernels. */
+ gsi_remove (&gsi2, true);
+ }
+ }
/* Initialize NEW_DATA. */
if (data)
@@@ -2138,18 -1998,12 +2111,18 @@@
gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
}
- /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
- bb = split_loop_exit_edge (single_dom_exit (loop));
- gsi = gsi_last_bb (bb);
- omp_return_stmt1 = gimple_build_omp_return (false);
- gimple_set_location (omp_return_stmt1, loc);
- gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
+ /* Skip insertion of OMP_RETURN for oacc_kernels_p. We've already
generated
+ one when lowering the oacc kernels directive in
+ pass_lower_omp/lower_omp (). */
+ if (!oacc_kernels_p)
+ {
+ /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
+ bb = split_loop_exit_edge (single_dom_exit (loop));
+ gsi = gsi_last_bb (bb);
+ omp_return_stmt1 = gimple_build_omp_return (false);
+ gimple_set_location (omp_return_stmt1, loc);
+ gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
+ }
/* Extract data for GIMPLE_OMP_FOR. */
gcc_assert (loop->header == single_dom_exit (loop)->src);
@@@ -2206,17 -2060,7 +2179,17 @@@
t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
- for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
+ if (1)
+ {
+ /* In combination with the NUM_GANGS on the parallel. */
+ for_clauses = build_omp_clause (loc, OMP_CLAUSE_GANG);
+ }
+
+ for_stmt = gimple_build_omp_for (NULL,
+ (oacc_kernels_p
+ ? GF_OMP_FOR_KIND_OACC_LOOP
+ : GF_OMP_FOR_KIND_FOR),
+ for_clauses, 1, NULL);
gimple_set_location (for_stmt, loc);
gimple_omp_for_set_index (for_stmt, 0, initvar);
gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
@@@ -2246,6 -2090,8 +2219,6 @@@
/* After the above dom info is hosed. Re-compute it. */
free_dominance_info (CDI_DOMINATORS);
calculate_dominance_info (CDI_DOMINATORS);
-
- return paral_bb;
}
/* Generates code to execute the iterations of LOOP in N_THREADS
@@@ -2257,8 -2103,7 +2230,8 @@@
static void
gen_parallel_loop (struct loop *loop,
reduction_info_table_type *reduction_list,
- unsigned n_threads, struct tree_niter_desc *niter)
+ unsigned n_threads, struct tree_niter_desc *niter,
+ basic_block region_entry, bool oacc_kernels_p)
{
tree many_iterations_cond, type, nit;
tree arg_struct, new_arg_struct;
@@@ -2339,43 -2184,40 +2312,43 @@@
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- if (loop->inner)
- m_p_thread=2;
- else
- m_p_thread=MIN_PER_THREAD;
-
- many_iterations_cond =
- fold_build2 (GE_EXPR, boolean_type_node,
- nit, build_int_cst (type, m_p_thread * n_threads));
-
- many_iterations_cond
- = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
- invert_truthvalue (unshare_expr (niter->may_be_zero)),
- many_iterations_cond);
- many_iterations_cond
- = force_gimple_operand (many_iterations_cond, &stmts, false,
NULL_TREE);
- if (stmts)
- gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- if (!is_gimple_condexpr (many_iterations_cond))
+ if (!oacc_kernels_p)
{
+ if (loop->inner)
+ m_p_thread=2;
+ else
+ m_p_thread=MIN_PER_THREAD;
+
+ many_iterations_cond =
+ fold_build2 (GE_EXPR, boolean_type_node,
+ nit, build_int_cst (type, m_p_thread * n_threads));
+
+ many_iterations_cond
+ = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+ invert_truthvalue (unshare_expr (niter->may_be_zero)),
+ many_iterations_cond);
many_iterations_cond
- = force_gimple_operand (many_iterations_cond, &stmts,
- true, NULL_TREE);
+ = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- }
+ if (!is_gimple_condexpr (many_iterations_cond))
+ {
+ many_iterations_cond
+ = force_gimple_operand (many_iterations_cond, &stmts,
+ true, NULL_TREE);
+ if (stmts)
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
stmts);
+ }
- initialize_original_copy_tables ();
+ initialize_original_copy_tables ();
- /* We assume that the loop usually iterates a lot. */
- prob = 4 * REG_BR_PROB_BASE / 5;
- loop_version (loop, many_iterations_cond, NULL,
- prob, prob, REG_BR_PROB_BASE - prob, true);
- update_ssa (TODO_update_ssa);
- free_original_copy_tables ();
+ /* We assume that the loop usually iterates a lot. */
+ prob = 4 * REG_BR_PROB_BASE / 5;
+ loop_version (loop, many_iterations_cond, NULL,
+ prob, prob, REG_BR_PROB_BASE - prob, true);
+ update_ssa (TODO_update_ssa);
+ free_original_copy_tables ();
+ }
/* Base all the induction variables in LOOP on a single control one. */
canonicalize_loop_ivs (loop, &nit, true);
@@@ -2387,9 -2229,6 +2360,9 @@@
iterations of the loop by one. */
if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
{
+ if (oacc_kernels_p)
+ n_threads = 1;
+
/* Fall back on the method that handles more cases, but duplicates
the
loop body: move the exit condition of LOOP to the beginning of its
header, and duplicate the part of the last iteration that gets disabled
@@@ -2406,34 -2245,19 +2379,34 @@@
entry = loop_preheader_edge (loop);
exit = single_dom_exit (loop);
- eliminate_local_variables (entry, exit);
- /* In the old loop, move all variables non-local to the loop to a
structure
- and back, and create separate decls for the variables used in loop.
*/
- separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
- &new_arg_struct, &clsn_data);
+ /* This rewrites the body in terms of new variables. This has already
+ been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */
+ if (!oacc_kernels_p)
+ {
+ eliminate_local_variables (entry, exit);
+ /* In the old loop, move all variables non-local to the loop to a
+ structure and back, and create separate decls for the variables used in
+ loop. */
+ separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
+ &new_arg_struct, &clsn_data);
+ }
+ else
+ {
+ arg_struct = NULL_TREE;
+ new_arg_struct = NULL_TREE;
+ clsn_data.load = NULL_TREE;
+ clsn_data.load_bb = exit->dest;
+ clsn_data.store = NULL_TREE;
+ clsn_data.store_bb = NULL;
+ }
/* Create the parallel constructs. */
loc = UNKNOWN_LOCATION;
cond_stmt = last_stmt (loop->header);
if (cond_stmt)
loc = gimple_location (cond_stmt);
- create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
- new_arg_struct, n_threads, loc);
+ create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
new_arg_struct,
+ n_threads, loc, region_entry, oacc_kernels_p);
if (reduction_list->elements () > 0)
create_call_for_reduction (loop, reduction_list, &clsn_data);
@@@ -2575,8 -2399,7 +2548,8 @@@ try_get_loop_niter (loop_p loop, struc
static bool
try_create_reduction_list (loop_p loop,
- reduction_info_table_type *reduction_list)
+ reduction_info_table_type *reduction_list,
+ bool oacc_kernels_p)
{
edge exit = single_dom_exit (loop);
gphi_iterator gsi;
@@@ -2666,61 -2489,6 +2639,61 @@@
}
+ if (oacc_kernels_p)
+ {
+ edge e = loop_preheader_edge (loop);
+
+ for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gphi *phi = gsi.phi ();
+ tree def = PHI_RESULT (phi);
+ affine_iv iv;
+
+ if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv,
true))
+ {
+ struct reduction_info *red;
+ red = reduction_phi (reduction_list, phi);
+
+ /* Look for pattern:
+
+ <bb preheader>
+ .omp_data_i = &.omp_data_arr;
+ addr = .omp_data_i->sum;
+ sum_a = *addr;
+
+ <bb header>:
+ sum_b = PHI <sum_a (preheader), sum_c (latch)>
+
+ and assign addr to reduc->reduc_addr. */
+
+ tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
+ gimple stmt = SSA_NAME_DEF_STMT (arg);
+ if (!gimple_assign_single_p (stmt))
+ return false;
+ tree memref = gimple_assign_rhs1 (stmt);
+ if (TREE_CODE (memref) != MEM_REF)
+ return false;
+ tree addr = TREE_OPERAND (memref, 0);
+
+ gimple stmt2 = SSA_NAME_DEF_STMT (addr);
+ if (!gimple_assign_single_p (stmt2))
+ return false;
+ tree compref = gimple_assign_rhs1 (stmt2);
+ if (TREE_CODE (compref) != COMPONENT_REF)
+ return false;
+ tree addr2 = TREE_OPERAND (compref, 0);
+ if (TREE_CODE (addr2) != MEM_REF)
+ return false;
+ addr2 = TREE_OPERAND (addr2, 0);
+ if (TREE_CODE (addr2) != SSA_NAME
+ || !gimple_stmt_omp_data_i_init_p (SSA_NAME_DEF_STMT (addr2)))
+ return false;
+ red->reduc_addr = addr;
+ }
+ }
+ }
+
return true;
}
@@@ -2729,7 -2497,7 +2702,7 @@@
otherwise. */
static bool
-parallelize_loops (void)
+parallelize_loops (bool oacc_kernels_p)
{
unsigned n_threads = flag_tree_parallelize_loops;
bool changed = false;
@@@ -2738,7 -2506,6 +2711,7 @@@
struct obstack parloop_obstack;
HOST_WIDE_INT estimated;
source_location loop_loc;
+ basic_block region_entry = NULL;
/* Do not parallelize loops in the functions created by
parallelization. */
if (parallelized_function_p (cfun->decl))
@@@ -2750,29 -2517,9 +2723,29 @@@
reduction_info_table_type reduction_list (10);
init_stmt_vec_info_vec ();
+ calculate_dominance_info (CDI_DOMINATORS);
+
FOR_EACH_LOOP (loop, 0)
{
reduction_list.empty ();
+
+ if (oacc_kernels_p)
+ {
+ if (!loop->in_oacc_kernels_region)
+ continue;
+
+ /* TODO: Allow nested loops. */
+ if (loop->inner)
+ continue;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ "Trying loop %d with header bb %d in oacc kernels
region\n",
+ loop->num, loop->header->index);
+
+ region_entry = loop_get_oacc_kernels_region_entry (loop);
+ }
+
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
@@@ -2814,7 -2561,6 +2787,7 @@@
/* FIXME: Bypass this check as graphite doesn't update the
count and frequency correctly now. */
if (!flag_loop_parallelize_all
+ && !oacc_kernels_p
&& ((estimated != -1
&& estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
/* Do not bother with loops in cold areas. */
@@@ -2824,7 -2570,7 +2797,7 @@@
if (!try_get_loop_niter (loop, &niter_desc))
continue;
- if (!try_create_reduction_list (loop, &reduction_list))
+ if (!try_create_reduction_list (loop, &reduction_list,
oacc_kernels_p))
continue;
if (!flag_loop_parallelize_all
@@@ -2843,9 -2589,8 +2816,9 @@@
fprintf (dump_file, "\nloop at %s:%d: ",
LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
}
+
gen_parallel_loop (loop, &reduction_list,
- n_threads, &niter_desc);
+ n_threads, &niter_desc, region_entry, oacc_kernels_p);
}
free_stmt_vec_info_vec ();
@@@ -2896,7 -2641,7 +2869,7 @@@ pass_parallelize_loops::execute (functi
if (number_of_loops (fun) <= 1)
return 0;
- if (parallelize_loops ())
+ if (parallelize_loops (false))
{
fun->curr_properties &= ~(PROP_gimple_eomp);
return TODO_update_ssa;
@@@ -2912,51 -2657,3 +2885,51 @@@ make_pass_parallelize_loops (gcc::conte
{
return new pass_parallelize_loops (ctxt);
}
+
+namespace {
+
+const pass_data pass_data_parallelize_loops_oacc_kernels =
+{
+ GIMPLE_PASS, /* type */
+ "parloops_oacc_kernels", /* name */
+ OPTGROUP_LOOP, /* optinfo_flags */
+ TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
+ ( PROP_cfg | PROP_ssa ), /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass
+{
+public:
+ pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *) { return flag_tree_parallelize_loops >
1; }
+ virtual unsigned int execute (function *);
+
+}; // class pass_parallelize_loops_oacc_kernels
+
+unsigned
+pass_parallelize_loops_oacc_kernels::execute (function *fun)
+{
+ if (number_of_loops (fun) <= 1)
+ return 0;
+
+ if (parallelize_loops (true))
+ return TODO_update_ssa;
+
+ return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+{
+ return new pass_parallelize_loops_oacc_kernels (ctxt);
+}
..., and:
$ git show -m 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 --
gcc/tree-parloops.c
commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 (from
f9d00ca614a8dc28f21ab4a16d7cdbbe16668ca3)
Merge: f9d00ca cacef50
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Sun Jul 12 09:30:39 2015 +0000
svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk
git-svn-id:
svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719
138bc75d-0d04-0410-961f-82ee72b054a4
diff --git gcc/tree-parloops.c gcc/tree-parloops.c
index 04708c0..80a215d 100644
--- gcc/tree-parloops.c
+++ gcc/tree-parloops.c
@@ -22,43 +22,22 @@ along with GCC; see the file COPYING3. If not see
#include "config.h"
#include "system.h"
#include "coretypes.h"
-#include "hash-set.h"
-#include "machmode.h"
-#include "vec.h"
-#include "double-int.h"
-#include "input.h"
#include "alias.h"
-#include "symtab.h"
-#include "options.h"
-#include "wide-int.h"
-#include "inchash.h"
+#include "backend.h"
#include "tree.h"
-#include "fold-const.h"
-#include "predict.h"
-#include "tm.h"
+#include "gimple.h"
#include "hard-reg-set.h"
-#include "input.h"
-#include "function.h"
-#include "dominance.h"
-#include "cfg.h"
-#include "basic-block.h"
-#include "tree-ssa-alias.h"
+#include "ssa.h"
+#include "options.h"
+#include "fold-const.h"
#include "internal-fn.h"
-#include "gimple-expr.h"
-#include "is-a.h"
-#include "gimple.h"
#include "gimplify.h"
#include "gimple-iterator.h"
#include "gimplify-me.h"
#include "gimple-walk.h"
#include "stor-layout.h"
#include "tree-nested.h"
-#include "gimple-ssa.h"
#include "tree-cfg.h"
-#include "tree-phinodes.h"
-#include "ssa-iterators.h"
-#include "stringpool.h"
-#include "tree-ssanames.h"
#include "tree-ssa-loop-ivopts.h"
#include "tree-ssa-loop-manip.h"
#include "tree-ssa-loop-niter.h"
@@ -75,8 +54,6 @@ along with GCC; see the file COPYING3. If not see
#include "tree-parloops.h"
#include "omp-low.h"
#include "tree-nested.h"
-#include "plugin-api.h"
-#include "ipa-ref.h"
#include "cgraph.h"
#include "tree-ssa.h"
@@ -229,10 +206,8 @@ struct reduction_info
/* Reduction info hashtable helpers. */
-struct reduction_hasher : typed_free_remove <reduction_info>
+struct reduction_hasher : free_ptr_hash <reduction_info>
{
- typedef reduction_info *value_type;
- typedef reduction_info *compare_type;
static inline hashval_t hash (const reduction_info *);
static inline bool equal (const reduction_info *, const reduction_info
*);
};
@@ -281,10 +256,8 @@ struct name_to_copy_elt
/* Name copies hashtable helpers. */
-struct name_to_copy_hasher : typed_free_remove <name_to_copy_elt>
+struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
{
- typedef name_to_copy_elt *value_type;
- typedef name_to_copy_elt *compare_type;
static inline hashval_t hash (const name_to_copy_elt *);
static inline bool equal (const name_to_copy_elt *, const
name_to_copy_elt *);
};
commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 (from
cacef506e4205bac13a0dd1de238d1a8cc78af28)
Merge: f9d00ca cacef50
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date: Sun Jul 12 09:30:39 2015 +0000
svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk
git-svn-id:
svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719
138bc75d-0d04-0410-961f-82ee72b054a4
diff --git gcc/tree-parloops.c gcc/tree-parloops.c
index 846077a..80a215d 100644
--- gcc/tree-parloops.c
+++ gcc/tree-parloops.c
@@ -195,6 +195,8 @@ struct reduction_info
of the reduction variable when existing the
loop. */
tree initial_value; /* The initial value of the reduction
var before entering the loop. */
tree field; /* the name of the field in the
parloop data structure intended for reduction. */
+ tree reduc_addr; /* The address of the reduction variable for
+ openacc reductions. */
tree init; /* reduction initialization value. */
gphi *new_phi; /* (helper field) Newly created phi node whose
result
will be passed to the atomic operation.
Represents
@@ -1080,10 +1082,30 @@ create_call_for_reduction_1 (reduction_info **slot,
struct clsn_data *clsn_data)
tree tmp_load, name;
gimple load;
- load_struct = build_simple_mem_ref (clsn_data->load);
- t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
+ if (reduc->reduc_addr == NULL_TREE)
+ {
+ load_struct = build_simple_mem_ref (clsn_data->load);
+ t = build3 (COMPONENT_REF, type, load_struct, reduc->field,
NULL_TREE);
+
+ addr = build_addr (t, current_function_decl);
+ }
+ else
+ {
+ /* Set the address for the atomic store. */
+ addr = reduc->reduc_addr;
+
+ /* Remove the non-atomic store '*addr = sum'. */
+ tree res = PHI_RESULT (reduc->keep_res);
+ use_operand_p use_p;
+ gimple stmt;
+ bool single_use_p = single_imm_use (res, &use_p, &stmt);
+ gcc_assert (single_use_p);
+ replace_uses_by (gimple_vdef (stmt),
+ gimple_vuse (stmt));
+ gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
+ gsi_remove (&gsi, true);
+ }
- addr = build_addr (t, current_function_decl);
/* Create phi node. */
bb = clsn_data->load_bb;
@@ -1685,10 +1707,15 @@ transform_to_exit_first_loop_alt (struct loop *loop,
/* Set the latch arguments of the new phis to ivtmp/sum_b. */
flush_pending_stmts (post_inc_edge);
- /* Create a new empty exit block, inbetween the new loop header and the
old
- exit block. The function separate_decls_in_region needs this block to
- insert code that is active on loop exit, but not any other path. */
- basic_block new_exit_block = split_edge (exit);
+
+ basic_block new_exit_block = NULL;
+ if (!single_pred_p (exit->dest))
+ {
+ /* Create a new empty exit block, inbetween the new loop header and
the
+ old exit block. The function separate_decls_in_region needs this block
+ to insert code that is active on loop exit, but not any other path. */
+ new_exit_block = split_edge (exit);
+ }
/* Insert and register the reduction exit phis. */
for (gphi_iterator gsi = gsi_start_phis (exit_block);
@@ -1696,17 +1723,24 @@ transform_to_exit_first_loop_alt (struct loop *loop,
gsi_next (&gsi))
{
gphi *phi = gsi.phi ();
+ gphi *nphi = NULL;
tree res_z = PHI_RESULT (phi);
+ tree res_c;
- /* Now that we have a new exit block, duplicate the phi of the old
exit
- block in the new exit block to preserve loop-closed ssa. */
- edge succ_new_exit_block = single_succ_edge (new_exit_block);
- edge pred_new_exit_block = single_pred_edge (new_exit_block);
- tree res_y = copy_ssa_name (res_z, phi);
- gphi *nphi = create_phi_node (res_y, new_exit_block);
- tree res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
- add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
- add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
+ if (new_exit_block != NULL)
+ {
+ /* Now that we have a new exit block, duplicate the phi of the old
+ exit block in the new exit block to preserve loop-closed ssa. */
+ edge succ_new_exit_block = single_succ_edge (new_exit_block);
+ edge pred_new_exit_block = single_pred_edge (new_exit_block);
+ tree res_y = copy_ssa_name (res_z, phi);
+ nphi = create_phi_node (res_y, new_exit_block);
+ res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
+ add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
+ add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
+ }
+ else
+ res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
if (virtual_operand_p (res_z))
continue;
@@ -1714,7 +1748,9 @@ transform_to_exit_first_loop_alt (struct loop *loop,
gimple reduc_phi = SSA_NAME_DEF_STMT (res_c);
struct reduction_info *red = reduction_phi (reduction_list,
reduc_phi);
if (red != NULL)
- red->keep_res = nphi;
+ red->keep_res = (nphi != NULL
+ ? nphi
+ : phi);
}
/* We're going to cancel the loop at the end of gen_parallel_loop, but
until
@@ -1828,8 +1864,24 @@ try_transform_to_exit_first_loop_alt (struct loop
*loop,
alt_bound = op1;
}
+ /* If not found, insert nit + 1. */
if (alt_bound == NULL_TREE)
- return false;
+ {
+ alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
+ build_int_cst_type (nit_type, 1));
+
+ gimple_seq pre = NULL, post = NULL;
+ push_gimplify_context (true);
+ gimplify_expr (&alt_bound, &pre, &post, is_gimple_reg,
+ fb_rvalue);
+ pop_gimplify_context (NULL);
+
+ gimple_seq_add_seq (&pre, post);
+
+ gimple_stmt_iterator gsi
+ = gsi_last_bb (loop_preheader_edge (loop)->src);
+ gsi_insert_seq_after (&gsi, pre, GSI_CONTINUE_LINKING);
+ }
transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
return true;
@@ -1953,9 +2005,10 @@ transform_to_exit_first_loop (struct loop *loop,
of LOOP_FN. N_THREADS is the requested number of threads. Returns the
basic block containing GIMPLE_OMP_PARALLEL tree. */
-static basic_block
+static void
create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
- tree new_data, unsigned n_threads, location_t loc)
+ tree new_data, unsigned n_threads, location_t loc,
+ basic_block region_entry, bool oacc_kernels_p)
{
gimple_stmt_iterator gsi;
basic_block bb, paral_bb, for_bb, ex_bb;
@@ -1968,19 +2021,79 @@ create_parallel_loop (struct loop *loop, tree
loop_fn, tree data,
gomp_continue *omp_cont_stmt;
tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
edge exit, nexit, guard, end, e;
+ tree for_clauses = NULL_TREE;
/* Prepare the GIMPLE_OMP_PARALLEL statement. */
bb = loop_preheader_edge (loop)->src;
paral_bb = single_pred (bb);
- gsi = gsi_last_bb (paral_bb);
+ if (!oacc_kernels_p)
+ gsi = gsi_last_bb (paral_bb);
+ else
+ /* Make sure the oacc parallel is inserted on top of the oacc kernels
+ region. */
+ gsi = gsi_last_bb (region_entry);
+
+ if (!oacc_kernels_p)
+ {
+ t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
+ OMP_CLAUSE_NUM_THREADS_EXPR (t)
+ = build_int_cst (integer_type_node, n_threads);
+ omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
+ gimple_set_location (omp_par_stmt, loc);
+
+ gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
+ }
+ else
+ {
+ /* Create oacc parallel pragma based on oacc kernels pragma. */
+ gomp_target *kernels = as_a <gomp_target *> (gsi_stmt (gsi));
+
+ gsi_prev (&gsi);
+ gcall *goacc_kernels = as_a <gcall *> (gsi_stmt (gsi));
- t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
- OMP_CLAUSE_NUM_THREADS_EXPR (t)
- = build_int_cst (integer_type_node, n_threads);
- omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
- gimple_set_location (omp_par_stmt, loc);
+ tree clauses = gimple_omp_target_clauses (kernels);
+ /* FIXME: We need a more intelligent mapping onto vector, gangs,
+ workers. */
+ if (1)
+ {
+ tree clause = build_omp_clause (gimple_location (kernels),
+ OMP_CLAUSE_NUM_GANGS);
+ OMP_CLAUSE_NUM_GANGS_EXPR (clause)
+ = build_int_cst (integer_type_node, n_threads);
+ OMP_CLAUSE_CHAIN (clause) = clauses;
+ clauses = clause;
+ }
+ gomp_target *stmt
+ = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_PARALLEL,
+ clauses);
+ tree child_fn = gimple_omp_target_child_fn (kernels);
+ gimple_omp_target_set_child_fn (stmt, child_fn);
+ tree data_arg = gimple_omp_target_data_arg (kernels);
+ gimple_omp_target_set_data_arg (stmt, data_arg);
+ tree ganglocal_size = gimple_call_arg (goacc_kernels, /* TODO */ 9);
+ gimple_omp_target_set_ganglocal_size (stmt, ganglocal_size);
+
+ gimple_set_location (stmt, loc);
+
+ /* Insert oacc parallel pragma after the oacc kernels pragma. */
+ {
+ gimple_stmt_iterator gsi2;
+ gsi = gsi_last_bb (region_entry);
+ gsi2 = gsi;
+ gsi_prev (&gsi2);
+
+ /* Insert pragma acc parallel. */
+ gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
- gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
+ /* Remove GOACC_kernels. */
+ replace_uses_by (gimple_vdef (gsi_stmt (gsi2)),
+ gimple_vuse (gsi_stmt (gsi2)));
+ gsi_remove (&gsi2, true);
+
+ /* Remove pragma acc kernels. */
+ gsi_remove (&gsi2, true);
+ }
+ }
/* Initialize NEW_DATA. */
if (data)
@@ -1998,12 +2111,18 @@ create_parallel_loop (struct loop *loop, tree
loop_fn, tree data,
gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
}
- /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
- bb = split_loop_exit_edge (single_dom_exit (loop));
- gsi = gsi_last_bb (bb);
- omp_return_stmt1 = gimple_build_omp_return (false);
- gimple_set_location (omp_return_stmt1, loc);
- gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
+ /* Skip insertion of OMP_RETURN for oacc_kernels_p. We've already
generated
+ one when lowering the oacc kernels directive in
+ pass_lower_omp/lower_omp (). */
+ if (!oacc_kernels_p)
+ {
+ /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */
+ bb = split_loop_exit_edge (single_dom_exit (loop));
+ gsi = gsi_last_bb (bb);
+ omp_return_stmt1 = gimple_build_omp_return (false);
+ gimple_set_location (omp_return_stmt1, loc);
+ gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
+ }
/* Extract data for GIMPLE_OMP_FOR. */
gcc_assert (loop->header == single_dom_exit (loop)->src);
@@ -2060,7 +2179,17 @@ create_parallel_loop (struct loop *loop, tree
loop_fn, tree data,
t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
- for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL);
+ if (1)
+ {
+ /* In combination with the NUM_GANGS on the parallel. */
+ for_clauses = build_omp_clause (loc, OMP_CLAUSE_GANG);
+ }
+
+ for_stmt = gimple_build_omp_for (NULL,
+ (oacc_kernels_p
+ ? GF_OMP_FOR_KIND_OACC_LOOP
+ : GF_OMP_FOR_KIND_FOR),
+ for_clauses, 1, NULL);
gimple_set_location (for_stmt, loc);
gimple_omp_for_set_index (for_stmt, 0, initvar);
gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
@@ -2090,8 +2219,6 @@ create_parallel_loop (struct loop *loop, tree
loop_fn, tree data,
/* After the above dom info is hosed. Re-compute it. */
free_dominance_info (CDI_DOMINATORS);
calculate_dominance_info (CDI_DOMINATORS);
-
- return paral_bb;
}
/* Generates code to execute the iterations of LOOP in N_THREADS
@@ -2103,7 +2230,8 @@ create_parallel_loop (struct loop *loop, tree
loop_fn, tree data,
static void
gen_parallel_loop (struct loop *loop,
reduction_info_table_type *reduction_list,
- unsigned n_threads, struct tree_niter_desc *niter)
+ unsigned n_threads, struct tree_niter_desc *niter,
+ basic_block region_entry, bool oacc_kernels_p)
{
tree many_iterations_cond, type, nit;
tree arg_struct, new_arg_struct;
@@ -2184,40 +2312,43 @@ gen_parallel_loop (struct loop *loop,
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- if (loop->inner)
- m_p_thread=2;
- else
- m_p_thread=MIN_PER_THREAD;
-
- many_iterations_cond =
- fold_build2 (GE_EXPR, boolean_type_node,
- nit, build_int_cst (type, m_p_thread * n_threads));
-
- many_iterations_cond
- = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
- invert_truthvalue (unshare_expr (niter->may_be_zero)),
- many_iterations_cond);
- many_iterations_cond
- = force_gimple_operand (many_iterations_cond, &stmts, false,
NULL_TREE);
- if (stmts)
- gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- if (!is_gimple_condexpr (many_iterations_cond))
+ if (!oacc_kernels_p)
{
+ if (loop->inner)
+ m_p_thread=2;
+ else
+ m_p_thread=MIN_PER_THREAD;
+
+ many_iterations_cond =
+ fold_build2 (GE_EXPR, boolean_type_node,
+ nit, build_int_cst (type, m_p_thread * n_threads));
+
+ many_iterations_cond
+ = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
+ invert_truthvalue (unshare_expr (niter->may_be_zero)),
+ many_iterations_cond);
many_iterations_cond
- = force_gimple_operand (many_iterations_cond, &stmts,
- true, NULL_TREE);
+ = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
if (stmts)
gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
- }
+ if (!is_gimple_condexpr (many_iterations_cond))
+ {
+ many_iterations_cond
+ = force_gimple_operand (many_iterations_cond, &stmts,
+ true, NULL_TREE);
+ if (stmts)
+ gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
stmts);
+ }
- initialize_original_copy_tables ();
+ initialize_original_copy_tables ();
- /* We assume that the loop usually iterates a lot. */
- prob = 4 * REG_BR_PROB_BASE / 5;
- loop_version (loop, many_iterations_cond, NULL,
- prob, prob, REG_BR_PROB_BASE - prob, true);
- update_ssa (TODO_update_ssa);
- free_original_copy_tables ();
+ /* We assume that the loop usually iterates a lot. */
+ prob = 4 * REG_BR_PROB_BASE / 5;
+ loop_version (loop, many_iterations_cond, NULL,
+ prob, prob, REG_BR_PROB_BASE - prob, true);
+ update_ssa (TODO_update_ssa);
+ free_original_copy_tables ();
+ }
/* Base all the induction variables in LOOP on a single control one. */
canonicalize_loop_ivs (loop, &nit, true);
@@ -2229,6 +2360,9 @@ gen_parallel_loop (struct loop *loop,
iterations of the loop by one. */
if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
{
+ if (oacc_kernels_p)
+ n_threads = 1;
+
/* Fall back on the method that handles more cases, but duplicates
the
loop body: move the exit condition of LOOP to the beginning of its
header, and duplicate the part of the last iteration that gets disabled
@@ -2245,19 +2379,34 @@ gen_parallel_loop (struct loop *loop,
entry = loop_preheader_edge (loop);
exit = single_dom_exit (loop);
- eliminate_local_variables (entry, exit);
- /* In the old loop, move all variables non-local to the loop to a
structure
- and back, and create separate decls for the variables used in loop.
*/
- separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
- &new_arg_struct, &clsn_data);
+ /* This rewrites the body in terms of new variables. This has already
+ been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */
+ if (!oacc_kernels_p)
+ {
+ eliminate_local_variables (entry, exit);
+ /* In the old loop, move all variables non-local to the loop to a
+ structure and back, and create separate decls for the variables used in
+ loop. */
+ separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
+ &new_arg_struct, &clsn_data);
+ }
+ else
+ {
+ arg_struct = NULL_TREE;
+ new_arg_struct = NULL_TREE;
+ clsn_data.load = NULL_TREE;
+ clsn_data.load_bb = exit->dest;
+ clsn_data.store = NULL_TREE;
+ clsn_data.store_bb = NULL;
+ }
/* Create the parallel constructs. */
loc = UNKNOWN_LOCATION;
cond_stmt = last_stmt (loop->header);
if (cond_stmt)
loc = gimple_location (cond_stmt);
- create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
- new_arg_struct, n_threads, loc);
+ create_parallel_loop (loop, create_loop_fn (loc), arg_struct,
new_arg_struct,
+ n_threads, loc, region_entry, oacc_kernels_p);
if (reduction_list->elements () > 0)
create_call_for_reduction (loop, reduction_list, &clsn_data);
@@ -2399,7 +2548,8 @@ try_get_loop_niter (loop_p loop, struct
tree_niter_desc *niter)
static bool
try_create_reduction_list (loop_p loop,
- reduction_info_table_type *reduction_list)
+ reduction_info_table_type *reduction_list,
+ bool oacc_kernels_p)
{
edge exit = single_dom_exit (loop);
gphi_iterator gsi;
@@ -2489,6 +2639,61 @@ try_create_reduction_list (loop_p loop,
}
+ if (oacc_kernels_p)
+ {
+ edge e = loop_preheader_edge (loop);
+
+ for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
+ gsi_next (&gsi))
+ {
+ gphi *phi = gsi.phi ();
+ tree def = PHI_RESULT (phi);
+ affine_iv iv;
+
+ if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv,
true))
+ {
+ struct reduction_info *red;
+ red = reduction_phi (reduction_list, phi);
+
+ /* Look for pattern:
+
+ <bb preheader>
+ .omp_data_i = &.omp_data_arr;
+ addr = .omp_data_i->sum;
+ sum_a = *addr;
+
+ <bb header>:
+ sum_b = PHI <sum_a (preheader), sum_c (latch)>
+
+ and assign addr to reduc->reduc_addr. */
+
+ tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
+ gimple stmt = SSA_NAME_DEF_STMT (arg);
+ if (!gimple_assign_single_p (stmt))
+ return false;
+ tree memref = gimple_assign_rhs1 (stmt);
+ if (TREE_CODE (memref) != MEM_REF)
+ return false;
+ tree addr = TREE_OPERAND (memref, 0);
+
+ gimple stmt2 = SSA_NAME_DEF_STMT (addr);
+ if (!gimple_assign_single_p (stmt2))
+ return false;
+ tree compref = gimple_assign_rhs1 (stmt2);
+ if (TREE_CODE (compref) != COMPONENT_REF)
+ return false;
+ tree addr2 = TREE_OPERAND (compref, 0);
+ if (TREE_CODE (addr2) != MEM_REF)
+ return false;
+ addr2 = TREE_OPERAND (addr2, 0);
+ if (TREE_CODE (addr2) != SSA_NAME
+ || !gimple_stmt_omp_data_i_init_p (SSA_NAME_DEF_STMT (addr2)))
+ return false;
+ red->reduc_addr = addr;
+ }
+ }
+ }
+
return true;
}
@@ -2497,7 +2702,7 @@ try_create_reduction_list (loop_p loop,
otherwise. */
static bool
-parallelize_loops (void)
+parallelize_loops (bool oacc_kernels_p)
{
unsigned n_threads = flag_tree_parallelize_loops;
bool changed = false;
@@ -2506,6 +2711,7 @@ parallelize_loops (void)
struct obstack parloop_obstack;
HOST_WIDE_INT estimated;
source_location loop_loc;
+ basic_block region_entry = NULL;
/* Do not parallelize loops in the functions created by parallelization.
*/
if (parallelized_function_p (cfun->decl))
@@ -2517,9 +2723,29 @@ parallelize_loops (void)
reduction_info_table_type reduction_list (10);
init_stmt_vec_info_vec ();
+ calculate_dominance_info (CDI_DOMINATORS);
+
FOR_EACH_LOOP (loop, 0)
{
reduction_list.empty ();
+
+ if (oacc_kernels_p)
+ {
+ if (!loop->in_oacc_kernels_region)
+ continue;
+
+ /* TODO: Allow nested loops. */
+ if (loop->inner)
+ continue;
+
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file,
+ "Trying loop %d with header bb %d in oacc kernels
region\n",
+ loop->num, loop->header->index);
+
+ region_entry = loop_get_oacc_kernels_region_entry (loop);
+ }
+
if (dump_file && (dump_flags & TDF_DETAILS))
{
fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
@@ -2561,6 +2787,7 @@ parallelize_loops (void)
/* FIXME: Bypass this check as graphite doesn't update the
count and frequency correctly now. */
if (!flag_loop_parallelize_all
+ && !oacc_kernels_p
&& ((estimated != -1
&& estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
/* Do not bother with loops in cold areas. */
@@ -2570,7 +2797,7 @@ parallelize_loops (void)
if (!try_get_loop_niter (loop, &niter_desc))
continue;
- if (!try_create_reduction_list (loop, &reduction_list))
+ if (!try_create_reduction_list (loop, &reduction_list,
oacc_kernels_p))
continue;
if (!flag_loop_parallelize_all
@@ -2589,8 +2816,9 @@ parallelize_loops (void)
fprintf (dump_file, "\nloop at %s:%d: ",
LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc));
}
+
gen_parallel_loop (loop, &reduction_list,
- n_threads, &niter_desc);
+ n_threads, &niter_desc, region_entry, oacc_kernels_p);
}
free_stmt_vec_info_vec ();
@@ -2641,7 +2869,7 @@ pass_parallelize_loops::execute (function *fun)
if (number_of_loops (fun) <= 1)
return 0;
- if (parallelize_loops ())
+ if (parallelize_loops (false))
{
fun->curr_properties &= ~(PROP_gimple_eomp);
return TODO_update_ssa;
@@ -2657,3 +2885,51 @@ make_pass_parallelize_loops (gcc::context *ctxt)
{
return new pass_parallelize_loops (ctxt);
}
+
+namespace {
+
+const pass_data pass_data_parallelize_loops_oacc_kernels =
+{
+ GIMPLE_PASS, /* type */
+ "parloops_oacc_kernels", /* name */
+ OPTGROUP_LOOP, /* optinfo_flags */
+ TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
+ ( PROP_cfg | PROP_ssa ), /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
+class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass
+{
+public:
+ pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ virtual bool gate (function *) { return flag_tree_parallelize_loops > 1;
}
+ virtual unsigned int execute (function *);
+
+}; // class pass_parallelize_loops_oacc_kernels
+
+unsigned
+pass_parallelize_loops_oacc_kernels::execute (function *fun)
+{
+ if (number_of_loops (fun) <= 1)
+ return 0;
+
+ if (parallelize_loops (true))
+ return TODO_update_ssa;
+
+ return 0;
+}
+
+} // anon namespace
+
+gimple_opt_pass *
+make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt)
+{
+ return new pass_parallelize_loops_oacc_kernels (ctxt);
+}
Grüße,
Thomas
signature.asc
Description: PGP signature
