Hi Tom! On Mon, 13 Jul 2015 12:32:20 +0200, Tom de Vries <tom_devr...@mentor.com> wrote: > On 13/07/15 10:31, Thomas Schwinge wrote: > > On Mon, 13 Jul 2015 09:20:16 +0200, Tom de Vries<tom_devr...@mentor.com> > > wrote: > >> >On 12/07/15 11:39, Thomas Schwinge wrote: > >> >I've looked at the merge commit, gcc/tree-parloops.c was not modified.
> > (Well, it was, but not "substantially".) > Hmm, the reason why I said tree-parloops.c was not modified, was that > the git show of your merge commit (which invokes git diff-tree --cc) > does not show any differences for tree-parloops.c: > ... > $ git show 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 | grep tree-parloops.c > $ > ... > > OTOH, if I use --numstat as diff-tree argument, I see: > ... > $ git diff-tree --numstat --cc 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 > | grep tree-parloops.c > 7 34 gcc/tree-parloops.c > ... > > I'm not sure if this is expected behaviour. Yes, I think so, because »--cc [...] compresses the patch output by omitting uninteresting hunks whose the contents in the parents have only two variants and the merge result picks one of them without modification«, and, as I said, for »merge conflicts, I just retained the code that was present on gomp-4_0-branch already«. In contrast, see the -c and -m options (which get passed from git show to git diff-tree): $ git show -c 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 -- gcc/tree-parloops.c commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 Merge: f9d00ca cacef50 Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4> Date: Sun Jul 12 09:30:39 2015 +0000 svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719 138bc75d-0d04-0410-961f-82ee72b054a4 diff --combined gcc/tree-parloops.c index 04708c0,846077a..80a215d --- gcc/tree-parloops.c +++ gcc/tree-parloops.c @@@ -22,43 -22,22 +22,22 @@@ along with GCC; see the file COPYING3 #include "config.h" #include "system.h" #include "coretypes.h" - #include "hash-set.h" - #include "machmode.h" - #include "vec.h" - #include "double-int.h" - #include "input.h" #include "alias.h" - #include "symtab.h" - #include "options.h" - #include "wide-int.h" - #include "inchash.h" + #include "backend.h" #include "tree.h" - #include "fold-const.h" - #include "predict.h" - #include "tm.h" + #include "gimple.h" #include "hard-reg-set.h" - #include "input.h" - #include "function.h" - #include "dominance.h" - #include "cfg.h" - #include "basic-block.h" - #include "tree-ssa-alias.h" + #include "ssa.h" + #include "options.h" + #include "fold-const.h" #include "internal-fn.h" - #include "gimple-expr.h" - #include "is-a.h" - #include "gimple.h" #include "gimplify.h" #include "gimple-iterator.h" #include "gimplify-me.h" #include "gimple-walk.h" #include "stor-layout.h" #include "tree-nested.h" - #include "gimple-ssa.h" #include "tree-cfg.h" - #include "tree-phinodes.h" - #include "ssa-iterators.h" - #include "stringpool.h" - #include "tree-ssanames.h" #include "tree-ssa-loop-ivopts.h" #include "tree-ssa-loop-manip.h" #include "tree-ssa-loop-niter.h" @@@ -75,8 -54,6 +54,6 @@@ #include "tree-parloops.h" #include "omp-low.h" #include "tree-nested.h" - #include "plugin-api.h" - #include "ipa-ref.h" #include "cgraph.h" #include "tree-ssa.h" @@@ -218,8 -195,6 +195,8 @@@ struct reduction_inf of the reduction variable when existing the loop. */ tree initial_value; /* The initial value of the reduction var before entering the loop. */ tree field; /* the name of the field in the parloop data structure intended for reduction. */ + tree reduc_addr; /* The address of the reduction variable for + openacc reductions. */ tree init; /* reduction initialization value. */ gphi *new_phi; /* (helper field) Newly created phi node whose result will be passed to the atomic operation. Represents @@@ -229,10 -204,8 +206,8 @@@ /* Reduction info hashtable helpers. */ - struct reduction_hasher : typed_free_remove <reduction_info> + struct reduction_hasher : free_ptr_hash <reduction_info> { - typedef reduction_info *value_type; - typedef reduction_info *compare_type; static inline hashval_t hash (const reduction_info *); static inline bool equal (const reduction_info *, const reduction_info *); }; @@@ -281,10 -254,8 +256,8 @@@ struct name_to_copy_el /* Name copies hashtable helpers. */ - struct name_to_copy_hasher : typed_free_remove <name_to_copy_elt> + struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt> { - typedef name_to_copy_elt *value_type; - typedef name_to_copy_elt *compare_type; static inline hashval_t hash (const name_to_copy_elt *); static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *); }; @@@ -1109,30 -1080,10 +1082,30 @@@ create_call_for_reduction_1 (reduction_ tree tmp_load, name; gimple load; - load_struct = build_simple_mem_ref (clsn_data->load); - t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); + if (reduc->reduc_addr == NULL_TREE) + { + load_struct = build_simple_mem_ref (clsn_data->load); + t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); + + addr = build_addr (t, current_function_decl); + } + else + { + /* Set the address for the atomic store. */ + addr = reduc->reduc_addr; + + /* Remove the non-atomic store '*addr = sum'. */ + tree res = PHI_RESULT (reduc->keep_res); + use_operand_p use_p; + gimple stmt; + bool single_use_p = single_imm_use (res, &use_p, &stmt); + gcc_assert (single_use_p); + replace_uses_by (gimple_vdef (stmt), + gimple_vuse (stmt)); + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_remove (&gsi, true); + } - addr = build_addr (t, current_function_decl); /* Create phi node. */ bb = clsn_data->load_bb; @@@ -1734,15 -1685,10 +1707,15 @@@ transform_to_exit_first_loop_alt (struc /* Set the latch arguments of the new phis to ivtmp/sum_b. */ flush_pending_stmts (post_inc_edge); - /* Create a new empty exit block, inbetween the new loop header and the old - exit block. The function separate_decls_in_region needs this block to - insert code that is active on loop exit, but not any other path. */ - basic_block new_exit_block = split_edge (exit); + + basic_block new_exit_block = NULL; + if (!single_pred_p (exit->dest)) + { + /* Create a new empty exit block, inbetween the new loop header and the + old exit block. The function separate_decls_in_region needs this block + to insert code that is active on loop exit, but not any other path. */ + new_exit_block = split_edge (exit); + } /* Insert and register the reduction exit phis. */ for (gphi_iterator gsi = gsi_start_phis (exit_block); @@@ -1750,24 -1696,17 +1723,24 @@@ gsi_next (&gsi)) { gphi *phi = gsi.phi (); + gphi *nphi = NULL; tree res_z = PHI_RESULT (phi); + tree res_c; - /* Now that we have a new exit block, duplicate the phi of the old exit - block in the new exit block to preserve loop-closed ssa. */ - edge succ_new_exit_block = single_succ_edge (new_exit_block); - edge pred_new_exit_block = single_pred_edge (new_exit_block); - tree res_y = copy_ssa_name (res_z, phi); - gphi *nphi = create_phi_node (res_y, new_exit_block); - tree res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block); - add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION); - add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION); + if (new_exit_block != NULL) + { + /* Now that we have a new exit block, duplicate the phi of the old + exit block in the new exit block to preserve loop-closed ssa. */ + edge succ_new_exit_block = single_succ_edge (new_exit_block); + edge pred_new_exit_block = single_pred_edge (new_exit_block); + tree res_y = copy_ssa_name (res_z, phi); + nphi = create_phi_node (res_y, new_exit_block); + res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block); + add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION); + add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION); + } + else + res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit); if (virtual_operand_p (res_z)) continue; @@@ -1775,9 -1714,7 +1748,9 @@@ gimple reduc_phi = SSA_NAME_DEF_STMT (res_c); struct reduction_info *red = reduction_phi (reduction_list, reduc_phi); if (red != NULL) - red->keep_res = nphi; + red->keep_res = (nphi != NULL + ? nphi + : phi); } /* We're going to cancel the loop at the end of gen_parallel_loop, but until @@@ -1891,24 -1828,8 +1864,24 @@@ try_transform_to_exit_first_loop_alt (s alt_bound = op1; } + /* If not found, insert nit + 1. */ if (alt_bound == NULL_TREE) - return false; + { + alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit, + build_int_cst_type (nit_type, 1)); + + gimple_seq pre = NULL, post = NULL; + push_gimplify_context (true); + gimplify_expr (&alt_bound, &pre, &post, is_gimple_reg, + fb_rvalue); + pop_gimplify_context (NULL); + + gimple_seq_add_seq (&pre, post); + + gimple_stmt_iterator gsi + = gsi_last_bb (loop_preheader_edge (loop)->src); + gsi_insert_seq_after (&gsi, pre, GSI_CONTINUE_LINKING); + } transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound); return true; @@@ -2032,10 -1953,9 +2005,10 @@@ transform_to_exit_first_loop (struct lo of LOOP_FN. N_THREADS is the requested number of threads. Returns the basic block containing GIMPLE_OMP_PARALLEL tree. */ -static basic_block +static void create_parallel_loop (struct loop *loop, tree loop_fn, tree data, - tree new_data, unsigned n_threads, location_t loc) + tree new_data, unsigned n_threads, location_t loc, + basic_block region_entry, bool oacc_kernels_p) { gimple_stmt_iterator gsi; basic_block bb, paral_bb, for_bb, ex_bb; @@@ -2048,79 -1968,19 +2021,79 @@@ gomp_continue *omp_cont_stmt; tree cvar, cvar_init, initvar, cvar_next, cvar_base, type; edge exit, nexit, guard, end, e; + tree for_clauses = NULL_TREE; /* Prepare the GIMPLE_OMP_PARALLEL statement. */ bb = loop_preheader_edge (loop)->src; paral_bb = single_pred (bb); - gsi = gsi_last_bb (paral_bb); + if (!oacc_kernels_p) + gsi = gsi_last_bb (paral_bb); + else + /* Make sure the oacc parallel is inserted on top of the oacc kernels + region. */ + gsi = gsi_last_bb (region_entry); + + if (!oacc_kernels_p) + { + t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); + OMP_CLAUSE_NUM_THREADS_EXPR (t) + = build_int_cst (integer_type_node, n_threads); + omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); + gimple_set_location (omp_par_stmt, loc); + + gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT); + } + else + { + /* Create oacc parallel pragma based on oacc kernels pragma. */ + gomp_target *kernels = as_a <gomp_target *> (gsi_stmt (gsi)); + + gsi_prev (&gsi); + gcall *goacc_kernels = as_a <gcall *> (gsi_stmt (gsi)); - t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); - OMP_CLAUSE_NUM_THREADS_EXPR (t) - = build_int_cst (integer_type_node, n_threads); - omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); - gimple_set_location (omp_par_stmt, loc); + tree clauses = gimple_omp_target_clauses (kernels); + /* FIXME: We need a more intelligent mapping onto vector, gangs, + workers. */ + if (1) + { + tree clause = build_omp_clause (gimple_location (kernels), + OMP_CLAUSE_NUM_GANGS); + OMP_CLAUSE_NUM_GANGS_EXPR (clause) + = build_int_cst (integer_type_node, n_threads); + OMP_CLAUSE_CHAIN (clause) = clauses; + clauses = clause; + } + gomp_target *stmt + = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_PARALLEL, + clauses); + tree child_fn = gimple_omp_target_child_fn (kernels); + gimple_omp_target_set_child_fn (stmt, child_fn); + tree data_arg = gimple_omp_target_data_arg (kernels); + gimple_omp_target_set_data_arg (stmt, data_arg); + tree ganglocal_size = gimple_call_arg (goacc_kernels, /* TODO */ 9); + gimple_omp_target_set_ganglocal_size (stmt, ganglocal_size); + + gimple_set_location (stmt, loc); + + /* Insert oacc parallel pragma after the oacc kernels pragma. */ + { + gimple_stmt_iterator gsi2; + gsi = gsi_last_bb (region_entry); + gsi2 = gsi; + gsi_prev (&gsi2); + + /* Insert pragma acc parallel. */ + gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); - gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT); + /* Remove GOACC_kernels. */ + replace_uses_by (gimple_vdef (gsi_stmt (gsi2)), + gimple_vuse (gsi_stmt (gsi2))); + gsi_remove (&gsi2, true); + + /* Remove pragma acc kernels. */ + gsi_remove (&gsi2, true); + } + } /* Initialize NEW_DATA. */ if (data) @@@ -2138,18 -1998,12 +2111,18 @@@ gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); } - /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ - bb = split_loop_exit_edge (single_dom_exit (loop)); - gsi = gsi_last_bb (bb); - omp_return_stmt1 = gimple_build_omp_return (false); - gimple_set_location (omp_return_stmt1, loc); - gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT); + /* Skip insertion of OMP_RETURN for oacc_kernels_p. We've already generated + one when lowering the oacc kernels directive in + pass_lower_omp/lower_omp (). */ + if (!oacc_kernels_p) + { + /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ + bb = split_loop_exit_edge (single_dom_exit (loop)); + gsi = gsi_last_bb (bb); + omp_return_stmt1 = gimple_build_omp_return (false); + gimple_set_location (omp_return_stmt1, loc); + gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT); + } /* Extract data for GIMPLE_OMP_FOR. */ gcc_assert (loop->header == single_dom_exit (loop)->src); @@@ -2206,17 -2060,7 +2179,17 @@@ t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE); OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC; - for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL); + if (1) + { + /* In combination with the NUM_GANGS on the parallel. */ + for_clauses = build_omp_clause (loc, OMP_CLAUSE_GANG); + } + + for_stmt = gimple_build_omp_for (NULL, + (oacc_kernels_p + ? GF_OMP_FOR_KIND_OACC_LOOP + : GF_OMP_FOR_KIND_FOR), + for_clauses, 1, NULL); gimple_set_location (for_stmt, loc); gimple_omp_for_set_index (for_stmt, 0, initvar); gimple_omp_for_set_initial (for_stmt, 0, cvar_init); @@@ -2246,6 -2090,8 +2219,6 @@@ /* After the above dom info is hosed. Re-compute it. */ free_dominance_info (CDI_DOMINATORS); calculate_dominance_info (CDI_DOMINATORS); - - return paral_bb; } /* Generates code to execute the iterations of LOOP in N_THREADS @@@ -2257,8 -2103,7 +2230,8 @@@ static void gen_parallel_loop (struct loop *loop, reduction_info_table_type *reduction_list, - unsigned n_threads, struct tree_niter_desc *niter) + unsigned n_threads, struct tree_niter_desc *niter, + basic_block region_entry, bool oacc_kernels_p) { tree many_iterations_cond, type, nit; tree arg_struct, new_arg_struct; @@@ -2339,43 -2184,40 +2312,43 @@@ if (stmts) gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - if (loop->inner) - m_p_thread=2; - else - m_p_thread=MIN_PER_THREAD; - - many_iterations_cond = - fold_build2 (GE_EXPR, boolean_type_node, - nit, build_int_cst (type, m_p_thread * n_threads)); - - many_iterations_cond - = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, - invert_truthvalue (unshare_expr (niter->may_be_zero)), - many_iterations_cond); - many_iterations_cond - = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); - if (stmts) - gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - if (!is_gimple_condexpr (many_iterations_cond)) + if (!oacc_kernels_p) { + if (loop->inner) + m_p_thread=2; + else + m_p_thread=MIN_PER_THREAD; + + many_iterations_cond = + fold_build2 (GE_EXPR, boolean_type_node, + nit, build_int_cst (type, m_p_thread * n_threads)); + + many_iterations_cond + = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, + invert_truthvalue (unshare_expr (niter->may_be_zero)), + many_iterations_cond); many_iterations_cond - = force_gimple_operand (many_iterations_cond, &stmts, - true, NULL_TREE); + = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); if (stmts) gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - } + if (!is_gimple_condexpr (many_iterations_cond)) + { + many_iterations_cond + = force_gimple_operand (many_iterations_cond, &stmts, + true, NULL_TREE); + if (stmts) + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); + } - initialize_original_copy_tables (); + initialize_original_copy_tables (); - /* We assume that the loop usually iterates a lot. */ - prob = 4 * REG_BR_PROB_BASE / 5; - loop_version (loop, many_iterations_cond, NULL, - prob, prob, REG_BR_PROB_BASE - prob, true); - update_ssa (TODO_update_ssa); - free_original_copy_tables (); + /* We assume that the loop usually iterates a lot. */ + prob = 4 * REG_BR_PROB_BASE / 5; + loop_version (loop, many_iterations_cond, NULL, + prob, prob, REG_BR_PROB_BASE - prob, true); + update_ssa (TODO_update_ssa); + free_original_copy_tables (); + } /* Base all the induction variables in LOOP on a single control one. */ canonicalize_loop_ivs (loop, &nit, true); @@@ -2387,9 -2229,6 +2360,9 @@@ iterations of the loop by one. */ if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit)) { + if (oacc_kernels_p) + n_threads = 1; + /* Fall back on the method that handles more cases, but duplicates the loop body: move the exit condition of LOOP to the beginning of its header, and duplicate the part of the last iteration that gets disabled @@@ -2406,34 -2245,19 +2379,34 @@@ entry = loop_preheader_edge (loop); exit = single_dom_exit (loop); - eliminate_local_variables (entry, exit); - /* In the old loop, move all variables non-local to the loop to a structure - and back, and create separate decls for the variables used in loop. */ - separate_decls_in_region (entry, exit, reduction_list, &arg_struct, - &new_arg_struct, &clsn_data); + /* This rewrites the body in terms of new variables. This has already + been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */ + if (!oacc_kernels_p) + { + eliminate_local_variables (entry, exit); + /* In the old loop, move all variables non-local to the loop to a + structure and back, and create separate decls for the variables used in + loop. */ + separate_decls_in_region (entry, exit, reduction_list, &arg_struct, + &new_arg_struct, &clsn_data); + } + else + { + arg_struct = NULL_TREE; + new_arg_struct = NULL_TREE; + clsn_data.load = NULL_TREE; + clsn_data.load_bb = exit->dest; + clsn_data.store = NULL_TREE; + clsn_data.store_bb = NULL; + } /* Create the parallel constructs. */ loc = UNKNOWN_LOCATION; cond_stmt = last_stmt (loop->header); if (cond_stmt) loc = gimple_location (cond_stmt); - create_parallel_loop (loop, create_loop_fn (loc), arg_struct, - new_arg_struct, n_threads, loc); + create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct, + n_threads, loc, region_entry, oacc_kernels_p); if (reduction_list->elements () > 0) create_call_for_reduction (loop, reduction_list, &clsn_data); @@@ -2575,8 -2399,7 +2548,8 @@@ try_get_loop_niter (loop_p loop, struc static bool try_create_reduction_list (loop_p loop, - reduction_info_table_type *reduction_list) + reduction_info_table_type *reduction_list, + bool oacc_kernels_p) { edge exit = single_dom_exit (loop); gphi_iterator gsi; @@@ -2666,61 -2489,6 +2639,61 @@@ } + if (oacc_kernels_p) + { + edge e = loop_preheader_edge (loop); + + for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + tree def = PHI_RESULT (phi); + affine_iv iv; + + if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true)) + { + struct reduction_info *red; + red = reduction_phi (reduction_list, phi); + + /* Look for pattern: + + <bb preheader> + .omp_data_i = &.omp_data_arr; + addr = .omp_data_i->sum; + sum_a = *addr; + + <bb header>: + sum_b = PHI <sum_a (preheader), sum_c (latch)> + + and assign addr to reduc->reduc_addr. */ + + tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e); + gimple stmt = SSA_NAME_DEF_STMT (arg); + if (!gimple_assign_single_p (stmt)) + return false; + tree memref = gimple_assign_rhs1 (stmt); + if (TREE_CODE (memref) != MEM_REF) + return false; + tree addr = TREE_OPERAND (memref, 0); + + gimple stmt2 = SSA_NAME_DEF_STMT (addr); + if (!gimple_assign_single_p (stmt2)) + return false; + tree compref = gimple_assign_rhs1 (stmt2); + if (TREE_CODE (compref) != COMPONENT_REF) + return false; + tree addr2 = TREE_OPERAND (compref, 0); + if (TREE_CODE (addr2) != MEM_REF) + return false; + addr2 = TREE_OPERAND (addr2, 0); + if (TREE_CODE (addr2) != SSA_NAME + || !gimple_stmt_omp_data_i_init_p (SSA_NAME_DEF_STMT (addr2))) + return false; + red->reduc_addr = addr; + } + } + } + return true; } @@@ -2729,7 -2497,7 +2702,7 @@@ otherwise. */ static bool -parallelize_loops (void) +parallelize_loops (bool oacc_kernels_p) { unsigned n_threads = flag_tree_parallelize_loops; bool changed = false; @@@ -2738,7 -2506,6 +2711,7 @@@ struct obstack parloop_obstack; HOST_WIDE_INT estimated; source_location loop_loc; + basic_block region_entry = NULL; /* Do not parallelize loops in the functions created by parallelization. */ if (parallelized_function_p (cfun->decl)) @@@ -2750,29 -2517,9 +2723,29 @@@ reduction_info_table_type reduction_list (10); init_stmt_vec_info_vec (); + calculate_dominance_info (CDI_DOMINATORS); + FOR_EACH_LOOP (loop, 0) { reduction_list.empty (); + + if (oacc_kernels_p) + { + if (!loop->in_oacc_kernels_region) + continue; + + /* TODO: Allow nested loops. */ + if (loop->inner) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Trying loop %d with header bb %d in oacc kernels region\n", + loop->num, loop->header->index); + + region_entry = loop_get_oacc_kernels_region_entry (loop); + } + if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "Trying loop %d as candidate\n",loop->num); @@@ -2814,7 -2561,6 +2787,7 @@@ /* FIXME: Bypass this check as graphite doesn't update the count and frequency correctly now. */ if (!flag_loop_parallelize_all + && !oacc_kernels_p && ((estimated != -1 && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD) /* Do not bother with loops in cold areas. */ @@@ -2824,7 -2570,7 +2797,7 @@@ if (!try_get_loop_niter (loop, &niter_desc)) continue; - if (!try_create_reduction_list (loop, &reduction_list)) + if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p)) continue; if (!flag_loop_parallelize_all @@@ -2843,9 -2589,8 +2816,9 @@@ fprintf (dump_file, "\nloop at %s:%d: ", LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc)); } + gen_parallel_loop (loop, &reduction_list, - n_threads, &niter_desc); + n_threads, &niter_desc, region_entry, oacc_kernels_p); } free_stmt_vec_info_vec (); @@@ -2896,7 -2641,7 +2869,7 @@@ pass_parallelize_loops::execute (functi if (number_of_loops (fun) <= 1) return 0; - if (parallelize_loops ()) + if (parallelize_loops (false)) { fun->curr_properties &= ~(PROP_gimple_eomp); return TODO_update_ssa; @@@ -2912,51 -2657,3 +2885,51 @@@ make_pass_parallelize_loops (gcc::conte { return new pass_parallelize_loops (ctxt); } + +namespace { + +const pass_data pass_data_parallelize_loops_oacc_kernels = +{ + GIMPLE_PASS, /* type */ + "parloops_oacc_kernels", /* name */ + OPTGROUP_LOOP, /* optinfo_flags */ + TV_TREE_PARALLELIZE_LOOPS, /* tv_id */ + ( PROP_cfg | PROP_ssa ), /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass +{ +public: + pass_parallelize_loops_oacc_kernels (gcc::context *ctxt) + : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) { return flag_tree_parallelize_loops > 1; } + virtual unsigned int execute (function *); + +}; // class pass_parallelize_loops_oacc_kernels + +unsigned +pass_parallelize_loops_oacc_kernels::execute (function *fun) +{ + if (number_of_loops (fun) <= 1) + return 0; + + if (parallelize_loops (true)) + return TODO_update_ssa; + + return 0; +} + +} // anon namespace + +gimple_opt_pass * +make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt) +{ + return new pass_parallelize_loops_oacc_kernels (ctxt); +} ..., and: $ git show -m 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 -- gcc/tree-parloops.c commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 (from f9d00ca614a8dc28f21ab4a16d7cdbbe16668ca3) Merge: f9d00ca cacef50 Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4> Date: Sun Jul 12 09:30:39 2015 +0000 svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719 138bc75d-0d04-0410-961f-82ee72b054a4 diff --git gcc/tree-parloops.c gcc/tree-parloops.c index 04708c0..80a215d 100644 --- gcc/tree-parloops.c +++ gcc/tree-parloops.c @@ -22,43 +22,22 @@ along with GCC; see the file COPYING3. If not see #include "config.h" #include "system.h" #include "coretypes.h" -#include "hash-set.h" -#include "machmode.h" -#include "vec.h" -#include "double-int.h" -#include "input.h" #include "alias.h" -#include "symtab.h" -#include "options.h" -#include "wide-int.h" -#include "inchash.h" +#include "backend.h" #include "tree.h" -#include "fold-const.h" -#include "predict.h" -#include "tm.h" +#include "gimple.h" #include "hard-reg-set.h" -#include "input.h" -#include "function.h" -#include "dominance.h" -#include "cfg.h" -#include "basic-block.h" -#include "tree-ssa-alias.h" +#include "ssa.h" +#include "options.h" +#include "fold-const.h" #include "internal-fn.h" -#include "gimple-expr.h" -#include "is-a.h" -#include "gimple.h" #include "gimplify.h" #include "gimple-iterator.h" #include "gimplify-me.h" #include "gimple-walk.h" #include "stor-layout.h" #include "tree-nested.h" -#include "gimple-ssa.h" #include "tree-cfg.h" -#include "tree-phinodes.h" -#include "ssa-iterators.h" -#include "stringpool.h" -#include "tree-ssanames.h" #include "tree-ssa-loop-ivopts.h" #include "tree-ssa-loop-manip.h" #include "tree-ssa-loop-niter.h" @@ -75,8 +54,6 @@ along with GCC; see the file COPYING3. If not see #include "tree-parloops.h" #include "omp-low.h" #include "tree-nested.h" -#include "plugin-api.h" -#include "ipa-ref.h" #include "cgraph.h" #include "tree-ssa.h" @@ -229,10 +206,8 @@ struct reduction_info /* Reduction info hashtable helpers. */ -struct reduction_hasher : typed_free_remove <reduction_info> +struct reduction_hasher : free_ptr_hash <reduction_info> { - typedef reduction_info *value_type; - typedef reduction_info *compare_type; static inline hashval_t hash (const reduction_info *); static inline bool equal (const reduction_info *, const reduction_info *); }; @@ -281,10 +256,8 @@ struct name_to_copy_elt /* Name copies hashtable helpers. */ -struct name_to_copy_hasher : typed_free_remove <name_to_copy_elt> +struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt> { - typedef name_to_copy_elt *value_type; - typedef name_to_copy_elt *compare_type; static inline hashval_t hash (const name_to_copy_elt *); static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *); }; commit 2c88b76c6bb1e01ab541ea7b02a2f097c888dcb1 (from cacef506e4205bac13a0dd1de238d1a8cc78af28) Merge: f9d00ca cacef50 Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4> Date: Sun Jul 12 09:30:39 2015 +0000 svn merge -r 222860:225562 svn+ssh://gcc.gnu.org/svn/gcc/trunk git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@225719 138bc75d-0d04-0410-961f-82ee72b054a4 diff --git gcc/tree-parloops.c gcc/tree-parloops.c index 846077a..80a215d 100644 --- gcc/tree-parloops.c +++ gcc/tree-parloops.c @@ -195,6 +195,8 @@ struct reduction_info of the reduction variable when existing the loop. */ tree initial_value; /* The initial value of the reduction var before entering the loop. */ tree field; /* the name of the field in the parloop data structure intended for reduction. */ + tree reduc_addr; /* The address of the reduction variable for + openacc reductions. */ tree init; /* reduction initialization value. */ gphi *new_phi; /* (helper field) Newly created phi node whose result will be passed to the atomic operation. Represents @@ -1080,10 +1082,30 @@ create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data) tree tmp_load, name; gimple load; - load_struct = build_simple_mem_ref (clsn_data->load); - t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); + if (reduc->reduc_addr == NULL_TREE) + { + load_struct = build_simple_mem_ref (clsn_data->load); + t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE); + + addr = build_addr (t, current_function_decl); + } + else + { + /* Set the address for the atomic store. */ + addr = reduc->reduc_addr; + + /* Remove the non-atomic store '*addr = sum'. */ + tree res = PHI_RESULT (reduc->keep_res); + use_operand_p use_p; + gimple stmt; + bool single_use_p = single_imm_use (res, &use_p, &stmt); + gcc_assert (single_use_p); + replace_uses_by (gimple_vdef (stmt), + gimple_vuse (stmt)); + gimple_stmt_iterator gsi = gsi_for_stmt (stmt); + gsi_remove (&gsi, true); + } - addr = build_addr (t, current_function_decl); /* Create phi node. */ bb = clsn_data->load_bb; @@ -1685,10 +1707,15 @@ transform_to_exit_first_loop_alt (struct loop *loop, /* Set the latch arguments of the new phis to ivtmp/sum_b. */ flush_pending_stmts (post_inc_edge); - /* Create a new empty exit block, inbetween the new loop header and the old - exit block. The function separate_decls_in_region needs this block to - insert code that is active on loop exit, but not any other path. */ - basic_block new_exit_block = split_edge (exit); + + basic_block new_exit_block = NULL; + if (!single_pred_p (exit->dest)) + { + /* Create a new empty exit block, inbetween the new loop header and the + old exit block. The function separate_decls_in_region needs this block + to insert code that is active on loop exit, but not any other path. */ + new_exit_block = split_edge (exit); + } /* Insert and register the reduction exit phis. */ for (gphi_iterator gsi = gsi_start_phis (exit_block); @@ -1696,17 +1723,24 @@ transform_to_exit_first_loop_alt (struct loop *loop, gsi_next (&gsi)) { gphi *phi = gsi.phi (); + gphi *nphi = NULL; tree res_z = PHI_RESULT (phi); + tree res_c; - /* Now that we have a new exit block, duplicate the phi of the old exit - block in the new exit block to preserve loop-closed ssa. */ - edge succ_new_exit_block = single_succ_edge (new_exit_block); - edge pred_new_exit_block = single_pred_edge (new_exit_block); - tree res_y = copy_ssa_name (res_z, phi); - gphi *nphi = create_phi_node (res_y, new_exit_block); - tree res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block); - add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION); - add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION); + if (new_exit_block != NULL) + { + /* Now that we have a new exit block, duplicate the phi of the old + exit block in the new exit block to preserve loop-closed ssa. */ + edge succ_new_exit_block = single_succ_edge (new_exit_block); + edge pred_new_exit_block = single_pred_edge (new_exit_block); + tree res_y = copy_ssa_name (res_z, phi); + nphi = create_phi_node (res_y, new_exit_block); + res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block); + add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION); + add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION); + } + else + res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit); if (virtual_operand_p (res_z)) continue; @@ -1714,7 +1748,9 @@ transform_to_exit_first_loop_alt (struct loop *loop, gimple reduc_phi = SSA_NAME_DEF_STMT (res_c); struct reduction_info *red = reduction_phi (reduction_list, reduc_phi); if (red != NULL) - red->keep_res = nphi; + red->keep_res = (nphi != NULL + ? nphi + : phi); } /* We're going to cancel the loop at the end of gen_parallel_loop, but until @@ -1828,8 +1864,24 @@ try_transform_to_exit_first_loop_alt (struct loop *loop, alt_bound = op1; } + /* If not found, insert nit + 1. */ if (alt_bound == NULL_TREE) - return false; + { + alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit, + build_int_cst_type (nit_type, 1)); + + gimple_seq pre = NULL, post = NULL; + push_gimplify_context (true); + gimplify_expr (&alt_bound, &pre, &post, is_gimple_reg, + fb_rvalue); + pop_gimplify_context (NULL); + + gimple_seq_add_seq (&pre, post); + + gimple_stmt_iterator gsi + = gsi_last_bb (loop_preheader_edge (loop)->src); + gsi_insert_seq_after (&gsi, pre, GSI_CONTINUE_LINKING); + } transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound); return true; @@ -1953,9 +2005,10 @@ transform_to_exit_first_loop (struct loop *loop, of LOOP_FN. N_THREADS is the requested number of threads. Returns the basic block containing GIMPLE_OMP_PARALLEL tree. */ -static basic_block +static void create_parallel_loop (struct loop *loop, tree loop_fn, tree data, - tree new_data, unsigned n_threads, location_t loc) + tree new_data, unsigned n_threads, location_t loc, + basic_block region_entry, bool oacc_kernels_p) { gimple_stmt_iterator gsi; basic_block bb, paral_bb, for_bb, ex_bb; @@ -1968,19 +2021,79 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data, gomp_continue *omp_cont_stmt; tree cvar, cvar_init, initvar, cvar_next, cvar_base, type; edge exit, nexit, guard, end, e; + tree for_clauses = NULL_TREE; /* Prepare the GIMPLE_OMP_PARALLEL statement. */ bb = loop_preheader_edge (loop)->src; paral_bb = single_pred (bb); - gsi = gsi_last_bb (paral_bb); + if (!oacc_kernels_p) + gsi = gsi_last_bb (paral_bb); + else + /* Make sure the oacc parallel is inserted on top of the oacc kernels + region. */ + gsi = gsi_last_bb (region_entry); + + if (!oacc_kernels_p) + { + t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); + OMP_CLAUSE_NUM_THREADS_EXPR (t) + = build_int_cst (integer_type_node, n_threads); + omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); + gimple_set_location (omp_par_stmt, loc); + + gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT); + } + else + { + /* Create oacc parallel pragma based on oacc kernels pragma. */ + gomp_target *kernels = as_a <gomp_target *> (gsi_stmt (gsi)); + + gsi_prev (&gsi); + gcall *goacc_kernels = as_a <gcall *> (gsi_stmt (gsi)); - t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS); - OMP_CLAUSE_NUM_THREADS_EXPR (t) - = build_int_cst (integer_type_node, n_threads); - omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data); - gimple_set_location (omp_par_stmt, loc); + tree clauses = gimple_omp_target_clauses (kernels); + /* FIXME: We need a more intelligent mapping onto vector, gangs, + workers. */ + if (1) + { + tree clause = build_omp_clause (gimple_location (kernels), + OMP_CLAUSE_NUM_GANGS); + OMP_CLAUSE_NUM_GANGS_EXPR (clause) + = build_int_cst (integer_type_node, n_threads); + OMP_CLAUSE_CHAIN (clause) = clauses; + clauses = clause; + } + gomp_target *stmt + = gimple_build_omp_target (NULL, GF_OMP_TARGET_KIND_OACC_PARALLEL, + clauses); + tree child_fn = gimple_omp_target_child_fn (kernels); + gimple_omp_target_set_child_fn (stmt, child_fn); + tree data_arg = gimple_omp_target_data_arg (kernels); + gimple_omp_target_set_data_arg (stmt, data_arg); + tree ganglocal_size = gimple_call_arg (goacc_kernels, /* TODO */ 9); + gimple_omp_target_set_ganglocal_size (stmt, ganglocal_size); + + gimple_set_location (stmt, loc); + + /* Insert oacc parallel pragma after the oacc kernels pragma. */ + { + gimple_stmt_iterator gsi2; + gsi = gsi_last_bb (region_entry); + gsi2 = gsi; + gsi_prev (&gsi2); + + /* Insert pragma acc parallel. */ + gsi_insert_after (&gsi, stmt, GSI_NEW_STMT); - gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT); + /* Remove GOACC_kernels. */ + replace_uses_by (gimple_vdef (gsi_stmt (gsi2)), + gimple_vuse (gsi_stmt (gsi2))); + gsi_remove (&gsi2, true); + + /* Remove pragma acc kernels. */ + gsi_remove (&gsi2, true); + } + } /* Initialize NEW_DATA. */ if (data) @@ -1998,12 +2111,18 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data, gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); } - /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ - bb = split_loop_exit_edge (single_dom_exit (loop)); - gsi = gsi_last_bb (bb); - omp_return_stmt1 = gimple_build_omp_return (false); - gimple_set_location (omp_return_stmt1, loc); - gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT); + /* Skip insertion of OMP_RETURN for oacc_kernels_p. We've already generated + one when lowering the oacc kernels directive in + pass_lower_omp/lower_omp (). */ + if (!oacc_kernels_p) + { + /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL. */ + bb = split_loop_exit_edge (single_dom_exit (loop)); + gsi = gsi_last_bb (bb); + omp_return_stmt1 = gimple_build_omp_return (false); + gimple_set_location (omp_return_stmt1, loc); + gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT); + } /* Extract data for GIMPLE_OMP_FOR. */ gcc_assert (loop->header == single_dom_exit (loop)->src); @@ -2060,7 +2179,17 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data, t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE); OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC; - for_stmt = gimple_build_omp_for (NULL, GF_OMP_FOR_KIND_FOR, t, 1, NULL); + if (1) + { + /* In combination with the NUM_GANGS on the parallel. */ + for_clauses = build_omp_clause (loc, OMP_CLAUSE_GANG); + } + + for_stmt = gimple_build_omp_for (NULL, + (oacc_kernels_p + ? GF_OMP_FOR_KIND_OACC_LOOP + : GF_OMP_FOR_KIND_FOR), + for_clauses, 1, NULL); gimple_set_location (for_stmt, loc); gimple_omp_for_set_index (for_stmt, 0, initvar); gimple_omp_for_set_initial (for_stmt, 0, cvar_init); @@ -2090,8 +2219,6 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data, /* After the above dom info is hosed. Re-compute it. */ free_dominance_info (CDI_DOMINATORS); calculate_dominance_info (CDI_DOMINATORS); - - return paral_bb; } /* Generates code to execute the iterations of LOOP in N_THREADS @@ -2103,7 +2230,8 @@ create_parallel_loop (struct loop *loop, tree loop_fn, tree data, static void gen_parallel_loop (struct loop *loop, reduction_info_table_type *reduction_list, - unsigned n_threads, struct tree_niter_desc *niter) + unsigned n_threads, struct tree_niter_desc *niter, + basic_block region_entry, bool oacc_kernels_p) { tree many_iterations_cond, type, nit; tree arg_struct, new_arg_struct; @@ -2184,40 +2312,43 @@ gen_parallel_loop (struct loop *loop, if (stmts) gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - if (loop->inner) - m_p_thread=2; - else - m_p_thread=MIN_PER_THREAD; - - many_iterations_cond = - fold_build2 (GE_EXPR, boolean_type_node, - nit, build_int_cst (type, m_p_thread * n_threads)); - - many_iterations_cond - = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, - invert_truthvalue (unshare_expr (niter->may_be_zero)), - many_iterations_cond); - many_iterations_cond - = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); - if (stmts) - gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - if (!is_gimple_condexpr (many_iterations_cond)) + if (!oacc_kernels_p) { + if (loop->inner) + m_p_thread=2; + else + m_p_thread=MIN_PER_THREAD; + + many_iterations_cond = + fold_build2 (GE_EXPR, boolean_type_node, + nit, build_int_cst (type, m_p_thread * n_threads)); + + many_iterations_cond + = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, + invert_truthvalue (unshare_expr (niter->may_be_zero)), + many_iterations_cond); many_iterations_cond - = force_gimple_operand (many_iterations_cond, &stmts, - true, NULL_TREE); + = force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE); if (stmts) gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); - } + if (!is_gimple_condexpr (many_iterations_cond)) + { + many_iterations_cond + = force_gimple_operand (many_iterations_cond, &stmts, + true, NULL_TREE); + if (stmts) + gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts); + } - initialize_original_copy_tables (); + initialize_original_copy_tables (); - /* We assume that the loop usually iterates a lot. */ - prob = 4 * REG_BR_PROB_BASE / 5; - loop_version (loop, many_iterations_cond, NULL, - prob, prob, REG_BR_PROB_BASE - prob, true); - update_ssa (TODO_update_ssa); - free_original_copy_tables (); + /* We assume that the loop usually iterates a lot. */ + prob = 4 * REG_BR_PROB_BASE / 5; + loop_version (loop, many_iterations_cond, NULL, + prob, prob, REG_BR_PROB_BASE - prob, true); + update_ssa (TODO_update_ssa); + free_original_copy_tables (); + } /* Base all the induction variables in LOOP on a single control one. */ canonicalize_loop_ivs (loop, &nit, true); @@ -2229,6 +2360,9 @@ gen_parallel_loop (struct loop *loop, iterations of the loop by one. */ if (!try_transform_to_exit_first_loop_alt (loop, reduction_list, nit)) { + if (oacc_kernels_p) + n_threads = 1; + /* Fall back on the method that handles more cases, but duplicates the loop body: move the exit condition of LOOP to the beginning of its header, and duplicate the part of the last iteration that gets disabled @@ -2245,19 +2379,34 @@ gen_parallel_loop (struct loop *loop, entry = loop_preheader_edge (loop); exit = single_dom_exit (loop); - eliminate_local_variables (entry, exit); - /* In the old loop, move all variables non-local to the loop to a structure - and back, and create separate decls for the variables used in loop. */ - separate_decls_in_region (entry, exit, reduction_list, &arg_struct, - &new_arg_struct, &clsn_data); + /* This rewrites the body in terms of new variables. This has already + been done for oacc_kernels_p in pass_lower_omp/lower_omp (). */ + if (!oacc_kernels_p) + { + eliminate_local_variables (entry, exit); + /* In the old loop, move all variables non-local to the loop to a + structure and back, and create separate decls for the variables used in + loop. */ + separate_decls_in_region (entry, exit, reduction_list, &arg_struct, + &new_arg_struct, &clsn_data); + } + else + { + arg_struct = NULL_TREE; + new_arg_struct = NULL_TREE; + clsn_data.load = NULL_TREE; + clsn_data.load_bb = exit->dest; + clsn_data.store = NULL_TREE; + clsn_data.store_bb = NULL; + } /* Create the parallel constructs. */ loc = UNKNOWN_LOCATION; cond_stmt = last_stmt (loop->header); if (cond_stmt) loc = gimple_location (cond_stmt); - create_parallel_loop (loop, create_loop_fn (loc), arg_struct, - new_arg_struct, n_threads, loc); + create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct, + n_threads, loc, region_entry, oacc_kernels_p); if (reduction_list->elements () > 0) create_call_for_reduction (loop, reduction_list, &clsn_data); @@ -2399,7 +2548,8 @@ try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter) static bool try_create_reduction_list (loop_p loop, - reduction_info_table_type *reduction_list) + reduction_info_table_type *reduction_list, + bool oacc_kernels_p) { edge exit = single_dom_exit (loop); gphi_iterator gsi; @@ -2489,6 +2639,61 @@ try_create_reduction_list (loop_p loop, } + if (oacc_kernels_p) + { + edge e = loop_preheader_edge (loop); + + for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); + gsi_next (&gsi)) + { + gphi *phi = gsi.phi (); + tree def = PHI_RESULT (phi); + affine_iv iv; + + if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true)) + { + struct reduction_info *red; + red = reduction_phi (reduction_list, phi); + + /* Look for pattern: + + <bb preheader> + .omp_data_i = &.omp_data_arr; + addr = .omp_data_i->sum; + sum_a = *addr; + + <bb header>: + sum_b = PHI <sum_a (preheader), sum_c (latch)> + + and assign addr to reduc->reduc_addr. */ + + tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e); + gimple stmt = SSA_NAME_DEF_STMT (arg); + if (!gimple_assign_single_p (stmt)) + return false; + tree memref = gimple_assign_rhs1 (stmt); + if (TREE_CODE (memref) != MEM_REF) + return false; + tree addr = TREE_OPERAND (memref, 0); + + gimple stmt2 = SSA_NAME_DEF_STMT (addr); + if (!gimple_assign_single_p (stmt2)) + return false; + tree compref = gimple_assign_rhs1 (stmt2); + if (TREE_CODE (compref) != COMPONENT_REF) + return false; + tree addr2 = TREE_OPERAND (compref, 0); + if (TREE_CODE (addr2) != MEM_REF) + return false; + addr2 = TREE_OPERAND (addr2, 0); + if (TREE_CODE (addr2) != SSA_NAME + || !gimple_stmt_omp_data_i_init_p (SSA_NAME_DEF_STMT (addr2))) + return false; + red->reduc_addr = addr; + } + } + } + return true; } @@ -2497,7 +2702,7 @@ try_create_reduction_list (loop_p loop, otherwise. */ static bool -parallelize_loops (void) +parallelize_loops (bool oacc_kernels_p) { unsigned n_threads = flag_tree_parallelize_loops; bool changed = false; @@ -2506,6 +2711,7 @@ parallelize_loops (void) struct obstack parloop_obstack; HOST_WIDE_INT estimated; source_location loop_loc; + basic_block region_entry = NULL; /* Do not parallelize loops in the functions created by parallelization. */ if (parallelized_function_p (cfun->decl)) @@ -2517,9 +2723,29 @@ parallelize_loops (void) reduction_info_table_type reduction_list (10); init_stmt_vec_info_vec (); + calculate_dominance_info (CDI_DOMINATORS); + FOR_EACH_LOOP (loop, 0) { reduction_list.empty (); + + if (oacc_kernels_p) + { + if (!loop->in_oacc_kernels_region) + continue; + + /* TODO: Allow nested loops. */ + if (loop->inner) + continue; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, + "Trying loop %d with header bb %d in oacc kernels region\n", + loop->num, loop->header->index); + + region_entry = loop_get_oacc_kernels_region_entry (loop); + } + if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "Trying loop %d as candidate\n",loop->num); @@ -2561,6 +2787,7 @@ parallelize_loops (void) /* FIXME: Bypass this check as graphite doesn't update the count and frequency correctly now. */ if (!flag_loop_parallelize_all + && !oacc_kernels_p && ((estimated != -1 && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD) /* Do not bother with loops in cold areas. */ @@ -2570,7 +2797,7 @@ parallelize_loops (void) if (!try_get_loop_niter (loop, &niter_desc)) continue; - if (!try_create_reduction_list (loop, &reduction_list)) + if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p)) continue; if (!flag_loop_parallelize_all @@ -2589,8 +2816,9 @@ parallelize_loops (void) fprintf (dump_file, "\nloop at %s:%d: ", LOCATION_FILE (loop_loc), LOCATION_LINE (loop_loc)); } + gen_parallel_loop (loop, &reduction_list, - n_threads, &niter_desc); + n_threads, &niter_desc, region_entry, oacc_kernels_p); } free_stmt_vec_info_vec (); @@ -2641,7 +2869,7 @@ pass_parallelize_loops::execute (function *fun) if (number_of_loops (fun) <= 1) return 0; - if (parallelize_loops ()) + if (parallelize_loops (false)) { fun->curr_properties &= ~(PROP_gimple_eomp); return TODO_update_ssa; @@ -2657,3 +2885,51 @@ make_pass_parallelize_loops (gcc::context *ctxt) { return new pass_parallelize_loops (ctxt); } + +namespace { + +const pass_data pass_data_parallelize_loops_oacc_kernels = +{ + GIMPLE_PASS, /* type */ + "parloops_oacc_kernels", /* name */ + OPTGROUP_LOOP, /* optinfo_flags */ + TV_TREE_PARALLELIZE_LOOPS, /* tv_id */ + ( PROP_cfg | PROP_ssa ), /* properties_required */ + 0, /* properties_provided */ + 0, /* properties_destroyed */ + 0, /* todo_flags_start */ + 0, /* todo_flags_finish */ +}; + +class pass_parallelize_loops_oacc_kernels : public gimple_opt_pass +{ +public: + pass_parallelize_loops_oacc_kernels (gcc::context *ctxt) + : gimple_opt_pass (pass_data_parallelize_loops_oacc_kernels, ctxt) + {} + + /* opt_pass methods: */ + virtual bool gate (function *) { return flag_tree_parallelize_loops > 1; } + virtual unsigned int execute (function *); + +}; // class pass_parallelize_loops_oacc_kernels + +unsigned +pass_parallelize_loops_oacc_kernels::execute (function *fun) +{ + if (number_of_loops (fun) <= 1) + return 0; + + if (parallelize_loops (true)) + return TODO_update_ssa; + + return 0; +} + +} // anon namespace + +gimple_opt_pass * +make_pass_parallelize_loops_oacc_kernels (gcc::context *ctxt) +{ + return new pass_parallelize_loops_oacc_kernels (ctxt); +} Grüße, Thomas
signature.asc
Description: PGP signature