Hi, the patch I below that I have committed to the branch adds a special gimple statement code in which GPU statements can survive between lowering and expansion and which makes sure that even statements which pertain to the kernel loop but lowering puts them in front of the loop are picked up by expansion and put into a separate function.
Thanks, Martin 2015-08-28 Martin Jambor <mjam...@suse.cz> * omp-low.c (expand_omp_for_kernel): Do not insert return statement. (expand_target_kernel_body): Handle kernels encapsulated in GIMPLE_OMP_GPUKERNEL statements. (lower_omp_target): Lower kernel code into a new GIMPLE_OMP_GPUKERNEL statement. * gimple.def (GIMPLE_OMP_GPUKERNEL): New code. * gimple.c (gimple_build_omp_gpukernel): New function. (gimple_copy): Handle GIMPLE_OMP_GPUKERNEL case. * gimple-low.c (lower_stmt): Likewise. * gimple-pretty-print.c (dump_gimple_omp_block): Likewise. (pp_gimple_stmt_1): Likewise. * gimple.h (gimple_build_omp_gpukernel): Declare. (gimple_has_substatements): Handle GIMPLE_OMP_GPUKERNEL case. (CASE_GIMPLE_OMP): Likewise. Index: gcc/gimple.def =================================================================== --- gcc/gimple.def (revision 227279) +++ gcc/gimple.def (working copy) @@ -375,6 +375,10 @@ DEFGSCODE(GIMPLE_OMP_TARGET, "gimple_omp CLAUSES is an OMP_CLAUSE chain holding the associated clauses. */ DEFGSCODE(GIMPLE_OMP_TEAMS, "gimple_omp_teams", GSS_OMP_SINGLE_LAYOUT) +/* GIMPLE_OMP_GPUKERNEL <BODY> represents a parallel loop lowered for execution + on a GPU. It is an artificial statement created by omp lowering. */ +DEFGSCODE(GIMPLE_OMP_GPUKERNEL, "gimple_omp_gpukernel", GSS_OMP) + /* GIMPLE_PREDICT <PREDICT, OUTCOME> specifies a hint for branch prediction. PREDICT is one of the predictors from predict.def. Index: gcc/gimple.c =================================================================== --- gcc/gimple.c (revision 227279) +++ gcc/gimple.c (working copy) @@ -959,6 +959,19 @@ gimple_build_omp_master (gimple_seq body return p; } +/* Build a GIMPLE_OMP_GPUKERNEL statement. + + BODY is the sequence of statements to be executed by the kernel. */ + +gimple +gimple_build_omp_gpukernel (gimple_seq body) +{ + gimple p = gimple_alloc (GIMPLE_OMP_GPUKERNEL, 0); + if (body) + gimple_omp_set_body (p, body); + + return p; +} /* Build a GIMPLE_OMP_TASKGROUP statement. @@ -1798,6 +1811,7 @@ gimple_copy (gimple stmt) case GIMPLE_OMP_MASTER: case GIMPLE_OMP_TASKGROUP: case GIMPLE_OMP_ORDERED: + case GIMPLE_OMP_GPUKERNEL: copy_omp_body: new_seq = gimple_seq_copy (gimple_omp_body (stmt)); gimple_omp_set_body (copy, new_seq); Index: gcc/gimple.h =================================================================== --- gcc/gimple.h (revision 227279) +++ gcc/gimple.h (working copy) @@ -1435,6 +1435,7 @@ gomp_task *gimple_build_omp_task (gimple tree, tree); gimple gimple_build_omp_section (gimple_seq); gimple gimple_build_omp_master (gimple_seq); +gimple gimple_build_omp_gpukernel (gimple_seq); gimple gimple_build_omp_taskgroup (gimple_seq); gomp_continue *gimple_build_omp_continue (tree, tree); gimple gimple_build_omp_ordered (gimple_seq); @@ -1691,6 +1692,7 @@ gimple_has_substatements (gimple g) case GIMPLE_OMP_TARGET: case GIMPLE_OMP_TEAMS: case GIMPLE_OMP_CRITICAL: + case GIMPLE_OMP_GPUKERNEL: case GIMPLE_WITH_CLEANUP_EXPR: case GIMPLE_TRANSACTION: return true; @@ -5879,7 +5881,8 @@ gimple_return_set_retbnd (gimple gs, tre case GIMPLE_OMP_RETURN: \ case GIMPLE_OMP_ATOMIC_LOAD: \ case GIMPLE_OMP_ATOMIC_STORE: \ - case GIMPLE_OMP_CONTINUE + case GIMPLE_OMP_CONTINUE: \ + case GIMPLE_OMP_GPUKERNEL static inline bool is_gimple_omp (const_gimple stmt) Index: gcc/gimple-pretty-print.c =================================================================== --- gcc/gimple-pretty-print.c (revision 227279) +++ gcc/gimple-pretty-print.c (working copy) @@ -1486,6 +1486,9 @@ dump_gimple_omp_block (pretty_printer *b case GIMPLE_OMP_SECTION: pp_string (buffer, "#pragma omp section"); break; + case GIMPLE_OMP_GPUKERNEL: + pp_string (buffer, "#pragma omp gpukernel"); + break; default: gcc_unreachable (); } @@ -2240,6 +2243,7 @@ pp_gimple_stmt_1 (pretty_printer *buffer case GIMPLE_OMP_TASKGROUP: case GIMPLE_OMP_ORDERED: case GIMPLE_OMP_SECTION: + case GIMPLE_OMP_GPUKERNEL: dump_gimple_omp_block (buffer, gs, spc, flags); break; Index: gcc/gimple-low.c =================================================================== --- gcc/gimple-low.c (revision 227279) +++ gcc/gimple-low.c (working copy) @@ -366,6 +366,7 @@ lower_stmt (gimple_stmt_iterator *gsi, s case GIMPLE_OMP_TASK: case GIMPLE_OMP_TARGET: case GIMPLE_OMP_TEAMS: + case GIMPLE_OMP_GPUKERNEL: data->cannot_fallthru = false; lower_omp_directive (gsi, data); data->cannot_fallthru = false; Index: gcc/omp-low.c =================================================================== --- gcc/omp-low.c (revision 227279) +++ gcc/omp-low.c (working copy) @@ -9905,8 +9905,6 @@ expand_omp_for_kernel (struct omp_region gsi = gsi_last_bb (kfor->exit); gcc_assert (!gsi_end_p (gsi) && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN); - gimple ret_stmt = gimple_build_return (NULL); - gsi_insert_after (&gsi, ret_stmt, GSI_SAME_STMT); gsi_remove (&gsi, true); /* Fixup the much simpler CFG. */ @@ -9957,14 +9955,13 @@ expand_target_kernel_body (struct omp_re struct omp_region **pp; for (pp = &target->inner; *pp; pp = &(*pp)->next) - if ((*pp)->type == GIMPLE_OMP_FOR - && (gimple_omp_for_kind (last_stmt ((*pp)->entry)) - == GF_OMP_FOR_KIND_KERNEL_BODY)) + if ((*pp)->type == GIMPLE_OMP_GPUKERNEL) break; + struct omp_region *gpukernel = *pp; + tree orig_child_fndecl = gimple_omp_target_child_fn (tgt_stmt); - struct omp_region *kfor = *pp; - if (!kfor) + if (!gpukernel) { gcc_assert (!tgt_stmt->kernel_iter); cgraph_node *n = cgraph_node::get (orig_child_fndecl); @@ -9978,9 +9975,18 @@ expand_target_kernel_body (struct omp_re } gcc_assert (tgt_stmt->kernel_iter); + *pp = gpukernel->next; + + for (pp = &gpukernel->inner; *pp; pp = &(*pp)->next) + if ((*pp)->type == GIMPLE_OMP_FOR + && (gimple_omp_for_kind (last_stmt ((*pp)->entry)) + == GF_OMP_FOR_KIND_KERNEL_BODY)) + break; + + struct omp_region *kfor = *pp; + gcc_assert (kfor); if (kfor->inner) expand_omp (kfor->inner); - *pp = kfor->next; tree kern_fndecl = copy_node (orig_child_fndecl); DECL_NAME (kern_fndecl) = clone_function_name (kern_fndecl, "kernel"); @@ -10007,8 +10013,20 @@ expand_target_kernel_body (struct omp_re expand_omp_for_kernel (kfor); - move_sese_region_to_fn (kern_cfun, single_succ (kfor->entry), - kfor->exit, block); + /* Remove the omp for statement */ + gimple_stmt_iterator gsi = gsi_last_bb (gpukernel->entry); + gsi_remove (&gsi, true); + /* Replace the GIMPLE_OMP_RETURN at the end of the kernel region with a real + return. */ + gsi = gsi_last_bb (gpukernel->exit); + gcc_assert (!gsi_end_p (gsi) + && gimple_code (gsi_stmt (gsi)) == GIMPLE_OMP_RETURN); + gimple ret_stmt = gimple_build_return (NULL); + gsi_insert_after (&gsi, ret_stmt, GSI_SAME_STMT); + gsi_remove (&gsi, true); + + move_sese_region_to_fn (kern_cfun, single_succ (gpukernel->entry), + gpukernel->exit, block); cgraph_node *kcn = cgraph_node::get_create (kern_fndecl); kcn->mark_force_output (); @@ -10034,7 +10052,6 @@ expand_target_kernel_body (struct omp_re basic_block bb; FOR_EACH_BB_FN (bb, kern_cfun) { - gimple_stmt_iterator gsi; for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) { gimple stmt = gsi_stmt (gsi); @@ -12117,10 +12134,12 @@ lower_omp_target (gimple_stmt_iterator * lower_omp (&tgt_body, ctx); if (ctx->kernel_inner_loop) { - /* FIXME: Try to invent an encapsulating block which would survive - until omp expansion. */ gimple_seq_add_stmt (&kernel_seq, ctx->kernel_inner_loop); lower_omp (&kernel_seq, ctx); + gimple_seq_add_stmt (&kernel_seq, gimple_build_omp_return (false)); + gimple gpukernel = gimple_build_omp_gpukernel (kernel_seq); + kernel_seq = NULL; + gimple_seq_add_stmt (&kernel_seq, gpukernel); } target_nesting_level--; } @@ -13047,6 +13066,7 @@ make_gimple_omp_edges (basic_block bb, s case GIMPLE_OMP_ORDERED: case GIMPLE_OMP_CRITICAL: case GIMPLE_OMP_SECTION: + case GIMPLE_OMP_GPUKERNEL: cur_region = new_omp_region (bb, code, cur_region); fallthru = true; break; Index: libgomp/plugin/plugin-hsa.c =================================================================== --- libgomp/plugin/plugin-hsa.c (revision 227279) +++ libgomp/plugin/plugin-hsa.c (working copy) @@ -855,9 +855,6 @@ GOMP_OFFLOAD_run (int n, void *fn_ptr, v { struct kernel_info *kernel = (struct kernel_info *) fn_ptr; struct agent_info *agent = kernel->agent; - if (pthread_rwlock_rdlock (&agent->modules_rwlock)) - GOMP_PLUGIN_fatal ("Unable to read-lock an HSA agent rwlock"); - struct kernel_launch_attributes def; const struct kernel_launch_attributes *kla; if (!parse_launch_attributes (kern_launch, &def, &kla)) @@ -867,6 +864,8 @@ GOMP_OFFLOAD_run (int n, void *fn_ptr, v "zero\n"); return; } + if (pthread_rwlock_rdlock (&agent->modules_rwlock)) + GOMP_PLUGIN_fatal ("Unable to read-lock an HSA agent rwlock"); create_and_finalize_hsa_program (agent); init_kernel (kernel) ;