Hi Cesar! (It's me, again!) ;-) On Fri, 27 Jan 2017 09:13:06 -0800, Cesar Philippidis <ce...@codesourcery.com> wrote: > This patch partially enables GOMP_MAP_FIRSTPRIVATE_POINTER in gfortran. > gfortran still falls back to GOMP_MAP_POINTER for arrays with > descriptors and derived types. The limitation on derived types is there > because we don't have much test coverage for it, and this patch series > was more exploratory for performance enhancements.
Now that you still freshly remember it, please file an issue so that we'll take care of that later. > With that in mind, > there are a couple of shortcomings with this patch. > > 1) Dummy reduction variables fallback to GOMP_MAP_POINTER because of a > pointer dereferencing bug. Please also file an issue for that. > The state of debugging such problems on > PTX targets leaves something to be desired, especially since print > isn't working on nvptx targets currently. If the following is what you mean, then that's working for me: $ cat < ../printf.c int main(int argc, char *argv[]) { #pragma acc parallel copyin(argv[0][0:__builtin_strlen(argv[0]) + 1]) { __builtin_printf("Offloaded from %s.\n", argv[0]); } return 0; } $ build-gcc/gcc/xgcc [...] -Wall -Wextra -g ../printf.c -fopenacc -O2 $ GOMP_DEBUG=1 ./a.out [...] nvptx_exec: kernel main$_omp_fn$0: launch gangs=1, workers=1, vectors=32 Offloaded from ./a.out. nvptx_exec: kernel main$_omp_fn$0: finished GOMP_offload_unregister_ver (1, 0x400c20, 5, 0x401560) GOMP_offload_unregister_ver (0, 0x400c20, 6, 0x602050) Again, please file an issue as appropriate. ;-) > 2) Apparently, firstprivate pointers negatively affects the alias > analysis used by ACC KERNELS and parloops, so a couple of more > execution tests fail to generate offloaded code. > > I plan to resolve issue 1) in a follow up patch later on (but maybe not > in the immediate future). Regarding 2), ACC KERNELS are eventually going > to need a significant rework, but that's not going to happen in the near > future either. I've been pushing to get the performance of ACC PARALLEL > regions on par to other OpenACC compilers first, and hopefully that > won't be too far way. Hmm, hmm. > With this patch, I'm observing an approximate 0.6s reduction in > CloverLeaf's original 0.9s execution time (it takes approximate 0.9s > after the GOMP_MAP_FIRSTPRIVATE_INT and GOMP_MAP_TO_PSET patches), to > yield a final execution time somewhere in the neighborhood of 0.3s. > That's about a one second savings from the unpatched version of GCC. Yay! \o/ > This patch has been committed to gomp-4_0-branch. (Not reviewed in detail.) > --- a/gcc/fortran/trans-openmp.c > +++ b/gcc/fortran/trans-openmp.c > @@ -2005,9 +2005,12 @@ gfc_trans_omp_clauses_1 (stmtblock_t *block, > gfc_omp_clauses *clauses, > (TREE_TYPE (TREE_TYPE (decl))))) > { > tree orig_decl = decl; > + enum gomp_map_kind gmk = GOMP_MAP_FIRSTPRIVATE_POINTER; > + if (n->u.map_op == OMP_MAP_FORCE_DEVICEPTR) > + gmk = GOMP_MAP_POINTER; Curious, why is "deviceptr" different? > node4 = build_omp_clause (input_location, > OMP_CLAUSE_MAP); > - OMP_CLAUSE_SET_MAP_KIND (node4, GOMP_MAP_POINTER); > + OMP_CLAUSE_SET_MAP_KIND (node4, gmk); > OMP_CLAUSE_DECL (node4) = decl; > OMP_CLAUSE_SIZE (node4) = size_int (0); > decl = build_fold_indirect_ref (decl); > --- a/gcc/gimplify.c > +++ b/gcc/gimplify.c > @@ -6605,11 +6636,10 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq > *pre_p, > ctx = new_omp_context (region_type); > ctx->clauses = *list_p; > outer_ctx = ctx->outer_context; > - if (code == OMP_TARGET && !lang_GNU_Fortran ()) > + if (code == OMP_TARGET && !(lang_GNU_Fortran () && !(region_type & > ORT_ACC))) > { > - ctx->target_map_pointers_as_0len_arrays = true; > - /* FIXME: For Fortran we want to set this too, when > - the Fortran FE is updated to OpenMP 4.5. */ > + if (!lang_GNU_Fortran () || region_type & ORT_ACC) > + ctx->target_map_pointers_as_0len_arrays = true; > ctx->target_map_scalars_firstprivate = true; > } I guess the Fortran OpenMP comment should stay? And, isn't that logic a bit complicated; could simplify this as follows, unless I'm confused? --- gcc/gimplify.c +++ gcc/gimplify.c @@ -6636,10 +6636,11 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p, ctx = new_omp_context (region_type); ctx->clauses = *list_p; outer_ctx = ctx->outer_context; - if (code == OMP_TARGET && !(lang_GNU_Fortran () && !(region_type & ORT_ACC))) + /* FIXME: For Fortran OpenMP we want to set this too, when + the Fortran FE is updated to OpenMP 4.5. */ + if (code == OMP_TARGET && (!lang_GNU_Fortran () || (region_type & ORT_ACC))) { - if (!lang_GNU_Fortran () || region_type & ORT_ACC) - ctx->target_map_pointers_as_0len_arrays = true; + ctx->target_map_pointers_as_0len_arrays = true; ctx->target_map_scalars_firstprivate = true; } if (!lang_GNU_Fortran ()) > --- a/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95 > +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-loop-n.f95 > @@ -37,4 +37,6 @@ end module test > ! Check that the loop has been split off into a function. > ! { dg-final { scan-tree-dump-times "(?n);; Function > __test_MOD_foo._omp_fn.0 " 1 "optimized" } } > > -! { dg-final { scan-tree-dump-times "(?n)oacc function \\(0," 1 "parloops1" > } } > +! This failure was introduced with the GOMP_MAP_POINTER -> > +! GOMP_MAP_FIRSTPRIVATE_POINTER conversion. > +! { dg-final { scan-tree-dump-times "(?n)oacc function \\(0," 1 "parloops1" > { xfail *-*-* } } } Hmm, hmm. > --- a/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90 > +++ b/libgomp/testsuite/libgomp.oacc-fortran/deviceptr-1.f90 > @@ -3,6 +3,7 @@ > ! the deviceptr variable is implied. > > ! { dg-do run } > +! { dg-additional-options "-foffload-force" } > > subroutine subr1 (a, b) > implicit none This is also an OpenACC kernels issue. > --- a/libgomp/testsuite/libgomp.oacc-fortran/non-scalar-data.f90 > +++ b/libgomp/testsuite/libgomp.oacc-fortran/non-scalar-data.f90 > @@ -2,6 +2,7 @@ > ! offloaded regions are properly mapped using present_or_copy. > > ! { dg-do run } > +! { dg-additional-options "-foffload-force" } > > program main > implicit none Likweise. I do agree that our OpenACC kernels implementation leaves a lot to be desired, but that we're now also regressing such very simple cases, is a bit unfortunate. Have you already made an attempt at figuring out what's going wrong? Another OpenMP regression: PASS: libgomp.fortran/target2.f90 -O0 (test for excess errors) [-PASS:-]{+FAIL:+} libgomp.fortran/target2.f90 -O0 execution test PASS: libgomp.fortran/target2.f90 -O1 (test for excess errors) [-PASS:-]{+FAIL:+} libgomp.fortran/target2.f90 -O1 execution test PASS: libgomp.fortran/target2.f90 -O2 (test for excess errors) [-PASS:-]{+FAIL:+} libgomp.fortran/target2.f90 -O2 execution test PASS: libgomp.fortran/target2.f90 -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions (test for excess errors) [-PASS:-]{+FAIL:+} libgomp.fortran/target2.f90 -O3 -fomit-frame-pointer -funroll-loops -fpeel-loops -ftracer -finline-functions execution test PASS: libgomp.fortran/target2.f90 -O3 -g (test for excess errors) [-PASS:-]{+FAIL:+} libgomp.fortran/target2.f90 -O3 -g execution test PASS: libgomp.fortran/target2.f90 -Os (test for excess errors) [-PASS:-]{+FAIL:+} libgomp.fortran/target2.f90 -Os execution test That is: offload error: process on the device 0 unexpectedly exited with code 0 ..., which, as far as I remember, basically means "SIGSEGV" in the Intel MIC (emulated) offloaded code. Porting this gomp-4_0-branch r244987 "Partially enable GOMP_MAP_FIRSTPRIVATE_POINTER in gfortran." to trunk (see attached, if you want to experiment with that), I can reproduce some (maybe even the same?) issue with OpenMP nvptx offloading: "libgomp: cuCtxSynchronize error: an illegal memory access was encountered". Do you have an idea which of your changes might cause that? Grüße Thomas
>From 35dfd63154e01e2d9f299daaa876adcc6f94f013 Mon Sep 17 00:00:00 2001 From: Thomas Schwinge <tho...@codesourcery.com> Date: Mon, 30 Jan 2017 14:48:40 +0100 Subject: [PATCH] Partially enable GOMP_MAP_FIRSTPRIVATE_POINTER in gfortran. gcc/fortran/ * trans-openmp.c (gfc_omp_finish_clause): Use GOMP_MAP_POINTER for POINTER_TYPE decls. (gfc_trans_omp_clauses_1): Likewise. gcc/ * gimplify.c (demote_firstprivate_pointer): New function. (gimplify_scan_omp_clauses): Enable target_map_pointers_as_0len_arrays and target_map_scalars_firstprivate in OpenACC and gfortran. (gimplify_adjust_omp_clauses): Demote FIRSTPRIVATE_POINTERS for OpenACC retuction variables. * omp-low.c (lower_omp_target): Adjust receiver reference of decls for fortran dummy arguments. gcc/testsuite/ * gfortran.dg/goacc/kernels-loop-n.f95: Xfail test. libgomp/ * testsuite/libgomp.oacc-fortran/deviceptr-1.f90: Add -foffload-force. * testsuite/libgomp.oacc-fortran/non-scalar-data.f90: Likewise. (cherry picked from commit 771fd834ccc7b5b06dc763240636f0b9a883a8fc) --- gcc/fortran/trans-openmp.c | 7 ++- gcc/gimplify.c | 52 +++++++++++++++++++--- gcc/omp-low.c | 3 +- .../gfortran.dg/goacc/kernels-alias-3.f95 | 3 +- 4 files changed, 55 insertions(+), 10 deletions(-) diff --git a/gcc/fortran/trans-openmp.c b/gcc/fortran/trans-openmp.c index 4f525fe..0afe8a0 100644 --- a/gcc/fortran/trans-openmp.c +++ b/gcc/fortran/trans-openmp.c @@ -1070,7 +1070,7 @@ gfc_omp_finish_clause (tree c, gimple_seq *pre_p) return; tree orig_decl = decl; c4 = build_omp_clause (OMP_CLAUSE_LOCATION (c), OMP_CLAUSE_MAP); - OMP_CLAUSE_SET_MAP_KIND (c4, GOMP_MAP_POINTER); + OMP_CLAUSE_SET_MAP_KIND (c4, GOMP_MAP_FIRSTPRIVATE_POINTER); OMP_CLAUSE_DECL (c4) = decl; OMP_CLAUSE_SIZE (c4) = size_int (0); decl = build_fold_indirect_ref (decl); @@ -2095,9 +2095,12 @@ gfc_trans_omp_clauses (stmtblock_t *block, gfc_omp_clauses *clauses, (TREE_TYPE (TREE_TYPE (decl))))) { tree orig_decl = decl; + enum gomp_map_kind gmk = GOMP_MAP_FIRSTPRIVATE_POINTER; + if (n->u.map_op == OMP_MAP_FORCE_DEVICEPTR) + gmk = GOMP_MAP_POINTER; node4 = build_omp_clause (input_location, OMP_CLAUSE_MAP); - OMP_CLAUSE_SET_MAP_KIND (node4, GOMP_MAP_POINTER); + OMP_CLAUSE_SET_MAP_KIND (node4, gmk); OMP_CLAUSE_DECL (node4) = decl; OMP_CLAUSE_SIZE (node4) = size_int (0); decl = build_fold_indirect_ref (decl); diff --git a/gcc/gimplify.c b/gcc/gimplify.c index feb5fa0..cd6c2aa 100644 --- a/gcc/gimplify.c +++ b/gcc/gimplify.c @@ -178,6 +178,7 @@ struct gimplify_omp_ctx /* Iteration variables in an OMP_FOR. */ vec<tree> loop_iter_var; location_t location; + tree clauses; enum omp_clause_default_kind default_kind; enum omp_region_type region_type; bool combined_loop; @@ -402,6 +403,7 @@ new_omp_context (enum omp_region_type region_type) c->variables = splay_tree_new (splay_tree_compare_decl_uid, 0, 0); c->privatized_types = new hash_set<tree>; c->location = input_location; + c->clauses = NULL_TREE; c->region_type = region_type; if ((region_type & ORT_TASK) == 0) c->default_kind = OMP_CLAUSE_DEFAULT_SHARED; @@ -7318,6 +7320,37 @@ find_decl_expr (tree *tp, int *walk_subtrees, void *data) return NULL_TREE; } +static void +demote_firstprivate_pointer (tree decl, gimplify_omp_ctx *ctx) +{ + if (!lang_GNU_Fortran ()) + return; + + while (ctx) + { + if (ctx->region_type == ORT_ACC_PARALLEL + || ctx->region_type == ORT_ACC_KERNELS) + break; + ctx = ctx->outer_context; + } + + if (ctx == NULL) + return; + + tree clauses = ctx->clauses; + + for (tree c = clauses; c; c = OMP_CLAUSE_CHAIN (c)) + { + if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_MAP + && OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_POINTER + && OMP_CLAUSE_DECL (c) == decl) + { + OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_POINTER); + return; + } + } +} + /* Scan the OMP clauses in *LIST_P, installing mappings into a new and previous omp contexts. */ @@ -7333,9 +7366,10 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p, ctx = new_omp_context (region_type); outer_ctx = ctx->outer_context; + ctx->clauses = *list_p; if (code == OMP_TARGET) { - if (!lang_GNU_Fortran ()) + if (!lang_GNU_Fortran () || region_type & ORT_ACC) ctx->target_map_pointers_as_0len_arrays = true; ctx->target_map_scalars_firstprivate = true; } @@ -7459,6 +7493,7 @@ gimplify_scan_omp_clauses (tree *list_p, gimple_seq *pre_p, if (!(region_type & ORT_ACC)) check_non_private = "reduction"; decl = OMP_CLAUSE_DECL (c); + demote_firstprivate_pointer (decl, ctx->outer_context); if (TREE_CODE (decl) == MEM_REF) { tree type = TREE_TYPE (decl); @@ -8910,11 +8945,16 @@ gimplify_adjust_omp_clauses (gimple_seq *pre_p, gimple_seq body, tree *list_p, && kind != GOMP_MAP_FORCE_PRESENT && kind != GOMP_MAP_POINTER) { - warning_at (OMP_CLAUSE_LOCATION (c), 0, - "incompatible data clause with reduction " - "on %qE; promoting to present_or_copy", - DECL_NAME (t)); - OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_TOFROM); + if (lang_hooks.decls.omp_privatize_by_reference (decl)) + OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_POINTER); + else + { + warning_at (OMP_CLAUSE_LOCATION (c), 0, + "incompatible data clause with reduction " + "on %qE; promoting to present_or_copy", + DECL_NAME (t)); + OMP_CLAUSE_SET_MAP_KIND (c, GOMP_MAP_TOFROM); + } } } } diff --git a/gcc/omp-low.c b/gcc/omp-low.c index ff0f447..18aa394 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -8328,7 +8328,8 @@ lower_omp_target (gimple_stmt_iterator *gsi_p, omp_context *ctx) } else is_ref = omp_is_reference (var); - if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_REFERENCE) + if (OMP_CLAUSE_MAP_KIND (c) == GOMP_MAP_FIRSTPRIVATE_REFERENCE + || (lang_GNU_Fortran () && TREE_CODE (var) == PARM_DECL)) is_ref = false; bool ref_to_array = false; if (is_ref) diff --git a/gcc/testsuite/gfortran.dg/goacc/kernels-alias-3.f95 b/gcc/testsuite/gfortran.dg/goacc/kernels-alias-3.f95 index 07dc8d6..8ca47a0 100644 --- a/gcc/testsuite/gfortran.dg/goacc/kernels-alias-3.f95 +++ b/gcc/testsuite/gfortran.dg/goacc/kernels-alias-3.f95 @@ -16,4 +16,5 @@ end program main ! Only the omp_data_i related loads should be annotated with cliques. ! { dg-final { scan-tree-dump-times "clique 1 base 1" 2 "ealias" } } -! { dg-final { scan-tree-dump-times "(?n)clique 1 base 0" 3 "ealias" } } +! TODO +! { dg-final { scan-tree-dump-times "(?n)clique 1 base 0" 3 "ealias" { xfail *-*-* } } } -- 2.9.3