As Chung-Lin noted here <https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01079.html>:
This patch adjusts omp-low.c:expand_omp_for_generic() to expand to a "sequential" loop form (without the OMP runtime calls), used for loop directives inside OpenACC kernels constructs. Tom mentions that this allows the kernels parallelization to work when '#pragma acc loop' makes the front-ends create OMP_FOR, which the loop analysis phases don't understand. I bootstrapped and regtested it on x86_64 Linux with nvptx offloading. Is this patch OK for trunk? Thanks, Cesar
[OpenACC] Generate sequential loop for OpenACC loop directive inside kernels 2018-XX-YY Chung-Lin Tang <clt...@codesourcery.com> Cesar Philippidis <ce...@codesourcery.com> gcc/ * omp-expand.c (struct omp_region): Add inside_kernels_p field. (expand_omp_for_generic): Adjust to generate a 'sequential' loop when GOMP builtin arguments are BUILT_IN_NONE. (expand_omp_for): Use expand_omp_for_generic to generate a non-parallelized loop for OMP_FORs inside OpenACC kernels regions. (expand_omp): Mark inside_kernels_p field true for regions nested inside OpenACC kernels constructs. gcc/testsuite/ * c-c++-common/goacc/kernels-loop-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test. * c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test. * c-c++-common/goacc/kernels-acc-loop-reduction.c: New test. * c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test. (cherry picked from gomp-4_0-branch r224505, r224837, r228232, r228233, r231461, and r247958) --- gcc/omp-expand.c | 136 ++++++++++++------ .../goacc/kernels-acc-loop-reduction.c | 23 +++ .../goacc/kernels-acc-loop-smaller-equal.c | 23 +++ .../goacc/kernels-loop-2-acc-loop.c | 18 +++ .../goacc/kernels-loop-3-acc-loop.c | 15 ++ .../goacc/kernels-loop-acc-loop.c | 15 ++ .../goacc/kernels-loop-n-acc-loop.c | 15 ++ 7 files changed, 204 insertions(+), 41 deletions(-) create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c index d2a77c067c6..9b03f62e065 100644 --- a/gcc/omp-expand.c +++ b/gcc/omp-expand.c @@ -104,6 +104,9 @@ struct omp_region /* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has a depend clause. */ gomp_ordered *ord_stmt; + + /* True if this is nested inside an OpenACC kernels construct. */ + bool inside_kernels_p; }; static struct omp_region *root_omp_region; @@ -2509,6 +2512,7 @@ expand_omp_for_generic (struct omp_region *region, gassign *assign_stmt; bool in_combined_parallel = is_combined_parallel (region); bool broken_loop = region->cont == NULL; + bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE); edge e, ne; tree *counts = NULL; int i; @@ -2606,8 +2610,12 @@ expand_omp_for_generic (struct omp_region *region, type = TREE_TYPE (fd->loop.v); istart0 = create_tmp_var (fd->iter_type, ".istart0"); iend0 = create_tmp_var (fd->iter_type, ".iend0"); - TREE_ADDRESSABLE (istart0) = 1; - TREE_ADDRESSABLE (iend0) = 1; + + if (!seq_loop) + { + TREE_ADDRESSABLE (istart0) = 1; + TREE_ADDRESSABLE (iend0) = 1; + } /* See if we need to bias by LLONG_MIN. */ if (fd->iter_type == long_long_unsigned_type_node @@ -2637,7 +2645,25 @@ expand_omp_for_generic (struct omp_region *region, gsi_prev (&gsif); tree arr = NULL_TREE; - if (in_combined_parallel) + if (seq_loop) + { + tree n1 = fold_convert (fd->iter_type, fd->loop.n1); + tree n2 = fold_convert (fd->iter_type, fd->loop.n2); + + n1 = force_gimple_operand_gsi_1 (&gsi, n1, is_gimple_reg, NULL_TREE, true, + GSI_SAME_STMT); + n2 = force_gimple_operand_gsi_1 (&gsi, n2, is_gimple_reg, NULL_TREE, true, + GSI_SAME_STMT); + + assign_stmt = gimple_build_assign (istart0, n1); + gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); + + assign_stmt = gimple_build_assign (iend0, n2); + gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT); + + t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0); + } + else if (in_combined_parallel) { gcc_assert (fd->ordered == 0); /* In a combined parallel loop, emit a call to @@ -3059,39 +3085,45 @@ expand_omp_for_generic (struct omp_region *region, collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb); /* Emit code to get the next parallel iteration in L2_BB. */ - gsi = gsi_start_bb (l2_bb); + if (!seq_loop) + { + gsi = gsi_start_bb (l2_bb); - t = build_call_expr (builtin_decl_explicit (next_fn), 2, - build_fold_addr_expr (istart0), - build_fold_addr_expr (iend0)); - t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, - false, GSI_CONTINUE_LINKING); - if (TREE_TYPE (t) != boolean_type_node) - t = fold_build2 (NE_EXPR, boolean_type_node, - t, build_int_cst (TREE_TYPE (t), 0)); - gcond *cond_stmt = gimple_build_cond_empty (t); - gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING); + t = build_call_expr (builtin_decl_explicit (next_fn), 2, + build_fold_addr_expr (istart0), + build_fold_addr_expr (iend0)); + t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE, + false, GSI_CONTINUE_LINKING); + if (TREE_TYPE (t) != boolean_type_node) + t = fold_build2 (NE_EXPR, boolean_type_node, + t, build_int_cst (TREE_TYPE (t), 0)); + gcond *cond_stmt = gimple_build_cond_empty (t); + gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING); + } } /* Add the loop cleanup function. */ gsi = gsi_last_nondebug_bb (exit_bb); - if (gimple_omp_return_nowait_p (gsi_stmt (gsi))) - t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT); - else if (gimple_omp_return_lhs (gsi_stmt (gsi))) - t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL); - else - t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END); - gcall *call_stmt = gimple_build_call (t, 0); - if (gimple_omp_return_lhs (gsi_stmt (gsi))) - gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi))); - gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT); - if (fd->ordered) + if (!seq_loop) { - tree arr = counts[fd->ordered]; - tree clobber = build_constructor (TREE_TYPE (arr), NULL); - TREE_THIS_VOLATILE (clobber) = 1; - gsi_insert_after (&gsi, gimple_build_assign (arr, clobber), - GSI_SAME_STMT); + if (gimple_omp_return_nowait_p (gsi_stmt (gsi))) + t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT); + else if (gimple_omp_return_lhs (gsi_stmt (gsi))) + t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL); + else + t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END); + gcall *call_stmt = gimple_build_call (t, 0); + if (gimple_omp_return_lhs (gsi_stmt (gsi))) + gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi))); + gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT); + if (fd->ordered) + { + tree arr = counts[fd->ordered]; + tree clobber = build_constructor (TREE_TYPE (arr), NULL); + TREE_THIS_VOLATILE (clobber) = 1; + gsi_insert_after (&gsi, gimple_build_assign (arr, clobber), + GSI_SAME_STMT); + } } gsi_remove (&gsi, true); @@ -3104,7 +3136,8 @@ expand_omp_for_generic (struct omp_region *region, gimple_seq phis; e = find_edge (cont_bb, l3_bb); - ne = make_edge (l2_bb, l3_bb, EDGE_FALSE_VALUE); + ne = make_edge (l2_bb, l3_bb, + seq_loop ? EDGE_FALLTHRU : EDGE_FALSE_VALUE); phis = phi_nodes (l3_bb); for (gsi = gsi_start (phis); !gsi_end_p (gsi); gsi_next (&gsi)) @@ -3144,7 +3177,8 @@ expand_omp_for_generic (struct omp_region *region, e = find_edge (cont_bb, l2_bb); e->flags = EDGE_FALLTHRU; } - make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE); + if (!seq_loop) + make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE); if (gimple_in_ssa_p (cfun)) { @@ -3203,12 +3237,16 @@ expand_omp_for_generic (struct omp_region *region, add_bb_to_loop (l2_bb, outer_loop); - /* We've added a new loop around the original loop. Allocate the - corresponding loop struct. */ - struct loop *new_loop = alloc_loop (); - new_loop->header = l0_bb; - new_loop->latch = l2_bb; - add_loop (new_loop, outer_loop); + struct loop *new_loop = NULL; + if (!seq_loop) + { + /* We've added a new loop around the original loop. Allocate the + corresponding loop struct. */ + new_loop = alloc_loop (); + new_loop->header = l0_bb; + new_loop->latch = l2_bb; + add_loop (new_loop, outer_loop); + } /* Allocate a loop structure for the original loop unless we already had one. */ @@ -3218,7 +3256,8 @@ expand_omp_for_generic (struct omp_region *region, struct loop *orig_loop = alloc_loop (); orig_loop->header = l1_bb; /* The loop may have multiple latches. */ - add_loop (orig_loop, new_loop); + add_loop (orig_loop, + new_loop != NULL ? new_loop : outer_loop); } } } @@ -5665,7 +5704,10 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt) original loops from being detected. Fix that up. */ loops_state_set (LOOPS_NEED_FIXUP); - if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD) + if (region->inside_kernels_p) + expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE, + inner_stmt); + else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD) expand_omp_simd (region, &fd); else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP) { @@ -7750,7 +7792,19 @@ expand_omp (struct omp_region *region) if (region->type == GIMPLE_OMP_PARALLEL) determine_parallel_type (region); else if (region->type == GIMPLE_OMP_TARGET) - grid_expand_target_grid_body (region); + { + grid_expand_target_grid_body (region); + + if (region->inner) + { + gomp_target *entry + = as_a <gomp_target *> (last_stmt (region->entry)); + if (region->inside_kernels_p + || (gimple_omp_target_kind (entry) + == GF_OMP_TARGET_KIND_OACC_KERNELS)) + region->inner->inside_kernels_p = true; + } + } if (region->type == GIMPLE_OMP_FOR && gimple_omp_for_combined_p (last_stmt (region->entry))) diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c new file mode 100644 index 00000000000..4824e530925 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c @@ -0,0 +1,23 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +unsigned int +foo (int n, unsigned int *a) +{ + unsigned int sum = 0; + +#pragma acc kernels loop gang reduction(+:sum) + for (int i = 0; i < n; i++) + sum += a[i]; + + return sum; +} + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c new file mode 100644 index 00000000000..d70afb0e662 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c @@ -0,0 +1,23 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +unsigned int +foo (int n) +{ + unsigned int sum = 1; + + #pragma acc kernels loop + for (int i = 1; i <= n; i++) + sum += i; + + return sum; +} + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c new file mode 100644 index 00000000000..7b127cb6fd9 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c @@ -0,0 +1,18 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP +#include "kernels-loop-2.c" + +/* Check that only three loops are analyzed, and that all can be + parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c new file mode 100644 index 00000000000..a040e096fc1 --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c @@ -0,0 +1,15 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP +#include "kernels-loop-3.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c new file mode 100644 index 00000000000..070a5b5bf3d --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c @@ -0,0 +1,15 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP +#include "kernels-loop.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */ diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c new file mode 100644 index 00000000000..1f25e63fbbb --- /dev/null +++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c @@ -0,0 +1,15 @@ +/* { dg-additional-options "-O2" } */ +/* { dg-additional-options "-fdump-tree-parloops1-all" } */ +/* { dg-additional-options "-fdump-tree-optimized" } */ + +/* Check that loops with '#pragma acc loop' tagged gets properly parallelized. */ +#define ACC_LOOP +#include "kernels-loop-n.c" + +/* Check that only one loop is analyzed, and that it can be parallelized. */ +/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */ +/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */ + +/* Check that the loop has been split off into a function. */ +/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */ -- 2.17.1