Hi,
I.
This patch allows parallelization of an outer loop in an openacc kernels
region.
The testcase is based on autopar/outer-1.c.
II.
We rely on pass_lim to move the *.omp_data_i loads out of the loop nest.
For the test-case, pass_lim was managing to move the load out of the
inner loop, but not the outer loop, because the load was classified as
'MOVE_PRESERVE_EXECUTION'. By marking the *.omp_data_i load
non-trapping, it's now classified as 'MOVE_POSSIBLE', and moved out of
the loop nest.
III.
The 'loops_state_set (LOOPS_NEED_FIXUP)' is a somewhat blunt and
temporary fix for the oacc kernels variant of PR66846 - parloops does
not always mark loops for fixup if needed.
The original PR needs an added verify_loop_structure to trigger the
problem. Normally the problem is hidden by the fact that the first pass
that runs on the new function is pass_fixup_cfg, which happens to fixup
the loops (The loops are fixed up because TODO_cleanup_cfg is set during
pass_fixup_cfg, because the function contains an ECF_CONST function:
__builtin_omp_get_num_threads).
For the oacc kernels variant, the problem triggers without adding
verify_loop_structure. During pass_ipa_inline, we call
loop_optimizer_init, which (given that LOOPS_NEED_FIXUP is not set)
verifies the loop structure, which fails. Pass_fixup_cfg is not run
inbetween the discovery of the new function and pass_ipa_inline.
IV.
I've committed this patch to gomp-4_0-branch.
Bootstrapped and reg-tested on x86_64. Build and reg-tested on setup
with nvidia accelerator.
Thanks,
- Tom
Handle nested loops in kernels regions
2015-07-12 Tom de Vries <t...@codesourcery.com>
* omp-low.c (build_receiver_ref): Mark *.omp_data_i as non-trapping.
* tree-parloops.c (gen_parallel_loop): Add LOOPS_NEED_FIXUP to loop
state.
(parallelize_loops): Allow nested loops.
* testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c: New test.
* c-c++-common/goacc/kernels-loop-nest.c: New test.
---
gcc/omp-low.c | 1 +
.../c-c++-common/goacc/kernels-loop-nest.c | 42 ++++++++++++++++++++++
gcc/tree-parloops.c | 5 +--
.../libgomp.oacc-c-c++-common/kernels-loop-nest.c | 26 ++++++++++++++
4 files changed, 70 insertions(+), 4 deletions(-)
create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
create mode 100644 libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 11ac909..a938ce0 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -1147,6 +1147,7 @@ build_receiver_ref (tree var, bool by_ref, omp_context *ctx)
field = x;
x = build_simple_mem_ref (ctx->receiver_decl);
+ TREE_THIS_NOTRAP (x) = 1;
x = omp_build_component_ref (x, field);
if (by_ref)
x = build_simple_mem_ref (x);
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
new file mode 100644
index 0000000..3e06c9f
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-nest.c
@@ -0,0 +1,42 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+/* { dg-additional-options "-fdump-tree-parloops_oacc_kernels-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Based on autopar/outer-1.c. */
+
+#include <stdlib.h>
+
+#define N 1000
+
+int
+main (void)
+{
+ int x[N][N];
+
+#pragma acc kernels copyout (x)
+ {
+ for (int ii = 0; ii < N; ii++)
+ for (int jj = 0; jj < N; jj++)
+ x[ii][jj] = ii + jj + 3;
+ }
+
+ for (int i = 0; i < N; i++)
+ for (int j = 0; j < N; j++)
+ if (x[i][j] != i + j + 3)
+ abort ();
+
+ return 0;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized. */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops_oacc_kernels" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops_oacc_kernels" } } */
+
+/* Check that the loop has been split off into a function. */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+
+/* { dg-final { scan-tree-dump-times "(?n)pragma omp target oacc_parallel.*num_gangs\\(32\\)" 1 "parloops_oacc_kernels" } } */
+
+/* { dg-final { cleanup-tree-dump "parloops_oacc_kernels" } } */
+/* { dg-final { cleanup-tree-dump "optimized" } } */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 04708c0..492ffcb 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -2442,6 +2442,7 @@ gen_parallel_loop (struct loop *loop,
/* Cancel the loop (it is simpler to do it here rather than to teach the
expander to do it). */
cancel_loop_tree (loop);
+ loops_state_set (LOOPS_NEED_FIXUP);
/* Free loop bound estimations that could contain references to
removed statements. */
@@ -2761,10 +2762,6 @@ parallelize_loops (bool oacc_kernels_p)
if (!loop->in_oacc_kernels_region)
continue;
- /* TODO: Allow nested loops. */
- if (loop->inner)
- continue;
-
if (dump_file && (dump_flags & TDF_DETAILS))
fprintf (dump_file,
"Trying loop %d with header bb %d in oacc kernels region\n",
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c
new file mode 100644
index 0000000..21d2599
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-nest.c
@@ -0,0 +1,26 @@
+/* { dg-do run } */
+/* { dg-additional-options "-ftree-parallelize-loops=32" } */
+
+#include <stdlib.h>
+
+#define N 1000
+
+int
+main (void)
+{
+ int x[N][N];
+
+#pragma acc kernels copyout (x)
+ {
+ for (int ii = 0; ii < N; ii++)
+ for (int jj = 0; jj < N; jj++)
+ x[ii][jj] = ii + jj + 3;
+ }
+
+ for (int i = 0; i < N; i++)
+ for (int j = 0; j < N; j++)
+ if (x[i][j] != i + j + 3)
+ abort ();
+
+ return 0;
+}
--
1.9.1