Jakub,
I've looked into how to implement the openacc kernels directive in gcc.
In order to map the loopnests marked by the kernels directive efficiently on
accelerator hardware, we need parallelization and vectorization.
Focussing on paralellization for the moment, a possibility for paralellization
is to use the parloops pass. The parloops pass identifies loops that can be
parallelized with a factor n, splits off the n-reduced loop into a function and
issues the function in n parallel threads.
A problem with using parloops for the kernels directive is that the parloops
pass is placed after lto's gimple-stream read/write point, so the parloops pass
is executed during the accelerator-specific compilation. So while the resulting
function with the reduced loop is compiled for the accelerator as required, also
the code issuing the function in parallel threads is generated for the
accelerator. While f.i. newer cuda with dynamic parallelism supports launching
accelerator kernels from within accelerator kernels, I guess that that might not
hold in general.
I've investigated moving the parloops pass up in the pass list, using attached
example kernels.c. It contains 4 loops; 2 loops that set arrays, one loop that
does a vector addition, and one loop that does a reduction
First, I compile the example using upstream trunk:
...
$ gcc -ftree-parallelize-loops=32 -fdump-tree-all-all -O2 kernels.c -std=c99
-Wl,-rpath,$(pwd -P)/lean-c/install/lib64
$ ./a.out ; echo $?
sum: 4293394432
0
...
All 4 loops are recognized as parallel by parloops:
...
$ egrep 'SUCCES|FAIL' kernels.c.*parloops
SUCCESS: may be parallelized
SUCCESS: may be parallelized
SUCCESS: may be parallelized
SUCCESS: may be parallelized
...
Using attached patch, I manage the same with parloops placed after
pass_build_ealias, with some additional passes inbetween:
...
NEXT_PASS (pass_build_ealias);
NEXT_PASS (pass_ch);
NEXT_PASS (pass_ccp);
NEXT_PASS (pass_lim_aux);
NEXT_PASS (pass_parallelize_loops);
...
The pass_lim_aux in front is needed because otherwise the loads of pointers a, b
and c stay in the loop and prevent parallelization.
The pass_ccp is to get rid of:
...
phi is i_5 = PHI <0(3)>
arg of phi to exit: value 0 used outside loop
checking if it a part of reduction pattern:
FAILED: it is not a part of reduction.
...
The pass_tree_ch is to get rid of:
...
phi is sum_3 = PHI <sum_1(4)>
arg of phi to exit: value sum_1 used outside loop
checking if it a part of reduction pattern:
FAILED: it is not a part of reduction.
...
The place after build_ealias is early enough to be before the lto-stream
write/read. I don't see how we can do this earlier. Before ealias, there's no
alias info, and one of the loops fails to be recognized as parallel.
Furthermore, pass_ch, pass_ccp, pass_lim_aux and pass_parloops are written to
work on cfg/ssa code, which we don't have at omp_low/omp_exp time.
We could insert a pass-group here that only deals with functions that have the
kernels directive, and do the auto-par thing in a pass_oacc_kernels (which
should share the majority of the infrastructure with the parloops pass):
...
NEXT_PASS (pass_build_ealias);
INSERT_PASSES_AFTER/WITHIN (passes_oacc_kernels)
NEXT_PASS (pass_ch);
NEXT_PASS (pass_ccp);
NEXT_PASS (pass_lim_aux);
NEXT_PASS (pass_oacc_par);
POP_INSERT_PASSES ()
...
Any comments, ideas or suggestions ?
Thanks,
- Tom
#include <stdlib.h>
#include <stdio.h>
#define N (1024 * 512)
#define N_REF 4293394432
unsigned int *__restrict a;
unsigned int *__restrict b;
unsigned int *__restrict c;
void
init_input (void)
{
for (unsigned int i = 0; i < N; i++)
a[i] = i * 2;
for (unsigned int i = 0; i < N; i++)
b[i] = i * 4;
}
void
check_output (void)
{
unsigned int sum = 0;
for (unsigned int i = 0; i < N; i++)
sum += c[i];
printf ("sum: %u\n", sum);
if (sum != N_REF)
abort ();
}
int
main (void)
{
unsigned int i;
a = malloc (N * sizeof (unsigned int));
b = malloc (N * sizeof (unsigned int));
c = malloc (N * sizeof (unsigned int));
init_input ();
for (int ii = 0; ii < N; ii++)
c[ii] = a[ii] + b[ii];
check_output ();
free (a);
free (b);
free (c);
return 0;
}
diff --git a/gcc/passes.def b/gcc/passes.def
index f13df6c..b501d2f 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -72,6 +72,10 @@ along with GCC; see the file COPYING3. If not see
/* pass_build_ealias is a dummy pass that ensures that we
execute TODO_rebuild_alias at this point. */
NEXT_PASS (pass_build_ealias);
+ NEXT_PASS (pass_ch);
+ NEXT_PASS (pass_ccp);
+ NEXT_PASS (pass_lim_aux);
+ NEXT_PASS (pass_parallelize_loops);
NEXT_PASS (pass_fre);
NEXT_PASS (pass_merge_phi);
NEXT_PASS (pass_cd_dce);
@@ -159,7 +163,6 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_tree_ifcombine);
NEXT_PASS (pass_phiopt);
NEXT_PASS (pass_tail_recursion);
- NEXT_PASS (pass_ch);
NEXT_PASS (pass_stdarg);
NEXT_PASS (pass_lower_complex);
NEXT_PASS (pass_sra);
@@ -221,7 +224,6 @@ along with GCC; see the file COPYING3. If not see
NEXT_PASS (pass_dce);
POP_INSERT_PASSES ()
NEXT_PASS (pass_iv_canon);
- NEXT_PASS (pass_parallelize_loops);
NEXT_PASS (pass_if_conversion);
/* pass_vectorize must immediately follow pass_if_conversion.
Please do not add any other passes in between. */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 112c295..87a77f8 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -2269,10 +2269,21 @@ public:
unsigned
pass_parallelize_loops::execute (function *fun)
{
- if (number_of_loops (fun) <= 1)
- return 0;
+ unsigned res = 0;
- if (parallelize_loops ())
+ loop_optimizer_init (LOOPS_NORMAL
+ | LOOPS_HAVE_RECORDED_EXITS);
+ rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
+ scev_initialize ();
+
+ if (number_of_loops (fun) > 1)
+ res = parallelize_loops ();
+
+ free_numbers_of_iterations_estimates ();
+ scev_finalize ();
+ loop_optimizer_finalize ();
+
+ if (res)
return TODO_cleanup_cfg | TODO_rebuild_alias;
return 0;
}
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 1477d1f..299d24e 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -353,6 +353,7 @@ extern gimple_opt_pass *make_pass_tree_loop (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tree_no_loop (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tree_loop_init (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_lim_aux (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt);
extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 0cbb3ae..fe0eeb7 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -2509,10 +2509,18 @@ tree_ssa_lim_finalize (void)
i.e. those that are likely to be win regardless of the register pressure. */
unsigned int
-tree_ssa_lim (void)
+tree_ssa_lim (bool aux_p)
{
unsigned int todo;
+ if (aux_p)
+ {
+ loop_optimizer_init (LOOPS_NORMAL
+ | LOOPS_HAVE_RECORDED_EXITS);
+
+ calculate_dominance_info (CDI_DOMINATORS);
+ }
+
tree_ssa_lim_initialize ();
/* Gathers information about memory accesses in the loops. */
@@ -2535,6 +2543,11 @@ tree_ssa_lim (void)
tree_ssa_lim_finalize ();
+ if (aux_p)
+ {
+ loop_optimizer_finalize ();
+ }
+
return todo;
}
@@ -2555,6 +2568,19 @@ const pass_data pass_data_lim =
0, /* todo_flags_finish */
};
+const pass_data pass_data_lim_aux =
+{
+ GIMPLE_PASS, /* type */
+ "limaux", /* name */
+ OPTGROUP_LOOP, /* optinfo_flags */
+ TV_LIM, /* tv_id */
+ PROP_cfg, /* properties_required */
+ 0, /* properties_provided */
+ 0, /* properties_destroyed */
+ 0, /* todo_flags_start */
+ 0, /* todo_flags_finish */
+};
+
class pass_lim : public gimple_opt_pass
{
public:
@@ -2569,18 +2595,47 @@ public:
}; // class pass_lim
+class pass_lim_aux : public gimple_opt_pass
+{
+public:
+ pass_lim_aux (gcc::context *ctxt)
+ : gimple_opt_pass (pass_data_lim_aux, ctxt)
+ {}
+
+ /* opt_pass methods: */
+ opt_pass * clone () { return new pass_lim_aux (m_ctxt); }
+ virtual bool gate (function *) { return flag_tree_loop_im != 0; }
+ virtual unsigned int execute (function *);
+
+}; // class pass_lim
+
+unsigned int
+pass_lim_aux::execute (function *fun)
+{
+ if (number_of_loops (fun) <= 1)
+ return 0;
+
+ return tree_ssa_lim (1);
+}
+
unsigned int
pass_lim::execute (function *fun)
{
if (number_of_loops (fun) <= 1)
return 0;
- return tree_ssa_lim ();
+ return tree_ssa_lim (0);
}
} // anon namespace
gimple_opt_pass *
+make_pass_lim_aux (gcc::context *ctxt)
+{
+ return new pass_lim_aux (ctxt);
+}
+
+gimple_opt_pass *
make_pass_lim (gcc::context *ctxt)
{
return new pass_lim (ctxt);