Jakub,

I've looked into how to implement the openacc kernels directive in gcc.

In order to map the loopnests marked by the kernels directive efficiently on accelerator hardware, we need parallelization and vectorization.

Focussing on paralellization for the moment, a possibility for paralellization is to use the parloops pass. The parloops pass identifies loops that can be parallelized with a factor n, splits off the n-reduced loop into a function and issues the function in n parallel threads.

A problem with using parloops for the kernels directive is that the parloops pass is placed after lto's gimple-stream read/write point, so the parloops pass is executed during the accelerator-specific compilation. So while the resulting function with the reduced loop is compiled for the accelerator as required, also the code issuing the function in parallel threads is generated for the accelerator. While f.i. newer cuda with dynamic parallelism supports launching accelerator kernels from within accelerator kernels, I guess that that might not hold in general.

I've investigated moving the parloops pass up in the pass list, using attached example kernels.c. It contains 4 loops; 2 loops that set arrays, one loop that does a vector addition, and one loop that does a reduction

First, I compile the example using upstream trunk:
...
$ gcc -ftree-parallelize-loops=32 -fdump-tree-all-all -O2 kernels.c -std=c99 -Wl,-rpath,$(pwd -P)/lean-c/install/lib64
$ ./a.out ; echo $?
sum: 4293394432
0
...

All 4 loops are recognized as parallel by parloops:
...
$ egrep 'SUCCES|FAIL' kernels.c.*parloops
   SUCCESS: may be parallelized
   SUCCESS: may be parallelized
   SUCCESS: may be parallelized
   SUCCESS: may be parallelized
...

Using attached patch, I manage the same with parloops placed after pass_build_ealias, with some additional passes inbetween:
...
          NEXT_PASS (pass_build_ealias);
          NEXT_PASS (pass_ch);
          NEXT_PASS (pass_ccp);
          NEXT_PASS (pass_lim_aux);
          NEXT_PASS (pass_parallelize_loops);
 ...

The pass_lim_aux in front is needed because otherwise the loads of pointers a, b and c stay in the loop and prevent parallelization.

The pass_ccp is to get rid of:
...
phi is i_5 = PHI <0(3)>
arg of phi to exit:   value 0 used outside loop
  checking if it a part of reduction pattern:
  FAILED: it is not a part of reduction.
...

The pass_tree_ch is to get rid of:
...
phi is sum_3 = PHI <sum_1(4)>
arg of phi to exit:   value sum_1 used outside loop
  checking if it a part of reduction pattern:
  FAILED: it is not a part of reduction.
...

The place after build_ealias is early enough to be before the lto-stream write/read. I don't see how we can do this earlier. Before ealias, there's no alias info, and one of the loops fails to be recognized as parallel. Furthermore, pass_ch, pass_ccp, pass_lim_aux and pass_parloops are written to work on cfg/ssa code, which we don't have at omp_low/omp_exp time.

We could insert a pass-group here that only deals with functions that have the kernels directive, and do the auto-par thing in a pass_oacc_kernels (which should share the majority of the infrastructure with the parloops pass):
...
          NEXT_PASS (pass_build_ealias);
          INSERT_PASSES_AFTER/WITHIN (passes_oacc_kernels)
             NEXT_PASS (pass_ch);
             NEXT_PASS (pass_ccp);
             NEXT_PASS (pass_lim_aux);
             NEXT_PASS (pass_oacc_par);
          POP_INSERT_PASSES ()
...

Any comments, ideas or suggestions ?

Thanks,
- Tom

#include <stdlib.h>
#include <stdio.h>

#define N (1024 * 512)
#define N_REF 4293394432

unsigned int *__restrict a;
unsigned int *__restrict b;
unsigned int *__restrict c;

void
init_input (void)
{
  for (unsigned int i = 0; i < N; i++)
    a[i] = i * 2;

  for (unsigned int i = 0; i < N; i++)
    b[i] = i * 4;
}

void
check_output (void)
{
  unsigned int sum = 0;

  for (unsigned int i = 0; i < N; i++)
    sum += c[i];

  printf ("sum: %u\n", sum);

  if (sum != N_REF)
    abort ();
}

int
main (void)
{
  unsigned int i;

  a = malloc (N * sizeof (unsigned int));
  b = malloc (N * sizeof (unsigned int));
  c = malloc (N * sizeof (unsigned int));

  init_input ();

  for (int ii = 0; ii < N; ii++)
    c[ii] = a[ii] + b[ii];
    
  check_output ();

  free (a);
  free (b);
  free (c);

  return 0;
}

diff --git a/gcc/passes.def b/gcc/passes.def
index f13df6c..b501d2f 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -72,6 +72,10 @@ along with GCC; see the file COPYING3.  If not see
 	  /* pass_build_ealias is a dummy pass that ensures that we
 	     execute TODO_rebuild_alias at this point.  */
 	  NEXT_PASS (pass_build_ealias);
+	  NEXT_PASS (pass_ch);
+	  NEXT_PASS (pass_ccp);
+	  NEXT_PASS (pass_lim_aux);
+	  NEXT_PASS (pass_parallelize_loops);
 	  NEXT_PASS (pass_fre);
 	  NEXT_PASS (pass_merge_phi);
 	  NEXT_PASS (pass_cd_dce);
@@ -159,7 +163,6 @@ along with GCC; see the file COPYING3.  If not see
       NEXT_PASS (pass_tree_ifcombine);
       NEXT_PASS (pass_phiopt);
       NEXT_PASS (pass_tail_recursion);
-      NEXT_PASS (pass_ch);
       NEXT_PASS (pass_stdarg);
       NEXT_PASS (pass_lower_complex);
       NEXT_PASS (pass_sra);
@@ -221,7 +224,6 @@ along with GCC; see the file COPYING3.  If not see
 	      NEXT_PASS (pass_dce);
 	  POP_INSERT_PASSES ()
 	  NEXT_PASS (pass_iv_canon);
-	  NEXT_PASS (pass_parallelize_loops);
 	  NEXT_PASS (pass_if_conversion);
 	  /* pass_vectorize must immediately follow pass_if_conversion.
 	     Please do not add any other passes in between.  */
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c
index 112c295..87a77f8 100644
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -2269,10 +2269,21 @@ public:
 unsigned
 pass_parallelize_loops::execute (function *fun)
 {
-  if (number_of_loops (fun) <= 1)
-    return 0;
+  unsigned res = 0;
 
-  if (parallelize_loops ())
+  loop_optimizer_init (LOOPS_NORMAL
+		       | LOOPS_HAVE_RECORDED_EXITS);
+  rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
+  scev_initialize ();
+
+  if (number_of_loops (fun) > 1)
+    res = parallelize_loops ();
+
+  free_numbers_of_iterations_estimates ();
+  scev_finalize ();
+  loop_optimizer_finalize ();
+
+  if (res)
     return TODO_cleanup_cfg | TODO_rebuild_alias;
   return 0;
 }
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index 1477d1f..299d24e 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -353,6 +353,7 @@ extern gimple_opt_pass *make_pass_tree_loop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_no_loop (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_loop_init (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_lim (gcc::context *ctxt);
+extern gimple_opt_pass *make_pass_lim_aux (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_tree_unswitch (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_predcom (gcc::context *ctxt);
 extern gimple_opt_pass *make_pass_iv_canon (gcc::context *ctxt);
diff --git a/gcc/tree-ssa-loop-im.c b/gcc/tree-ssa-loop-im.c
index 0cbb3ae..fe0eeb7 100644
--- a/gcc/tree-ssa-loop-im.c
+++ b/gcc/tree-ssa-loop-im.c
@@ -2509,10 +2509,18 @@ tree_ssa_lim_finalize (void)
    i.e. those that are likely to be win regardless of the register pressure.  */
 
 unsigned int
-tree_ssa_lim (void)
+tree_ssa_lim (bool aux_p)
 {
   unsigned int todo;
 
+  if (aux_p)
+    {
+      loop_optimizer_init (LOOPS_NORMAL
+			   | LOOPS_HAVE_RECORDED_EXITS);
+
+      calculate_dominance_info (CDI_DOMINATORS);
+    }
+
   tree_ssa_lim_initialize ();
 
   /* Gathers information about memory accesses in the loops.  */
@@ -2535,6 +2543,11 @@ tree_ssa_lim (void)
 
   tree_ssa_lim_finalize ();
 
+  if (aux_p)
+    {
+      loop_optimizer_finalize ();
+    }
+
   return todo;
 }
 
@@ -2555,6 +2568,19 @@ const pass_data pass_data_lim =
   0, /* todo_flags_finish */
 };
 
+const pass_data pass_data_lim_aux =
+{
+  GIMPLE_PASS, /* type */
+  "limaux", /* name */
+  OPTGROUP_LOOP, /* optinfo_flags */
+  TV_LIM, /* tv_id */
+  PROP_cfg, /* properties_required */
+  0, /* properties_provided */
+  0, /* properties_destroyed */
+  0, /* todo_flags_start */
+  0, /* todo_flags_finish */
+};
+
 class pass_lim : public gimple_opt_pass
 {
 public:
@@ -2569,18 +2595,47 @@ public:
 
 }; // class pass_lim
 
+class pass_lim_aux : public gimple_opt_pass
+{
+public:
+  pass_lim_aux (gcc::context *ctxt)
+    : gimple_opt_pass (pass_data_lim_aux, ctxt)
+  {}
+
+  /* opt_pass methods: */
+  opt_pass * clone () { return new pass_lim_aux (m_ctxt); }
+  virtual bool gate (function *) { return flag_tree_loop_im != 0; }
+  virtual unsigned int execute (function *);
+
+}; // class pass_lim
+
+unsigned int
+pass_lim_aux::execute (function *fun)
+{
+  if (number_of_loops (fun) <= 1)
+    return 0;
+
+  return tree_ssa_lim (1);
+}
+
 unsigned int
 pass_lim::execute (function *fun)
 {
   if (number_of_loops (fun) <= 1)
     return 0;
 
-  return tree_ssa_lim ();
+  return tree_ssa_lim (0);
 }
 
 } // anon namespace
 
 gimple_opt_pass *
+make_pass_lim_aux (gcc::context *ctxt)
+{
+  return new pass_lim_aux (ctxt);
+}
+
+gimple_opt_pass *
 make_pass_lim (gcc::context *ctxt)
 {
   return new pass_lim (ctxt);

Reply via email to