[patch,openacc] Generate sequential loop for OpenACC loop directive inside kernels

Cesar Philippidis Thu, 20 Sep 2018 10:15:00 -0700

As Chung-Lin noted here
<https://gcc.gnu.org/ml/gcc-patches/2015-06/msg01079.html>:


  This patch adjusts omp-low.c:expand_omp_for_generic() to expand to a
  "sequential" loop form (without the OMP runtime calls), used for loop
  directives inside OpenACC kernels constructs. Tom mentions that this
  allows the kernels parallelization to work when '#pragma acc loop'
  makes the front-ends create OMP_FOR, which the loop analysis phases
  don't understand.

I bootstrapped and regtested it on x86_64 Linux with nvptx offloading.
Is this patch OK for trunk?

Thanks,
Cesar

[OpenACC] Generate sequential loop for OpenACC loop directive inside kernels

2018-XX-YY  Chung-Lin Tang <clt...@codesourcery.com>
	    Cesar Philippidis  <ce...@codesourcery.com>

	gcc/
	* omp-expand.c (struct omp_region): Add inside_kernels_p field.
	(expand_omp_for_generic): Adjust to generate a 'sequential' loop
	when GOMP builtin arguments are BUILT_IN_NONE.
	(expand_omp_for): Use expand_omp_for_generic to generate a
	non-parallelized loop for OMP_FORs inside OpenACC kernels regions.
	(expand_omp): Mark inside_kernels_p field true for regions
	nested inside OpenACC kernels constructs.
	gcc/testsuite/
	* c-c++-common/goacc/kernels-loop-acc-loop.c: New test.
	* c-c++-common/goacc/kernels-loop-2-acc-loop.c: New test.
	* c-c++-common/goacc/kernels-loop-3-acc-loop.c: New test.
	* c-c++-common/goacc/kernels-loop-n-acc-loop.c: New test.
	* c-c++-common/goacc/kernels-acc-loop-reduction.c: New test.
	* c-c++-common/goacc/kernels-acc-loop-smaller-equal.c: New test.

(cherry picked from gomp-4_0-branch r224505, r224837, r228232, r228233,
r231461, and r247958)
---
 gcc/omp-expand.c                              | 136 ++++++++++++------
 .../goacc/kernels-acc-loop-reduction.c        |  23 +++
 .../goacc/kernels-acc-loop-smaller-equal.c    |  23 +++
 .../goacc/kernels-loop-2-acc-loop.c           |  18 +++
 .../goacc/kernels-loop-3-acc-loop.c           |  15 ++
 .../goacc/kernels-loop-acc-loop.c             |  15 ++
 .../goacc/kernels-loop-n-acc-loop.c           |  15 ++
 7 files changed, 204 insertions(+), 41 deletions(-)
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
 create mode 100644 gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c

diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
index d2a77c067c6..9b03f62e065 100644
--- a/gcc/omp-expand.c
+++ b/gcc/omp-expand.c
@@ -104,6 +104,9 @@ struct omp_region
   /* The ordered stmt if type is GIMPLE_OMP_ORDERED and it has
      a depend clause.  */
   gomp_ordered *ord_stmt;
+
+  /* True if this is nested inside an OpenACC kernels construct.  */
+  bool inside_kernels_p;
 };
 
 static struct omp_region *root_omp_region;
@@ -2509,6 +2512,7 @@ expand_omp_for_generic (struct omp_region *region,
   gassign *assign_stmt;
   bool in_combined_parallel = is_combined_parallel (region);
   bool broken_loop = region->cont == NULL;
+  bool seq_loop = (start_fn == BUILT_IN_NONE || next_fn == BUILT_IN_NONE);
   edge e, ne;
   tree *counts = NULL;
   int i;
@@ -2606,8 +2610,12 @@ expand_omp_for_generic (struct omp_region *region,
   type = TREE_TYPE (fd->loop.v);
   istart0 = create_tmp_var (fd->iter_type, ".istart0");
   iend0 = create_tmp_var (fd->iter_type, ".iend0");
-  TREE_ADDRESSABLE (istart0) = 1;
-  TREE_ADDRESSABLE (iend0) = 1;
+
+  if (!seq_loop)
+    {
+      TREE_ADDRESSABLE (istart0) = 1;
+      TREE_ADDRESSABLE (iend0) = 1;
+    }
 
   /* See if we need to bias by LLONG_MIN.  */
   if (fd->iter_type == long_long_unsigned_type_node
@@ -2637,7 +2645,25 @@ expand_omp_for_generic (struct omp_region *region,
   gsi_prev (&gsif);
 
   tree arr = NULL_TREE;
-  if (in_combined_parallel)
+  if (seq_loop)
+    {
+      tree n1 = fold_convert (fd->iter_type, fd->loop.n1);
+      tree n2 = fold_convert (fd->iter_type, fd->loop.n2);
+
+      n1 = force_gimple_operand_gsi_1 (&gsi, n1, is_gimple_reg, NULL_TREE, true,
+				       GSI_SAME_STMT);
+      n2 = force_gimple_operand_gsi_1 (&gsi, n2, is_gimple_reg, NULL_TREE, true,
+				       GSI_SAME_STMT);
+
+      assign_stmt = gimple_build_assign (istart0, n1);
+      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+      assign_stmt = gimple_build_assign (iend0, n2);
+      gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
+
+      t = fold_build2 (NE_EXPR, boolean_type_node, istart0, iend0);
+    }
+  else if (in_combined_parallel)
     {
       gcc_assert (fd->ordered == 0);
       /* In a combined parallel loop, emit a call to
@@ -3059,39 +3085,45 @@ expand_omp_for_generic (struct omp_region *region,
 	collapse_bb = extract_omp_for_update_vars (fd, cont_bb, l1_bb);
 
       /* Emit code to get the next parallel iteration in L2_BB.  */
-      gsi = gsi_start_bb (l2_bb);
+      if (!seq_loop)
+	{
+	  gsi = gsi_start_bb (l2_bb);
 
-      t = build_call_expr (builtin_decl_explicit (next_fn), 2,
-			   build_fold_addr_expr (istart0),
-			   build_fold_addr_expr (iend0));
-      t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
-				    false, GSI_CONTINUE_LINKING);
-      if (TREE_TYPE (t) != boolean_type_node)
-	t = fold_build2 (NE_EXPR, boolean_type_node,
-			 t, build_int_cst (TREE_TYPE (t), 0));
-      gcond *cond_stmt = gimple_build_cond_empty (t);
-      gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+	  t = build_call_expr (builtin_decl_explicit (next_fn), 2,
+			       build_fold_addr_expr (istart0),
+			       build_fold_addr_expr (iend0));
+	  t = force_gimple_operand_gsi (&gsi, t, true, NULL_TREE,
+					false, GSI_CONTINUE_LINKING);
+	  if (TREE_TYPE (t) != boolean_type_node)
+	    t = fold_build2 (NE_EXPR, boolean_type_node,
+			     t, build_int_cst (TREE_TYPE (t), 0));
+	  gcond *cond_stmt = gimple_build_cond_empty (t);
+	  gsi_insert_after (&gsi, cond_stmt, GSI_CONTINUE_LINKING);
+	}
     }
 
   /* Add the loop cleanup function.  */
   gsi = gsi_last_nondebug_bb (exit_bb);
-  if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
-  else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
-  else
-    t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
-  gcall *call_stmt = gimple_build_call (t, 0);
-  if (gimple_omp_return_lhs (gsi_stmt (gsi)))
-    gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
-  gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
-  if (fd->ordered)
+  if (!seq_loop)
     {
-      tree arr = counts[fd->ordered];
-      tree clobber = build_constructor (TREE_TYPE (arr), NULL);
-      TREE_THIS_VOLATILE (clobber) = 1;
-      gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
-			GSI_SAME_STMT);
+      if (gimple_omp_return_nowait_p (gsi_stmt (gsi)))
+	t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_NOWAIT);
+      else if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+	t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END_CANCEL);
+      else
+	t = builtin_decl_explicit (BUILT_IN_GOMP_LOOP_END);
+      gcall *call_stmt = gimple_build_call (t, 0);
+      if (gimple_omp_return_lhs (gsi_stmt (gsi)))
+	gimple_call_set_lhs (call_stmt, gimple_omp_return_lhs (gsi_stmt (gsi)));
+      gsi_insert_after (&gsi, call_stmt, GSI_SAME_STMT);
+      if (fd->ordered)
+	{
+	  tree arr = counts[fd->ordered];
+	  tree clobber = build_constructor (TREE_TYPE (arr), NULL);
+	  TREE_THIS_VOLATILE (clobber) = 1;
+	  gsi_insert_after (&gsi, gimple_build_assign (arr, clobber),
+			    GSI_SAME_STMT);
+	}
     }
   gsi_remove (&gsi, true);
 
@@ -3104,7 +3136,8 @@ expand_omp_for_generic (struct omp_region *region,
       gimple_seq phis;
 
       e = find_edge (cont_bb, l3_bb);
-      ne = make_edge (l2_bb, l3_bb, EDGE_FALSE_VALUE);
+      ne = make_edge (l2_bb, l3_bb,
+		      seq_loop ? EDGE_FALLTHRU : EDGE_FALSE_VALUE);
 
       phis = phi_nodes (l3_bb);
       for (gsi = gsi_start (phis); !gsi_end_p (gsi); gsi_next (&gsi))
@@ -3144,7 +3177,8 @@ expand_omp_for_generic (struct omp_region *region,
 	  e = find_edge (cont_bb, l2_bb);
 	  e->flags = EDGE_FALLTHRU;
 	}
-      make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
+      if (!seq_loop)
+	make_edge (l2_bb, l0_bb, EDGE_TRUE_VALUE);
 
       if (gimple_in_ssa_p (cfun))
 	{
@@ -3203,12 +3237,16 @@ expand_omp_for_generic (struct omp_region *region,
 
       add_bb_to_loop (l2_bb, outer_loop);
 
-      /* We've added a new loop around the original loop.  Allocate the
-	 corresponding loop struct.  */
-      struct loop *new_loop = alloc_loop ();
-      new_loop->header = l0_bb;
-      new_loop->latch = l2_bb;
-      add_loop (new_loop, outer_loop);
+      struct loop *new_loop = NULL;
+      if (!seq_loop)
+	{
+	  /* We've added a new loop around the original loop.  Allocate the
+	     corresponding loop struct.  */
+	  new_loop = alloc_loop ();
+	  new_loop->header = l0_bb;
+	  new_loop->latch = l2_bb;
+	  add_loop (new_loop, outer_loop);
+	}
 
       /* Allocate a loop structure for the original loop unless we already
 	 had one.  */
@@ -3218,7 +3256,8 @@ expand_omp_for_generic (struct omp_region *region,
 	  struct loop *orig_loop = alloc_loop ();
 	  orig_loop->header = l1_bb;
 	  /* The loop may have multiple latches.  */
-	  add_loop (orig_loop, new_loop);
+	  add_loop (orig_loop,
+		    new_loop != NULL ? new_loop : outer_loop);
 	}
     }
 }
@@ -5665,7 +5704,10 @@ expand_omp_for (struct omp_region *region, gimple *inner_stmt)
        original loops from being detected.  Fix that up.  */
     loops_state_set (LOOPS_NEED_FIXUP);
 
-  if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
+  if (region->inside_kernels_p)
+    expand_omp_for_generic (region, &fd, BUILT_IN_NONE, BUILT_IN_NONE,
+			    inner_stmt);
+  else if (gimple_omp_for_kind (fd.for_stmt) & GF_OMP_FOR_SIMD)
     expand_omp_simd (region, &fd);
   else if (gimple_omp_for_kind (fd.for_stmt) == GF_OMP_FOR_KIND_OACC_LOOP)
     {
@@ -7750,7 +7792,19 @@ expand_omp (struct omp_region *region)
       if (region->type == GIMPLE_OMP_PARALLEL)
 	determine_parallel_type (region);
       else if (region->type == GIMPLE_OMP_TARGET)
-	grid_expand_target_grid_body (region);
+	{
+	  grid_expand_target_grid_body (region);
+
+	  if (region->inner)
+	    {
+	      gomp_target *entry
+		= as_a <gomp_target *> (last_stmt (region->entry));
+	      if (region->inside_kernels_p
+		  || (gimple_omp_target_kind (entry)
+		      == GF_OMP_TARGET_KIND_OACC_KERNELS))
+		region->inner->inside_kernels_p = true;
+	    }
+	}
 
       if (region->type == GIMPLE_OMP_FOR
 	  && gimple_omp_for_combined_p (last_stmt (region->entry)))
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
new file mode 100644
index 00000000000..4824e530925
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-reduction.c
@@ -0,0 +1,23 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n, unsigned int *a)
+{
+  unsigned int sum = 0;
+
+#pragma acc kernels loop gang reduction(+:sum)
+  for (int i = 0; i < n; i++)
+    sum += a[i];
+
+  return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
new file mode 100644
index 00000000000..d70afb0e662
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-acc-loop-smaller-equal.c
@@ -0,0 +1,23 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+unsigned int
+foo (int n)
+{
+  unsigned int sum = 1;
+
+  #pragma acc kernels loop
+  for (int i = 1; i <= n; i++)
+    sum += i;
+
+  return sum;
+}
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*\\._omp_fn\\.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
new file mode 100644
index 00000000000..7b127cb6fd9
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-2-acc-loop.c
@@ -0,0 +1,18 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-2.c"
+
+/* Check that only three loops are analyzed, and that all can be
+   parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 3 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.1" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.2" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
new file mode 100644
index 00000000000..a040e096fc1
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-3-acc-loop.c
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-3.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
new file mode 100644
index 00000000000..070a5b5bf3d
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-acc-loop.c
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*main._omp_fn.0" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
new file mode 100644
index 00000000000..1f25e63fbbb
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-loop-n-acc-loop.c
@@ -0,0 +1,15 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fdump-tree-parloops1-all" } */
+/* { dg-additional-options "-fdump-tree-optimized" } */
+
+/* Check that loops with '#pragma acc loop' tagged gets properly parallelized.  */
+#define ACC_LOOP
+#include "kernels-loop-n.c"
+
+/* Check that only one loop is analyzed, and that it can be parallelized.  */
+/* { dg-final { scan-tree-dump-times "SUCCESS: may be parallelized" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-times "(?n)__attribute__\\(\\(oacc kernels parallelized, oacc function \\(, , \\), oacc kernels, omp target entrypoint\\)\\)" 1 "parloops1" } } */
+/* { dg-final { scan-tree-dump-not "FAILED:" "parloops1" } } */
+
+/* Check that the loop has been split off into a function.  */
+/* { dg-final { scan-tree-dump-times "(?n);; Function .*foo.*._omp_fn.0" 1 "optimized" } } */
-- 
2.17.1

[patch,openacc] Generate sequential loop for OpenACC loop directive inside kernels

Reply via email to