On 2016/5/31 05:51 PM, Chung-Lin Tang wrote:
> On 2016/5/31 3:28 PM, Thomas Schwinge wrote:
>> > Hi!
>> >
>> > On Mon, 30 May 2016 18:53:41 +0200, Jakub Jelinek <[email protected]> wrote:
>>> >> On Mon, May 30, 2016 at 10:38:59PM +0800, Chung-Lin Tang wrote:
>>>> >>> Hi, a previous patch of Cesar's has made the middle-end omp-lowering
>>>> >>> automatically create and insert a tofrom (i.e. present_or_copy) map for
>>>> >>> parallel reductions. This allowed the user to not need explicit
>>>> >>> clauses to copy out the reduction result, but because reduction
>>>> >>> arguments
>>>> >>> are not marked addressable, async does not work as expected,
>>>> >>> i.e. the asynchronous copy-out results are not used in the compiler
>>>> >>> generated code.
>>> >>
>>> >> If you need it only for async parallel/kernels? regions, can't you do
>>> >> that
>>> >> only for those and not for others?
> That is achievable, but not in line with how we currently treat all other
> data clause OMP_CLAUSE_MAPs, which are all marked addressable. Is this special
> case handling really better here?
>
Hi Jakub, here's a version of the patch with the addressable marking restricted
to when there's an async clause. Tests re-ran to ensure no regressions. Please
inform
if this way seems better. Also attached are some new testcases.
Thanks,
Chung-Lin
2016-06-01 Chung-Lin Tang <[email protected]>
c/
* c-typeck.c (c_finish_omp_clauses): Mark OpenACC reduction
arguments as addressable when async clause exists.
cp/
* semantics.c (finish_omp_clauses): Mark OpenACC reduction
arguments as addressable when async clause exists.
fortran/
* trans-openmp.c (gfc_trans_oacc_construct): Mark OpenACC reduction
arguments as addressable. when async clause exists.
(gfc_trans_oacc_combined_directive): Likewise.
libgomp/testsuite/
* libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-8.c: New test.
* libgomp/testsuite/libgomp.oacc-fortran/reduction-8.f90: New test.
Index: c/c-typeck.c
===================================================================
--- c/c-typeck.c (revision 236845)
+++ c/c-typeck.c (working copy)
@@ -12529,6 +12529,7 @@ c_finish_omp_clauses (tree clauses, enum c_omp_reg
tree *nowait_clause = NULL;
bool ordered_seen = false;
tree schedule_clause = NULL_TREE;
+ bool oacc_async = false;
bitmap_obstack_initialize (NULL);
bitmap_initialize (&generic_head, &bitmap_default_obstack);
@@ -12539,6 +12540,14 @@ c_finish_omp_clauses (tree clauses, enum c_omp_reg
bitmap_initialize (&map_field_head, &bitmap_default_obstack);
bitmap_initialize (&oacc_reduction_head, &bitmap_default_obstack);
+ if (ort & C_ORT_ACC)
+ for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+ {
+ oacc_async = true;
+ break;
+ }
+
for (pc = &clauses, c = clauses; c ; c = *pc)
{
bool remove = false;
@@ -12575,6 +12584,8 @@ c_finish_omp_clauses (tree clauses, enum c_omp_reg
remove = true;
break;
}
+ if (oacc_async)
+ c_mark_addressable (t);
type = TREE_TYPE (t);
if (TREE_CODE (t) == MEM_REF)
type = TREE_TYPE (type);
Index: cp/semantics.c
===================================================================
--- cp/semantics.c (revision 236845)
+++ cp/semantics.c (working copy)
@@ -5774,6 +5774,7 @@ finish_omp_clauses (tree clauses, enum c_omp_regio
bool branch_seen = false;
bool copyprivate_seen = false;
bool ordered_seen = false;
+ bool oacc_async = false;
bitmap_obstack_initialize (NULL);
bitmap_initialize (&generic_head, &bitmap_default_obstack);
@@ -5784,6 +5785,14 @@ finish_omp_clauses (tree clauses, enum c_omp_regio
bitmap_initialize (&map_field_head, &bitmap_default_obstack);
bitmap_initialize (&oacc_reduction_head, &bitmap_default_obstack);
+ if (ort & C_ORT_ACC)
+ for (c = clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+ {
+ oacc_async = true;
+ break;
+ }
+
for (pc = &clauses, c = clauses; c ; c = *pc)
{
bool remove = false;
@@ -5827,6 +5836,8 @@ finish_omp_clauses (tree clauses, enum c_omp_regio
t = n;
goto check_dup_generic_t;
}
+ if (oacc_async)
+ cxx_mark_addressable (t);
goto check_dup_generic;
case OMP_CLAUSE_COPYPRIVATE:
copyprivate_seen = true;
Index: fortran/trans-openmp.c
===================================================================
--- fortran/trans-openmp.c (revision 236845)
+++ fortran/trans-openmp.c (working copy)
@@ -2704,6 +2704,15 @@ gfc_trans_oacc_construct (gfc_code *code)
gfc_start_block (&block);
oacc_clauses = gfc_trans_omp_clauses (&block, code->ext.omp_clauses,
code->loc);
+ for (tree c = oacc_clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+ {
+ for (c = oacc_clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
+ && DECL_P (OMP_CLAUSE_DECL (c)))
+ TREE_ADDRESSABLE (OMP_CLAUSE_DECL (c)) = 1;
+ break;
+ }
stmt = gfc_trans_omp_code (code->block->next, true);
stmt = build2_loc (input_location, construct_code, void_type_node, stmt,
oacc_clauses);
@@ -3501,6 +3510,15 @@ gfc_trans_oacc_combined_directive (gfc_code *code)
construct_clauses.lists[OMP_LIST_REDUCTION] = NULL;
oacc_clauses = gfc_trans_omp_clauses (&block, &construct_clauses,
code->loc);
+ for (tree c = oacc_clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_ASYNC)
+ {
+ for (c = oacc_clauses; c; c = OMP_CLAUSE_CHAIN (c))
+ if (OMP_CLAUSE_CODE (c) == OMP_CLAUSE_REDUCTION
+ && DECL_P (OMP_CLAUSE_DECL (c)))
+ TREE_ADDRESSABLE (OMP_CLAUSE_DECL (c)) = 1;
+ break;
+ }
}
if (!loop_clauses.seq)
pblock = █
Index: libgomp/testsuite/libgomp.oacc-fortran/reduction-8.f90
===================================================================
--- libgomp/testsuite/libgomp.oacc-fortran/reduction-8.f90 (revision 0)
+++ libgomp/testsuite/libgomp.oacc-fortran/reduction-8.f90 (revision 0)
@@ -0,0 +1,41 @@
+! { dg-do run }
+
+program reduction
+ implicit none
+ integer, parameter :: n = 100
+ integer :: i, h1, h2, s1, s2, a1, a2
+
+ h1 = 0
+ h2 = 0
+ do i = 1, n
+ h1 = h1 + 1
+ h2 = h2 + 2
+ end do
+
+ s1 = 0
+ s2 = 0
+ !$acc parallel loop reduction(+:s1, s2)
+ do i = 1, n
+ s1 = s1 + 1
+ s2 = s2 + 2
+ end do
+ !$acc end parallel loop
+
+ a1 = 0
+ a2 = 0
+ !$acc parallel loop reduction(+:a1, a2) async(1)
+ do i = 1, n
+ a1 = a1 + 1
+ a2 = a2 + 2
+ end do
+ !$acc end parallel loop
+
+ if (h1 .ne. s1) call abort ()
+ if (h2 .ne. s2) call abort ()
+
+ !$acc wait(1)
+
+ if (h1 .ne. a1) call abort ()
+ if (h2 .ne. a2) call abort ()
+
+end program reduction
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-8.c
===================================================================
--- libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-8.c (revision 0)
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/reduction-8.c (revision 0)
@@ -0,0 +1,30 @@
+const int n = 100;
+
+// Check async over parallel construct with reduction
+
+int
+async_sum (int c)
+{
+ int s = 0;
+
+#pragma acc parallel loop num_gangs (10) gang reduction (+:s) async
+ for (int i = 0; i < n; i++)
+ s += i+c;
+
+#pragma acc wait
+ return s;
+}
+
+int
+main()
+{
+ int result = 0;
+
+ for (int i = 0; i < n; i++)
+ result += i+1;
+
+ if (async_sum (1) != result)
+ __builtin_abort ();
+
+ return 0;
+}