Hi Thomas, These are the testsuite changes. After the last review round, the amount of additional changes have been greatly reduced.
Thanks, Chung-Lin * testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c: New testcase. * testsuite/libgomp.oacc-c-c++-common/lib-81.c: Adjust testcase. * testsuite/libgomp.oacc-fortran/lib-12.f90: Likewise.
Index: libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c (nonexistent) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/asyncwait-2.c (working copy) @@ -0,0 +1,904 @@ +/* { dg-do run { target openacc_nvidia_accel_selected } } */ +/* { dg-additional-options "-lcuda" } */ + +#include <openacc.h> +#include <stdlib.h> +#include <cuda.h> + +#include <stdio.h> +#include <time.h> +#include <sys/time.h> + +int +main (int argc, char **argv) +{ + CUresult r; + CUstream stream1; + int N = 128; //1024 * 1024; + float *a, *b, *c, *d, *e; + int i; + int nbytes; + + srand (time (NULL)); + int s = rand () % 100; + + acc_init (acc_device_nvidia); + + nbytes = N * sizeof (float); + + a = (float *) malloc (nbytes); + b = (float *) malloc (nbytes); + c = (float *) malloc (nbytes); + d = (float *) malloc (nbytes); + e = (float *) malloc (nbytes); + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + } + + acc_set_default_async (s); + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 3.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 2.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 2.0) + abort (); + + if (b[i] != 2.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 9.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 2.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc parallel wait (s) async (s) + { + int ii; + + for (ii = 0; ii < N; ii++) + e[ii] = a[ii] + b[ii] + c[ii] + d[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 2.0) + abort (); + + if (b[i] != 4.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + + if (e[i] != 11.0) + abort (); + } + + + r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING); + if (r != CUDA_SUCCESS) + { + fprintf (stderr, "cuStreamCreate failed: %d\n", r); + abort (); + } + + acc_set_cuda_stream (1, stream1); + + for (i = 0; i < N; i++) + { + a[i] = 5.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 5.0) + abort (); + + if (b[i] != 5.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 7.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 7.0) + abort (); + + if (b[i] != 49.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc parallel wait (s) async (s) + { + int ii; + + for (ii = 0; ii < N; ii++) + e[ii] = a[ii] + b[ii] + c[ii] + d[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 9.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + + if (e[i] != 17.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 4.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 4.0) + abort (); + + if (b[i] != 16.0) + abort (); + + if (c[i] != 4.0) + abort (); + } + + + for (i = 0; i < N; i++) + { + a[i] = 5.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N) + { + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc parallel async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc update host (a[0:N], b[0:N], c[0:N]) async + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 5.0) + abort (); + + if (b[i] != 25.0) + abort (); + + if (c[i] != 4.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 3.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 2.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 2.0) + abort (); + + if (b[i] != 2.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 9.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 2.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc kernels wait (s) async (s) + { + int ii; + + for (ii = 0; ii < N; ii++) + e[ii] = a[ii] + b[ii] + c[ii] + d[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 2.0) + abort (); + + if (b[i] != 4.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + + if (e[i] != 11.0) + abort (); + } + + + r = cuStreamCreate (&stream1, CU_STREAM_NON_BLOCKING); + if (r != CUDA_SUCCESS) + { + fprintf (stderr, "cuStreamCreate failed: %d\n", r); + abort (); + } + + acc_set_cuda_stream (1, stream1); + + for (i = 0; i < N; i++) + { + a[i] = 5.0; + b[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 5.0) + abort (); + + if (b[i] != 5.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 7.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + } + +#pragma acc data copy (a[0:N]) copy (b[0:N]) copy (c[0:N]) copy (d[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 7.0) + abort (); + + if (b[i] != 49.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 3.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copy (a[0:N], b[0:N], c[0:N], d[0:N], e[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + d[ii] = ((a[ii] * a[ii] + a[ii]) / a[ii]) - a[ii]; + } + +#pragma acc kernels wait (s) async (s) + { + int ii; + + for (ii = 0; ii < N; ii++) + e[ii] = a[ii] + b[ii] + c[ii] + d[ii]; + } + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 3.0) + abort (); + + if (b[i] != 9.0) + abort (); + + if (c[i] != 4.0) + abort (); + + if (d[i] != 1.0) + abort (); + + if (e[i] != 17.0) + abort (); + } + + for (i = 0; i < N; i++) + { + a[i] = 4.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc update host (a[0:N], b[0:N], c[0:N]) wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 4.0) + abort (); + + if (b[i] != 16.0) + abort (); + + if (c[i] != 4.0) + abort (); + } + + + for (i = 0; i < N; i++) + { + a[i] = 5.0; + b[i] = 0.0; + c[i] = 0.0; + d[i] = 0.0; + e[i] = 0.0; + } + +#pragma acc data copyin (a[0:N], b[0:N], c[0:N]) copyin (N) + { + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + b[ii] = (a[ii] * a[ii] * a[ii]) / a[ii]; + } + +#pragma acc kernels async + { + int ii; + + for (ii = 0; ii < N; ii++) + c[ii] = (a[ii] + a[ii] + a[ii] + a[ii]) / a[ii]; + } + +#pragma acc update host (a[0:N], b[0:N], c[0:N]) async + +#pragma acc wait (s) + + } + + for (i = 0; i < N; i++) + { + if (a[i] != 5.0) + abort (); + + if (b[i] != 25.0) + abort (); + + if (c[i] != 4.0) + abort (); + } + + acc_shutdown (acc_device_nvidia); + + return 0; +} Index: libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c =================================================================== --- libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c (revision 268142) +++ libgomp/testsuite/libgomp.oacc-c-c++-common/lib-81.c (working copy) @@ -133,7 +133,7 @@ main (int argc, char **argv) for (i = 0; i <= N; i++) { - if (acc_async_test (i) != 0) + if (acc_async_test (i) == 0) abort (); } Index: libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90 =================================================================== --- libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90 (revision 268142) +++ libgomp/testsuite/libgomp.oacc-fortran/lib-12.f90 (working copy) @@ -1,4 +1,5 @@ ! { dg-do run } +! { dg-xfail-run-if "n/a" { openacc_host_selected } } program main use openacc