On 08/11/2016 03:38 PM, Cesar Philippidis wrote: > This patch ports libgomp.oacc-c-c++-common/host_data-1.c to fortran. > Fortunately, the existing fortran host_data infrastructure was already > in place, so I had to do was port over the calls to Nvidia's CUDA BLAS > library. > > There are a couple of details that one needs to consider when using CUDA > BLAS in gfortran. First, if you want to use Nvidia's wrapper functions > written in C to set up the appropriate cuda device contexts, then use > the thunking variants of the functions described here > <http://docs.nvidia.com/cuda/cublas/#appendix-b-cublas-fortran-bindings>. > Otherwise, it's much easier to let gfortran's OpenACC runtime manage the > data mappings and use the host_data clause to pass those data pointers > to the CUDA BLAS library calls. > > In terms of calling the actual CUDA BLAS functions, there's already good > documentation for that here > <https://gcc.gnu.org/onlinedocs/gfortran/Interoperability-with-C.html>. > Basically, those library calls need a function interface with a special > C binding. The function I tested in host_data-2.f90 is cublasSaxpy. > Other function interfaces will need to be created as necessary. > > I've applied this patch to gomp-4_0-branch.
I've added some additional test coverage in this patch. Specifically, I've included both a module and fixed-mode test. I also corrected some problems when host_data-2.f90 and host_data-1.c are built with -Wall -Wextra. This patch has been applied to gomp-4_0-branch. Cesar
2016-08-12 Cesar Philippidis <ce...@codesourcery.com> libgomp/ * testsuite/libgomp.oacc-c-c++-common/host_data-1.c: Increase test coverage. Build with -Wall -Wextra. * testsuite/libgomp.oacc-fortran/host_data-2.f90: Likewise. * testsuite/libgomp.oacc-fortran/cublas-fixed.h: New test. * testsuite/libgomp.oacc-fortran/host_data-3.f: New test. * testsuite/libgomp.oacc-fortran/host_data-4.f90: New test. diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c index d19aa20..fe843ec 100644 --- a/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c +++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/host_data-1.c @@ -1,14 +1,16 @@ /* { dg-do run { target openacc_nvidia_accel_selected } } */ -/* { dg-additional-options "-lcuda -lcublas -lcudart" } */ +/* { dg-additional-options "-lm -lcuda -lcublas -lcudart -Wall -Wextra" } */ #include <stdlib.h> +#include <math.h> #include <openacc.h> #include <cuda.h> #include <cuda_runtime_api.h> #include <cublas_v2.h> +#pragma acc routine void -saxpy_host (int n, float a, float *x, float *y) +saxpy (int n, float a, float *x, float *y) { int i; @@ -16,18 +18,18 @@ saxpy_host (int n, float a, float *x, float *y) y[i] = y[i] + a * x[i]; } -#pragma acc routine void -saxpy_target (int n, float a, float *x, float *y) +validate_results (int n, float *a, float *b) { int i; for (i = 0; i < n; i++) - y[i] = y[i] + a * x[i]; + if (fabs (a[i] - b[i]) > .00001) + abort (); } int -main(int argc, char **argv) +main() { /* We're using cuBLAS, so excpect to be using a Nvidia GPU. */ acc_init (acc_device_nvidia); @@ -45,7 +47,7 @@ main(int argc, char **argv) y[i] = y_ref[i] = 3.0; } - saxpy_host (N, a, x_ref, y_ref); + saxpy (N, a, x_ref, y_ref); cublasCreate (&h); @@ -57,11 +59,7 @@ main(int argc, char **argv) } } - for (i = 0; i < N; i++) - { - if (y[i] != y_ref[i]) - abort (); - } + validate_results (N, y, y_ref); #pragma acc data create (x[0:N]) copyout (y[0:N]) { @@ -77,11 +75,7 @@ main(int argc, char **argv) cublasDestroy (h); - for (i = 0; i < N; i++) - { - if (y[i] != y_ref[i]) - abort (); - } + validate_results (N, y, y_ref); for (i = 0; i < N; i++) y[i] = 3.0; @@ -90,14 +84,24 @@ main(int argc, char **argv) #pragma acc data copyin (x[0:N]) copyin (a) copy (y[0:N]) { #pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a) - saxpy_target (N, a, x, y); + saxpy (N, a, x, y); } + validate_results (N, y, y_ref); + + /* Exercise host_data with data transferred with acc enter data. */ + for (i = 0; i < N; i++) - { - if (y[i] != y_ref[i]) - abort (); - } + y[i] = 3.0; + +#pragma acc enter data copyin (x, a, y) +#pragma acc parallel present (x[0:N]) pcopy (y[0:N]) present (a) + { + saxpy (N, a, x, y); + } +#pragma acc exit data delete (x, a) copyout (y) + + validate_results (N, y, y_ref); return 0; } diff --git a/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h b/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h new file mode 100644 index 0000000..4a5f61a --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/cublas-fixed.h @@ -0,0 +1,16 @@ +! CUDA BLAS interface binding for SAXPY. + + use iso_c_binding + interface + subroutine cublassaxpy(N, alpha, x, incx, y, incy) + 1 bind(c, name="cublasSaxpy") + use iso_c_binding + integer(kind=c_int), value :: N + real(kind=c_float), value :: alpha + type(*), dimension(*) :: x + integer(kind=c_int), value :: incx + type(*), dimension(*) :: y + integer(kind=c_int), value :: incy + end subroutine cublassaxpy + end interface + diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90 b/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90 index 68e14e3..ff09218 100644 --- a/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90 +++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-2.f90 @@ -1,7 +1,8 @@ -! Test host_data interoperability with CUDA blas. +! Test host_data interoperability with CUDA blas. This test was +! derived from libgomp.oacc-c-c++-common/host_data-1.c. ! { dg-do run { target openacc_nvidia_accel_selected } } -! { dg-additional-options "-lcublas" } +! { dg-additional-options "-lcublas -Wall -Wextra" } program test implicit none @@ -14,7 +15,7 @@ program test subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy") use iso_c_binding integer(kind=c_int), value :: N - real*4, value :: alpha + real(kind=c_float), value :: alpha type(*), dimension(*) :: x integer(kind=c_int), value :: incx type(*), dimension(*) :: y @@ -32,16 +33,14 @@ program test end do call saxpy (N, a, x_ref, y_ref) - + !$acc data copyin (x) copy (y) !$acc host_data use_device (x, y) call cublassaxpy(N, a, x, 1, y, 1) !$acc end host_data !$acc end data - - do i = 1, N - if (y(i) .ne. y_ref(i)) call abort - end do + + call validate_results (N, y, y_ref) !$acc data create (x) copyout (y) !$acc parallel loop @@ -55,31 +54,45 @@ program test !$acc end host_data !$acc end data - do i = 1, N - if (y(i) .ne. y_ref(i)) call abort - end do + call validate_results (N, y, y_ref) y(:) = 3.0 - + !$acc data copyin (x) copyin (a) copy (y) !$acc parallel present (x) pcopy (y) present (a) call saxpy (N, a, x, y) !$acc end parallel !$acc end data - do i = 1, N - if (y(i) .ne. y_ref(i)) call abort - end do + call validate_results (N, y, y_ref) + + y(:) = 3.0 + + !$acc enter data copyin (x, a, y) + !$acc parallel present (x) pcopy (y) present (a) + call saxpy (N, a, x, y) + !$acc end parallel + !$acc exit data delete (x, a) copyout (y) + + call validate_results (N, y, y_ref) end program test subroutine saxpy (nn, aa, xx, yy) integer :: nn real*4 :: aa, xx(nn), yy(nn) integer i - real*4 :: t !$acc routine do i = 1, nn yy(i) = yy(i) + aa * xx(i) end do end subroutine saxpy + +subroutine validate_results (n, a, b) + integer :: n + real*4 :: a(n), b(n) + + do i = 1, N + if (abs(a(i) - b(i)) > 0.0001) call abort + end do +end subroutine validate_results diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f b/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f new file mode 100644 index 0000000..05ed949 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-3.f @@ -0,0 +1,85 @@ +! Fixed-mode host_data interaction with CUDA BLAS. + +! { dg-do run { target openacc_nvidia_accel_selected } } +! { dg-additional-options "-lcublas -Wall -Wextra" } + + include "cublas-fixed.h" + + integer, parameter :: N = 10 + integer :: i + real*4 :: x_ref(N), y_ref(N), x(N), y(N), a + + a = 2.0 + + do i = 1, N + x(i) = 4.0 * i + y(i) = 3.0 + x_ref(i) = x(i) + y_ref(i) = y(i) + end do + + call saxpy (N, a, x_ref, y_ref) + +!$acc data copyin (x) copy (y) +!$acc host_data use_device (x, y) + call cublassaxpy(N, a, x, 1, y, 1) +!$acc end host_data +!$acc end data + + call validate_results (N, y, y_ref) + +!$acc data create (x) copyout (y) +!$acc parallel loop + do i = 1, N + y(i) = 3.0 + end do +!$acc end parallel loop + +!$acc host_data use_device (x, y) + call cublassaxpy(N, a, x, 1, y, 1) +!$acc end host_data +!$acc end data + + call validate_results (N, y, y_ref) + + y(:) = 3.0 + +!$acc data copyin (x) copyin (a) copy (y) +!$acc parallel present (x) pcopy (y) present (a) + call saxpy (N, a, x, y) +!$acc end parallel +!$acc end data + + call validate_results (N, y, y_ref) + + y(:) = 3.0 + +!$acc enter data copyin (x, a, y) +!$acc parallel present (x) pcopy (y) present (a) + call saxpy (N, a, x, y) +!$acc end parallel +!$acc exit data delete (x, a) copyout (y) + + call validate_results (N, y, y_ref) + end + + subroutine saxpy (nn, aa, xx, yy) + integer :: nn + real*4 :: aa, xx(nn), yy(nn) + integer i +!$acc routine + + do i = 1, nn + yy(i) = yy(i) + aa * xx(i) + end do + end subroutine saxpy + + subroutine validate_results (n, a, b) + integer :: n + real*4 :: a(n), b(n) + + do i = 1, N + if (abs(a(i) - b(i)) > 0.0001) call abort + end do + end subroutine validate_results + diff --git a/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90 b/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90 new file mode 100644 index 0000000..6e379b5 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-fortran/host_data-4.f90 @@ -0,0 +1,101 @@ +! Test host_data interoperability with CUDA blas using modules. + +! { dg-do run { target openacc_nvidia_accel_selected } } +! { dg-additional-options "-lcublas -Wall -Wextra" } + +module cublas + interface + subroutine cublassaxpy(N, alpha, x, incx, y, incy) bind(c, name="cublasSaxpy") + use iso_c_binding + integer(kind=c_int), value :: N + real(kind=c_float), value :: alpha + type(*), dimension(*) :: x + integer(kind=c_int), value :: incx + type(*), dimension(*) :: y + integer(kind=c_int), value :: incy + end subroutine cublassaxpy + end interface + +contains + subroutine saxpy (nn, aa, xx, yy) + integer :: nn + real*4 :: aa, xx(nn), yy(nn) + integer i + !$acc routine + + do i = 1, nn + yy(i) = yy(i) + aa * xx(i) + end do + end subroutine saxpy + + subroutine validate_results (n, a, b) + integer :: n + real*4 :: a(n), b(n) + + do i = 1, N + if (abs(a(i) - b(i)) > 0.0001) call abort + end do + end subroutine validate_results +end module cublas + +program test + use cublas + implicit none + + integer, parameter :: N = 10 + integer :: i + real*4 :: x_ref(N), y_ref(N), x(N), y(N), a + + a = 2.0 + + do i = 1, N + x(i) = 4.0 * i + y(i) = 3.0 + x_ref(i) = x(i) + y_ref(i) = y(i) + end do + + call saxpy (N, a, x_ref, y_ref) + + !$acc data copyin (x) copy (y) + !$acc host_data use_device (x, y) + call cublassaxpy(N, a, x, 1, y, 1) + !$acc end host_data + !$acc end data + + call validate_results (N, y, y_ref) + + !$acc data create (x) copyout (y) + !$acc parallel loop + do i = 1, N + y(i) = 3.0 + end do + !$acc end parallel loop + + !$acc host_data use_device (x, y) + call cublassaxpy(N, a, x, 1, y, 1) + !$acc end host_data + !$acc end data + + call validate_results (N, y, y_ref) + + y(:) = 3.0 + + !$acc data copyin (x) copyin (a) copy (y) + !$acc parallel present (x) pcopy (y) present (a) + call saxpy (N, a, x, y) + !$acc end parallel + !$acc end data + + call validate_results (N, y, y_ref) + + y(:) = 3.0 + + !$acc enter data copyin (x, a, y) + !$acc parallel present (x) pcopy (y) present (a) + call saxpy (N, a, x, y) + !$acc end parallel + !$acc exit data delete (x, a) copyout (y) + + call validate_results (N, y, y_ref) +end program test