Hi!

On Thu, 11 May 2017 14:26:51 +0200, I wrote:
> Building on the other pending patches (I'll soon commit the approved
> ones), we can then support the num_gangs, num_workers, vector_length
> clauses for the OpenACC 2.5 kernels construct.  OK for trunk?

>     OpenACC 2.5 kernels construct: num_gangs, num_workers, vector_length 
> clauses

For now, committed to gomp-4_0-branch in r248031:

commit cc2a61ba48e84268e37c53874cb3eef27f5ede1d
Author: tschwinge <tschwinge@138bc75d-0d04-0410-961f-82ee72b054a4>
Date:   Sun May 14 10:26:07 2017 +0000

    OpenACC 2.5 kernels construct: num_gangs, num_workers, vector_length clauses
    
            gcc/c/
            * c-parser.c (OACC_KERNELS_CLAUSE_MASK)
            (OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK): Add
            "PRAGMA_OACC_CLAUSE_NUM_GANGS", "PRAGMA_OACC_CLAUSE_NUM_WORKERS",
            "VECTOR_LENGTH".
            gcc/cp/
            * parser.c (OACC_KERNELS_CLAUSE_MASK)
            (OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK): Add
            "PRAGMA_OACC_CLAUSE_NUM_GANGS", "PRAGMA_OACC_CLAUSE_NUM_WORKERS",
            "VECTOR_LENGTH".
            gcc/fortran/
            * openmp.c (OACC_KERNELS_CLAUSES)
            (OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK): Add
            "OMP_CLAUSE_NUM_GANGS", "OMP_CLAUSE_NUM_WORKERS",
            "OMP_CLAUSE_VECTOR_LENGTH".
            gcc/
            * omp-low.c (execute_oacc_device_lower): Remove the parallelism
            dimensions function attributes for unparallelized OpenACC kernels
            constructs.
            gcc/testsuite/
            * c-c++-common/goacc/parallel-dims-1.c: Update.
            * c-c++-common/goacc/parallel-dims-2.c: Likewise.
            * c-c++-common/goacc/routine-1.c: Likewise.
            * c-c++-common/goacc/uninit-dim-clause.c: Likewise.
            * g++.dg/goacc/template.C: Likewise.
            * gfortran.dg/goacc/kernels-tree.f95: Likewise.
            * gfortran.dg/goacc/routine-3.f90: Likewise.
            * gfortran.dg/goacc/sie.f95: Likewise.
            * gfortran.dg/goacc/uninit-dim-clause.f95: Likewise.
            libgomp/
            * testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: New
            file.
            * testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c: Update.
            * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
            * testsuite/libgomp.oacc-fortran/kernels-loop-2.f95: Likewise.
    
    git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/branches/gomp-4_0-branch@248031 
138bc75d-0d04-0410-961f-82ee72b054a4
---
 gcc/ChangeLog.gomp                                 |   6 +
 gcc/c/ChangeLog.gomp                               |   7 +
 gcc/c/c-parser.c                                   |   6 +
 gcc/cp/ChangeLog.gomp                              |   7 +
 gcc/cp/parser.c                                    |   6 +
 gcc/fortran/ChangeLog.gomp                         |   7 +
 gcc/fortran/openmp.c                               |   6 +-
 gcc/omp-low.c                                      |   9 +
 gcc/testsuite/ChangeLog.gomp                       |  12 +
 gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c |   4 +
 gcc/testsuite/c-c++-common/goacc/parallel-dims-2.c | 152 +++++++++++--
 gcc/testsuite/c-c++-common/goacc/routine-1.c       |  13 ++
 .../c-c++-common/goacc/uninit-dim-clause.c         |  17 +-
 gcc/testsuite/g++.dg/goacc/template.C              |   4 +
 gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95   |   6 +-
 gcc/testsuite/gfortran.dg/goacc/routine-3.f90      |   6 +
 gcc/testsuite/gfortran.dg/goacc/sie.f95            |  84 +++++++
 .../gfortran.dg/goacc/uninit-dim-clause.f95        |  18 +-
 libgomp/ChangeLog.gomp                             |   6 +
 .../libgomp.oacc-c-c++-common/acc_prof-kernels-1.c | 244 +++++++++++++++++++++
 .../libgomp.oacc-c-c++-common/kernels-loop-2.c     |  21 +-
 .../libgomp.oacc-c-c++-common/parallel-dims.c      |  35 +++
 .../libgomp.oacc-fortran/kernels-loop-2.f95        |  13 +-
 23 files changed, 661 insertions(+), 28 deletions(-)

diff --git gcc/ChangeLog.gomp gcc/ChangeLog.gomp
index a754647..a4720c3 100644
--- gcc/ChangeLog.gomp
+++ gcc/ChangeLog.gomp
@@ -1,3 +1,9 @@
+2017-05-14  Thomas Schwinge  <tho...@codesourcery.com>
+
+       * omp-low.c (execute_oacc_device_lower): Remove the parallelism
+       dimensions function attributes for unparallelized OpenACC kernels
+       constructs.
+
 2017-05-12  Cesar Philippidis  <ce...@codesourcery.com>
 
        * config/nvptx/nvptx.c (nvptx_goacc_reduction_init): Don't update
diff --git gcc/c/ChangeLog.gomp gcc/c/ChangeLog.gomp
index 3efcc8b..baedcf8 100644
--- gcc/c/ChangeLog.gomp
+++ gcc/c/ChangeLog.gomp
@@ -1,3 +1,10 @@
+2017-05-14  Thomas Schwinge  <tho...@codesourcery.com>
+
+       * c-parser.c (OACC_KERNELS_CLAUSE_MASK)
+       (OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK): Add
+       "PRAGMA_OACC_CLAUSE_NUM_GANGS", "PRAGMA_OACC_CLAUSE_NUM_WORKERS",
+       "VECTOR_LENGTH".
+
 2017-05-12  Thomas Schwinge  <tho...@codesourcery.com>
 
        * c-parser.c (c_parser_omp_clause_num_gangs)
diff --git gcc/c/c-parser.c gcc/c/c-parser.c
index ef61c5f..afc467d 100644
--- gcc/c/c-parser.c
+++ gcc/c/c-parser.c
@@ -13966,11 +13966,17 @@ c_parser_oacc_loop (location_t loc, c_parser *parser, 
char *p_name,
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICE_TYPE)         \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)           \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)                  \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_GANGS)           \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_WORKERS)         \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT)             \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR_LENGTH)       \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
 #define OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK                           \
        ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)               \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_GANGS)           \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_WORKERS)         \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR_LENGTH)       \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
 #define OACC_PARALLEL_CLAUSE_MASK                                      \
diff --git gcc/cp/ChangeLog.gomp gcc/cp/ChangeLog.gomp
index 470f4e7..d59e856 100644
--- gcc/cp/ChangeLog.gomp
+++ gcc/cp/ChangeLog.gomp
@@ -1,3 +1,10 @@
+2017-05-14  Thomas Schwinge  <tho...@codesourcery.com>
+
+       * parser.c (OACC_KERNELS_CLAUSE_MASK)
+       (OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK): Add
+       "PRAGMA_OACC_CLAUSE_NUM_GANGS", "PRAGMA_OACC_CLAUSE_NUM_WORKERS",
+       "VECTOR_LENGTH".
+
 2017-05-04  Cesar Philippidis  <ce...@codesourcery.com>
 
        * parser.c (cp_parser_omp_clause_name): Add support for if_present.
diff --git gcc/cp/parser.c gcc/cp/parser.c
index b9c9747..de42cdd 100644
--- gcc/cp/parser.c
+++ gcc/cp/parser.c
@@ -35704,11 +35704,17 @@ cp_parser_oacc_loop (cp_parser *parser, cp_token 
*pragma_tok, char *p_name,
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICE_TYPE)         \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_DEVICEPTR)           \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_IF)                  \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_GANGS)           \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_WORKERS)         \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_PRESENT)             \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR_LENGTH)       \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT) )
 
 #define OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK                           \
        ( (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_ASYNC)               \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_GANGS)           \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_NUM_WORKERS)         \
+       | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_VECTOR_LENGTH)       \
        | (OMP_CLAUSE_MASK_1 << PRAGMA_OACC_CLAUSE_WAIT))
 
 #define OACC_PARALLEL_CLAUSE_MASK                                      \
diff --git gcc/fortran/ChangeLog.gomp gcc/fortran/ChangeLog.gomp
index e9d8928..8a6ae6a 100644
--- gcc/fortran/ChangeLog.gomp
+++ gcc/fortran/ChangeLog.gomp
@@ -1,3 +1,10 @@
+2017-05-14  Thomas Schwinge  <tho...@codesourcery.com>
+
+       * openmp.c (OACC_KERNELS_CLAUSES)
+       (OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK): Add
+       "OMP_CLAUSE_NUM_GANGS", "OMP_CLAUSE_NUM_WORKERS",
+       "OMP_CLAUSE_VECTOR_LENGTH".
+
 2017-05-04  Cesar Philippidis  <ce...@codesourcery.com>
 
        * gfortran.h (gfc_omp_clauses): Add if_present member.
diff --git gcc/fortran/openmp.c gcc/fortran/openmp.c
index 12b2430..c7e78bb 100644
--- gcc/fortran/openmp.c
+++ gcc/fortran/openmp.c
@@ -2049,7 +2049,8 @@ gfc_match_omp_clauses (gfc_omp_clauses **cp, omp_mask 
mask,
    | OMP_CLAUSE_PRIVATE | OMP_CLAUSE_FIRSTPRIVATE | OMP_CLAUSE_DEFAULT        \
    | OMP_CLAUSE_WAIT | OMP_CLAUSE_DEVICE_TYPE)
 #define OACC_KERNELS_CLAUSES \
-  (omp_mask (OMP_CLAUSE_IF) | OMP_CLAUSE_ASYNC | OMP_CLAUSE_DEVICEPTR        \
+  (omp_mask (OMP_CLAUSE_IF) | OMP_CLAUSE_ASYNC | OMP_CLAUSE_NUM_GANGS        \
+   | OMP_CLAUSE_NUM_WORKERS | OMP_CLAUSE_VECTOR_LENGTH | OMP_CLAUSE_DEVICEPTR \
    | OMP_CLAUSE_COPY | OMP_CLAUSE_COPYIN | OMP_CLAUSE_COPYOUT                 \
    | OMP_CLAUSE_CREATE | OMP_CLAUSE_PRESENT | OMP_CLAUSE_DEFAULT              \
    | OMP_CLAUSE_WAIT | OMP_CLAUSE_DEVICE_TYPE)
@@ -2093,7 +2094,8 @@ gfc_match_omp_clauses (gfc_omp_clauses **cp, omp_mask 
mask,
    | OMP_CLAUSE_VECTOR | OMP_CLAUSE_AUTO | OMP_CLAUSE_SEQ | OMP_CLAUSE_TILE   \
    | OMP_CLAUSE_DEVICE_TYPE)
 #define OACC_KERNELS_CLAUSE_DEVICE_TYPE_MASK \
-  (omp_mask (OMP_CLAUSE_ASYNC) | OMP_CLAUSE_WAIT | OMP_CLAUSE_DEVICE_TYPE)
+  (omp_mask (OMP_CLAUSE_ASYNC) | OMP_CLAUSE_NUM_GANGS | OMP_CLAUSE_NUM_WORKERS 
\
+   | OMP_CLAUSE_VECTOR_LENGTH | OMP_CLAUSE_WAIT | OMP_CLAUSE_DEVICE_TYPE)
 #define OACC_PARALLEL_CLAUSE_DEVICE_TYPE_MASK                                \
   (omp_mask (OMP_CLAUSE_ASYNC) | OMP_CLAUSE_NUM_GANGS | OMP_CLAUSE_NUM_WORKERS 
\
    | OMP_CLAUSE_VECTOR_LENGTH | OMP_CLAUSE_WAIT | OMP_CLAUSE_DEVICE_TYPE)
diff --git gcc/omp-low.c gcc/omp-low.c
index 0fbc3ff..ae8b6d9 100644
--- gcc/omp-low.c
+++ gcc/omp-low.c
@@ -21178,6 +21178,15 @@ execute_oacc_device_lower ()
     = (lookup_attribute ("oacc kernels parallelized",
                         DECL_ATTRIBUTES (current_function_decl)) != NULL);
 
+  /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
+     kernels, so remove the parallelism dimensions function attributes
+     potentially set earlier on.  */
+  if (is_oacc_kernels && !is_oacc_kernels_parallelized)
+    {
+      set_oacc_fn_attrib (current_function_decl, NULL, NULL);
+      attrs = get_oacc_fn_attrib (current_function_decl);
+    }
+
   /* Discover, partition and process the loops.  */
   oacc_loop *loops = oacc_loop_discovery ();
   int fn_level = oacc_fn_attrib_level (attrs);
diff --git gcc/testsuite/ChangeLog.gomp gcc/testsuite/ChangeLog.gomp
index c24820d..67f01e8 100644
--- gcc/testsuite/ChangeLog.gomp
+++ gcc/testsuite/ChangeLog.gomp
@@ -1,3 +1,15 @@
+2017-05-14  Thomas Schwinge  <tho...@codesourcery.com>
+
+       * c-c++-common/goacc/parallel-dims-1.c: Update.
+       * c-c++-common/goacc/parallel-dims-2.c: Likewise.
+       * c-c++-common/goacc/routine-1.c: Likewise.
+       * c-c++-common/goacc/uninit-dim-clause.c: Likewise.
+       * g++.dg/goacc/template.C: Likewise.
+       * gfortran.dg/goacc/kernels-tree.f95: Likewise.
+       * gfortran.dg/goacc/routine-3.f90: Likewise.
+       * gfortran.dg/goacc/sie.f95: Likewise.
+       * gfortran.dg/goacc/uninit-dim-clause.f95: Likewise.
+
 2017-05-12  Thomas Schwinge  <tho...@codesourcery.com>
 
        * c-c++-common/goacc/parallel-dims-1.c: New file.
diff --git gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c 
gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c
index 9e4cfaa..6cdbebe 100644
--- gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c
+++ gcc/testsuite/c-c++-common/goacc/parallel-dims-1.c
@@ -3,6 +3,10 @@
 
 void f(int i)
 {
+#pragma acc kernels \
+  num_gangs(i) num_workers(i) vector_length(i)
+  ;
+
 #pragma acc parallel /* { dg-bogus "region is (gang|worker|vector) 
partitioned" "" { xfail *-*-* } } */ \
   num_gangs(i) num_workers(i) vector_length(i)
   ;
diff --git gcc/testsuite/c-c++-common/goacc/parallel-dims-2.c 
gcc/testsuite/c-c++-common/goacc/parallel-dims-2.c
index 30a3d17..acfbe7f 100644
--- gcc/testsuite/c-c++-common/goacc/parallel-dims-2.c
+++ gcc/testsuite/c-c++-common/goacc/parallel-dims-2.c
@@ -1,18 +1,15 @@
 /* Invalid use of OpenACC parallelism dimensions clauses: num_gangs,
    num_workers, vector_length.  */
 
-void acc_kernels(int i)
+void f(int i, float f)
 {
-#pragma acc kernels num_gangs(i) /* { dg-error "'num_gangs' is not valid for 
'#pragma acc kernels'" } */
+#pragma acc kernels num_gangs /* { dg-error "expected '\\(' before end of 
line" } */
   ;
-#pragma acc kernels num_workers(i) /* { dg-error "'num_workers' is not valid 
for '#pragma acc kernels'" } */
+#pragma acc kernels num_workers /* { dg-error "expected '\\(' before end of 
line" } */
   ;
-#pragma acc kernels vector_length(i) /* { dg-error "'vector_length' is not 
valid for '#pragma acc kernels'" } */
+#pragma acc kernels vector_length /* { dg-error "expected '\\(' before end of 
line" } */
   ;
-}
 
-void acc_parallel(int i, float f)
-{
 #pragma acc parallel num_gangs /* { dg-error "expected '\\(' before end of 
line" } */
   ;
 #pragma acc parallel num_workers /* { dg-error "expected '\\(' before end of 
line" } */
@@ -20,6 +17,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length /* { dg-error "expected '\\(' before end of 
line" } */
   ;
 
+
+#pragma acc kernels num_gangs( /* { dg-error "expected (primary-|)expression 
before end of line" } */
+  ;
+#pragma acc kernels num_workers( /* { dg-error "expected (primary-|)expression 
before end of line" } */
+  ;
+#pragma acc kernels vector_length( /* { dg-error "expected 
(primary-|)expression before end of line" } */
+  ;
+
 #pragma acc parallel num_gangs( /* { dg-error "expected (primary-|)expression 
before end of line" } */
   ;
 #pragma acc parallel num_workers( /* { dg-error "expected 
(primary-|)expression before end of line" } */
@@ -27,6 +32,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length( /* { dg-error "expected 
(primary-|)expression before end of line" } */
   ;
 
+
+#pragma acc kernels num_gangs() /* { dg-error "expected (primary-|)expression 
before '\\)' token" } */
+  ;
+#pragma acc kernels num_workers() /* { dg-error "expected 
(primary-|)expression before '\\)' token" } */
+  ;
+#pragma acc kernels vector_length() /* { dg-error "expected 
(primary-|)expression before '\\)' token" } */
+  ;
+
 #pragma acc parallel num_gangs() /* { dg-error "expected (primary-|)expression 
before '\\)' token" } */
   ;
 #pragma acc parallel num_workers() /* { dg-error "expected 
(primary-|)expression before '\\)' token" } */
@@ -34,6 +47,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length() /* { dg-error "expected 
(primary-|)expression before '\\)' token" } */
   ;
 
+
+#pragma acc kernels num_gangs(1 /* { dg-error "expected '\\)' before end of 
line" } */
+  ;
+#pragma acc kernels num_workers(1 /* { dg-error "expected '\\)' before end of 
line" } */
+  ;
+#pragma acc kernels vector_length(1 /* { dg-error "expected '\\)' before end 
of line" } */
+  ;
+
 #pragma acc parallel num_gangs(1 /* { dg-error "expected '\\)' before end of 
line" } */
   ;
 #pragma acc parallel num_workers(1 /* { dg-error "expected '\\)' before end of 
line" } */
@@ -41,6 +62,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length(1 /* { dg-error "expected '\\)' before end 
of line" } */
   ;
 
+
+#pragma acc kernels num_gangs(i /* { dg-error "expected '\\)' before end of 
line" } */
+  ;
+#pragma acc kernels num_workers(i /* { dg-error "expected '\\)' before end of 
line" } */
+  ;
+#pragma acc kernels vector_length(i /* { dg-error "expected '\\)' before end 
of line" } */
+  ;
+
 #pragma acc parallel num_gangs(i /* { dg-error "expected '\\)' before end of 
line" } */
   ;
 #pragma acc parallel num_workers(i /* { dg-error "expected '\\)' before end of 
line" } */
@@ -48,6 +77,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length(i /* { dg-error "expected '\\)' before end 
of line" } */
   ;
 
+
+#pragma acc kernels num_gangs(1 i /* { dg-error "expected '\\)' before 'i'" } 
*/
+  ;
+#pragma acc kernels num_workers(1 i /* { dg-error "expected '\\)' before 'i'" 
} */
+  ;
+#pragma acc kernels vector_length(1 i /* { dg-error "expected '\\)' before 
'i'" } */
+  ;
+
 #pragma acc parallel num_gangs(1 i /* { dg-error "expected '\\)' before 'i'" } 
*/
   ;
 #pragma acc parallel num_workers(1 i /* { dg-error "expected '\\)' before 'i'" 
} */
@@ -55,6 +92,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length(1 i /* { dg-error "expected '\\)' before 
'i'" } */
   ;
 
+
+#pragma acc kernels num_gangs(1 i) /* { dg-error "expected '\\)' before 'i'" } 
*/
+  ;
+#pragma acc kernels num_workers(1 i) /* { dg-error "expected '\\)' before 'i'" 
} */
+  ;
+#pragma acc kernels vector_length(1 i) /* { dg-error "expected '\\)' before 
'i'" } */
+  ;
+
 #pragma acc parallel num_gangs(1 i) /* { dg-error "expected '\\)' before 'i'" 
} */
   ;
 #pragma acc parallel num_workers(1 i) /* { dg-error "expected '\\)' before 
'i'" } */
@@ -62,6 +107,17 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length(1 i) /* { dg-error "expected '\\)' before 
'i'" } */
   ;
 
+
+#pragma acc kernels num_gangs(1, i /* { dg-error "expected '\\)' before ',' 
token" "TODO" { xfail c } } */
+  /* { dg-bogus "expected '\\)' before end of line" "TODO" { xfail c } .-1 } */
+  ;
+#pragma acc kernels num_workers(1, i /* { dg-error "expected '\\)' before ',' 
token" "TODO" { xfail c } } */
+  /* { dg-bogus "expected '\\)' before end of line" "TODO" { xfail c } .-1 } */
+  ;
+#pragma acc kernels vector_length(1, i /* { dg-error "expected '\\)' before 
',' token" "TODO" { xfail c } } */
+  /* { dg-bogus "expected '\\)' before end of line" "TODO" { xfail c } .-1 } */
+  ;
+
 #pragma acc parallel num_gangs(1, i /* { dg-error "expected '\\)' before ',' 
token" "TODO" { xfail c } } */
   /* { dg-bogus "expected '\\)' before end of line" "TODO" { xfail c } .-1 } */
   ;
@@ -72,6 +128,14 @@ void acc_parallel(int i, float f)
   /* { dg-bogus "expected '\\)' before end of line" "TODO" { xfail c } .-1 } */
   ;
 
+
+#pragma acc kernels num_gangs(1, i) /* { dg-error "expected '\\)' before ',' 
token" "TODO" { xfail c } } */
+  ;
+#pragma acc kernels num_workers(1, i) /* { dg-error "expected '\\)' before ',' 
token" "TODO" { xfail c } } */
+  ;
+#pragma acc kernels vector_length(1, i) /* { dg-error "expected '\\)' before 
',' token" "TODO" { xfail c } } */
+  ;
+
 #pragma acc parallel num_gangs(1, i) /* { dg-error "expected '\\)' before ',' 
token" "TODO" { xfail c } } */
   ;
 #pragma acc parallel num_workers(1, i) /* { dg-error "expected '\\)' before 
',' token" "TODO" { xfail c } } */
@@ -79,11 +143,27 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length(1, i) /* { dg-error "expected '\\)' before 
',' token" "TODO" { xfail c } } */
   ;
 
-#pragma acc parallel num_gangs(num_gangs) /* { dg-error "'num_gangs' (un|was 
not )declared" } */
+
+#pragma acc kernels num_gangs(num_gangs_k) /* { dg-error "'num_gangs_k' 
(un|was not )declared" } */
   ;
-#pragma acc parallel num_workers(num_workers) /* { dg-error "'num_workers' 
(un|was not )declared" } */
+#pragma acc kernels num_workers(num_workers_k) /* { dg-error "'num_workers_k' 
(un|was not )declared" } */
   ;
-#pragma acc parallel vector_length(vector_length) /* { dg-error 
"'vector_length' (un|was not )declared" } */
+#pragma acc kernels vector_length(vector_length_k) /* { dg-error 
"'vector_length_k' (un|was not )declared" } */
+  ;
+
+#pragma acc parallel num_gangs(num_gangs_p) /* { dg-error "'num_gangs_p' 
(un|was not )declared" } */
+  ;
+#pragma acc parallel num_workers(num_workers_p) /* { dg-error "'num_workers_p' 
(un|was not )declared" } */
+  ;
+#pragma acc parallel vector_length(vector_length_p) /* { dg-error 
"'vector_length_p' (un|was not )declared" } */
+  ;
+
+
+#pragma acc kernels num_gangs(f) /* { dg-error "'num_gangs' expression must be 
integral" } */
+  ;
+#pragma acc kernels num_workers(f) /* { dg-error "'num_workers' expression 
must be integral" } */
+  ;
+#pragma acc kernels vector_length(f) /* { dg-error "'vector_length' expression 
must be integral" } */
   ;
 
 #pragma acc parallel num_gangs(f) /* { dg-error "'num_gangs' expression must 
be integral" } */
@@ -93,6 +173,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length(f) /* { dg-error "'vector_length' 
expression must be integral" } */
   ;
 
+
+#pragma acc kernels num_gangs((float) 1) /* { dg-error "'num_gangs' expression 
must be integral" } */
+  ;
+#pragma acc kernels num_workers((float) 1) /* { dg-error "'num_workers' 
expression must be integral" } */
+  ;
+#pragma acc kernels vector_length((float) 1) /* { dg-error "'vector_length' 
expression must be integral" } */
+  ;
+
 #pragma acc parallel num_gangs((float) 1) /* { dg-error "'num_gangs' 
expression must be integral" } */
   ;
 #pragma acc parallel num_workers((float) 1) /* { dg-error "'num_workers' 
expression must be integral" } */
@@ -100,6 +188,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length((float) 1) /* { dg-error "'vector_length' 
expression must be integral" } */
   ;
 
+
+#pragma acc kernels num_gangs(0) /* { dg-warning "'num_gangs' value must be 
positive" } */
+  ;
+#pragma acc kernels num_workers(0) /* { dg-warning "'num_workers' value must 
be positive" } */
+  ;
+#pragma acc kernels vector_length(0) /* { dg-warning "'vector_length' value 
must be positive" } */
+  ;
+
 #pragma acc parallel num_gangs(0) /* { dg-warning "'num_gangs' value must be 
positive" } */
   ;
 #pragma acc parallel num_workers(0) /* { dg-warning "'num_workers' value must 
be positive" } */
@@ -107,6 +203,14 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length(0) /* { dg-warning "'vector_length' value 
must be positive" } */
   ;
 
+
+#pragma acc kernels num_gangs((int) -1.2) /* { dg-warning "'num_gangs' value 
must be positive" } */
+  ;
+#pragma acc kernels num_workers((int) -1.2) /* { dg-warning "'num_workers' 
value must be positive" } */
+  ;
+#pragma acc kernels vector_length((int) -1.2) /* { dg-warning "'vector_length' 
value must be positive" } */
+  ;
+
 #pragma acc parallel num_gangs((int) -1.2) /* { dg-warning "'num_gangs' value 
must be positive" } */
   ;
 #pragma acc parallel num_workers((int) -1.2) /* { dg-warning "'num_workers' 
value must be positive" } */
@@ -114,7 +218,8 @@ void acc_parallel(int i, float f)
 #pragma acc parallel vector_length((int) -1.2) /* { dg-warning 
"'vector_length' value must be positive" } */
   ;
 
-#pragma acc parallel \
+
+#pragma acc kernels \
   num_gangs(1) /* { dg-error "too many 'num_gangs' clauses" "" { target c } } 
*/ \
   num_workers(1) /* { dg-error "too many 'num_workers' clauses" "" { target c 
} } */ \
   vector_length(1) /* { dg-error "too many 'vector_length' clauses" "" { 
target c } } */ \
@@ -123,12 +228,31 @@ void acc_parallel(int i, float f)
   num_gangs(1) /* { dg-error "too many 'num_gangs' clauses" "" { target c++ } 
} */
   ;
 
-#pragma acc parallel \
+#pragma acc parallel                                                   \
+  num_gangs(1) /* { dg-error "too many 'num_gangs' clauses" "" { target c } } 
*/ \
+  num_workers(1) /* { dg-error "too many 'num_workers' clauses" "" { target c 
} } */ \
+  vector_length(1) /* { dg-error "too many 'vector_length' clauses" "" { 
target c } } */ \
+  num_workers(1) /* { dg-error "too many 'num_workers' clauses" "" { target 
c++ } } */ \
+  vector_length(1) /* { dg-error "too many 'vector_length' clauses" "" { 
target c++ } } */ \
+  num_gangs(1) /* { dg-error "too many 'num_gangs' clauses" "" { target c++ } 
} */
+  ;
+
+
+#pragma acc kernels \
+  num_gangs(-1) /* { dg-warning "'num_gangs' value must be positive" } */ \
+  num_workers() /* { dg-error "expected (primary-|)expression before '\\)' 
token" } */ \
+  vector_length(abc_k) /* { dg-error "'abc_k' (un|was not )declared" } */ \
+  num_workers(0.5) /* { dg-error "'num_workers' expression must be integral" } 
*/ \
+  vector_length(&f) /* { dg-error "'vector_length' expression must be 
integral" } */ \
+  num_gangs( /* { dg-error "expected (primary-|)expression before end of line" 
"TODO" { xfail c } } */
+  ;
+
+#pragma acc parallel                                                   \
   num_gangs(-1) /* { dg-warning "'num_gangs' value must be positive" } */ \
   num_workers() /* { dg-error "expected (primary-|)expression before '\\)' 
token" } */ \
-  vector_length(abc) /* { dg-error "'abc' (un|was not )declared" } */ \
+  vector_length(abc_p) /* { dg-error "'abc_p' (un|was not )declared" } */ \
   num_workers(0.5) /* { dg-error "'num_workers' expression must be integral" } 
*/ \
-  vector_length(&acc_parallel) /* { dg-error "'vector_length' expression must 
be integral" } */ \
+  vector_length(&f) /* { dg-error "'vector_length' expression must be 
integral" } */ \
   num_gangs( /* { dg-error "expected (primary-|)expression before end of line" 
"TODO" { xfail c } } */
   ;
 }
diff --git gcc/testsuite/c-c++-common/goacc/routine-1.c 
gcc/testsuite/c-c++-common/goacc/routine-1.c
index a4ecfd3..7389575 100644
--- gcc/testsuite/c-c++-common/goacc/routine-1.c
+++ gcc/testsuite/c-c++-common/goacc/routine-1.c
@@ -91,6 +91,19 @@ extern void nohost (void);
 
 int main ()
 {
+#pragma acc kernels num_gangs (32) num_workers (32) vector_length (32)
+  {
+    gang ();
+    worker ();
+    vector ();
+    seq ();
+    bind_f_1 ();
+    bind_f_1_1 ();
+    bind_f_2 ();
+    bind_f_2_1 ();
+    nohost ();
+  }
+
 #pragma acc parallel num_gangs (32) num_workers (32) vector_length (32)
   {
     gang ();
diff --git gcc/testsuite/c-c++-common/goacc/uninit-dim-clause.c 
gcc/testsuite/c-c++-common/goacc/uninit-dim-clause.c
index b6e4c3d..72aacd7 100644
--- gcc/testsuite/c-c++-common/goacc/uninit-dim-clause.c
+++ gcc/testsuite/c-c++-common/goacc/uninit-dim-clause.c
@@ -1,7 +1,6 @@
 /* { dg-additional-options "-Wuninitialized" } */
 
-int
-main (void)
+void acc_parallel()
 {
   int i, j, k;
 
@@ -16,6 +15,18 @@ main (void)
   #pragma acc parallel loop vector vector_length(k) /* { dg-warning "is used 
uninitialized in this function" } */
   for (k = 0; k < 1; k++)
     ;
+}
+
+void acc_kernels()
+{
+  int i, j, k;
+
+  #pragma acc kernels num_gangs(i) /* { dg-warning "is used uninitialized in 
this function" } */
+  ;
+
+  #pragma acc kernels num_workers(j) /* { dg-warning "is used uninitialized in 
this function" } */
+  ;
 
-  return 0;
+  #pragma acc kernels vector_length(k) /* { dg-warning "is used uninitialized 
in this function" } */
+  ;
 }
diff --git gcc/testsuite/g++.dg/goacc/template.C 
gcc/testsuite/g++.dg/goacc/template.C
index 4bc2596..f4d255c 100644
--- gcc/testsuite/g++.dg/goacc/template.C
+++ gcc/testsuite/g++.dg/goacc/template.C
@@ -100,6 +100,10 @@ oacc_kernels_copy (T a)
   float y = 3;
   double z = 4;
 
+#pragma acc kernels num_gangs (a) num_workers (a) vector_length (a) default 
(none) copyout (b) copyin (a)
+  for (int i = 0; i < 1; i++)
+    b = a;
+
 #pragma acc kernels copy (w, x, y, z)
   {
     w = accDouble<char>(w);
diff --git gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95 
gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
index 2c237b7..a70f1e7 100644
--- gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
+++ gcc/testsuite/gfortran.dg/goacc/kernels-tree.f95
@@ -6,7 +6,8 @@ program test
   integer :: q, i, j, k, m, n, o, p, r, s, t, u, v, w
   logical :: l = .true.
 
-  !$acc kernels if(l) async copy(i), copyin(j), copyout(k), create(m) &
+  !$acc kernels if(l) async num_gangs(i) num_workers(i) vector_length(i) &
+  !$acc copy(i), copyin(j), copyout(k), create(m) &
   !$acc present(o), pcopy(p), pcopyin(r), pcopyout(s), pcreate(t) &
   !$acc deviceptr(u)
   !$acc end kernels
@@ -16,6 +17,9 @@ end program test
 
 ! { dg-final { scan-tree-dump-times "if" 1 "original" } }
 ! { dg-final { scan-tree-dump-times "async" 1 "original" } } 
+! { dg-final { scan-tree-dump-times "num_gangs" 1 "original" } } 
+! { dg-final { scan-tree-dump-times "num_workers" 1 "original" } } 
+! { dg-final { scan-tree-dump-times "vector_length" 1 "original" } } 
 
 ! { dg-final { scan-tree-dump-times "map\\(tofrom:i\\)" 1 "original" } } 
 ! { dg-final { scan-tree-dump-times "map\\(to:j\\)" 1 "original" } } 
diff --git gcc/testsuite/gfortran.dg/goacc/routine-3.f90 
gcc/testsuite/gfortran.dg/goacc/routine-3.f90
index ca9b928..6773f62 100644
--- gcc/testsuite/gfortran.dg/goacc/routine-3.f90
+++ gcc/testsuite/gfortran.dg/goacc/routine-3.f90
@@ -4,6 +4,12 @@ CONTAINS
     INTEGER  :: i
     REAL(KIND=8), ALLOCATABLE :: un(:),  ua(:)
 
+    !$acc kernels num_gangs(2) num_workers(4) vector_length(32)
+    DO jj = 1, 100
+       un(i) = ua(i)
+    END DO
+    !$acc end kernels
+
     !$acc parallel num_gangs(2) num_workers(4) vector_length(32)
     DO jj = 1, 100
        un(i) = ua(i)
diff --git gcc/testsuite/gfortran.dg/goacc/sie.f95 
gcc/testsuite/gfortran.dg/goacc/sie.f95
index b4dd9ed..3abf2c8 100644
--- gcc/testsuite/gfortran.dg/goacc/sie.f95
+++ gcc/testsuite/gfortran.dg/goacc/sie.f95
@@ -95,6 +95,34 @@ program test
   !$acc parallel num_gangs("1") ! { dg-error "scalar INTEGER expression" }
   !$acc end parallel
 
+  !$acc kernels num_gangs ! { dg-error "Unclassifiable OpenACC directive" }
+
+  !$acc kernels num_gangs(3)
+  !$acc end kernels
+
+  !$acc kernels num_gangs(i)
+  !$acc end kernels
+
+  !$acc kernels num_gangs(i+1)
+  !$acc end kernels
+
+  !$acc kernels num_gangs(-1) ! { dg-error "must be positive" }
+  !$acc end kernels
+
+  !$acc kernels num_gangs(0) ! { dg-error "must be positive" }
+  !$acc end kernels
+
+  !$acc kernels num_gangs() ! { dg-error "Invalid character in name" }
+
+  !$acc kernels num_gangs(1.5) ! { dg-error "scalar INTEGER expression" }
+  !$acc end kernels
+
+  !$acc kernels num_gangs(.true.) ! { dg-error "scalar INTEGER expression" }
+  !$acc end kernels
+
+  !$acc kernels num_gangs("1") ! { dg-error "scalar INTEGER expression" }
+  !$acc end kernels
+
 
   !$acc parallel num_workers ! { dg-error "Unclassifiable OpenACC directive" }
 
@@ -124,6 +152,34 @@ program test
   !$acc parallel num_workers("1") ! { dg-error "scalar INTEGER expression" }
   !$acc end parallel
 
+  !$acc kernels num_workers ! { dg-error "Unclassifiable OpenACC directive" }
+
+  !$acc kernels num_workers(3)
+  !$acc end kernels
+
+  !$acc kernels num_workers(i)
+  !$acc end kernels
+
+  !$acc kernels num_workers(i+1)
+  !$acc end kernels
+
+  !$acc kernels num_workers(-1) ! { dg-error "must be positive" }
+  !$acc end kernels
+
+  !$acc kernels num_workers(0) ! { dg-error "must be positive" }
+  !$acc end kernels
+
+  !$acc kernels num_workers() ! { dg-error "Invalid character in name" }
+
+  !$acc kernels num_workers(1.5) ! { dg-error "scalar INTEGER expression" }
+  !$acc end kernels
+
+  !$acc kernels num_workers(.true.) ! { dg-error "scalar INTEGER expression" }
+  !$acc end kernels
+
+  !$acc kernels num_workers("1") ! { dg-error "scalar INTEGER expression" }
+  !$acc end kernels
+
 
   !$acc parallel vector_length ! { dg-error "Unclassifiable OpenACC directive" 
}
 
@@ -153,6 +209,34 @@ program test
   !$acc parallel vector_length("1") ! { dg-error "scalar INTEGER expression" }
   !$acc end parallel
 
+  !$acc kernels vector_length ! { dg-error "Unclassifiable OpenACC directive" }
+
+  !$acc kernels vector_length(3)
+  !$acc end kernels
+
+  !$acc kernels vector_length(i)
+  !$acc end kernels
+
+  !$acc kernels vector_length(i+1)
+  !$acc end kernels
+
+  !$acc kernels vector_length(-1) ! { dg-error "must be positive" }
+  !$acc end kernels
+
+  !$acc kernels vector_length(0) ! { dg-error "must be positive" }
+  !$acc end kernels
+
+  !$acc kernels vector_length() ! { dg-error "Invalid character in name" }
+
+  !$acc kernels vector_length(1.5) ! { dg-error "scalar INTEGER expression" }
+  !$acc end kernels
+
+  !$acc kernels vector_length(.true.) ! { dg-error "scalar INTEGER expression" 
}
+  !$acc end kernels
+
+  !$acc kernels vector_length("1") ! { dg-error "scalar INTEGER expression" }
+  !$acc end kernels
+
 
   !$acc loop gang
   do i = 1,10
diff --git gcc/testsuite/gfortran.dg/goacc/uninit-dim-clause.f95 
gcc/testsuite/gfortran.dg/goacc/uninit-dim-clause.f95
index 9db692a..8551140 100644
--- gcc/testsuite/gfortran.dg/goacc/uninit-dim-clause.f95
+++ gcc/testsuite/gfortran.dg/goacc/uninit-dim-clause.f95
@@ -1,6 +1,6 @@
 ! { dg-additional-options "-Wuninitialized" }
 
-program test
+subroutine acc_parallel
   implicit none
   integer :: i, j, k
 
@@ -18,4 +18,18 @@ program test
   do k = 0, 1
   end do
   !$acc end parallel loop
-end program test
+end subroutine acc_parallel
+
+subroutine acc_kernels
+  implicit none
+  integer :: i, j, k
+
+  !$acc kernels num_gangs(i) ! { dg-warning "is used uninitialized in this 
function" }
+  !$acc end kernels
+
+  !$acc kernels num_workers(j) ! { dg-warning "is used uninitialized in this 
function" }
+  !$acc end kernels
+
+  !$acc kernels vector_length(k) ! { dg-warning "is used uninitialized in this 
function" }
+  !$acc end kernels
+end subroutine acc_kernels
diff --git libgomp/ChangeLog.gomp libgomp/ChangeLog.gomp
index a1627a8..5dc0889 100644
--- libgomp/ChangeLog.gomp
+++ libgomp/ChangeLog.gomp
@@ -1,5 +1,11 @@
 2017-05-14  Thomas Schwinge  <tho...@codesourcery.com>
 
+       * testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c: New
+       file.
+       * testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c: Update.
+       * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Likewise.
+       * testsuite/libgomp.oacc-fortran/kernels-loop-2.f95: Likewise.
+
        * testsuite/libgomp.oacc-c-c++-common/parallel-dims.c: Rewrite.
        * testsuite/libgomp.oacc-c++/c++.exp (check_effective_target_c)
        (check_effective_target_c++): New procs.
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
new file mode 100644
index 0000000..24b5718
--- /dev/null
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/acc_prof-kernels-1.c
@@ -0,0 +1,244 @@
+/* Test dispatch of events to callbacks.  */
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <acc_prof.h>
+
+/* Use explicit copyin clauses, to work around firstprivate optimizations,
+   which will cause the value at the point of call to be used (*before* any
+   potential modifications done in callbacks), as opposed to its address being
+   taken, which then later gets dereferenced (*after* any modifications done in
+   callbacks).  */
+#define COPYIN(...) copyin(__VA_ARGS__)
+
+/* See the "DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT" reference in
+   libgomp.texi.  */
+#define DEVICE_INIT_INSIDE_COMPUTE_CONSTRUCT 0
+
+#define DEBUG_printf(...) //__builtin_printf (__VA_ARGS__)
+
+static int state = -1;
+#define STATE_OP(state, op)\
+  do \
+    { \
+      typeof (state) state_o = (state); \
+      (void) state_o; \
+      (state)op; \
+      DEBUG_printf("state: %d -> %d\n", state_o, (state)); \
+    } \
+  while (0)
+
+static acc_device_t acc_device_type;
+static int acc_device_num;
+static int num_gangs, num_workers, vector_length;
+
+void cb_enqueue_launch_start (acc_prof_info *prof_info, acc_event_info 
*event_info, acc_api_info *api_info)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  assert (acc_device_type != acc_device_host);
+
+  assert (state == 0);
+  STATE_OP (state, = 1);
+
+  assert (prof_info->event_type == acc_ev_enqueue_launch_start);
+  assert (prof_info->valid_bytes == _ACC_PROF_INFO_VALID_BYTES);
+  assert (prof_info->version == _ACC_PROF_INFO_VERSION);
+  assert (prof_info->device_type == acc_device_type);
+  assert (prof_info->device_number == acc_device_num);
+  assert (prof_info->thread_id == -1);
+  assert (prof_info->async == acc_async_sync);
+  assert (prof_info->async_queue == prof_info->async);
+  assert (prof_info->src_file == NULL);
+  assert (prof_info->func_name == NULL);
+  assert (prof_info->line_no == -1);
+  assert (prof_info->end_line_no == -1);
+  assert (prof_info->func_line_no == -1);
+  assert (prof_info->func_end_line_no == -1);
+
+  assert (event_info->launch_event.event_type == prof_info->event_type);
+  assert (event_info->launch_event.valid_bytes == 
_ACC_LAUNCH_EVENT_INFO_VALID_BYTES);
+  assert (event_info->launch_event.parent_construct == acc_construct_parallel);
+  assert (event_info->launch_event.implicit == 1);
+  assert (event_info->launch_event.tool_info == NULL);
+  assert (event_info->launch_event.kernel_name != NULL);
+  {
+    char *s = strstr (event_info->launch_event.kernel_name, "main");
+    assert (s != NULL);
+    s = strstr (s, "omp_fn");
+    assert (s != NULL);
+  }
+  if (num_gangs < 1)
+    assert (event_info->launch_event.num_gangs >= 1);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.num_gangs == num_gangs);
+#else
+      /* No parallelized OpenACC kernels constructs, and unparallelized OpenACC
+        kernels constructs must get launched as 1 x 1 x 1 kernels.  */
+      assert (event_info->launch_event.num_gangs == 1);
+#endif
+    }
+  if (num_workers < 1)
+    assert (event_info->launch_event.num_workers >= 1);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.num_workers == num_workers);
+#else
+      /* See num_gangs above.  */
+      assert (event_info->launch_event.num_workers == 1);
+#endif
+    }
+  if (vector_length < 1)
+    assert (event_info->launch_event.vector_length >= 1);
+  else if (acc_device_type == acc_device_nvidia) /* ... is special.  */
+    assert (event_info->launch_event.vector_length == 32);
+  else
+    {
+#ifdef __OPTIMIZE__
+      assert (event_info->launch_event.vector_length == vector_length);
+#else
+      /* See num_gangs above.  */
+      assert (event_info->launch_event.vector_length == 1);
+#endif
+    }
+
+  if (acc_device_type == acc_device_host)
+    assert (api_info->device_api == acc_device_api_none);
+  else
+    assert (api_info->device_api == acc_device_api_cuda);
+  assert (api_info->valid_bytes == _ACC_API_INFO_VALID_BYTES);
+  assert (api_info->device_type == prof_info->device_type);
+  assert (api_info->vendor == -1);
+  assert (api_info->device_handle == NULL);
+  assert (api_info->context_handle == NULL);
+  assert (api_info->async_handle == NULL);
+}
+
+static acc_prof_reg reg;
+static acc_prof_reg unreg;
+static acc_prof_lookup_func lookup;
+void acc_register_library (acc_prof_reg reg_, acc_prof_reg unreg_, 
acc_prof_lookup_func lookup_)
+{
+  DEBUG_printf ("%s\n", __FUNCTION__);
+
+  reg = reg_;
+  unreg = unreg_;
+  lookup = lookup_;
+}
+
+int main()
+{
+  STATE_OP (state, = 0);
+  reg (acc_ev_enqueue_launch_start, cb_enqueue_launch_start, acc_reg);
+  assert (state == 0);
+
+  acc_device_type = acc_get_device_type ();
+  acc_device_num = acc_get_device_num (acc_device_type);
+  assert (state == 0);
+
+  /* Parallelism dimensions: compiler/runtime decides.  */
+  STATE_OP (state, = 0);
+  num_gangs = num_workers = vector_length = 0;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels
+    {
+      for (int i = 0; i < N; ++i)
+       x[i] = i * i;
+    }
+#ifdef __OPTIMIZE__
+    /* TODO.  With -O2 optimizations enabled, the compiler believes that here
+       "state == 0" still holds.  It's not yet clear what's going on.
+       Mis-optimization across the GOMP function call boundary?  Per its
+       gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+       "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+       must expect calls back into this compilation unit?  */
+    asm volatile ("" : : : "memory");
+#endif
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No acc_ev_enqueue_launch_start.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+       __builtin_abort ();
+#undef N
+  }
+
+  /* Parallelism dimensions: literal.  */
+  STATE_OP (state, = 0);
+  num_gangs = 30;
+  num_workers = 3;
+  vector_length = 5;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels \
+  num_gangs (30) num_workers (3) vector_length (5)
+    /* { dg-prune-output "using vector_length \\(32\\), ignoring 5" } */
+    {
+      for (int i = 0; i < N; ++i)
+       x[i] = i * i;
+    }
+#ifdef __OPTIMIZE__
+    /* TODO.  With -O2 optimizations enabled, the compiler believes that here
+       "state == 0" still holds.  It's not yet clear what's going on.
+       Mis-optimization across the GOMP function call boundary?  Per its
+       gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+       "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+       must expect calls back into this compilation unit?  */
+    asm volatile ("" : : : "memory");
+#endif
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No acc_ev_enqueue_launch_start.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+       __builtin_abort ();
+#undef N
+  }
+
+  /* Parallelism dimensions: variable.  */
+  STATE_OP (state, = 0);
+  num_gangs = 22;
+  num_workers = 5;
+  vector_length = 7;
+  {
+#define N 100
+    int x[N];
+#pragma acc kernels \
+  num_gangs (num_gangs) num_workers (num_workers) vector_length (vector_length)
+    /* { dg-prune-output "using vector_length \\(32\\), ignoring runtime 
setting" } */
+    {
+      for (int i = 0; i < N; ++i)
+       x[i] = i * i;
+    }
+#ifdef __OPTIMIZE__
+    /* TODO.  With -O2 optimizations enabled, the compiler believes that here
+       "state == 0" still holds.  It's not yet clear what's going on.
+       Mis-optimization across the GOMP function call boundary?  Per its
+       gcc/omp-builtins.def definition, BUILT_IN_GOACC_PARALLEL
+       "GOACC_parallel_keyed" doesn't have a "leaf" attribute, so the compiler
+       must expect calls back into this compilation unit?  */
+    asm volatile ("" : : : "memory");
+#endif
+    if (acc_device_type == acc_device_host)
+      assert (state == 0); /* No acc_ev_enqueue_launch_start.  */
+    else
+      assert (state == 1);
+    for (int i = 0; i < N; ++i)
+      if (x[i] != i * i)
+       __builtin_abort ();
+#undef N
+  }
+
+  return 0;
+}
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c
index c7592d6..b840888 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-loop-2.c
@@ -14,27 +14,40 @@ main (void)
   b = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
   c = (unsigned int *__restrict)malloc (N * sizeof (unsigned int));
 
+  /* Parallelism dimensions: compiler/runtime decides.  */
 #pragma acc kernels copyout (a[0:N])
   {
     for (COUNTERTYPE i = 0; i < N; i++)
       a[i] = i * 2;
   }
 
-#pragma acc kernels copyout (b[0:N])
+  /* Parallelism dimensions: variable.  */
+#pragma acc kernels copyout (b[0:N]) \
+  num_gangs (3 + a[3]) num_workers (5 + a[5]) vector_length (7 + a[7])
+  /* { dg-prune-output "using vector_length \\(32\\), ignoring runtime 
setting" } */
   {
     for (COUNTERTYPE i = 0; i < N; i++)
       b[i] = i * 4;
   }
 
-#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N])
+  /* Parallelism dimensions: literal.  */
+#pragma acc kernels copyin (a[0:N], b[0:N]) copyout (c[0:N]) \
+  num_gangs (3) num_workers (5) vector_length (7)
+  /* { dg-prune-output "using vector_length \\(32\\), ignoring 7" } */
   {
     for (COUNTERTYPE ii = 0; ii < N; ii++)
       c[ii] = a[ii] + b[ii];
   }
 
   for (COUNTERTYPE i = 0; i < N; i++)
-    if (c[i] != a[i] + b[i])
-      abort ();
+    {
+      if (a[i] != i * 2)
+       abort ();
+      if (b[i] != i * 4)
+       abort ();
+      if (c[i] != a[i] + b[i])
+       abort ();
+    }
 
   free (a);
   free (b);
diff --git libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c 
libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
index 3458757..1dd6353 100644
--- libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
+++ libgomp/testsuite/libgomp.oacc-c-c++-common/parallel-dims.c
@@ -523,5 +523,40 @@ int main ()
   }
 
 
+  /* Unparallelized OpenACC kernels constructs must get launched as 1 x 1 x 1
+     kernels even when there are explicit num_gangs, num_workers, or
+     vector_length clauses.  */
+  {
+    int gangs = 5;
+#define WORKERS 5
+#define VECTORS 13
+    int gangs_min, gangs_max, workers_min, workers_max, vectors_min, 
vectors_max;
+    gangs_min = workers_min = vectors_min = INT_MAX;
+    gangs_max = workers_max = vectors_max = INT_MIN;
+#pragma acc kernels \
+  num_gangs (gangs) \
+  num_workers (WORKERS) \
+  vector_length (VECTORS)
+    {
+      /* This is to make the OpenACC kernels construct unparallelizable.  */
+      asm volatile ("" : : : "memory");
+
+#pragma acc loop reduction (min: gangs_min, workers_min, vectors_min) 
reduction (max: gangs_max, workers_max, vectors_max)
+      for (int i = 100; i > -100; --i)
+       {
+         gangs_min = gangs_max = acc_gang ();
+         workers_min = workers_max = acc_worker ();
+         vectors_min = vectors_max = acc_vector ();
+       }
+    }
+    if (gangs_min != 0 || gangs_max != 1 - 1
+       || workers_min != 0 || workers_max != 1 - 1
+       || vectors_min != 0 || vectors_max != 1 - 1)
+      __builtin_abort ();
+#undef VECTORS
+#undef WORKERS
+  }
+
+
   return 0;
 }
diff --git libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-2.f95 
libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-2.f95
index 163e8d5..b88ca67 100644
--- libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-2.f95
+++ libgomp/testsuite/libgomp.oacc-fortran/kernels-loop-2.f95
@@ -6,25 +6,34 @@ program main
   integer, dimension (0:n-1) :: a, b, c
   integer                    :: i, ii
 
+  ! Parallelism dimensions: compiler/runtime decides.
   !$acc kernels copyout (a(0:n-1))
   do i = 0, n - 1
      a(i) = i * 2
   end do
   !$acc end kernels
 
-  !$acc kernels copyout (b(0:n-1))
+  ! Parallelism dimensions: variable.
+  !$acc kernels copyout (b(0:n-1)) &
+  !$acc num_gangs (3 + a(3)) num_workers (5 + a(5)) vector_length (7 + a(7))
+  ! { dg-prune-output "using vector_length \\(32\\), ignoring runtime setting" 
}
   do i = 0, n -1
      b(i) = i * 4
   end do
   !$acc end kernels
 
-  !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1))
+  ! Parallelism dimensions: literal.
+  !$acc kernels copyin (a(0:n-1), b(0:n-1)) copyout (c(0:n-1)) &
+  !$acc num_gangs (3) num_workers (5) vector_length (7)
+  ! { dg-prune-output "using vector_length \\(32\\), ignoring 7" }
   do ii = 0, n - 1
      c(ii) = a(ii) + b(ii)
   end do
   !$acc end kernels
 
   do i = 0, n - 1
+     if (a(i) .ne. i * 2) call abort
+     if (b(i) .ne. i * 4) call abort
      if (c(i) .ne. a(i) + b(i)) call abort
   end do
 


Grüße
 Thomas

Reply via email to