Hi, The OpenACC execution model states that implementing a critical section across workers using atomic operations and a busy-wait loop may never succeed, since the scheduler may suspend the worker that owns the lock, in which case the worker waiting on the lock can never complete.
Add a test-case that implements the next best thing: a spinlock using a busy-wait loop that gives up after a certain number of tries. This ensures termination, and makes the test-case a valid one, while still excercising atomic exchange and atomic store. OK for trunk? Thanks, - Tom [libgomp, openacc] Add terminating spinlock test-cases libgomp/ChangeLog: 2022-02-02 Tom de Vries <tdevr...@suse.de> * testsuite/libgomp.oacc-c/spin-lock-global.c: New test. * testsuite/libgomp.oacc-c/spin-lock-global.h: New test. * testsuite/libgomp.oacc-c/spin-lock-shared.c: New test. * testsuite/libgomp.oacc-c/spin-lock-shared.h: New test. --- .../testsuite/libgomp.oacc-c/spin-lock-global.c | 43 ++++++ .../testsuite/libgomp.oacc-c/spin-lock-global.h | 169 +++++++++++++++++++++ .../testsuite/libgomp.oacc-c/spin-lock-shared.c | 35 +++++ .../testsuite/libgomp.oacc-c/spin-lock-shared.h | 135 ++++++++++++++++ 4 files changed, 382 insertions(+) diff --git a/libgomp/testsuite/libgomp.oacc-c/spin-lock-global.c b/libgomp/testsuite/libgomp.oacc-c/spin-lock-global.c new file mode 100644 index 00000000000..0c1da9e842f --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c/spin-lock-global.c @@ -0,0 +1,43 @@ +#include <stdio.h> +#include <openacc.h> +#include <gomp-constants.h> +#include <assert.h> + +enum memmodel + { + MEMMODEL_RELAXED = 0, + MEMMODEL_ACQUIRE = 2, + MEMMODEL_RELEASE = 3, + MEMMODEL_SEQ_CST = 5, + }; + +#define TYPE unsigned int +#define LOCKVAR1 lock_32_1 +#define LOCKVAR2 lock_32_2 +#define TESTS tests_32 +#include "spin-lock-global.h" +#undef TYPE +#undef LOCKVAR1 +#undef LOCKVAR2 +#undef TESTS + +#define TYPE unsigned long long int +#define LOCKVAR1 lock_64_1 +#define LOCKVAR2 lock_64_2 +#define TESTS tests_64 +#include "spin-lock-global.h" +#undef TYPE +#undef LOCKVAR1 +#undef LOCKVAR2 +#undef TESTS + +#define N (7 * 1000) + +int +main (void) +{ + tests_32 (N); + tests_64 (N); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-c/spin-lock-global.h b/libgomp/testsuite/libgomp.oacc-c/spin-lock-global.h new file mode 100644 index 00000000000..ea63fafccb9 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c/spin-lock-global.h @@ -0,0 +1,169 @@ +#define XSTR(S) STR (S) +#define STR(S) #S + +#define PRINTF(...) \ + { \ + printf (__VA_ARGS__); \ + fflush (NULL); \ + } + +#define DO_PRAGMA(x) _Pragma (#x) + +#ifndef SPIN_CNT_MAX +/* Define to have limited-spin spinlock. + Ensures that the program will terminate. */ +#define SPIN_CNT_MAX 0x8000U +#endif + +#define TEST_1(N, LOCKVAR, VERIFY, N_GANGS, N_WORKERS) \ + assert (N % N_GANGS == 0); \ + \ + DO_PRAGMA (acc parallel \ + num_gangs(N_GANGS) \ + num_workers(N_WORKERS) \ + copy (lock_cnt) \ + copy (spin_cnt_max_hit) \ + present (LOCKVAR)) \ + { \ + TYPE unlocked = (TYPE)0; \ + TYPE locked = ~unlocked; \ + \ + LOCKVAR = unlocked; \ + \ + unsigned int n_gangs \ + = __builtin_goacc_parlevel_size (GOMP_DIM_GANG); \ + \ + DO_PRAGMA (acc loop worker) \ + for (unsigned int i = 0; i < N / n_gangs; i++) \ + { \ + TYPE res; \ + \ + unsigned int spin_cnt = 0; \ + while (1) \ + { \ + res = __atomic_exchange_n (&LOCKVAR, locked, \ + MEMMODEL_ACQUIRE); \ + if (res == locked) \ + { \ + if (SPIN_CNT_MAX > 0) \ + { \ + spin_cnt++; \ + if (spin_cnt == SPIN_CNT_MAX) \ + { \ + if (VERIFY) \ + __atomic_fetch_add (&spin_cnt_max_hit, 1, \ + MEMMODEL_RELAXED); \ + break; \ + } \ + } \ + continue; \ + \ + } \ + else \ + { \ + if (res != unlocked) \ + __builtin_abort (); \ + \ + if (VERIFY) \ + __atomic_fetch_add (&lock_cnt, 1, \ + MEMMODEL_RELAXED); \ + \ + __atomic_store_n (&LOCKVAR, unlocked, \ + MEMMODEL_RELEASE); \ + break; \ + } \ + } \ + } \ + } + +#define TEST(N, LOCKVAR, VERIFY, N_GANGS, N_WORKERS) \ + { \ + spin_cnt_max_hit = 0; \ + \ + if (VERIFY) \ + lock_cnt = 0; \ + \ + PRINTF ("%s - verify=%u - lock=%s - gangs=%u - workers=%u ... ", \ + XSTR (TYPE), VERIFY, STR(LOCKVAR), N_GANGS, N_WORKERS); \ + TEST_1 (N, LOCKVAR, VERIFY, N_GANGS, N_WORKERS); \ + PRINTF ("done\n"); \ + \ + if (VERIFY && SPIN_CNT_MAX) \ + PRINTF ("spin_cnt_max_hit: %llu\n", spin_cnt_max_hit); \ + \ + if (VERIFY && (lock_cnt + spin_cnt_max_hit != N)) \ + { \ + PRINTF ("lock_cnt: %llu\n", lock_cnt); \ + PRINTF ("lock_cnt + spin_cnt_max_hit: %llu\n", \ + lock_cnt + spin_cnt_max_hit); \ + PRINTF ("N: %u\n", N); \ + __builtin_abort (); \ + } \ + } + +/* Uses .global addressing on nvptx. */ +TYPE LOCKVAR1; +#pragma acc declare create (LOCKVAR1) + +void +TESTS (unsigned int n) +{ + unsigned long long int lock_cnt; + unsigned long long int spin_cnt_max_hit; + + /* Uses generic addressing on nvptx. */ + TYPE LOCKVAR2; +#pragma acc declare create (LOCKVAR2) + +#define N_GANGS 1 +#define N_WORKERS 8 +#define VERIFY 0 + TEST (n, LOCKVAR1, VERIFY, N_GANGS, N_WORKERS); + TEST (n, LOCKVAR2, VERIFY, N_GANGS, N_WORKERS); +#undef VERIFY +#define VERIFY 1 + TEST (n, LOCKVAR1, VERIFY, N_GANGS, N_WORKERS); + TEST (n, LOCKVAR2, VERIFY, N_GANGS, N_WORKERS); +#undef VERIFY +#undef N_WORKERS +#undef N_GANGS + +#define N_GANGS 2 +#define N_WORKERS 4 +#define VERIFY 0 + TEST (n, LOCKVAR1, VERIFY, N_GANGS, N_WORKERS); + TEST (n, LOCKVAR2, VERIFY, N_GANGS, N_WORKERS); +#undef VERIFY +#define VERIFY 1 + TEST (n, LOCKVAR1, VERIFY, N_GANGS, N_WORKERS); + TEST (n, LOCKVAR2, VERIFY, N_GANGS, N_WORKERS); +#undef VERIFY +#undef N_WORKERS +#undef N_GANGS + +#define N_GANGS 4 +#define N_WORKERS 2 +#define VERIFY 0 + TEST (n, LOCKVAR1, VERIFY, N_GANGS, N_WORKERS); + TEST (n, LOCKVAR2, VERIFY, N_GANGS, N_WORKERS); +#undef VERIFY +#define VERIFY 1 + TEST (n, LOCKVAR1, VERIFY, N_GANGS, N_WORKERS); + TEST (n, LOCKVAR2, VERIFY, N_GANGS, N_WORKERS); +#undef VERIFY +#undef N_WORKERS +#undef N_GANGS + +#define N_GANGS 8 +#define N_WORKERS 1 +#define VERIFY 0 + TEST (n, LOCKVAR1, VERIFY, N_GANGS, N_WORKERS); + TEST (n, LOCKVAR2, VERIFY, N_GANGS, N_WORKERS); +#undef VERIFY +#define VERIFY 1 + TEST (n, LOCKVAR1, VERIFY, N_GANGS, N_WORKERS); + TEST (n, LOCKVAR2, VERIFY, N_GANGS, N_WORKERS); +#undef VERIFY +#undef N_WORKERS +#undef N_GANGS +} diff --git a/libgomp/testsuite/libgomp.oacc-c/spin-lock-shared.c b/libgomp/testsuite/libgomp.oacc-c/spin-lock-shared.c new file mode 100644 index 00000000000..81d18fcc798 --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c/spin-lock-shared.c @@ -0,0 +1,35 @@ +#include <stdio.h> +#include <openacc.h> +#include <gomp-constants.h> +#include <assert.h> + +enum memmodel + { + MEMMODEL_RELAXED = 0, + MEMMODEL_ACQUIRE = 2, + MEMMODEL_RELEASE = 3, + MEMMODEL_SEQ_CST = 5, + }; + +#define TYPE unsigned int +#define TESTS tests_32 +#include "spin-lock-shared.h" +#undef TYPE +#undef TESTS + +#define TYPE unsigned long long int +#define TESTS tests_64 +#include "spin-lock-shared.h" +#undef TYPE +#undef TESTS + +#define N (50 * 1000) + +int +main (void) +{ + tests_32 (N); + tests_64 (N); + + return 0; +} diff --git a/libgomp/testsuite/libgomp.oacc-c/spin-lock-shared.h b/libgomp/testsuite/libgomp.oacc-c/spin-lock-shared.h new file mode 100644 index 00000000000..923f38c60fe --- /dev/null +++ b/libgomp/testsuite/libgomp.oacc-c/spin-lock-shared.h @@ -0,0 +1,135 @@ +#define XSTR(S) STR (S) +#define STR(S) #S + +#define PRINTF(...) \ + { \ + printf (__VA_ARGS__); \ + fflush (NULL); \ + } + +#define DO_PRAGMA(x) _Pragma (#x) + +#ifndef SPIN_CNT_MAX +/* Define to have limited-spin spinlock. + Ensures that the program will terminate. */ +#define SPIN_CNT_MAX 0x20000U +#endif + +#define TEST_1(N, LOCKREF) \ + DO_PRAGMA (acc parallel \ + num_gangs(1) \ + num_workers(N_WORKERS) \ + copy (lock_cnt) \ + copy (spin_cnt_max_hit)) \ + { \ + TYPE unlocked = (TYPE)0; \ + TYPE locked = ~unlocked; \ + TYPE lock; \ + TYPE *volatile lock_ptr = &lock; \ + unsigned long long int lock_cnt_1; \ + unsigned long long int spin_cnt_max_hit_1; \ + \ + if (VERIFY) \ + { \ + lock_cnt_1 = 0; \ + \ + if (SPIN_CNT_MAX) \ + spin_cnt_max_hit_1 = 0; \ + } \ + \ + *(LOCKREF) = unlocked; \ + \ + DO_PRAGMA (acc loop worker) \ + for (unsigned int i = 0; i < N; i++) \ + { \ + TYPE res; \ + \ + unsigned int spin_cnt = 0; \ + while (1) \ + { \ + res = __atomic_exchange_n (LOCKREF, locked, \ + MEMMODEL_ACQUIRE); \ + if (res == locked) \ + { \ + if (SPIN_CNT_MAX > 0) \ + { \ + spin_cnt++; \ + if (spin_cnt == SPIN_CNT_MAX) \ + { \ + if (VERIFY) \ + __atomic_fetch_add (&spin_cnt_max_hit_1, 1, \ + MEMMODEL_RELAXED); \ + break; \ + } \ + } \ + continue; \ + } \ + else \ + { \ + if (res != unlocked) \ + __builtin_abort (); \ + \ + if (VERIFY) \ + __atomic_fetch_add (&lock_cnt_1, 1, \ + MEMMODEL_RELAXED); \ + \ + __atomic_store_n (LOCKREF, unlocked, \ + MEMMODEL_RELEASE); \ + \ + break; \ + } \ + } \ + } \ + \ + if (VERIFY) \ + { \ + lock_cnt += lock_cnt_1; \ + \ + if (SPIN_CNT_MAX) \ + spin_cnt_max_hit += spin_cnt_max_hit_1; \ + } \ + } + +#define TEST(N, LOCKREF) \ + { \ + spin_cnt_max_hit = 0; \ + \ + if (VERIFY) \ + lock_cnt = 0; \ + \ + PRINTF ("%s - verify=%u - LOCKREF=%s ... ", \ + XSTR (TYPE), VERIFY, #LOCKREF); \ + TEST_1 (N, LOCKREF); \ + PRINTF ("done\n"); \ + \ + if (VERIFY && SPIN_CNT_MAX) \ + PRINTF ("spin_cnt_max_hit: %llu\n", spin_cnt_max_hit); \ + \ + if (VERIFY && (lock_cnt + spin_cnt_max_hit != N)) \ + { \ + PRINTF ("lock_cnt: %llu\n", lock_cnt); \ + PRINTF ("lock_cnt + spin_cnt_max_hit: %llu\n", \ + lock_cnt + spin_cnt_max_hit); \ + PRINTF ("N: %u\n", N); \ + __builtin_abort (); \ + } \ + } + +void +TESTS (unsigned int n) +{ + unsigned long long int lock_cnt; + unsigned long long int spin_cnt_max_hit; + +#define N_WORKERS 8 + +#define VERIFY 0 + TEST (n, &lock); + TEST (n, lock_ptr); +#undef VERIFY + +#define VERIFY 1 + TEST (n, &lock); + TEST (n, lock_ptr); +#undef VERIFY +}