On 18/09/2020 12:25, Andrew Stubbs wrote:
This patch fixes a problem in which nested OpenMP parallel regions cause errors if the number of inner teams is not balanced (i.e. the number of loop iterations is not divisible by the number of physical threads). A testcase is included.
This updated version removes an editing mistake that should have been spotted sooner.
Sorry for the inconvenience. Andrew
libgomp: disable barriers in nested teams Both GCN and NVPTX allow nested parallel regions, but the barrier implementation did not allow the nested teams to run independently of each other (due to hardware limitations). This patch fixes that, under the assumption that each thread will create a new subteam of one thread, by simply not using barriers when there's no other thread to synchronise. libgomp/ChangeLog: * config/gcn/bar.c (gomp_barrier_wait_end): Skip the barrier if the total number of threads is one. (gomp_team_barrier_wake): Likewise. (gomp_team_barrier_wait_end): Likewise. (gomp_team_barrier_wait_cancel_end): Likewise. * config/nvptx/bar.c (gomp_barrier_wait_end): Likewise. (gomp_team_barrier_wake): Likewise. (gomp_team_barrier_wait_end): Likewise. (gomp_team_barrier_wait_cancel_end): Likewise. * testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c: New test. diff --git a/libgomp/config/gcn/bar.c b/libgomp/config/gcn/bar.c index 02fd19710d4..a21529a624b 100644 --- a/libgomp/config/gcn/bar.c +++ b/libgomp/config/gcn/bar.c @@ -43,7 +43,8 @@ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) __atomic_store_n (&bar->generation, bar->generation + BAR_INCR, MEMMODEL_RELAXED); } - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); } void @@ -71,7 +72,8 @@ gomp_barrier_wait_last (gomp_barrier_t *bar) void gomp_team_barrier_wake (gomp_barrier_t *bar, int count) { - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); } void @@ -97,7 +99,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) state &= ~BAR_CANCELLED; state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED); - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); return; } } @@ -172,7 +175,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, { state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELAXED); - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); return false; } } @@ -195,7 +199,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, abort(); } - asm ("s_barrier" ::: "memory"); + if (bar->total > 1) + asm ("s_barrier" ::: "memory"); gen = __atomic_load_n (&bar->generation, MEMMODEL_RELAXED); if (__builtin_expect (gen & BAR_CANCELLED, 0)) return true; diff --git a/libgomp/config/nvptx/bar.c b/libgomp/config/nvptx/bar.c index 125ca3e49ec..1116561d931 100644 --- a/libgomp/config/nvptx/bar.c +++ b/libgomp/config/nvptx/bar.c @@ -41,7 +41,8 @@ gomp_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) __atomic_store_n (&bar->generation, bar->generation + BAR_INCR, MEMMODEL_RELEASE); } - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); } void @@ -69,7 +70,8 @@ gomp_barrier_wait_last (gomp_barrier_t *bar) void gomp_team_barrier_wake (gomp_barrier_t *bar, int count) { - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); } void @@ -95,7 +97,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) state &= ~BAR_CANCELLED; state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); return; } } @@ -104,7 +107,8 @@ gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) state &= ~BAR_CANCELLED; do { - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); if (__builtin_expect (gen & BAR_TASK_PENDING, 0)) { @@ -158,7 +162,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, { state += BAR_INCR - BAR_WAS_LAST; __atomic_store_n (&bar->generation, state, MEMMODEL_RELEASE); - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); return false; } } @@ -169,7 +174,8 @@ gomp_team_barrier_wait_cancel_end (gomp_barrier_t *bar, generation = state; do { - asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); + if (bar->total > 1) + asm ("bar.sync 1, %0;" : : "r" (32 * bar->total)); gen = __atomic_load_n (&bar->generation, MEMMODEL_ACQUIRE); if (__builtin_expect (gen & BAR_CANCELLED, 0)) return true; diff --git a/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c new file mode 100644 index 00000000000..e777271dde1 --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/nested-parallel-unbalanced.c @@ -0,0 +1,31 @@ +/* Ensure that nested parallel regions work even when the number of loop + iterations is not divisible by the number of threads. */ + +#include <stdlib.h> + +int main() { + int A[30][40], B[30][40]; + size_t n = 30; + + for (size_t i = 0; i < 30; ++i) + for (size_t j = 0; j < 40; ++j) + A[i][j] = 42; + +#pragma omp target map(A[0:30][0:40], B[0:30][0:40]) + { +#pragma omp parallel for num_threads(8) + for (size_t i = 0; i < n; ++i) + { +#pragma omp parallel for + for (size_t j = 0; j < n; ++j) + { + B[i][j] = A[i][j]; + } + } + } + +for (size_t i = 0; i < n; ++i) + for (size_t j = 0; j < n; ++j) + if (B[i][j] != 42) + abort (); +}