On Mon, Nov 28, 2011 at 08:09:01PM +1030, Alan Modra wrote: > On Mon, Nov 28, 2011 at 09:13:27AM +0100, Jakub Jelinek wrote: > > On Mon, Nov 28, 2011 at 10:12:09AM +1030, Alan Modra wrote: > > > --- libgomp/config/linux/bar.c (revision 181718) > > > +++ libgomp/config/linux/bar.c (working copy) > > > @@ -36,18 +36,15 @@ gomp_barrier_wait_end (gomp_barrier_t *b > > > if (__builtin_expect ((state & 1) != 0, 0)) > > > { > > > /* Next time we'll be awaiting TOTAL threads again. */ > > > - bar->awaited = bar->total; > > > - atomic_write_barrier (); > > > - bar->generation += 4; > > > + __atomic_store_4 (&bar->awaited, bar->total, MEMMODEL_RELEASE); > > > + __atomic_add_fetch (&bar->generation, 4, MEMMODEL_ACQ_REL); > > > futex_wake ((int *) &bar->generation, INT_MAX); > > > } > > > > The above is unfortunate, bar->generation should be only modified from a > > single thread at this point, but the __atomic_add_fetch will enforce there > > an atomic instruction on it rather than just some kind of barrier. > > I will try without the atomic add and see what happens.
Seems to be fine. I took out the extra write barriers too, so we now just have two MEMMODEL_RELEASE atomic store barriers replacing the two old atomic_write_barriers, and a number of new acquire barriers. PR libgomp/51298 * config/linux/bar.h: Use atomic rather than sync builtins. * config/linux/bar.c: Likewise. Add missing acquire synchronisation on generation field. * task.c (gomp_barrier_handle_tasks): Regain lock so as to not double unlock. Index: libgomp/task.c =================================================================== --- libgomp/task.c (revision 181718) +++ libgomp/task.c (working copy) @@ -273,6 +273,7 @@ gomp_barrier_handle_tasks (gomp_barrier_ gomp_team_barrier_done (&team->barrier, state); gomp_mutex_unlock (&team->task_lock); gomp_team_barrier_wake (&team->barrier, 0); + gomp_mutex_lock (&team->task_lock); } } } Index: libgomp/config/linux/bar.h =================================================================== --- libgomp/config/linux/bar.h (revision 181770) +++ libgomp/config/linux/bar.h (working copy) @@ -50,7 +50,7 @@ static inline void gomp_barrier_init (go static inline void gomp_barrier_reinit (gomp_barrier_t *bar, unsigned count) { - __sync_fetch_and_add (&bar->awaited, count - bar->total); + __atomic_add_fetch (&bar->awaited, count - bar->total, MEMMODEL_ACQ_REL); bar->total = count; } @@ -69,10 +69,8 @@ extern void gomp_team_barrier_wake (gomp static inline gomp_barrier_state_t gomp_barrier_wait_start (gomp_barrier_t *bar) { - unsigned int ret = bar->generation & ~3; - /* Do we need any barrier here or is __sync_add_and_fetch acting - as the needed LoadLoad barrier already? */ - ret += __sync_add_and_fetch (&bar->awaited, -1) == 0; + unsigned int ret = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) & ~3; + ret += __atomic_add_fetch (&bar->awaited, -1, MEMMODEL_ACQ_REL) == 0; return ret; } Index: libgomp/config/linux/bar.c =================================================================== --- libgomp/config/linux/bar.c (revision 181770) +++ libgomp/config/linux/bar.c (working copy) @@ -37,17 +37,15 @@ gomp_barrier_wait_end (gomp_barrier_t *b { /* Next time we'll be awaiting TOTAL threads again. */ bar->awaited = bar->total; - atomic_write_barrier (); - bar->generation += 4; + __atomic_store_4 (&bar->generation, bar->generation + 4, + MEMMODEL_RELEASE); futex_wake ((int *) &bar->generation, INT_MAX); } else { - unsigned int generation = state; - do - do_wait ((int *) &bar->generation, generation); - while (bar->generation == generation); + do_wait ((int *) &bar->generation, state); + while (__atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE) == state); } } @@ -81,15 +79,15 @@ gomp_team_barrier_wake (gomp_barrier_t * void gomp_team_barrier_wait_end (gomp_barrier_t *bar, gomp_barrier_state_t state) { - unsigned int generation; + unsigned int generation, gen; if (__builtin_expect ((state & 1) != 0, 0)) { /* Next time we'll be awaiting TOTAL threads again. */ struct gomp_thread *thr = gomp_thread (); struct gomp_team *team = thr->ts.team; + bar->awaited = bar->total; - atomic_write_barrier (); if (__builtin_expect (team->task_count, 0)) { gomp_barrier_handle_tasks (state); @@ -97,7 +95,7 @@ gomp_team_barrier_wait_end (gomp_barrier } else { - bar->generation = state + 3; + __atomic_store_4 (&bar->generation, state + 3, MEMMODEL_RELEASE); futex_wake ((int *) &bar->generation, INT_MAX); return; } @@ -107,12 +105,16 @@ gomp_team_barrier_wait_end (gomp_barrier do { do_wait ((int *) &bar->generation, generation); - if (__builtin_expect (bar->generation & 1, 0)) - gomp_barrier_handle_tasks (state); - if ((bar->generation & 2)) + gen = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE); + if (__builtin_expect (gen & 1, 0)) + { + gomp_barrier_handle_tasks (state); + gen = __atomic_load_4 (&bar->generation, MEMMODEL_ACQUIRE); + } + if ((gen & 2) != 0) generation |= 2; } - while (bar->generation != state + 4); + while (gen != state + 4); } void -- Alan Modra Australia Development Lab, IBM