Hi! If lock contention is high, but all locks are held for relatively short time and no threads actually goes into futex_wait, we still completely unnecessarily do lots of futex_wake syscalls.
On Linux with futex, our mutexes have 3 states: 0 - unlocked 1 - locked, uncontended 2 - locked, contended When the initial atomic 0 -> 1 change fails, gomp_mutex_slow is called and from that point onwards the lock is changed into state 2 and handled as contended, until it is unlocked. State 2 in particular means that gomp_mutex_unlock will do a futex_wake on it. The following patch changes it, so that when 0 -> 1 change fails, because the mutex is already in state 1, it will do the userland spinning. If the spinning times out with no change, it will enter state 2, futex_wait and continue as before, while if during the spinning the lock became 0, it will try to change it again atomically from 0 -> 1. If that succeeds, it returns, if it fails because another thread made it into 1, it will do the spinning again, and any time the mutex becomes 2, it will fall thru into the old loop. The advantage of this is that if no thread needed to go to sleep, no futex_wakes will be done. Additionally I've improved the old loop, from do { int oldval = __sync_val_compare_and_swap (mutex, 1, 2); if (oldval != 0) do_wait (mutex, 2); } while (!__sync_bool_compare_and_swap (mutex, 0, 2)); into: while ((oldval = __sync_lock_test_and_set (mutex, 2))) do_wait (mutex, 2); which should have the advantage that there is just one atomic insn instead of two. While __sync_lock_test_and_set isn't a full barrier on all targets, I hope it doesn't matter, because the inline caller has already done one __sync_val_compare_and_swap which is a full barrier. Tested with libgomp testsuite and looked at performance numbers of Sho's omp_fib.c and libgomp.c/sort-1.c, unfortunately the differences looked to be in the noise. But, e.g. on omp_fib.c strace -f shows that the number of futex FUTEX_WAKE syscalls went down a lot (from ~ 75000 for omp_fib 32 down to ~ 50 with the default wait policy, of course for OMP_WAIT_POLICY=passive it remained roughly the same). Any comments? Can anyone see meassurable differences on some benchmark? 2011-07-15 Jakub Jelinek <ja...@redhat.com> * config/linux/wait.h (do_spin): New inline, largely copied from do_wait, just don't do futex_wait here, instead return true if it should be done. (do_wait): Implement using do_spin. * config/linux/mutex.h (gomp_mutex_lock_slow): Add an int argument to prototype. (gomp_mutex_lock): Use __sync_val_compare_and_swap instead of __sync_bool_compare_and_swap, pass the oldval to gomp_mutex_lock_slow. * config/linux/mutex.c (gomp_mutex_lock_slow): Add oldval argument. If all mutex contenders are just spinning and not sleeping, don't change state to 2 unnecessarily. Optimize the loop when state has already become 2 to use just one atomic operation per loop instead of two. * config/linux/ia64/mutex.h (gomp_mutex_lock_slow): Add an int argument to prototype. (gomp_mutex_lock): Use __sync_val_compare_and_swap instead of __sync_bool_compare_and_swap, pass the oldval to gomp_mutex_lock_slow. --- libgomp/config/linux/wait.h.jj 2011-02-15 15:26:05.000000000 +0100 +++ libgomp/config/linux/wait.h 2011-07-15 09:56:14.000000000 +0200 @@ -44,7 +44,7 @@ extern long int gomp_futex_wait, gomp_fu #include <futex.h> -static inline void do_wait (int *addr, int val) +static inline int do_spin (int *addr, int val) { unsigned long long i, count = gomp_spin_count_var; @@ -52,10 +52,16 @@ static inline void do_wait (int *addr, i count = gomp_throttled_spin_count_var; for (i = 0; i < count; i++) if (__builtin_expect (*addr != val, 0)) - return; + return 0; else cpu_relax (); - futex_wait (addr, val); + return 1; +} + +static inline void do_wait (int *addr, int val) +{ + if (do_spin (addr, val)) + futex_wait (addr, val); } #ifdef HAVE_ATTRIBUTE_VISIBILITY --- libgomp/config/linux/mutex.h.jj 2009-08-05 11:47:08.000000000 +0200 +++ libgomp/config/linux/mutex.h 2011-07-15 09:29:04.000000000 +0200 @@ -1,4 +1,4 @@ -/* Copyright (C) 2005, 2009 Free Software Foundation, Inc. +/* Copyright (C) 2005, 2009, 2011 Free Software Foundation, Inc. Contributed by Richard Henderson <r...@redhat.com>. This file is part of the GNU OpenMP Library (libgomp). @@ -38,11 +38,12 @@ static inline void gomp_mutex_init (gomp *mutex = 0; } -extern void gomp_mutex_lock_slow (gomp_mutex_t *mutex); +extern void gomp_mutex_lock_slow (gomp_mutex_t *mutex, int); static inline void gomp_mutex_lock (gomp_mutex_t *mutex) { - if (!__sync_bool_compare_and_swap (mutex, 0, 1)) - gomp_mutex_lock_slow (mutex); + int oldval = __sync_val_compare_and_swap (mutex, 0, 1); + if (__builtin_expect (oldval, 0)) + gomp_mutex_lock_slow (mutex, oldval); } extern void gomp_mutex_unlock_slow (gomp_mutex_t *mutex); --- libgomp/config/linux/mutex.c.jj 2009-04-14 16:33:07.000000000 +0200 +++ libgomp/config/linux/mutex.c 2011-07-15 10:18:21.000000000 +0200 @@ -1,4 +1,4 @@ -/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc. +/* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc. Contributed by Richard Henderson <r...@redhat.com>. This file is part of the GNU OpenMP Library (libgomp). @@ -32,15 +32,27 @@ long int gomp_futex_wake = FUTEX_WAKE | long int gomp_futex_wait = FUTEX_WAIT | FUTEX_PRIVATE_FLAG; void -gomp_mutex_lock_slow (gomp_mutex_t *mutex) +gomp_mutex_lock_slow (gomp_mutex_t *mutex, int oldval) { - do + while (oldval == 1) { - int oldval = __sync_val_compare_and_swap (mutex, 1, 2); - if (oldval != 0) - do_wait (mutex, 2); + if (do_spin (mutex, 1)) + { + oldval = __sync_lock_test_and_set (mutex, 2); + if (oldval == 0) + return; + futex_wait (mutex, 2); + break; + } + else + { + oldval = __sync_val_compare_and_swap (mutex, 0, 1); + if (oldval == 0) + return; + } } - while (!__sync_bool_compare_and_swap (mutex, 0, 2)); + while ((oldval = __sync_lock_test_and_set (mutex, 2))) + do_wait (mutex, 2); } void --- libgomp/config/linux/ia64/mutex.h.jj 2009-04-14 16:33:07.000000000 +0200 +++ libgomp/config/linux/ia64/mutex.h 2011-07-15 10:39:25.000000000 +0200 @@ -1,4 +1,4 @@ -/* Copyright (C) 2005, 2008, 2009 Free Software Foundation, Inc. +/* Copyright (C) 2005, 2008, 2009, 2011 Free Software Foundation, Inc. Contributed by Richard Henderson <r...@redhat.com>. This file is part of the GNU OpenMP Library (libgomp). @@ -38,11 +38,12 @@ static inline void gomp_mutex_init (gomp *mutex = 0; } -extern void gomp_mutex_lock_slow (gomp_mutex_t *mutex); +extern void gomp_mutex_lock_slow (gomp_mutex_t *mutex, int); static inline void gomp_mutex_lock (gomp_mutex_t *mutex) { - if (!__sync_bool_compare_and_swap (mutex, 0, 1)) - gomp_mutex_lock_slow (mutex); + int oldval = __sync_val_compare_and_swap (mutex, 0, 1); + if (__builtin_expect (oldval, 0)) + gomp_mutex_lock_slow (mutex, oldval); } extern void gomp_mutex_unlock_slow (gomp_mutex_t *mutex); Jakub