Paolo Bonzini <pbonz...@redhat.com> writes: > We're already using them in several places, but __sync builtins are just > too ugly to type, and do not provide seqcst load/store operations. > > Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> > --- > docs/atomics.txt | 345 > +++++++++++++++++++++++++++++++++++++++++++++++ > hw/display/qxl.c | 3 +- > hw/virtio/vhost.c | 9 +- > include/qemu/atomic.h | 190 +++++++++++++++++++++----- > migration.c | 3 +- > tests/test-thread-pool.c | 8 +- > 6 files changed, 514 insertions(+), 44 deletions(-) > create mode 100644 docs/atomics.txt > > diff --git a/docs/atomics.txt b/docs/atomics.txt > new file mode 100644 > index 0000000..e2ce04b > --- /dev/null > +++ b/docs/atomics.txt
Really nice write-up! > diff --git a/hw/display/qxl.c b/hw/display/qxl.c > index 3862d7a..f24cb4e 100644 > --- a/hw/display/qxl.c > +++ b/hw/display/qxl.c > @@ -23,6 +23,7 @@ > #include "qemu-common.h" > #include "qemu/timer.h" > #include "qemu/queue.h" > +#include "qemu/atomic.h" > #include "monitor/monitor.h" > #include "sysemu/sysemu.h" > #include "trace.h" > @@ -1726,7 +1727,7 @@ static void qxl_send_events(PCIQXLDevice *d, uint32_t > events) > trace_qxl_send_events_vm_stopped(d->id, events); > return; > } > - old_pending = __sync_fetch_and_or(&d->ram->int_pending, le_events); > + old_pending = atomic_or(&d->ram->int_pending, le_events); > if ((old_pending & le_events) == le_events) { > return; > } > diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c > index 96ab625..8f6ab13 100644 > --- a/hw/virtio/vhost.c > +++ b/hw/virtio/vhost.c > @@ -16,6 +16,7 @@ > #include <sys/ioctl.h> > #include "hw/virtio/vhost.h" > #include "hw/hw.h" > +#include "qemu/atomic.h" > #include "qemu/range.h" > #include <linux/vhost.h> > #include "exec/address-spaces.h" > @@ -47,11 +48,9 @@ static void vhost_dev_sync_region(struct vhost_dev *dev, > addr += VHOST_LOG_CHUNK; > continue; > } > - /* Data must be read atomically. We don't really > - * need the barrier semantics of __sync > - * builtins, but it's easier to use them than > - * roll our own. */ > - log = __sync_fetch_and_and(from, 0); > + /* Data must be read atomically. We don't really need barrier > semantics > + * but it's easier to use atomic_* than roll our own. */ > + log = atomic_xchg(from, 0); > while ((bit = sizeof(log) > sizeof(int) ? > ffsll(log) : ffs(log))) { > hwaddr page_addr; > diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h > index 10becb6..04d64d0 100644 > --- a/include/qemu/atomic.h > +++ b/include/qemu/atomic.h > @@ -1,68 +1,194 @@ > -#ifndef __QEMU_BARRIER_H > -#define __QEMU_BARRIER_H 1 > +/* > + * Simple interface for atomic operations. > + * > + * Copyright (C) 2013 Red Hat, Inc. > + * > + * Author: Paolo Bonzini <pbonz...@redhat.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2 or later. > + * See the COPYING file in the top-level directory. > + * > + */ > > -/* Compiler barrier */ > -#define barrier() asm volatile("" ::: "memory") > +#ifndef __QEMU_ATOMIC_H > +#define __QEMU_ATOMIC_H 1 > > -#if defined(__i386__) > +#include "qemu/compiler.h" > > -#include "qemu/compiler.h" /* QEMU_GNUC_PREREQ */ > +/* For C11 atomic ops */ > > -/* > - * Because of the strongly ordered x86 storage model, wmb() and rmb() are > nops > - * on x86(well, a compiler barrier only). Well, at least as long as > - * qemu doesn't do accesses to write-combining memory or non-temporal > - * load/stores from C code. > - */ > -#define smp_wmb() barrier() > -#define smp_rmb() barrier() > +/* Compiler barrier */ > +#define barrier() ({ asm volatile("" ::: "memory"); (void)0; }) > + > +#ifndef __ATOMIC_RELAXED > > /* > - * We use GCC builtin if it's available, as that can use > - * mfence on 32 bit as well, e.g. if built with -march=pentium-m. > - * However, on i386, there seem to be known bugs as recently as 4.3. > - * */ > -#if QEMU_GNUC_PREREQ(4, 4) > -#define smp_mb() __sync_synchronize() > + * We use GCC builtin if it's available, as that can use mfence on > + * 32-bit as well, e.g. if built with -march=pentium-m. However, on > + * i386 the spec is buggy, and the implementation followed it until > + * 4.3 (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36793). > + */ > +#if defined(__i386__) || defined(__x86_64__) > +#if !QEMU_GNUC_PREREQ(4, 4) > +#if defined __x86_64__ > +#define smp_mb() ({ asm volatile("mfence" ::: "memory"); (void)0; }) > #else > -#define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") > +#define smp_mb() ({ asm volatile("lock; addl $0,0(%%esp) " ::: "memory"); > (void)0; }) > +#endif > +#endif > #endif > > -#elif defined(__x86_64__) > > +#ifdef __alpha__ > +#define smp_read_barrier_depends() asm volatile("mb":::"memory") > +#endif > + > +#if defined(__i386__) || defined(__x86_64__) || defined(__s390x__) > + > +/* > + * Because of the strongly ordered storage model, wmb() and rmb() are nops > + * here (a compiler barrier only). QEMU doesn't do accesses to > write-combining > + * qemu memory or non-temporal load/stores from C code. Tiny copy/paste error here: s/qemu memory/memory/g". One thing I've been thinking about reviewing this code, what should we be doing in virtio.c? We have barriers but we're relying on st[u][wlb]_phys having atomic semantics. I think it's okay in practice but if we're taking a more diligent approach here should we introduce atomic variants that work on guest phys addresses? Otherwise: Reviewed-by: Anthony Liguori <aligu...@us.ibm.com> Regards, Anthony Liguori > + */ > #define smp_wmb() barrier() > #define smp_rmb() barrier() > -#define smp_mb() asm volatile("mfence" ::: "memory") > + > +/* > + * __sync_lock_test_and_set() is documented to be an acquire barrier only, > + * but it is a full barrier at the hardware level. Add a compiler barrier > + * to make it a full barrier also at the compiler level. > + */ > +#define atomic_xchg(ptr, i) (barrier(), __sync_lock_test_and_set(ptr, i)) > + > +/* > + * Load/store with Java volatile semantics. > + */ > +#define atomic_mb_set(ptr, i) ((void)atomic_xchg(ptr, i)) > > #elif defined(_ARCH_PPC) > > /* > * We use an eieio() for wmb() on powerpc. This assumes we don't > * need to order cacheable and non-cacheable stores with respect to > - * each other > + * each other. > + * > + * smp_mb has the same problem as on x86 for not-very-new GCC > + * (http://patchwork.ozlabs.org/patch/126184/, Nov 2011). > */ > -#define smp_wmb() asm volatile("eieio" ::: "memory") > - > +#define smp_wmb() ({ asm volatile("eieio" ::: "memory"); (void)0; }) > #if defined(__powerpc64__) > -#define smp_rmb() asm volatile("lwsync" ::: "memory") > +#define smp_rmb() ({ asm volatile("lwsync" ::: "memory"); (void)0; }) > #else > -#define smp_rmb() asm volatile("sync" ::: "memory") > +#define smp_rmb() ({ asm volatile("sync" ::: "memory"); (void)0; }) > #endif > +#define smp_mb() ({ asm volatile("sync" ::: "memory"); (void)0; }) > > -#define smp_mb() asm volatile("sync" ::: "memory") > +#endif /* _ARCH_PPC */ > > -#else > +#endif /* C11 atomics */ > > /* > * For (host) platforms we don't have explicit barrier definitions > * for, we use the gcc __sync_synchronize() primitive to generate a > * full barrier. This should be safe on all platforms, though it may > - * be overkill for wmb() and rmb(). > + * be overkill for smp_wmb() and smp_rmb(). > */ > +#ifndef smp_mb > +#define smp_mb() __sync_synchronize() > +#endif > + > +#ifndef smp_wmb > +#ifdef __ATOMIC_RELEASE > +#define smp_wmb() __atomic_thread_fence(__ATOMIC_RELEASE) > +#else > #define smp_wmb() __sync_synchronize() > -#define smp_mb() __sync_synchronize() > +#endif > +#endif > + > +#ifndef smp_rmb > +#ifdef __ATOMIC_ACQUIRE > +#define smp_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE) > +#else > #define smp_rmb() __sync_synchronize() > +#endif > +#endif > + > +#ifndef smp_read_barrier_depends > +#ifdef __ATOMIC_CONSUME > +#define smp_read_barrier_depends() __atomic_thread_fence(__ATOMIC_CONSUME) > +#else > +#define smp_read_barrier_depends() barrier() > +#endif > +#endif > > +#ifndef atomic_read > +#define atomic_read(ptr) (*(__typeof__(*ptr) *volatile) (ptr)) > #endif > > +#ifndef atomic_set > +#define atomic_set(ptr, i) ((*(__typeof__(*ptr) *volatile) (ptr)) = (i)) > +#endif > + > +/* These have the same semantics as Java volatile variables. > + * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html: > + * "1. Issue a StoreStore barrier (wmb) before each volatile store." > + * 2. Issue a StoreLoad barrier after each volatile store. > + * Note that you could instead issue one before each volatile load, but > + * this would be slower for typical programs using volatiles in which > + * reads greatly outnumber writes. Alternatively, if available, you > + * can implement volatile store as an atomic instruction (for example > + * XCHG on x86) and omit the barrier. This may be more efficient if > + * atomic instructions are cheaper than StoreLoad barriers. > + * 3. Issue LoadLoad and LoadStore barriers after each volatile load." > + * > + * If you prefer to think in terms of "pairing" of memory barriers, > + * an atomic_mb_read pairs with an atomic_mb_set. > + * > + * And for the few ia64 lovers that exist, an atomic_mb_read is a ld.acq, > + * while an atomic_mb_set is a st.rel followed by a memory barrier. > + * > + * These are a bit weaker than __atomic_load/store with __ATOMIC_SEQ_CST > + * (see docs/atomics.txt), and I'm not sure that __ATOMIC_ACQ_REL is enough. > + * Just always use the barriers manually by the rules above. > + */ > +#ifndef atomic_mb_read > +#define atomic_mb_read(ptr) ({ \ > + typeof(*ptr) _val = atomic_read(ptr); \ > + smp_rmb(); \ > + _val; \ > +}) > +#endif > + > +#ifndef atomic_mb_set > +#define atomic_mb_set(ptr, i) do { \ > + smp_wmb(); \ > + atomic_set(ptr, i); \ > + smp_mb(); \ > +} while (0) > +#endif > + > +#ifndef atomic_xchg > +#ifdef __ATOMIC_SEQ_CST > +#define atomic_xchg(ptr, i) ({ \ > + typeof(*ptr) _new = (i), _old; \ > + __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \ > + _old; \ > +}) > +#elif defined __clang__ > +#define atomic_xchg(ptr, i) __sync_exchange(ptr, i) > +#else > +/* __sync_lock_test_and_set() is documented to be an acquire barrier only. > */ > +#define atomic_xchg(ptr, i) (smp_mb(), __sync_lock_test_and_set(ptr, i)) > +#endif > +#endif > + > +/* Provide shorter names for GCC atomic builtins. */ > +#define atomic_inc(ptr) __sync_fetch_and_add(ptr, 1) > +#define atomic_dec(ptr) __sync_fetch_and_add(ptr, -1) > +#define atomic_add __sync_fetch_and_add > +#define atomic_sub __sync_fetch_and_sub > +#define atomic_and __sync_fetch_and_and > +#define atomic_or __sync_fetch_and_or > +#define atomic_cmpxchg __sync_val_compare_and_swap > + > #endif > diff --git a/migration.c b/migration.c > index 058f9e6..83f5691 100644 > --- a/migration.c > +++ b/migration.c > @@ -290,8 +290,7 @@ static void migrate_fd_cleanup(void *opaque) > > static void migrate_finish_set_state(MigrationState *s, int new_state) > { > - if (__sync_val_compare_and_swap(&s->state, MIG_STATE_ACTIVE, > - new_state) == new_state) { > + if (atomic_cmpxchg(&s->state, MIG_STATE_ACTIVE, new_state) == new_state) > { > trace_migrate_set_state(new_state); > } > } > diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c > index 22915aa..dbf2fc8 100644 > --- a/tests/test-thread-pool.c > +++ b/tests/test-thread-pool.c > @@ -17,15 +17,15 @@ typedef struct { > static int worker_cb(void *opaque) > { > WorkerTestData *data = opaque; > - return __sync_fetch_and_add(&data->n, 1); > + return atomic_inc(&data->n); > } > > static int long_cb(void *opaque) > { > WorkerTestData *data = opaque; > - __sync_fetch_and_add(&data->n, 1); > + atomic_inc(&data->n); > g_usleep(2000000); > - __sync_fetch_and_add(&data->n, 1); > + atomic_inc(&data->n); > return 0; > } > > @@ -169,7 +169,7 @@ static void test_cancel(void) > /* Cancel the jobs that haven't been started yet. */ > num_canceled = 0; > for (i = 0; i < 100; i++) { > - if (__sync_val_compare_and_swap(&data[i].n, 0, 3) == 0) { > + if (atomic_cmpxchg(&data[i].n, 0, 3) == 0) { > data[i].ret = -ECANCELED; > bdrv_aio_cancel(data[i].aiocb); > active--; > -- > 1.8.1.4