Re: [Qemu-devel] [PATCH 04/30] add a header file for atomic operations

Anthony Liguori Fri, 28 Jun 2013 13:42:42 -0700

Paolo Bonzini <pbonz...@redhat.com> writes:

> We're already using them in several places, but __sync builtins are just
> too ugly to type, and do not provide seqcst load/store operations.
>
> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
> ---
>  docs/atomics.txt         | 345 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  hw/display/qxl.c         |   3 +-
>  hw/virtio/vhost.c        |   9 +-
>  include/qemu/atomic.h    | 190 +++++++++++++++++++++-----
>  migration.c              |   3 +-
>  tests/test-thread-pool.c |   8 +-
>  6 files changed, 514 insertions(+), 44 deletions(-)
>  create mode 100644 docs/atomics.txt
>
> diff --git a/docs/atomics.txt b/docs/atomics.txt
> new file mode 100644
> index 0000000..e2ce04b
> --- /dev/null
> +++ b/docs/atomics.txt


Really nice write-up!

> diff --git a/hw/display/qxl.c b/hw/display/qxl.c
> index 3862d7a..f24cb4e 100644
> --- a/hw/display/qxl.c
> +++ b/hw/display/qxl.c
> @@ -23,6 +23,7 @@
>  #include "qemu-common.h"
>  #include "qemu/timer.h"
>  #include "qemu/queue.h"
> +#include "qemu/atomic.h"
>  #include "monitor/monitor.h"
>  #include "sysemu/sysemu.h"
>  #include "trace.h"
> @@ -1726,7 +1727,7 @@ static void qxl_send_events(PCIQXLDevice *d, uint32_t 
> events)
>          trace_qxl_send_events_vm_stopped(d->id, events);
>          return;
>      }
> -    old_pending = __sync_fetch_and_or(&d->ram->int_pending, le_events);
> +    old_pending = atomic_or(&d->ram->int_pending, le_events);
>      if ((old_pending & le_events) == le_events) {
>          return;
>      }
> diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
> index 96ab625..8f6ab13 100644
> --- a/hw/virtio/vhost.c
> +++ b/hw/virtio/vhost.c
> @@ -16,6 +16,7 @@
>  #include <sys/ioctl.h>
>  #include "hw/virtio/vhost.h"
>  #include "hw/hw.h"
> +#include "qemu/atomic.h"
>  #include "qemu/range.h"
>  #include <linux/vhost.h>
>  #include "exec/address-spaces.h"
> @@ -47,11 +48,9 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
>              addr += VHOST_LOG_CHUNK;
>              continue;
>          }
> -        /* Data must be read atomically. We don't really
> -         * need the barrier semantics of __sync
> -         * builtins, but it's easier to use them than
> -         * roll our own. */
> -        log = __sync_fetch_and_and(from, 0);
> +        /* Data must be read atomically. We don't really need barrier 
> semantics
> +         * but it's easier to use atomic_* than roll our own. */
> +        log = atomic_xchg(from, 0);
>          while ((bit = sizeof(log) > sizeof(int) ?
>                  ffsll(log) : ffs(log))) {
>              hwaddr page_addr;
> diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
> index 10becb6..04d64d0 100644
> --- a/include/qemu/atomic.h
> +++ b/include/qemu/atomic.h
> @@ -1,68 +1,194 @@
> -#ifndef __QEMU_BARRIER_H
> -#define __QEMU_BARRIER_H 1
> +/*
> + * Simple interface for atomic operations.
> + *
> + * Copyright (C) 2013 Red Hat, Inc.
> + *
> + * Author: Paolo Bonzini <pbonz...@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
>  
> -/* Compiler barrier */
> -#define barrier()   asm volatile("" ::: "memory")
> +#ifndef __QEMU_ATOMIC_H
> +#define __QEMU_ATOMIC_H 1
>  
> -#if defined(__i386__)
> +#include "qemu/compiler.h"
>  
> -#include "qemu/compiler.h"        /* QEMU_GNUC_PREREQ */
> +/* For C11 atomic ops */
>  
> -/*
> - * Because of the strongly ordered x86 storage model, wmb() and rmb() are 
> nops
> - * on x86(well, a compiler barrier only).  Well, at least as long as
> - * qemu doesn't do accesses to write-combining memory or non-temporal
> - * load/stores from C code.
> - */
> -#define smp_wmb()   barrier()
> -#define smp_rmb()   barrier()
> +/* Compiler barrier */
> +#define barrier()   ({ asm volatile("" ::: "memory"); (void)0; })
> +
> +#ifndef __ATOMIC_RELAXED
>  
>  /*
> - * We use GCC builtin if it's available, as that can use
> - * mfence on 32 bit as well, e.g. if built with -march=pentium-m.
> - * However, on i386, there seem to be known bugs as recently as 4.3.
> - * */
> -#if QEMU_GNUC_PREREQ(4, 4)
> -#define smp_mb() __sync_synchronize()
> + * We use GCC builtin if it's available, as that can use mfence on
> + * 32-bit as well, e.g. if built with -march=pentium-m. However, on
> + * i386 the spec is buggy, and the implementation followed it until
> + * 4.3 (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36793).
> + */
> +#if defined(__i386__) || defined(__x86_64__)
> +#if !QEMU_GNUC_PREREQ(4, 4)
> +#if defined __x86_64__
> +#define smp_mb()    ({ asm volatile("mfence" ::: "memory"); (void)0; })
>  #else
> -#define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
> +#define smp_mb()    ({ asm volatile("lock; addl $0,0(%%esp) " ::: "memory"); 
> (void)0; })
> +#endif
> +#endif
>  #endif
>  
> -#elif defined(__x86_64__)
>  
> +#ifdef __alpha__
> +#define smp_read_barrier_depends()   asm volatile("mb":::"memory")
> +#endif
> +
> +#if defined(__i386__) || defined(__x86_64__) || defined(__s390x__)
> +
> +/*
> + * Because of the strongly ordered storage model, wmb() and rmb() are nops
> + * here (a compiler barrier only).  QEMU doesn't do accesses to 
> write-combining
> + * qemu memory or non-temporal load/stores from C code.

Tiny copy/paste error here: s/qemu memory/memory/g".

One thing I've been thinking about reviewing this code, what should we
be doing in virtio.c?

We have barriers but we're relying on st[u][wlb]_phys having atomic
semantics.  I think it's okay in practice but if we're taking a more
diligent approach here should we introduce atomic variants that work on
guest phys addresses?

Otherwise:

Reviewed-by: Anthony Liguori <aligu...@us.ibm.com>

Regards,

Anthony Liguori

> + */
>  #define smp_wmb()   barrier()
>  #define smp_rmb()   barrier()
> -#define smp_mb() asm volatile("mfence" ::: "memory")
> +
> +/*
> + * __sync_lock_test_and_set() is documented to be an acquire barrier only,
> + * but it is a full barrier at the hardware level.  Add a compiler barrier
> + * to make it a full barrier also at the compiler level.
> + */
> +#define atomic_xchg(ptr, i)    (barrier(), __sync_lock_test_and_set(ptr, i))
> +
> +/*
> + * Load/store with Java volatile semantics.
> + */
> +#define atomic_mb_set(ptr, i)  ((void)atomic_xchg(ptr, i))
>  
>  #elif defined(_ARCH_PPC)
>  
>  /*
>   * We use an eieio() for wmb() on powerpc.  This assumes we don't
>   * need to order cacheable and non-cacheable stores with respect to
> - * each other
> + * each other.
> + *
> + * smp_mb has the same problem as on x86 for not-very-new GCC
> + * (http://patchwork.ozlabs.org/patch/126184/, Nov 2011).
>   */
> -#define smp_wmb()   asm volatile("eieio" ::: "memory")
> -
> +#define smp_wmb()   ({ asm volatile("eieio" ::: "memory"); (void)0; })
>  #if defined(__powerpc64__)
> -#define smp_rmb()   asm volatile("lwsync" ::: "memory")
> +#define smp_rmb()   ({ asm volatile("lwsync" ::: "memory"); (void)0; })
>  #else
> -#define smp_rmb()   asm volatile("sync" ::: "memory")
> +#define smp_rmb()   ({ asm volatile("sync" ::: "memory"); (void)0; })
>  #endif
> +#define smp_mb()    ({ asm volatile("sync" ::: "memory"); (void)0; })
>  
> -#define smp_mb()   asm volatile("sync" ::: "memory")
> +#endif /* _ARCH_PPC */
>  
> -#else
> +#endif /* C11 atomics */
>  
>  /*
>   * For (host) platforms we don't have explicit barrier definitions
>   * for, we use the gcc __sync_synchronize() primitive to generate a
>   * full barrier.  This should be safe on all platforms, though it may
> - * be overkill for wmb() and rmb().
> + * be overkill for smp_wmb() and smp_rmb().
>   */
> +#ifndef smp_mb
> +#define smp_mb()    __sync_synchronize()
> +#endif
> +
> +#ifndef smp_wmb
> +#ifdef __ATOMIC_RELEASE
> +#define smp_wmb()   __atomic_thread_fence(__ATOMIC_RELEASE)
> +#else
>  #define smp_wmb()   __sync_synchronize()
> -#define smp_mb()   __sync_synchronize()
> +#endif
> +#endif
> +
> +#ifndef smp_rmb
> +#ifdef __ATOMIC_ACQUIRE
> +#define smp_rmb()   __atomic_thread_fence(__ATOMIC_ACQUIRE)
> +#else
>  #define smp_rmb()   __sync_synchronize()
> +#endif
> +#endif
> +
> +#ifndef smp_read_barrier_depends
> +#ifdef __ATOMIC_CONSUME
> +#define smp_read_barrier_depends()   __atomic_thread_fence(__ATOMIC_CONSUME)
> +#else
> +#define smp_read_barrier_depends()   barrier()
> +#endif
> +#endif
>  
> +#ifndef atomic_read
> +#define atomic_read(ptr)       (*(__typeof__(*ptr) *volatile) (ptr))
>  #endif
>  
> +#ifndef atomic_set
> +#define atomic_set(ptr, i)     ((*(__typeof__(*ptr) *volatile) (ptr)) = (i))
> +#endif
> +
> +/* These have the same semantics as Java volatile variables.
> + * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html:
> + * "1. Issue a StoreStore barrier (wmb) before each volatile store."
> + *  2. Issue a StoreLoad barrier after each volatile store.
> + *     Note that you could instead issue one before each volatile load, but
> + *     this would be slower for typical programs using volatiles in which
> + *     reads greatly outnumber writes. Alternatively, if available, you
> + *     can implement volatile store as an atomic instruction (for example
> + *     XCHG on x86) and omit the barrier. This may be more efficient if
> + *     atomic instructions are cheaper than StoreLoad barriers.
> + *  3. Issue LoadLoad and LoadStore barriers after each volatile load."
> + *
> + * If you prefer to think in terms of "pairing" of memory barriers,
> + * an atomic_mb_read pairs with an atomic_mb_set.
> + *
> + * And for the few ia64 lovers that exist, an atomic_mb_read is a ld.acq,
> + * while an atomic_mb_set is a st.rel followed by a memory barrier.
> + *
> + * These are a bit weaker than __atomic_load/store with __ATOMIC_SEQ_CST
> + * (see docs/atomics.txt), and I'm not sure that __ATOMIC_ACQ_REL is enough.
> + * Just always use the barriers manually by the rules above.
> + */
> +#ifndef atomic_mb_read
> +#define atomic_mb_read(ptr)    ({           \
> +    typeof(*ptr) _val = atomic_read(ptr);   \
> +    smp_rmb();                              \
> +    _val;                                   \
> +})
> +#endif
> +
> +#ifndef atomic_mb_set
> +#define atomic_mb_set(ptr, i)  do {         \
> +    smp_wmb();                              \
> +    atomic_set(ptr, i);                     \
> +    smp_mb();                               \
> +} while (0)
> +#endif
> +
> +#ifndef atomic_xchg
> +#ifdef __ATOMIC_SEQ_CST
> +#define atomic_xchg(ptr, i)    ({                           \
> +    typeof(*ptr) _new = (i), _old;                          \
> +    __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \
> +    _old;                                                   \
> +})
> +#elif defined __clang__
> +#define atomic_xchg(ptr, i)    __sync_exchange(ptr, i)
> +#else
> +/* __sync_lock_test_and_set() is documented to be an acquire barrier only.  
> */
> +#define atomic_xchg(ptr, i)    (smp_mb(), __sync_lock_test_and_set(ptr, i))
> +#endif
> +#endif
> +
> +/* Provide shorter names for GCC atomic builtins.  */
> +#define atomic_inc(ptr)        __sync_fetch_and_add(ptr, 1)
> +#define atomic_dec(ptr)        __sync_fetch_and_add(ptr, -1)
> +#define atomic_add             __sync_fetch_and_add
> +#define atomic_sub             __sync_fetch_and_sub
> +#define atomic_and             __sync_fetch_and_and
> +#define atomic_or              __sync_fetch_and_or
> +#define atomic_cmpxchg         __sync_val_compare_and_swap
> +
>  #endif
> diff --git a/migration.c b/migration.c
> index 058f9e6..83f5691 100644
> --- a/migration.c
> +++ b/migration.c
> @@ -290,8 +290,7 @@ static void migrate_fd_cleanup(void *opaque)
>  
>  static void migrate_finish_set_state(MigrationState *s, int new_state)
>  {
> -    if (__sync_val_compare_and_swap(&s->state, MIG_STATE_ACTIVE,
> -                                    new_state) == new_state) {
> +    if (atomic_cmpxchg(&s->state, MIG_STATE_ACTIVE, new_state) == new_state) 
> {
>          trace_migrate_set_state(new_state);
>      }
>  }
> diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
> index 22915aa..dbf2fc8 100644
> --- a/tests/test-thread-pool.c
> +++ b/tests/test-thread-pool.c
> @@ -17,15 +17,15 @@ typedef struct {
>  static int worker_cb(void *opaque)
>  {
>      WorkerTestData *data = opaque;
> -    return __sync_fetch_and_add(&data->n, 1);
> +    return atomic_inc(&data->n);
>  }
>  
>  static int long_cb(void *opaque)
>  {
>      WorkerTestData *data = opaque;
> -    __sync_fetch_and_add(&data->n, 1);
> +    atomic_inc(&data->n);
>      g_usleep(2000000);
> -    __sync_fetch_and_add(&data->n, 1);
> +    atomic_inc(&data->n);
>      return 0;
>  }
>  
> @@ -169,7 +169,7 @@ static void test_cancel(void)
>      /* Cancel the jobs that haven't been started yet.  */
>      num_canceled = 0;
>      for (i = 0; i < 100; i++) {
> -        if (__sync_val_compare_and_swap(&data[i].n, 0, 3) == 0) {
> +        if (atomic_cmpxchg(&data[i].n, 0, 3) == 0) {
>              data[i].ret = -ECANCELED;
>              bdrv_aio_cancel(data[i].aiocb);
>              active--;
> -- 
> 1.8.1.4

Re: [Qemu-devel] [PATCH 04/30] add a header file for atomic operations

Reply via email to