
/*
 * The basic principle of a queue-based spinlock can best be understood
 * by studying a classic queue-based spinlock implementation called the
 * MCS lock. The paper below provides a good description for this kind
 * of lock.
 *
 * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
 *
 * This queue spinlock implementation is based on the MCS lock with twists
 * to make it fit the following constraints:
 * 1. A max spinlock size of 4 bytes
 * 2. Good fastpath performance
 * 3. No change in the locking APIs
 *
 * The queue spinlock fastpath is as simple as it can get, all the heavy
 * lifting is done in the lock slowpath. The main idea behind this queue
 * spinlock implementation is to keep the spinlock size at 4 bytes while
 * at the same time implement a queue structure to queue up the waiting
 * lock spinners.
 *
 * Since preemption is disabled before getting the lock, a given CPU will
 * only need to use one queue node structure in a non-interrupt context.
 * A percpu queue node structure will be allocated for this purpose and the
 * cpu number will be put into the queue spinlock structure to indicate the
 * tail of the queue.
 */

#include "qspinlock.h"
#include "linux.h"

struct mcs_spinlock {
	struct mcs_spinlock *next;
	int locked;
	int count;
};

#ifndef arch_mcs_spin_lock_contended
/*
 * Using smp_load_acquire() provides a memory barrier that ensures
 * subsequent operations happen after the lock is acquired.
 */
#define arch_mcs_spin_lock_contended(l)					\
do {									\
	while (!(smp_load_acquire(l)))					\
		arch_mutex_cpu_relax();					\
} while (0)
#endif

#ifndef arch_mcs_spin_unlock_contended
/*
 * smp_store_release() provides a memory barrier to ensure all
 * operations in the critical section has been completed before
 * unlocking.
 */
#define arch_mcs_spin_unlock_contended(l)				\
	smp_store_release((l), 1)
#endif

/*
 * Exactly fills one cacheline on 64bit.
 */
static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[4]);

static inline u32 encode_tail(int cpu, int idx)
{
	u32 code;

        code  = (cpu + 1) << _Q_TAIL_CPU_OFFSET;
	code |= idx << _Q_TAIL_IDX_OFFSET; /* assume < 4 */

	return code;
}

static inline struct mcs_spinlock *decode_tail(u32 code)
{
	int cpu = (code >> _Q_TAIL_CPU_OFFSET) - 1;
	int idx = (code >> _Q_TAIL_IDX_OFFSET) & _Q_TAIL_IDX_MASK;

	return per_cpu_ptr(&mcs_nodes[idx], cpu);
}

#define _QSPINLOCK_PENDING	(1U << _Q_PENDING_OFFSET)
#define _QSPINLOCK_MASK		(_QSPINLOCK_LOCKED | _QSPINLOCK_PENDING)

// PENDING - enables the pending bit logic
// OPT     - removes one atomic op at the cost of making pending a byte
// OPT2    - replaces some cmpxchg loops with unconditional atomic ops
//
// PENDING looks to be a win, even with 2 atomic ops on Intel, and a loss on AMD
// OPT is a full win
// OPT2 somehow doesn't seem to make much difference !?
//

/**
 * queue_spin_lock_slowpath - acquire the queue spinlock
 * @lock: Pointer to queue spinlock structure
 *
 *              fast      :    slow                                  :    unlock
 *                        :                                          :
 * uncontended  (0,0,0) --:--> (0,0,1) ------------------------------:--> (*,*,0)
 *                        :       | ^--------.------.             /  :
 *                        :       v           \      \            |  :
 * pending                :    (0,1,1) +--> (0,1,0)   \           |  :
 *                        :       | ^--'              |           |  :
 *                        :       v                   |           |  :
 * uncontended            :    (n,x,y) +--> (n,0,0) --'           |  :
 *   queue                :       | ^--'                          |  :
 *                        :       v                               |  :
 * contended              :    (*,x,y) +--> (*,0,0) ---> (*,0,1) -'  :
 *   queue                :         ^--'                             :
 *
 */
void queue_spin_lock_slowpath(struct qspinlock *lock, u32 val)
{
	struct mcs_spinlock *prev, *next, *node;
	u32 new, old, code;
	int idx;

#if PENDING
	/*
	 * trylock || pending
	 *
	 * 0,0,0 -> 0,0,1 ; trylock
	 * 0,0,1 -> 0,1,1 ; pending
	 */
#if !OPT2
	for (;;) {
		/*
		 * If we observe any contention; queue.
		 */
		if (val & ~_Q_LOCKED_MASK)
			goto queue;

		new = _QSPINLOCK_LOCKED;
		if (val == new)
			new |= _QSPINLOCK_PENDING;

		old = atomic_cmpxchg(&lock->val, val, new);
		if (old == val)
			break;

		val = old;
	}

	/*
	 * we won the trylock
	 */
	if (new == _QSPINLOCK_LOCKED)
		return;

#else
	/*
	 * we can ignore the (unlikely) trylock case and have a fall-through on
	 * the wait below.
	 */
	if (val & ~_Q_LOCKED_MASK)
		goto queue;

	if (xchg(&(((u8 *)lock)[1]), 1))
		goto queue;

// could not observe a significant difference
// between the on (xchg) and the other (bts) unconditional
// LOCKed op
//
//	if (atomic_test_and_set_bit(_Q_PENDING_OFFSET, &lock->val))
//		goto queue;
#endif

	/*
	 * we're pending, wait for the owner to go away.
	 *
	 * *,1,1 -> *,1,0
	 */
	while ((val = atomic_read(&lock->val)) & _QSPINLOCK_LOCKED)
		cpu_relax();

	/*
	 * take ownership and clear the pending bit.
	 *
	 * *,1,0 -> *,0,1
	 */
#if !OPT
	for (;;) {
		new = (val & ~_QSPINLOCK_PENDING) | _QSPINLOCK_LOCKED;

		old = atomic_cmpxchg(&lock->val, val, new);
		if (old == val)
			break;

		val = old;
	}
#else
	((u8 *)lock)[0] = 1; /* locked */
	smp_wmb();
	((u8 *)lock)[1] = 0; /* pending */

// there is a big difference between an atomic and
// no atomic op.
//
//	smp_mb__before_atomic_inc();
//	atomic_clear_bit(_Q_PENDING_OFFSET, &lock->val);
#endif

	return;

queue:
#endif
	node = this_cpu_ptr(&mcs_nodes[0]);
	idx = node->count++;
	code = encode_tail(smp_processor_id(), idx);

	node += idx;
	node->locked = 0;
	node->next = NULL;

	/*
	 * we already touched the queueing cacheline; don't bother with pending
	 * stuff.
	 *
	 * trylock || xchg(lock, node)
	 *
	 * 0,0,0 -> 0,0,1 ; trylock
	 * p,y,x -> n,y,x ; prev = xchg(lock, node)
	 */
#if !OPT2
	for (;;) {
		new = _QSPINLOCK_LOCKED;
		if (val)
			new = code | (val & _QSPINLOCK_MASK);

		old = atomic_cmpxchg(&lock->val, val, new);
		if (old == val)
			break;

		val = old;
	}

	/*
	 * we won the trylock; forget about queueing.
	 */
	if (new == _QSPINLOCK_LOCKED)
		goto release;
#else
	/*
	 * Like with the pending case; we can ignore the unlikely trylock case
	 * and have a fall-through on the wait.
	 */
	old = xchg(&((u16 *)lock)[1], code >> 16) << 16;
#endif

	/*
	 * if there was a previous node; link it and wait.
	 */
	if (old & ~_QSPINLOCK_MASK) {
		prev = decode_tail(old);
		ACCESS_ONCE(prev->next) = node;

		arch_mcs_spin_lock_contended(&node->locked);
	}

	/*
	 * we're at the head of the waitqueue, wait for the owner & pending to
	 * go away.
	 *
	 * *,x,y -> *,0,0
	 */
	while ((val = atomic_read(&lock->val)) & _QSPINLOCK_MASK)
		cpu_relax();

	/*
	 * claim the lock:
	 *
	 * n,0,0 -> 0,0,1 : lock, uncontended
	 * *,0,0 -> *,0,1 : lock, contended
	 */
	for (;;) {
		new = _QSPINLOCK_LOCKED;
		if (val != code)
			new |= val;

		old = atomic_cmpxchg(&lock->val, val, new);
		if (old == val)
			break;

		val = old;
	}

	/*
	 * contended path; wait for next, release.
	 */
	if (new != _QSPINLOCK_LOCKED) {
		while (!(next = ACCESS_ONCE(node->next)))
			arch_mutex_cpu_relax();

		arch_mcs_spin_unlock_contended(&next->locked);
	}

release:
	/*
	 * release the node
	 */
	this_cpu_ptr(&mcs_nodes[0])->count--;
//	this_cpu_dec(mcs_nodes[0].count);
}
EXPORT_SYMBOL(queue_spin_lock_slowpath);
