/*
 * Queue spinlock
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
 *
 * Authors: Waiman Long <waiman.long@hp.com>
 */

#include "qspinlock.h"
#include "linux.h"


// #if CONFIG_NR_CPUS >= (1 << 14)
// # define _Q_MANY_CPUS
// # define _QCODE_OFFSET 8
// #else
# define _QCODE_OFFSET 16
// #endif

/*
 * The basic principle of a queue-based spinlock can best be understood
 * by studying a classic queue-based spinlock implementation called the
 * MCS lock. The paper below provides a good description for this kind
 * of lock.
 *
 * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
 *
 * This queue spinlock implementation is based on the MCS lock with twists
 * to make it fit the following constraints:
 * 1. A max spinlock size of 4 bytes
 * 2. Good fastpath performance
 * 3. No change in the locking APIs
 *
 * The queue spinlock fastpath is as simple as it can get, all the heavy
 * lifting is done in the lock slowpath. The main idea behind this queue
 * spinlock implementation is to keep the spinlock size at 4 bytes while
 * at the same time implement a queue structure to queue up the waiting
 * lock spinners.
 *
 * Since preemption is disabled before getting the lock, a given CPU will
 * only need to use one queue node structure in a non-interrupt context.
 * A percpu queue node structure will be allocated for this purpose and the
 * cpu number will be put into the queue spinlock structure to indicate the
 * tail of the queue.
 *
 * To handle spinlock acquisition at interrupt context (softirq or hardirq),
 * the queue node structure is actually an array for supporting nested spin
 * locking operations in interrupt handlers. If all the entries in the
 * array are used up, a warning message will be printed (as that shouldn't
 * happen in normal circumstances) and the lock spinner will fall back to
 * busy spinning instead of waiting in a queue.
 */

#define smp_u8_store_release(p, v)	\
do {					\
	barrier();			\
	ACCESS_ONCE(*p) = (v);		\
} while (0)

/*
 * As the qcode will be accessed as a 16-bit word, no offset is needed
 */
#define _QCODE_VAL_OFFSET	0

/*
 * x86-64 specific queue spinlock union structure
 * Besides the slock and lock fields, the other fields are only
 * valid with less than 16K CPUs.
 */
union arch_qspinlock {
	struct qspinlock slock;
	struct {
		u8  lock;	/* Lock bit	*/
		u8  wait;	/* Waiting bit	*/
		u16 qcode;	/* Queue code	*/
	};
	u16 lock_wait;		/* Lock and wait bits */
};

/*
 * The 24-bit queue node code is divided into the following 2 fields:
 * Bits 0-1 : queue node index (4 nodes)
 * Bits 2-23: CPU number + 1   (4M - 1 CPUs)
 *
 * The 16-bit queue node code is divided into the following 2 fields:
 * Bits 0-1 : queue node index (4 nodes)
 * Bits 2-15: CPU number + 1   (16K - 1 CPUs)
 *
 * A queue node code of 0 indicates that no one is waiting for the lock.
 * As the value 0 cannot be used as a valid CPU number. We need to add
 * 1 to it before putting it into the queue code.
 */
#define MAX_QNODES		4
#ifndef _QCODE_VAL_OFFSET
#define _QCODE_VAL_OFFSET	_QCODE_OFFSET
#endif

/*
 * The queue node structure
 *
 * This structure is essentially the same as the mcs_spinlock structure
 * in mcs_spinlock.h file. This structure is retained for future extension
 * where new fields may be added.
 */
struct qnode {
	u32		 wait;		/* Waiting flag		*/
	struct qnode	*next;		/* Next queue node addr */
};

struct qnode_set {
	struct qnode	nodes[MAX_QNODES];
	int		node_idx;	/* Current node to use */
};

/*
 * Per-CPU queue node structures
 */
static DEFINE_PER_CPU_ALIGNED(struct qnode_set, qnset) = { {{0}}, 0 };

/*
 ************************************************************************
 * The following optimized codes are for architectures that support:	*
 *  1) Atomic byte and short data write					*
 *  2) Byte and short data exchange and compare-exchange instructions	*
 *									*
 * For those architectures, their asm/qspinlock.h header file should	*
 * define the followings in order to use the optimized codes.		*
 *
 *  2) A smp_u8_store_release() macro for byte size store operation	*
 *  3) A "union arch_qspinlock" structure that include the individual	*
 *     fields of the qspinlock structure, including:			*
 *      o slock     - the qspinlock structure				*
 *      o lock      - the lock byte					*
 *      o wait      - the waiting byte					*
 *      o qcode     - the queue node code				*
 *      o lock_wait - the combined lock and waiting bytes		*
 *									*
 ************************************************************************
 */


/**
 * queue_spin_setlock - try to acquire the lock by setting the lock bit
 * @lock: Pointer to queue spinlock structure
 * Return: 1 if lock bit set successfully, 0 if failed
 */
static __always_inline int queue_spin_setlock(struct qspinlock *lock)
{
	union arch_qspinlock *qlock = (union arch_qspinlock *)lock;

	if (!ACCESS_ONCE(qlock->lock) &&
	   (cmpxchg(&qlock->lock, 0, _Q_LOCKED_VAL) == 0))
		return 1;
	return 0;
}

#ifndef _Q_MANY_CPUS
/*
 * With less than 16K CPUs, the following optimizations are possible with
 * the x86 architecture:
 *  1) The 2nd byte of the 32-bit lock word can be used as a pending bit
 *     for waiting lock acquirer so that it won't need to go through the
 *     MCS style locking queuing which has a higher overhead.
 *  2) The 16-bit queue code can be accessed or modified directly as a
 *     16-bit short value without disturbing the first 2 bytes.
 */
#define	_QSPINLOCK_WAITING	0x100U	/* Waiting bit in 2nd byte   */
#define	_QSPINLOCK_LWMASK	0xffff	/* Mask for lock & wait bits */

#define queue_encode_qcode(cpu, idx)	(((cpu) + 1) << 2 | (idx))

#define queue_spin_trylock_quick queue_spin_trylock_quick
/**
 * queue_spin_trylock_quick - fast spinning on the queue spinlock
 * @lock : Pointer to queue spinlock structure
 * @qsval: Old queue spinlock value
 * Return: 1 if lock acquired, 0 if failed
 *
 * This is an optimized contention path for 2 contending tasks. It
 * should only be entered if no task is waiting in the queue. This
 * optimized path is not as fair as the ticket spinlock, but it offers
 * slightly better performance. The regular MCS locking path for 3 or
 * more contending tasks, however, is fair.
 *
 * Depending on the exact timing, there are several different paths where
 * a contending task can take. The actual contention performance depends
 * on which path is taken. So it can be faster or slower than the
 * corresponding ticket spinlock path. On average, it is probably on par
 * with ticket spinlock.
 */
static inline int queue_spin_trylock_quick(struct qspinlock *lock, int qsval)
{
	union arch_qspinlock *qlock = (union arch_qspinlock *)lock;
	u16		     old;

	/*
	 * Fall into the quick spinning code path only if no one is waiting
	 * or the lock is available.
	 */
	if (unlikely((qsval != _Q_LOCKED_VAL) &&
		     (qsval != _QSPINLOCK_WAITING)))
		return 0;

	old = xchg(&qlock->lock_wait, _QSPINLOCK_WAITING|_Q_LOCKED_VAL);

	if (old == 0) {
		/*
		 * Got the lock, can clear the waiting bit now
		 */
		smp_u8_store_release(&qlock->wait, 0);
		return 1;
	} else if (old == _Q_LOCKED_VAL) {
try_again:
		/*
		 * Wait until the lock byte is cleared to get the lock
		 */
		do {
			cpu_relax();
		} while (ACCESS_ONCE(qlock->lock));
		/*
		 * Set the lock bit & clear the waiting bit
		 */
		if (cmpxchg(&qlock->lock_wait, _QSPINLOCK_WAITING,
			   _Q_LOCKED_VAL) == _QSPINLOCK_WAITING)
			return 1;
		/*
		 * Someone has steal the lock, so wait again
		 */
		goto try_again;
	} else if (old == _QSPINLOCK_WAITING) {
		/*
		 * Another task is already waiting while it steals the lock.
		 * A bit of unfairness here won't change the big picture.
		 * So just take the lock and return.
		 */
		return 1;
	}
	/*
	 * Nothing need to be done if the old value is
	 * (_QSPINLOCK_WAITING | _Q_LOCKED_VAL).
	 */
	return 0;
}

#define queue_code_xchg queue_code_xchg
/**
 * queue_code_xchg - exchange a queue code value
 * @lock : Pointer to queue spinlock structure
 * @qcode: New queue code to be exchanged
 * Return: The original qcode value in the queue spinlock
 */
static inline u32 queue_code_xchg(struct qspinlock *lock, u32 qcode)
{
	union arch_qspinlock *qlock = (union arch_qspinlock *)lock;

	return (u32)xchg(&qlock->qcode, (u16)qcode);
}

#define queue_spin_trylock_and_clr_qcode queue_spin_trylock_and_clr_qcode
/**
 * queue_spin_trylock_and_clr_qcode - Try to lock & clear qcode simultaneously
 * @lock : Pointer to queue spinlock structure
 * @qcode: The supposedly current qcode value
 * Return: true if successful, false otherwise
 */
static inline int
queue_spin_trylock_and_clr_qcode(struct qspinlock *lock, u32 qcode)
{
	qcode <<= _QCODE_OFFSET;
	return atomic_cmpxchg(&lock->val, qcode, _Q_LOCKED_VAL) == qcode;
}

#define queue_get_lock_qcode queue_get_lock_qcode
/**
 * queue_get_lock_qcode - get the lock & qcode values
 * @lock  : Pointer to queue spinlock structure
 * @qcode : Pointer to the returned qcode value
 * @mycode: My qcode value
 * Return : > 0 if lock is not available
 *	   = 0 if lock is free
 *	   < 0 if lock is taken & can return after cleanup
 *
 * It is considered locked when either the lock bit or the wait bit is set.
 */
static inline int
queue_get_lock_qcode(struct qspinlock *lock, u32 *qcode, u32 mycode)
{
	u32 val;

	val = (u32)atomic_read(&lock->val);
	/*
	 * With the special case that val contains only _Q_LOCKED_VAL
	 * and mycode. It will try to transition back to the quick spinning
	 * code by clearing the qcode and setting the _QSPINLOCK_WAITING
	 * bit.
	 */
	if (val == (_Q_LOCKED_VAL | (mycode << _QCODE_OFFSET))) {
		u32 old = val;

		val = atomic_cmpxchg(&lock->val, old,
				_Q_LOCKED_VAL|_QSPINLOCK_WAITING);
		if (val == old) {
			union arch_qspinlock *slock =
				(union arch_qspinlock *)lock;
try_again:
			/*
			 * Wait until the lock byte is cleared
			 */
			do {
				cpu_relax();
			} while (ACCESS_ONCE(slock->lock));
			/*
			 * Set the lock bit & clear the waiting bit
			 */
			if (cmpxchg(&slock->lock_wait, _QSPINLOCK_WAITING,
				    _Q_LOCKED_VAL) == _QSPINLOCK_WAITING)
				return -1;	/* Got the lock */
			goto try_again;
		}
	}
	*qcode = val >> _QCODE_OFFSET;
	return val & _QSPINLOCK_LWMASK;
}
#endif /* _Q_MANY_CPUS */


/*
 ************************************************************************
 * Inline functions used by the queue_spin_lock_slowpath() function	*
 * that may get superseded by a more optimized version.			*
 ************************************************************************
 */
#ifndef queue_spin_trylock_quick
static inline int queue_spin_trylock_quick(struct qspinlock *lock, int qsval)
{ return 0; }
#endif

#ifndef queue_get_lock_qcode
/**
 * queue_get_lock_qcode - get the lock & qcode values
 * @lock  : Pointer to queue spinlock structure
 * @qcode : Pointer to the returned qcode value
 * @mycode: My qcode value (not used)
 * Return : > 0 if lock is not available, = 0 if lock is free
 */
static inline int
queue_get_lock_qcode(struct qspinlock *lock, u32 *qcode, u32 mycode)
{
	int val = atomic_read(&lock->val);

	*qcode = val;
	return val & _Q_LOCKED_VAL;
}
#endif /* queue_get_lock_qcode */

#ifndef queue_spin_trylock_and_clr_qcode
/**
 * queue_spin_trylock_and_clr_qcode - Try to lock & clear qcode simultaneously
 * @lock : Pointer to queue spinlock structure
 * @qcode: The supposedly current qcode value
 * Return: true if successful, false otherwise
 */
static inline int
queue_spin_trylock_and_clr_qcode(struct qspinlock *lock, u32 qcode)
{
	return atomic_cmpxchg(&lock->val, qcode, _Q_LOCKED_VAL) == qcode;
}
#endif /* queue_spin_trylock_and_clr_qcode */

#ifndef queue_encode_qcode
/**
 * queue_encode_qcode - Encode the CPU number & node index into a qnode code
 * @cpu_nr: CPU number
 * @qn_idx: Queue node index
 * Return : A qnode code that can be saved into the qspinlock structure
 *
 * The lock bit is set in the encoded 32-bit value as the need to encode
 * a qnode means that the lock should have been taken.
 */
static u32 queue_encode_qcode(u32 cpu_nr, u8 qn_idx)
{
	return ((cpu_nr + 1) << (_QCODE_VAL_OFFSET + 2)) |
		(qn_idx << _QCODE_VAL_OFFSET) | _Q_LOCKED_VAL;
}
#endif /* queue_encode_qcode */

/*
 ************************************************************************
 * Other inline functions needed by the queue_spin_lock_slowpath()	*
 * function.								*
 ************************************************************************
 */

/**
 * xlate_qcode - translate the queue code into the queue node address
 * @qcode: Queue code to be translated
 * Return: The corresponding queue node address
 */
static inline struct qnode *xlate_qcode(u32 qcode)
{
	struct qnode_set *qset;

	u32 cpu_nr = (qcode >> (_QCODE_VAL_OFFSET + 2)) - 1;
	u8  qn_idx = (qcode >> _QCODE_VAL_OFFSET) & 3;

       	qset = per_cpu_ptr(&qnset, cpu_nr);
	return &qset->nodes[qn_idx];

//	return per_cpu_ptr(&qnset.nodes[qn_idx], cpu_nr);
}

/**
 * get_qnode - Get a queue node address
 * @qn_idx: Pointer to queue node index [out]
 * Return : queue node address & queue node index in qn_idx, or NULL if
 *	    no free queue node available.
 */
static struct qnode *get_qnode(unsigned int *qn_idx)
{
	struct qnode_set *qset = this_cpu_ptr(&qnset);
	int i;

	if (unlikely(qset->node_idx >= MAX_QNODES))
		return NULL;
	i = qset->node_idx++;
	*qn_idx = i;
	return &qset->nodes[i];
}

/**
 * put_qnode - Return a queue node to the pool
 */
static void put_qnode(void)
{
	struct qnode_set *qset = this_cpu_ptr(&qnset);

	qset->node_idx--;
}

/**
 * queue_spin_lock_slowpath - acquire the queue spinlock
 * @lock : Pointer to queue spinlock structure
 * @qsval: Current value of the queue spinlock 32-bit word
 */
void queue_spin_lock_slowpath(struct qspinlock *lock, u32 qsval)
{
	unsigned int cpu_nr, qn_idx;
	struct qnode *node, *next;
	u32 prev_qcode, my_qcode;

	/*
	 * Try the quick spinning code path
	 */
	if (queue_spin_trylock_quick(lock, qsval))
		return;
	/*
	 * Get the queue node
	 */
	cpu_nr = smp_processor_id();
	node   = get_qnode(&qn_idx);

	/*
	 * It should never happen that all the queue nodes are being used.
	 */
	BUG_ON(!node);

	/*
	 * Set up the new cpu code to be exchanged
	 */
	my_qcode = queue_encode_qcode(cpu_nr, qn_idx);

	/*
	 * Initialize the queue node
	 */
	node->wait = 1;
	node->next = NULL;

	/*
	 * The lock may be available at this point, try again if no task was
	 * waiting in the queue.
	 */
	if (!(qsval >> _QCODE_OFFSET) && queue_spin_trylock(lock)) {
		put_qnode();
		return;
	}

#ifdef queue_code_xchg
	prev_qcode = queue_code_xchg(lock, my_qcode);
#else
	/*
	 * Exchange current copy of the queue node code
	 */
	prev_qcode = atomic_xchg(&lock->val, my_qcode);
	/*
	 * It is possible that we may accidentally steal the lock. If this is
	 * the case, we need to either release it if not the head of the queue
	 * or get the lock and be done with it.
	 */
	if (unlikely(!(prev_qcode & _Q_LOCKED_VAL))) {
		if (prev_qcode == 0) {
			/*
			 * Got the lock since it is at the head of the queue
			 * Now try to atomically clear the queue code.
			 */
			if (atomic_cmpxchg(&lock->val, my_qcode,
					  _Q_LOCKED_VAL) == my_qcode)
				goto release_node;
			/*
			 * The cmpxchg fails only if one or more tasks
			 * are added to the queue. In this case, we need to
			 * notify the next one to be the head of the queue.
			 */
			goto notify_next;
		}
		/*
		 * Accidentally steal the lock, release the lock and
		 * let the queue head get it.
		 */
		queue_spin_unlock(lock);
	} else
		prev_qcode &= ~_Q_LOCKED_VAL;	/* Clear the lock bit */
	my_qcode &= ~_Q_LOCKED_VAL;
#endif /* queue_code_xchg */

	if (prev_qcode) {
		/*
		 * Not at the queue head, get the address of the previous node
		 * and set up the "next" fields of the that node.
		 */
		struct qnode *prev = xlate_qcode(prev_qcode);

		ACCESS_ONCE(prev->next) = node;
		/*
		 * Wait until the waiting flag is off
		 */
		while (smp_load_acquire(&node->wait))
			arch_mutex_cpu_relax();
	}

	/*
	 * At the head of the wait queue now
	 */
	while (1) {
		u32 qcode;
		int retval;

		retval = queue_get_lock_qcode(lock, &qcode, my_qcode);
		if (retval > 0)
			;	/* Lock not available yet */
		else if (retval < 0)
			/* Lock taken, can release the node & return */
			goto release_node;
		else if (qcode != my_qcode) {
			/*
			 * Just get the lock with other spinners waiting
			 * in the queue.
			 */
			if (queue_spin_setlock(lock))
				goto notify_next;
		} else {
			/*
			 * Get the lock & clear the queue code simultaneously
			 */
			if (queue_spin_trylock_and_clr_qcode(lock, qcode))
				/* No need to notify the next one */
				goto release_node;
		}
		arch_mutex_cpu_relax();
	}

notify_next:
	/*
	 * Wait, if needed, until the next one in queue set up the next field
	 */
	while (!(next = ACCESS_ONCE(node->next)))
		arch_mutex_cpu_relax();
	/*
	 * The next one in queue is now at the head
	 */
	smp_store_release(&next->wait, 0);

release_node:
	put_qnode();
}
EXPORT_SYMBOL(queue_spin_lock_slowpath);
