On Tue, Mar 28, 2023 at 10:17 AM Mike Christie
<[email protected]> wrote:
>
> For vhost-scsi with 3 vqs and a workload like that tries to use those vqs
> like:
>
> fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=128 --numjobs=3
>
> the single vhost worker thread will become a bottlneck and we are stuck
> at around 500K IOPs no matter how many jobs, virtqueues, and CPUs are
> used.
>
> To better utilize virtqueues and available CPUs, this patch allows
> userspace to create workers and bind them to vqs. You can have N workers
> per dev and also share N workers with M vqs.
>
> With the patches and doing a worker per vq, we can scale to at least
> 16 vCPUs/vqs (that's my system limit) with the same command fio command
> above with numjobs=16:
>
> fio --filename=/dev/sdb --direct=1 --rw=randrw --bs=4k \
> --ioengine=libaio --iodepth=64 --numjobs=16
>
> which gives around 2326K IOPs.
>
> Note that for testing I dropped depth to 64 above because the vhost/virt
> layer supports only 1024 total commands per device. And the only tuning I
> did was set LIO's emulate_pr to 0 to avoid LIO's PR lock in the main IO
> path which becomes an issue at around 12 jobs/virtqueues.
>
> Signed-off-by: Mike Christie <[email protected]>
> ---
> drivers/vhost/vhost.c | 177 ++++++++++++++++++++++++++++---
> drivers/vhost/vhost.h | 4 +-
> include/uapi/linux/vhost.h | 22 ++++
> include/uapi/linux/vhost_types.h | 15 +++
> 4 files changed, 204 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index 1fa5e9a49092..e40699e83c6d 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -271,7 +271,11 @@ EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
>
> void vhost_dev_flush(struct vhost_dev *dev)
> {
> - vhost_work_flush_on(dev->worker);
> + struct vhost_worker *worker;
> + unsigned long i;
> +
> + xa_for_each(&dev->worker_xa, i, worker)
> + vhost_work_flush_on(worker);
> }
> EXPORT_SYMBOL_GPL(vhost_dev_flush);
>
> @@ -489,7 +493,6 @@ void vhost_dev_init(struct vhost_dev *dev,
> dev->umem = NULL;
> dev->iotlb = NULL;
> dev->mm = NULL;
> - dev->worker = NULL;
> dev->iov_limit = iov_limit;
> dev->weight = weight;
> dev->byte_weight = byte_weight;
> @@ -499,7 +502,7 @@ void vhost_dev_init(struct vhost_dev *dev,
> INIT_LIST_HEAD(&dev->read_list);
> INIT_LIST_HEAD(&dev->pending_list);
> spin_lock_init(&dev->iotlb_lock);
> -
> + xa_init_flags(&dev->worker_xa, XA_FLAGS_ALLOC);
>
> for (i = 0; i < dev->nvqs; ++i) {
> vq = dev->vqs[i];
> @@ -562,32 +565,67 @@ static void vhost_detach_mm(struct vhost_dev *dev)
> dev->mm = NULL;
> }
>
> -static void vhost_worker_free(struct vhost_dev *dev)
> +static void vhost_worker_put(struct vhost_dev *dev, struct vhost_worker
> *worker)
> {
> - struct vhost_worker *worker = dev->worker;
> -
> if (!worker)
> return;
>
> - dev->worker = NULL;
> + if (!refcount_dec_and_test(&worker->refcount))
> + return;
> +
> WARN_ON(!llist_empty(&worker->work_list));
> vhost_task_stop(worker->vtsk);
> kfree(worker);
> }
>
> +static void vhost_vq_detach_worker(struct vhost_virtqueue *vq)
> +{
> + if (vq->worker)
What happens to the pending work that queues for the old worker?
> + vhost_worker_put(vq->dev, vq->worker);
> + vq->worker = NULL;
> +}
> +
> +static void vhost_workers_free(struct vhost_dev *dev)
> +{
> + struct vhost_worker *worker;
> + unsigned long i;
> +
> + if (!dev->use_worker)
> + return;
> +
> + for (i = 0; i < dev->nvqs; i++)
> + vhost_vq_detach_worker(dev->vqs[i]);
> + /*
> + * Drop the refcount taken during allocation, and handle the default
> + * worker and the cases where userspace might have crashed or was lazy
> + * and did a VHOST_NEW_WORKER but not a VHOST_FREE_WORKER.
> + */
> + xa_for_each(&dev->worker_xa, i, worker) {
> + xa_erase(&dev->worker_xa, worker->id);
> + vhost_worker_put(dev, worker);
> + }
> + xa_destroy(&dev->worker_xa);
> +}
> +
> static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
> {
> struct vhost_worker *worker;
> struct vhost_task *vtsk;
> char name[TASK_COMM_LEN];
> + int ret;
> + u32 id;
>
> worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
> if (!worker)
> return NULL;
>
> - dev->worker = worker;
> worker->kcov_handle = kcov_common_handle();
> init_llist_head(&worker->work_list);
> + /*
> + * We increase the refcount for the initial creation and then
> + * later each time it's attached to a virtqueue.
> + */
> + refcount_set(&worker->refcount, 1);
> snprintf(name, sizeof(name), "vhost-%d", current->pid);
>
> vtsk = vhost_task_create(vhost_worker, worker, name);
> @@ -596,14 +634,104 @@ static struct vhost_worker *vhost_worker_create(struct
> vhost_dev *dev)
>
> worker->vtsk = vtsk;
> vhost_task_start(vtsk);
> +
> + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b,
> GFP_KERNEL);
> + if (ret < 0)
> + goto stop_worker;
> + worker->id = id;
> +
> return worker;
>
> +stop_worker:
> + vhost_task_stop(vtsk);
> free_worker:
> kfree(worker);
> - dev->worker = NULL;
> return NULL;
> }
>
> +/* Caller must have device and virtqueue mutex */
> +static void __vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> + struct vhost_worker *worker)
> +{
> + refcount_inc(&worker->refcount);
> + vhost_vq_detach_worker(vq);())
> + vq->worker = worker;
What happens if there's a pending flush in a specific device (e.g in
vhost_scsi_tmf_resp_work())?
Does this mean we need to hold vq mutex when doing the flush? (which
seems not the case of vhost_scsi_tmf_resp_work()).
> +}
> +
> +/* Caller must have device and virtqueue mutex */
> +static int vhost_vq_attach_worker(struct vhost_virtqueue *vq,
> + struct vhost_vring_worker *info)
> +{
> + unsigned long index = info->worker_id;
> + struct vhost_dev *dev = vq->dev;
> + struct vhost_worker *worker;
> +
> + if (!dev->use_worker)
> + return -EINVAL;
> +
> + /*
> + * We don't support setting a worker on an active vq to make flushing
> + * and removal simple.
> + */
> + if (vhost_vq_get_backend(vq))
> + return -EBUSY;
This assumes the worker won't be started until the backend is set
which is not true.
> +
> + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
> + if (!worker || worker->id != info->worker_id)
> + return -ENODEV;
> +
> + __vhost_vq_attach_worker(vq, worker);
> + return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_new_worker(struct vhost_dev *dev,
> + struct vhost_worker_state *info)
> +{
> + struct vhost_worker *worker;
> +
> + if (!dev->use_worker)
> + return -EINVAL;
> +
> + worker = vhost_worker_create(dev);
> + if (!worker)
> + return -ENOMEM;
> +
> + info->worker_id = worker->id;
> + return 0;
> +}
> +
> +/* Caller must have device mutex */
> +static int vhost_free_worker(struct vhost_dev *dev,
> + struct vhost_worker_state *info)
> +{
> + unsigned long index = info->worker_id;
> + struct vhost_worker *worker;
> +
> + if (!dev->use_worker)
> + return -EINVAL;
> +
> + worker = xa_find(&dev->worker_xa, &index, UINT_MAX, XA_PRESENT);
So we use int for worker_id which conflicts with UINT_MAX here?
struct vhost_worker_state {
/*
* For VHOST_NEW_WORKER the kernel will return the new vhost_worker id.
* For VHOST_FREE_WORKER this must be set to the id of the vhost_worker
* to free.
*/
int worker_id;
};
A side effect of using xa_index directly is that userspace can guess
the xa_index of the default worker and free it here, is this intended?
Should we hide the default worker from xa?
> + if (!worker || worker->id != info->worker_id)
> + return -ENODEV;
> +
> + /*
> + * We can free the worker if it's not attached to any virtqueues.
> + */
> + if (refcount_read(&worker->refcount) != 1)
> + return -EBUSY;
> +
> + xa_erase(&dev->worker_xa, worker->id);
> + /*
> + * Make sure if there was a flush that saw the worker in the XA that
> + * it has completed.
> + */
> + vhost_work_flush_on(worker);
> +
> + vhost_worker_put(dev, worker);
> + return 0;
> +}
> +
> /* Caller should have device mutex */
> long vhost_dev_set_owner(struct vhost_dev *dev)
> {
> @@ -624,7 +752,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
> goto err_worker;
>
> for (i = 0; i < dev->nvqs; i++)
> - dev->vqs[i]->worker = worker;
> + __vhost_vq_attach_worker(dev->vqs[i], worker);
> }
>
> err = vhost_dev_alloc_iovecs(dev);
> @@ -633,7 +761,7 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
>
> return 0;
> err_iovecs:
> - vhost_worker_free(dev);
> + vhost_workers_free(dev);
> err_worker:
> vhost_detach_mm(dev);
> err_mm:
> @@ -726,7 +854,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
> dev->iotlb = NULL;
> vhost_clear_msg(dev);
> wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM);
> - vhost_worker_free(dev);
> + vhost_workers_free(dev);
> vhost_detach_mm(dev);
> }
> EXPORT_SYMBOL_GPL(vhost_dev_cleanup);
> @@ -1616,6 +1744,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned
> int ioctl, void __user *arg
> struct eventfd_ctx *ctx = NULL;
> u32 __user *idxp = argp;
> struct vhost_virtqueue *vq;
> + struct vhost_vring_worker w;
> struct vhost_vring_state s;
> struct vhost_vring_file f;
> u32 idx;
> @@ -1723,7 +1852,16 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned
> int ioctl, void __user *arg
> if (copy_to_user(argp, &s, sizeof(s)))
> r = -EFAULT;
> break;
> - default:
> + case VHOST_ATTACH_VRING_WORKER:
> + if (copy_from_user(&w, argp, sizeof(w))) {
> + r = -EFAULT;
> + break;
> + }
> + r = vhost_vq_attach_worker(vq, &w);
> + if (!r && copy_to_user(argp, &w, sizeof(w)))
> + r = -EFAULT;
> + break;
It's a little odd that we have new and attach but only a free.
Thanks
> +default:
> r = -ENOIOCTLCMD;
> }
>
> @@ -1776,6 +1914,7 @@ EXPORT_SYMBOL_GPL(vhost_init_device_iotlb);
> /* Caller must have device mutex */
> long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user
> *argp)
> {
> + struct vhost_worker_state w;
> struct eventfd_ctx *ctx;
> u64 p;
> long r;
> @@ -1836,6 +1975,18 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int
> ioctl, void __user *argp)
> if (ctx)
> eventfd_ctx_put(ctx);
> break;
> + case VHOST_NEW_WORKER:
> + r = vhost_new_worker(d, &w);
> + if (!r && copy_to_user(argp, &w, sizeof(w)))
> + r = -EFAULT;
> + break;
> + case VHOST_FREE_WORKER:
> + if (copy_from_user(&w, argp, sizeof(w))) {
> + r = -EFAULT;
> + break;
> + }
> + r = vhost_free_worker(d, &w);
> + break;
> default:
> r = -ENOIOCTLCMD;
> break;
> diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
> index 395707c680e5..a67ae8293c38 100644
> --- a/drivers/vhost/vhost.h
> +++ b/drivers/vhost/vhost.h
> @@ -30,6 +30,8 @@ struct vhost_worker {
> struct vhost_task *vtsk;
> struct llist_head work_list;
> u64 kcov_handle;
> + refcount_t refcount;
> + u32 id;
> };
>
> /* Poll a file (eventfd or socket) */
> @@ -156,7 +158,6 @@ struct vhost_dev {
> struct vhost_virtqueue **vqs;
> int nvqs;
> struct eventfd_ctx *log_ctx;
> - struct vhost_worker *worker;
> struct vhost_iotlb *umem;
> struct vhost_iotlb *iotlb;
> spinlock_t iotlb_lock;
> @@ -166,6 +167,7 @@ struct vhost_dev {
> int iov_limit;
> int weight;
> int byte_weight;
> + struct xarray worker_xa;
> bool use_worker;
> int (*msg_handler)(struct vhost_dev *dev, u32 asid,
> struct vhost_iotlb_msg *msg);
> diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
> index 92e1b700b51c..7329e7f349dd 100644
> --- a/include/uapi/linux/vhost.h
> +++ b/include/uapi/linux/vhost.h
> @@ -45,6 +45,23 @@
> #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
> /* Specify an eventfd file descriptor to signal on log write. */
> #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
> +/* By default, a device gets one vhost_worker that its virtqueues share. This
> + * command allows the owner of the device to create an additional
> vhost_worker
> + * for the device. It can later be bound to 1 or more of its virtqueues using
> + * the VHOST_ATTACH_VRING_WORKER command.
> + *
> + * This must be called after VHOST_SET_OWNER and the caller must be the owner
> + * of the device. The new thread will inherit caller's cgroups and
> namespaces,
> + * and will share the caller's memory space. The new thread will also be
> + * counted against the caller's RLIMIT_NPROC value.
> + */
> +#define VHOST_NEW_WORKER _IOW(VHOST_VIRTIO, 0x8, struct vhost_worker_state)
> +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any
> + * virtqueue. If userspace is not able to call this for workers its created,
> + * the kernel will free all the device's workers when the device is closed
> and
> + * the last reference to the device has been released.
> + */
> +#define VHOST_FREE_WORKER _IOR(VHOST_VIRTIO, 0x9, struct vhost_worker_state)
>
> /* Ring setup. */
> /* Set number of descriptors in ring. This parameter can not
> @@ -70,6 +87,11 @@
> #define VHOST_VRING_BIG_ENDIAN 1
> #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct
> vhost_vring_state)
> #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct
> vhost_vring_state)
> +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's
> + * virtqueues. This must be done before the virtqueue is active.
> + */
> +#define VHOST_ATTACH_VRING_WORKER _IOR(VHOST_VIRTIO, 0x15, \
> + struct vhost_vring_worker)
>
> /* The following ioctls use eventfd file descriptors to signal and poll
> * for events. */
> diff --git a/include/uapi/linux/vhost_types.h
> b/include/uapi/linux/vhost_types.h
> index c5690a8992d8..ad0fe2e721be 100644
> --- a/include/uapi/linux/vhost_types.h
> +++ b/include/uapi/linux/vhost_types.h
> @@ -47,6 +47,21 @@ struct vhost_vring_addr {
> __u64 log_guest_addr;
> };
>
> +struct vhost_worker_state {
> + /*
> + * For VHOST_NEW_WORKER the kernel will return the new vhost_worker
> id.
> + * For VHOST_FREE_WORKER this must be set to the id of the
> vhost_worker
> + * to free.
> + */
> + int worker_id;
> +};
> +
> +struct vhost_vring_worker {
> + unsigned int index;
> + /* The id of the vhost_worker returned from VHOST_NEW_WORKER */
> + int worker_id;
> +};
> +
> /* no alignment requirement */
> struct vhost_iotlb_msg {
> __u64 iova;
> --
> 2.25.1
>
_______________________________________________
Virtualization mailing list
[email protected]
https://lists.linuxfoundation.org/mailman/listinfo/virtualization