On Mon, Feb 7, 2022 at 10:01 PM Stefan Hajnoczi <[email protected]> wrote: > > On Tue, Jan 25, 2022 at 09:17:57PM +0800, Xie Yongji wrote: > > VDUSE [1] is a linux framework that makes it possible to implement > > software-emulated vDPA devices in userspace. This adds a library > > as a subproject to help implementing VDUSE backends in QEMU. > > > > [1] https://www.kernel.org/doc/html/latest/userspace-api/vduse.html > > This library assumes that the program is allowed to access the control > device (/dev/vduse/control). Is that always the case or should the > library also support access to /dev/vduse/<name> only (maybe even with > file descriptor passing) so a privileged process can create/destroy > VDUSE devices? >
Make sense. I will add two new API to support these two cases. > I didn't review the vring code in detail. > > > > > Signed-off-by: Xie Yongji <[email protected]> > > --- > > meson.build | 15 + > > meson_options.txt | 2 + > > scripts/meson-buildoptions.sh | 3 + > > subprojects/libvduse/include/atomic.h | 1 + > > subprojects/libvduse/libvduse.c | 1025 +++++++++++++++++++ > > subprojects/libvduse/libvduse.h | 193 ++++ > > subprojects/libvduse/meson.build | 10 + > > subprojects/libvduse/standard-headers/linux | 1 + > > 8 files changed, 1250 insertions(+) > > create mode 120000 subprojects/libvduse/include/atomic.h > > create mode 100644 subprojects/libvduse/libvduse.c > > create mode 100644 subprojects/libvduse/libvduse.h > > create mode 100644 subprojects/libvduse/meson.build > > create mode 120000 subprojects/libvduse/standard-headers/linux > > > > diff --git a/meson.build b/meson.build > > index 333c61deba..864fb50ade 100644 > > --- a/meson.build > > +++ b/meson.build > > @@ -1305,6 +1305,21 @@ if not get_option('fuse_lseek').disabled() > > endif > > endif > > > > +have_libvduse = (targetos == 'linux') > > +if get_option('libvduse').enabled() > > + if targetos != 'linux' > > + error('libvduse requires linux') > > + endif > > +elif get_option('libvduse').disabled() > > + have_libvduse = false > > +endif > > + > > +libvduse = not_found > > +if have_libvduse > > + libvduse_proj = subproject('libvduse') > > + libvduse = libvduse_proj.get_variable('libvduse_dep') > > +endif > > + > > # libbpf > > libbpf = dependency('libbpf', required: get_option('bpf'), method: > > 'pkg-config') > > if libbpf.found() and not cc.links(''' > > diff --git a/meson_options.txt b/meson_options.txt > > index 921967eddb..16790d1814 100644 > > --- a/meson_options.txt > > +++ b/meson_options.txt > > @@ -195,6 +195,8 @@ option('virtfs', type: 'feature', value: 'auto', > > description: 'virtio-9p support') > > option('virtiofsd', type: 'feature', value: 'auto', > > description: 'build virtiofs daemon (virtiofsd)') > > +option('libvduse', type: 'feature', value: 'auto', > > + description: 'build VDUSE Library') > > > > option('capstone', type: 'combo', value: 'auto', > > choices: ['disabled', 'enabled', 'auto', 'system', 'internal'], > > diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh > > index a4af02c527..af5c75d758 100644 > > --- a/scripts/meson-buildoptions.sh > > +++ b/scripts/meson-buildoptions.sh > > @@ -58,6 +58,7 @@ meson_options_help() { > > printf "%s\n" ' libssh ssh block device support' > > printf "%s\n" ' libudev Use libudev to enumerate host devices' > > printf "%s\n" ' libusb libusb support for USB passthrough' > > + printf "%s\n" ' libvduse build VDUSE Library' > > printf "%s\n" ' libxml2 libxml2 support for Parallels image > > format' > > printf "%s\n" ' linux-aio Linux AIO support' > > printf "%s\n" ' linux-io-uring Linux io_uring support' > > @@ -188,6 +189,8 @@ _meson_option_parse() { > > --disable-libudev) printf "%s" -Dlibudev=disabled ;; > > --enable-libusb) printf "%s" -Dlibusb=enabled ;; > > --disable-libusb) printf "%s" -Dlibusb=disabled ;; > > + --enable-libvduse) printf "%s" -Dlibvduse=enabled ;; > > + --disable-libvduse) printf "%s" -Dlibvduse=disabled ;; > > --enable-libxml2) printf "%s" -Dlibxml2=enabled ;; > > --disable-libxml2) printf "%s" -Dlibxml2=disabled ;; > > --enable-linux-aio) printf "%s" -Dlinux_aio=enabled ;; > > diff --git a/subprojects/libvduse/include/atomic.h > > b/subprojects/libvduse/include/atomic.h > > new file mode 120000 > > index 0000000000..8c2be64f7b > > --- /dev/null > > +++ b/subprojects/libvduse/include/atomic.h > > @@ -0,0 +1 @@ > > +../../../include/qemu/atomic.h > > \ No newline at end of file > > diff --git a/subprojects/libvduse/libvduse.c > > b/subprojects/libvduse/libvduse.c > > new file mode 100644 > > index 0000000000..7671864bca > > --- /dev/null > > +++ b/subprojects/libvduse/libvduse.c > > @@ -0,0 +1,1025 @@ > > +/* > > + * VDUSE (vDPA Device in Userspace) library > > + * > > + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights > > reserved. > > + * Portions of codes and concepts borrowed from libvhost-user.c, so: > > + * Copyright IBM, Corp. 2007 > > + * Copyright (c) 2016 Red Hat, Inc. > > + * > > + * Author: > > + * Xie Yongji <[email protected]> > > + * Anthony Liguori <[email protected]> > > + * Marc-André Lureau <[email protected]> > > + * Victor Kaplansky <[email protected]> > > + * > > + * This work is licensed under the terms of the GNU GPL, version 2 or > > + * later. See the COPYING file in the top-level directory. > > + */ > > + > > +#include <stdlib.h> > > +#include <stdio.h> > > +#include <stdbool.h> > > +#include <stddef.h> > > +#include <errno.h> > > +#include <string.h> > > +#include <assert.h> > > +#include <endian.h> > > +#include <unistd.h> > > +#include <limits.h> > > +#include <fcntl.h> > > + > > +#include <sys/ioctl.h> > > +#include <sys/eventfd.h> > > +#include <sys/mman.h> > > + > > +#include "include/atomic.h" > > +#include "standard-headers/linux/vhost_types.h" > > +#include "standard-headers/linux/vduse.h" > > +#include "libvduse.h" > > + > > +#define VIRTQUEUE_MAX_SIZE 1024 > > +#define VDUSE_VQ_ALIGN 4096 > > +#define MAX_IOVA_REGIONS 256 > > + > > +/* Round number down to multiple */ > > +#define ALIGN_DOWN(n, m) ((n) / (m) * (m)) > > + > > +/* Round number up to multiple */ > > +#define ALIGN_UP(n, m) ALIGN_DOWN((n) + (m) - 1, (m)) > > + > > +#ifndef unlikely > > +#define unlikely(x) __builtin_expect(!!(x), 0) > > +#endif > > + > > +typedef struct VduseRing { > > + unsigned int num; > > + uint64_t desc_addr; > > + uint64_t avail_addr; > > + uint64_t used_addr; > > + struct vring_desc *desc; > > + struct vring_avail *avail; > > + struct vring_used *used; > > +} VduseRing; > > + > > +struct VduseVirtq { > > + VduseRing vring; > > + uint16_t last_avail_idx; > > + uint16_t shadow_avail_idx; > > + uint16_t used_idx; > > + uint16_t signalled_used; > > + bool signalled_used_valid; > > + int index; > > + int inuse; > > + bool ready; > > + int fd; > > + VduseDev *dev; > > +}; > > + > > +typedef struct VduseIovaRegion { > > + uint64_t iova; > > + uint64_t size; > > + uint64_t mmap_offset; > > + uint64_t mmap_addr; > > +} VduseIovaRegion; > > + > > +struct VduseDev { > > + VduseVirtq *vqs; > > + VduseIovaRegion regions[MAX_IOVA_REGIONS]; > > + int num_regions; > > + char *name; > > + uint32_t device_id; > > + uint32_t vendor_id; > > + uint16_t num_queues; > > + uint16_t queue_size; > > + uint64_t features; > > + const VduseOps *ops; > > + int fd; > > + int ctrl_fd; > > + void *priv; > > +}; > > + > > +static inline bool has_feature(uint64_t features, unsigned int fbit) > > +{ > > + assert(fbit < 64); > > + return !!(features & (1ULL << fbit)); > > +} > > + > > +static inline bool vduse_dev_has_feature(VduseDev *dev, unsigned int fbit) > > +{ > > + return has_feature(dev->features, fbit); > > +} > > + > > +VduseDev *vduse_queue_get_dev(VduseVirtq *vq) > > +{ > > + return vq->dev; > > +} > > + > > +int vduse_queue_get_fd(VduseVirtq *vq) > > +{ > > + return vq->fd; > > +} > > + > > +void *vduse_dev_get_priv(VduseDev *dev) > > +{ > > + return dev->priv; > > +} > > + > > +VduseVirtq *vduse_dev_get_queue(VduseDev *dev, int index) > > +{ > > + return &dev->vqs[index]; > > +} > > + > > +int vduse_dev_get_fd(VduseDev *dev) > > +{ > > + return dev->fd; > > +} > > + > > +static int vduse_inject_irq(VduseDev *dev, int index) > > +{ > > + return ioctl(dev->fd, VDUSE_VQ_INJECT_IRQ, &index); > > +} > > + > > +static void vduse_iova_remove_region(VduseDev *dev, uint64_t start, > > + uint64_t last) > > +{ > > + int i; > > + > > + if (last == start) { > > + return; > > + } > > + > > + for (i = 0; i < MAX_IOVA_REGIONS; i++) { > > + if (!dev->regions[i].mmap_addr) { > > + continue; > > + } > > + > > + if (start <= dev->regions[i].iova && > > + last >= (dev->regions[i].iova + dev->regions[i].size - 1)) { > > + munmap((void *)dev->regions[i].mmap_addr, > > + dev->regions[i].mmap_offset + dev->regions[i].size); > > + dev->regions[i].mmap_addr = 0; > > + dev->num_regions--; > > + } > > + } > > +} > > + > > +static int vduse_iova_add_region(VduseDev *dev, int fd, > > + uint64_t offset, uint64_t start, > > + uint64_t last, int prot) > > +{ > > + int i; > > + uint64_t size = last - start + 1; > > + void *mmap_addr = mmap(0, size + offset, prot, MAP_SHARED, fd, 0); > > + > > + if (mmap_addr == MAP_FAILED) { > > + return -EINVAL; > > Missing close(fd). This function takes ownership of fd. Will fix it. > > + } > > + > > + for (i = 0; i < MAX_IOVA_REGIONS; i++) { > > + if (!dev->regions[i].mmap_addr) { > > + dev->regions[i].mmap_addr = (uint64_t)(uintptr_t)mmap_addr; > > + dev->regions[i].mmap_offset = offset; > > + dev->regions[i].iova = start; > > + dev->regions[i].size = size; > > + dev->num_regions++; > > + break; > > + } > > + } > > + close(fd); > > assert(i < MAX_IOVA_REGIONS)? If we can really reach the end of the for > loop then we must remember to call munmap(2). > Sure. > > + > > + return 0; > > +} > > + > > +static int perm_to_prot(uint8_t perm) > > +{ > > + int prot = 0; > > + > > + switch (perm) { > > + case VDUSE_ACCESS_WO: > > + prot |= PROT_WRITE; > > + break; > > + case VDUSE_ACCESS_RO: > > + prot |= PROT_READ; > > + break; > > + case VDUSE_ACCESS_RW: > > + prot |= PROT_READ | PROT_WRITE; > > + break; > > + default: > > + break; > > + } > > + > > + return prot; > > +} > > + > > +static inline void *iova_to_va(VduseDev *dev, uint64_t *plen, uint64_t > > iova) > > +{ > > + int i, ret; > > + struct vduse_iotlb_entry entry; > > + > > + for (i = 0; i < MAX_IOVA_REGIONS; i++) { > > + VduseIovaRegion *r = &dev->regions[i]; > > + > > + if (!r->mmap_addr) { > > + continue; > > + } > > + > > + if ((iova >= r->iova) && (iova < (r->iova + r->size))) { > > + if ((iova + *plen) > (r->iova + r->size)) { > > + *plen = r->iova + r->size - iova; > > + } > > + return (void *)(uintptr_t)(iova - r->iova + > > + r->mmap_addr + r->mmap_offset); > > + } > > + } > > + > > + entry.start = iova; > > + entry.last = iova + 1; > > + ret = ioctl(dev->fd, VDUSE_IOTLB_GET_FD, &entry); > > + if (ret < 0) { > > + return NULL; > > + } > > + > > + if (!vduse_iova_add_region(dev, ret, entry.offset, entry.start, > > + entry.last, perm_to_prot(entry.perm))) { > > + return iova_to_va(dev, plen, iova); > > + } > > + > > + return NULL; > > +} > > + > > +static inline uint16_t vring_avail_flags(VduseVirtq *vq) > > +{ > > + return le16toh(vq->vring.avail->flags); > > I remember we discussed whether VDUSE should support Transitional > devices. VIRTIO 1.0+ uses little-endian but legacy VIRTIO uses > guest-endian, so le16toh() will not work for legacy VIRTIO vrings in a > cross-endian configuration (e.g. big-endian guest on little-endian > host). > You're right. Cross-endian isn't supported now. > If cross-endian isn't supported please add an error during > intialization so users get a clear error message. > Will check VIRTIO_F_VERSION_1 during intialization. > > +} > > + > > +static inline uint16_t vring_avail_idx(VduseVirtq *vq) > > +{ > > + vq->shadow_avail_idx = le16toh(vq->vring.avail->idx); > > + > > + return vq->shadow_avail_idx; > > +} > > + > > +static inline uint16_t vring_avail_ring(VduseVirtq *vq, int i) > > +{ > > + return le16toh(vq->vring.avail->ring[i]); > > +} > > + > > +static inline uint16_t vring_get_used_event(VduseVirtq *vq) > > +{ > > + return vring_avail_ring(vq, vq->vring.num); > > +} > > + > > +static bool vduse_queue_get_head(VduseVirtq *vq, unsigned int idx, > > + unsigned int *head) > > +{ > > + /* > > + * Grab the next descriptor number they're advertising, and increment > > + * the index we've seen. > > + */ > > + *head = vring_avail_ring(vq, idx % vq->vring.num); > > + > > + /* If their number is silly, that's a fatal mistake. */ > > + if (*head >= vq->vring.num) { > > + fprintf(stderr, "Guest says index %u is available\n", *head); > > + return false; > > + } > > + > > + return true; > > +} > > + > > +static int > > +vduse_queue_read_indirect_desc(VduseDev *dev, struct vring_desc *desc, > > + uint64_t addr, size_t len) > > +{ > > + struct vring_desc *ori_desc; > > + uint64_t read_len; > > + > > + if (len > (VIRTQUEUE_MAX_SIZE * sizeof(struct vring_desc))) { > > + return -1; > > + } > > + > > + if (len == 0) { > > + return -1; > > + } > > + > > + while (len) { > > + read_len = len; > > + ori_desc = iova_to_va(dev, &read_len, addr); > > + if (!ori_desc) { > > + return -1; > > + } > > + > > + memcpy(desc, ori_desc, read_len); > > + len -= read_len; > > + addr += read_len; > > + desc += read_len; > > + } > > + > > + return 0; > > +} > > + > > +enum { > > + VIRTQUEUE_READ_DESC_ERROR = -1, > > + VIRTQUEUE_READ_DESC_DONE = 0, /* end of chain */ > > + VIRTQUEUE_READ_DESC_MORE = 1, /* more buffers in chain */ > > +}; > > + > > +static int vduse_queue_read_next_desc(struct vring_desc *desc, int i, > > + unsigned int max, unsigned int *next) > > +{ > > + /* If this descriptor says it doesn't chain, we're done. */ > > + if (!(le16toh(desc[i].flags) & VRING_DESC_F_NEXT)) { > > + return VIRTQUEUE_READ_DESC_DONE; > > + } > > + > > + /* Check they're not leading us off end of descriptors. */ > > + *next = desc[i].next; > > + /* Make sure compiler knows to grab that: we don't want it changing! */ > > + smp_wmb(); > > + > > + if (*next >= max) { > > + fprintf(stderr, "Desc next is %u\n", *next); > > + return VIRTQUEUE_READ_DESC_ERROR; > > + } > > + > > + return VIRTQUEUE_READ_DESC_MORE; > > +} > > + > > +/* > > + * Fetch avail_idx from VQ memory only when we really need to know if > > + * guest has added some buffers. > > + */ > > +static bool vduse_queue_empty(VduseVirtq *vq) > > +{ > > + if (unlikely(!vq->vring.avail)) { > > + return true; > > + } > > + > > + if (vq->shadow_avail_idx != vq->last_avail_idx) { > > + return false; > > + } > > + > > + return vring_avail_idx(vq) == vq->last_avail_idx; > > +} > > + > > +static bool vduse_queue_should_notify(VduseVirtq *vq) > > +{ > > + VduseDev *dev = vq->dev; > > + uint16_t old, new; > > + bool v; > > + > > + /* We need to expose used array entries before checking used event. */ > > + smp_mb(); > > + > > + /* Always notify when queue is empty (when feature acknowledge) */ > > + if (vduse_dev_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && > > + !vq->inuse && vduse_queue_empty(vq)) { > > + return true; > > + } > > + > > + if (!vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { > > + return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); > > + } > > + > > + v = vq->signalled_used_valid; > > + vq->signalled_used_valid = true; > > + old = vq->signalled_used; > > + new = vq->signalled_used = vq->used_idx; > > + return !v || vring_need_event(vring_get_used_event(vq), new, old); > > +} > > + > > +void vduse_queue_notify(VduseVirtq *vq) > > +{ > > + VduseDev *dev = vq->dev; > > + > > + if (unlikely(!vq->vring.avail)) { > > + return; > > + } > > + > > + if (!vduse_queue_should_notify(vq)) { > > + return; > > + } > > + > > + if (vduse_inject_irq(dev, vq->index) < 0) { > > + fprintf(stderr, "Error inject irq for vq %d: %s\n", > > + vq->index, strerror(errno)); > > + } > > +} > > + > > +static inline void vring_used_flags_set_bit(VduseVirtq *vq, int mask) > > +{ > > + uint16_t *flags; > > + > > + flags = (uint16_t *)((char*)vq->vring.used + > > + offsetof(struct vring_used, flags)); > > + *flags = htole16(le16toh(*flags) | mask); > > +} > > + > > +static inline void vring_used_flags_unset_bit(VduseVirtq *vq, int mask) > > +{ > > + uint16_t *flags; > > + > > + flags = (uint16_t *)((char*)vq->vring.used + > > + offsetof(struct vring_used, flags)); > > + *flags = htole16(le16toh(*flags) & ~mask); > > +} > > + > > +static inline void vring_set_avail_event(VduseVirtq *vq, uint16_t val) > > +{ > > + *((uint16_t *)&vq->vring.used->ring[vq->vring.num]) = htole16(val); > > +} > > + > > +static bool vduse_queue_map_single_desc(VduseVirtq *vq, unsigned int > > *p_num_sg, > > + struct iovec *iov, unsigned int > > max_num_sg, > > + bool is_write, uint64_t pa, size_t sz) > > +{ > > + unsigned num_sg = *p_num_sg; > > + VduseDev *dev = vq->dev; > > + > > + assert(num_sg <= max_num_sg); > > + > > + if (!sz) { > > + fprintf(stderr, "virtio: zero sized buffers are not allowed\n"); > > + return false; > > + } > > + > > + while (sz) { > > + uint64_t len = sz; > > + > > + if (num_sg == max_num_sg) { > > + fprintf(stderr, > > + "virtio: too many descriptors in indirect table\n"); > > + return false; > > + } > > + > > + iov[num_sg].iov_base = iova_to_va(dev, &len, pa); > > + if (iov[num_sg].iov_base == NULL) { > > + fprintf(stderr, "virtio: invalid address for buffers\n"); > > + return false; > > + } > > + iov[num_sg++].iov_len = len; > > + sz -= len; > > + pa += len; > > + } > > + > > + *p_num_sg = num_sg; > > + return true; > > +} > > + > > +static void *vduse_queue_alloc_element(size_t sz, unsigned out_num, > > + unsigned in_num) > > +{ > > + VduseVirtqElement *elem; > > + size_t in_sg_ofs = ALIGN_UP(sz, __alignof__(elem->in_sg[0])); > > + size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); > > + size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); > > + > > + assert(sz >= sizeof(VduseVirtqElement)); > > + elem = malloc(out_sg_end); > > Missing malloc() NULL return value check. > Will fix it. > > + elem->out_num = out_num; > > + elem->in_num = in_num; > > + elem->in_sg = (void *)elem + in_sg_ofs; > > + elem->out_sg = (void *)elem + out_sg_ofs; > > + return elem; > > +} > > + > > +static void *vduse_queue_map_desc(VduseVirtq *vq, unsigned int idx, size_t > > sz) > > +{ > > + struct vring_desc *desc = vq->vring.desc; > > + VduseDev *dev = vq->dev; > > + uint64_t desc_addr, read_len; > > + unsigned int desc_len; > > + unsigned int max = vq->vring.num; > > + unsigned int i = idx; > > + VduseVirtqElement *elem; > > + struct iovec iov[VIRTQUEUE_MAX_SIZE]; > > + struct vring_desc desc_buf[VIRTQUEUE_MAX_SIZE]; > > + unsigned int out_num = 0, in_num = 0; > > + int rc; > > + > > + if (le16toh(desc[i].flags) & VRING_DESC_F_INDIRECT) { > > + if (le32toh(desc[i].len) % sizeof(struct vring_desc)) { > > + fprintf(stderr, "Invalid size for indirect buffer table\n"); > > + return NULL; > > + } > > + > > + /* loop over the indirect descriptor table */ > > + desc_addr = le64toh(desc[i].addr); > > + desc_len = le32toh(desc[i].len); > > + max = desc_len / sizeof(struct vring_desc); > > + read_len = desc_len; > > + desc = iova_to_va(dev, &read_len, desc_addr); > > + if (unlikely(desc && read_len != desc_len)) { > > + /* Failed to use zero copy */ > > + desc = NULL; > > + if (!vduse_queue_read_indirect_desc(dev, desc_buf, > > + desc_addr, > > + desc_len)) { > > + desc = desc_buf; > > + } > > + } > > + if (!desc) { > > + fprintf(stderr, "Invalid indirect buffer table\n"); > > + return NULL; > > + } > > + i = 0; > > + } > > + > > + /* Collect all the descriptors */ > > + do { > > + if (le16toh(desc[i].flags) & VRING_DESC_F_WRITE) { > > + if (!vduse_queue_map_single_desc(vq, &in_num, iov + out_num, > > + VIRTQUEUE_MAX_SIZE - out_num, > > + true, le64toh(desc[i].addr), > > + le32toh(desc[i].len))) { > > + return NULL; > > + } > > + } else { > > + if (in_num) { > > + fprintf(stderr, "Incorrect order for descriptors\n"); > > + return NULL; > > + } > > + if (!vduse_queue_map_single_desc(vq, &out_num, iov, > > + VIRTQUEUE_MAX_SIZE, false, > > + le64toh(desc[i].addr), > > + le32toh(desc[i].len))) { > > + return NULL; > > + } > > + } > > + > > + /* If we've got too many, that implies a descriptor loop. */ > > + if ((in_num + out_num) > max) { > > + fprintf(stderr, "Looped descriptor\n"); > > + return NULL; > > + } > > + rc = vduse_queue_read_next_desc(desc, i, max, &i); > > + } while (rc == VIRTQUEUE_READ_DESC_MORE); > > + > > + if (rc == VIRTQUEUE_READ_DESC_ERROR) { > > + fprintf(stderr, "read descriptor error\n"); > > + return NULL; > > + } > > + > > + /* Now copy what we have collected and mapped */ > > + elem = vduse_queue_alloc_element(sz, out_num, in_num); > > + elem->index = idx; > > + for (i = 0; i < out_num; i++) { > > + elem->out_sg[i] = iov[i]; > > + } > > + for (i = 0; i < in_num; i++) { > > + elem->in_sg[i] = iov[out_num + i]; > > + } > > + > > + return elem; > > +} > > + > > +void *vduse_queue_pop(VduseVirtq *vq, size_t sz) > > +{ > > + unsigned int head; > > + VduseVirtqElement *elem; > > + VduseDev *dev = vq->dev; > > + > > + if (unlikely(!vq->vring.avail)) { > > + return NULL; > > + } > > + > > + if (vduse_queue_empty(vq)) { > > + return NULL; > > + } > > + /* Needed after virtio_queue_empty() */ > > + smp_rmb(); > > + > > + if (vq->inuse >= vq->vring.num) { > > + fprintf(stderr, "Virtqueue size exceeded: %d\n", vq->inuse); > > + return NULL; > > + } > > + > > + if (!vduse_queue_get_head(vq, vq->last_avail_idx++, &head)) { > > + return NULL; > > + } > > + > > + if (vduse_dev_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { > > + vring_set_avail_event(vq, vq->last_avail_idx); > > + } > > + > > + elem = vduse_queue_map_desc(vq, head, sz); > > + > > + if (!elem) { > > + return NULL; > > + } > > + > > + vq->inuse++; > > + > > + return elem; > > +} > > + > > +static inline void vring_used_write(VduseVirtq *vq, > > + struct vring_used_elem *uelem, int i) > > +{ > > + struct vring_used *used = vq->vring.used; > > + > > + used->ring[i] = *uelem; > > +} > > + > > +static void vduse_queue_fill(VduseVirtq *vq, const VduseVirtqElement *elem, > > + unsigned int len, unsigned int idx) > > +{ > > + struct vring_used_elem uelem; > > + > > + if (unlikely(!vq->vring.used)) { > > + return; > > + } > > + > > + idx = (idx + vq->used_idx) % vq->vring.num; > > + > > + uelem.id = htole32(elem->index); > > + uelem.len = htole32(len); > > + vring_used_write(vq, &uelem, idx); > > +} > > + > > +static inline void vring_used_idx_set(VduseVirtq *vq, uint16_t val) > > +{ > > + vq->vring.used->idx = htole16(val); > > + vq->used_idx = val; > > +} > > + > > +static void vduse_queue_flush(VduseVirtq *vq, unsigned int count) > > +{ > > + uint16_t old, new; > > + > > + if (unlikely(!vq->vring.used)) { > > + return; > > + } > > + > > + /* Make sure buffer is written before we update index. */ > > + smp_wmb(); > > + > > + old = vq->used_idx; > > + new = old + count; > > + vring_used_idx_set(vq, new); > > + vq->inuse -= count; > > + if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - > > old))) { > > + vq->signalled_used_valid = false; > > + } > > +} > > + > > +void vduse_queue_push(VduseVirtq *vq, const VduseVirtqElement *elem, > > + unsigned int len) > > +{ > > + vduse_queue_fill(vq, elem, len, 0); > > + vduse_queue_flush(vq, 1); > > +} > > + > > +static int vduse_queue_update_vring(VduseVirtq *vq, uint64_t desc_addr, > > + uint64_t avail_addr, uint64_t > > used_addr) > > +{ > > + struct VduseDev *dev = vq->dev; > > + uint64_t len; > > + > > + len = sizeof(struct vring_desc); > > + vq->vring.desc = iova_to_va(dev, &len, desc_addr); > > + assert(len == sizeof(struct vring_desc)); > > + > > + len = sizeof(struct vring_avail); > > + vq->vring.avail = iova_to_va(dev, &len, avail_addr); > > + assert(len == sizeof(struct vring_avail)); > > + > > + len = sizeof(struct vring_used); > > + vq->vring.used = iova_to_va(dev, &len, used_addr); > > + assert(len == sizeof(struct vring_used)); > > + > > + if (!vq->vring.desc || !vq->vring.avail || !vq->vring.used) { > > + fprintf(stderr, "Failed to get vq[%d] iova mapping\n", vq->index); > > + return -EINVAL; > > + } > > + > > + return 0; > > +} > > + > > +static void vduse_queue_enable(VduseVirtq *vq) > > +{ > > + struct VduseDev *dev = vq->dev; > > + struct vduse_vq_info vq_info; > > + struct vduse_vq_eventfd vq_eventfd; > > + int fd; > > + > > + vq_info.index = vq->index; > > + if (ioctl(dev->fd, VDUSE_VQ_GET_INFO, &vq_info)) { > > + fprintf(stderr, "Failed to get vq[%d] info: %s\n", > > + vq->index, strerror(errno)); > > + return; > > + } > > + > > + if (!vq_info.ready) { > > + return; > > + } > > + > > + vq->vring.num = vq_info.num; > > + vq->vring.desc_addr = vq_info.desc_addr; > > + vq->vring.avail_addr = vq_info.driver_addr; > > + vq->vring.used_addr = vq_info.device_addr; > > + > > + if (vduse_queue_update_vring(vq, vq_info.desc_addr, > > + vq_info.driver_addr, > > vq_info.device_addr)) { > > + fprintf(stderr, "Failed to update vring for vq[%d]\n", vq->index); > > + return; > > + } > > + > > + fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); > > + if (fd < 0) { > > + fprintf(stderr, "Failed to init eventfd for vq[%d]\n", vq->index); > > + return; > > + } > > + > > + vq_eventfd.index = vq->index; > > + vq_eventfd.fd = fd; > > + if (ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &vq_eventfd)) { > > + fprintf(stderr, "Failed to setup kick fd for vq[%d]\n", vq->index); > > + close(fd); > > + return; > > + } > > + > > + vq->fd = fd; > > + vq->shadow_avail_idx = vq->last_avail_idx = vq_info.split.avail_index; > > + vq->inuse = 0; > > + vq->used_idx = 0; > > + vq->signalled_used_valid = false; > > + vq->ready = true; > > + > > + dev->ops->enable_queue(dev, vq); > > +} > > + > > +static void vduse_queue_disable(VduseVirtq *vq) > > +{ > > + struct VduseDev *dev = vq->dev; > > + struct vduse_vq_eventfd eventfd; > > + > > + if (!vq->ready) { > > + return; > > + } > > + > > + dev->ops->disable_queue(dev, vq); > > + > > + eventfd.index = vq->index; > > + eventfd.fd = VDUSE_EVENTFD_DEASSIGN; > > + ioctl(dev->fd, VDUSE_VQ_SETUP_KICKFD, &eventfd); > > + close(vq->fd); > > + > > + assert(vq->inuse == 0); > > + > > + vq->vring.num = 0; > > + vq->vring.desc_addr = 0; > > + vq->vring.avail_addr = 0; > > + vq->vring.used_addr = 0; > > + vq->vring.desc = 0; > > + vq->vring.avail = 0; > > + vq->vring.used = 0; > > + vq->ready = false; > > + vq->fd = -1; > > +} > > + > > +static void vduse_dev_start_dataplane(VduseDev *dev) > > +{ > > + int i; > > + > > + if (ioctl(dev->fd, VDUSE_DEV_GET_FEATURES, &dev->features)) { > > + fprintf(stderr, "Failed to get features: %s\n", strerror(errno)); > > + return; > > + } > > + > > + for (i = 0; i < dev->num_queues; i++) { > > + vduse_queue_enable(&dev->vqs[i]); > > + } > > +} > > + > > +static void vduse_dev_stop_dataplane(VduseDev *dev) > > +{ > > + int i; > > + > > + for (i = 0; i < dev->num_queues; i++) { > > + vduse_queue_disable(&dev->vqs[i]); > > + } > > + dev->features = 0; > > + vduse_iova_remove_region(dev, 0, ULONG_MAX); > > +} > > + > > +int vduse_dev_handler(VduseDev *dev) > > +{ > > + struct vduse_dev_request req; > > + struct vduse_dev_response resp = { 0 }; > > + VduseVirtq *vq; > > + int i, ret; > > + > > + ret = read(dev->fd, &req, sizeof(req)); > > This file descriptor is blocking? I guess the assumption is that the > kernel VDUSE code always enqueues at least one struct vduse_dev_request, > so userspace will not block when the file descriptor becomes readable? > Yes, that's true. We can always get one entire request if the file descriptor becomes readable. > > + if (ret != sizeof(req)) { > > + fprintf(stderr, "Read request error [%d]: %s\n", > > + ret, strerror(errno)); > > + return -errno; > > + } > > + resp.request_id = req.request_id; > > + > > + switch (req.type) { > > + case VDUSE_GET_VQ_STATE: > > + vq = &dev->vqs[req.vq_state.index]; > > + resp.vq_state.split.avail_index = vq->last_avail_idx; > > + resp.result = VDUSE_REQ_RESULT_OK; > > + break; > > + case VDUSE_SET_STATUS: > > + if (req.s.status & VIRTIO_CONFIG_S_DRIVER_OK) { > > + vduse_dev_start_dataplane(dev); > > + } else if (req.s.status == 0) { > > + vduse_dev_stop_dataplane(dev); > > + } > > + resp.result = VDUSE_REQ_RESULT_OK; > > + break; > > + case VDUSE_UPDATE_IOTLB: > > + /* The iova will be updated by iova_to_va() later, so just remove > > it */ > > + vduse_iova_remove_region(dev, req.iova.start, req.iova.last); > > + for (i = 0; i < dev->num_queues; i++) { > > + VduseVirtq *vq = &dev->vqs[i]; > > + if (vq->ready) { > > + if (vduse_queue_update_vring(vq, vq->vring.desc_addr, > > + vq->vring.avail_addr, > > + vq->vring.used_addr)) { > > + fprintf(stderr, "Failed to update vring for vq[%d]\n", > > + vq->index); > > + } > > + } > > + } > > + resp.result = VDUSE_REQ_RESULT_OK; > > + break; > > + default: > > + resp.result = VDUSE_REQ_RESULT_FAILED; > > + break; > > + } > > + > > + ret = write(dev->fd, &resp, sizeof(resp)); > > The kernel never blocks here? > Yes. > > + if (ret != sizeof(resp)) { > > + fprintf(stderr, "Write request %d error [%d]: %s\n", > > + req.type, ret, strerror(errno)); > > + return -errno; > > + } > > + return 0; > > +} > > + > > +int vduse_dev_update_config(VduseDev *dev, uint32_t size, > > + uint32_t offset, char *buffer) > > +{ > > + int ret; > > + struct vduse_config_data *data; > > + > > + data = malloc(offsetof(struct vduse_config_data, buffer) + size); > > + if (!data) { > > + return -ENOMEM; > > + } > > + > > + data->offset = offset; > > + data->length = size; > > + memcpy(data->buffer, buffer, size); > > + > > + ret = ioctl(dev->fd, VDUSE_DEV_SET_CONFIG, data); > > + free(data); > > + > > + if (ret) { > > + return -errno; > > + } > > + > > + if (ioctl(dev->fd, VDUSE_DEV_INJECT_CONFIG_IRQ)) { > > + return -errno; > > + } > > + > > + return 0; > > +} > > + > > +int vduse_dev_setup_queue(VduseDev *dev, int index, int max_size) > > +{ > > + VduseVirtq *vq = &dev->vqs[index]; > > + struct vduse_vq_config vq_config = { 0 }; > > + > > + vq_config.index = vq->index; > > + vq_config.max_size = max_size; > > + > > + if (ioctl(dev->fd, VDUSE_VQ_SETUP, &vq_config)) { > > + return -errno; > > + } > > + > > + return 0; > > +} > > + > > +VduseDev *vduse_dev_create(const char *name, uint32_t device_id, > > + uint32_t vendor_id, uint64_t features, > > + uint16_t num_queues, uint32_t config_size, > > + char *config, const VduseOps *ops, void *priv) > > +{ > > + VduseDev *dev; > > + int i, ret, ctrl_fd, fd = -1; > > + uint64_t version; > > + char dev_path[VDUSE_NAME_MAX + 16]; > > Why 16? It has to be at least strlen("/dev/vduse/"), but why more? I > suggest including strlen("/dev/vduse/") instead of hardcoding a magic > constant. > LGTM, will use strlen("/dev/vduse/") instead. > > + VduseVirtq *vqs = NULL; > > + struct vduse_dev_config *dev_config = NULL; > > + size_t size = offsetof(struct vduse_dev_config, config); > > + > > + if (!name || strlen(name) > VDUSE_NAME_MAX || !config || > > The NUL terminator needs to be taken into account: > > strlen(name) + 1 > VDUSE_NAME_MAX > Will fix it. > > + !config_size || !ops || !ops->enable_queue || !ops->disable_queue) > > { > > + fprintf(stderr, "Invalid parameter for vduse\n"); > > + return NULL; > > + } > > + > > + dev = malloc(sizeof(VduseDev)); > > + if (!dev) { > > + fprintf(stderr, "Failed to allocate vduse device\n"); > > + return NULL; > > + } > > + memset(dev, 0, sizeof(VduseDev)); > > + > > + ctrl_fd = open("/dev/vduse/control", O_RDWR); > > + if (ctrl_fd < 0) { > > + fprintf(stderr, "Failed to open /dev/vduse/control: %s\n", > > + strerror(errno)); > > + goto err_ctrl; > > + } > > + > > + version = VDUSE_API_VERSION; > > + if (ioctl(ctrl_fd, VDUSE_SET_API_VERSION, &version)) { > > + fprintf(stderr, "Failed to set api version %lu: %s\n", > > + version, strerror(errno)); > > + goto err_dev; > > + } > > + > > + dev_config = malloc(size + config_size); > > + if (!dev_config) { > > + fprintf(stderr, "Failed to allocate config space\n"); > > + goto err_dev; > > + } > > + memset(dev_config, 0, size + config_size); > > + > > + strcpy(dev_config->name, name); > > + dev_config->device_id = device_id; > > + dev_config->vendor_id = vendor_id; > > + dev_config->features = features; > > + dev_config->vq_num = num_queues; > > + dev_config->vq_align = VDUSE_VQ_ALIGN; > > + dev_config->config_size = config_size; > > + memcpy(dev_config->config, config, config_size); > > + > > + ret = ioctl(ctrl_fd, VDUSE_CREATE_DEV, dev_config); > > + free(dev_config); > > + if (ret < 0) { > > + fprintf(stderr, "Failed to create vduse dev %s: %s\n", > > + name, strerror(errno)); > > + goto err_dev; > > + } > > + > > + sprintf(dev_path, "/dev/vduse/%s", name); > > + fd = open(dev_path, O_RDWR); > > Does the caller reject names with ".." path components? Maybe input > validation should be performed before we call open(2)? > Make sense. > > + if (fd < 0) { > > + fprintf(stderr, "Failed to open vduse dev %s: %s\n", > > + name, strerror(errno)); > > + goto err; > > + } > > + > > + vqs = calloc(sizeof(VduseVirtq), num_queues); > > calloc() could be used instead of malloc + memset above as well. > OK. > > + if (!vqs) { > > + fprintf(stderr, "Failed to allocate virtqueues\n"); > > + goto err; > > + } > > + > > + for (i = 0; i < num_queues; i++) { > > + vqs[i].index = i; > > + vqs[i].dev = dev; > > + vqs[i].fd = -1; > > + } > > + > > + dev->vqs = vqs; > > + dev->name = strdup(name); > > malloc(3) return values are checked elsewhere, strdup(3) should also be > checked. > OK. > > + dev->num_queues = num_queues; > > + dev->ops = ops; > > + dev->ctrl_fd = ctrl_fd; > > + dev->fd = fd; > > + dev->priv = priv; > > + > > + return dev; > > +err: > > + if (fd > 0) { > > + close(fd); > > + } > > + ioctl(ctrl_fd, VDUSE_DESTROY_DEV, name); > > +err_dev: > > + close(ctrl_fd); > > +err_ctrl: > > + free(dev); > > + > > + return NULL; > > +} > > + > > +void vduse_dev_destroy(VduseDev *dev) > > +{ > > + free(dev->vqs); > > + close(dev->fd); > > + dev->fd = -1; > > + ioctl(dev->ctrl_fd, VDUSE_DESTROY_DEV, dev->name); > > + free(dev->name); > > + close(dev->ctrl_fd); > > + dev->ctrl_fd = -1; > > + free(dev); > > +} > > diff --git a/subprojects/libvduse/libvduse.h > > b/subprojects/libvduse/libvduse.h > > new file mode 100644 > > index 0000000000..f6bcb51b5a > > --- /dev/null > > +++ b/subprojects/libvduse/libvduse.h > > @@ -0,0 +1,193 @@ > > +/* > > + * VDUSE (vDPA Device in Userspace) library > > + * > > + * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights > > reserved. > > + * > > + * Author: > > + * Xie Yongji <[email protected]> > > + * > > + * This work is licensed under the terms of the GNU GPL, version 2 or > > + * later. See the COPYING file in the top-level directory. > > + */ > > + > > +#ifndef LIBVDUSE_H > > +#define LIBVDUSE_H > > + > > +#include <stdint.h> > > +#include <sys/uio.h> > > + > > +/* VDUSE device structure */ > > +typedef struct VduseDev VduseDev; > > + > > +/* Virtqueue structure */ > > +typedef struct VduseVirtq VduseVirtq; > > + > > +/* Some operation of VDUSE backend */ > > +typedef struct VduseOps { > > + /* Called when virtqueue can be processed */ > > + void (*enable_queue)(VduseDev *dev, VduseVirtq *vq); > > + /* Called when virtqueue processing should be stopped */ > > + void (*disable_queue)(VduseDev *dev, VduseVirtq *vq); > > +} VduseOps; > > + > > +/* Describing elements of the I/O buffer */ > > +typedef struct VduseVirtqElement { > > + /* Virtqueue index */ > > + unsigned int index; > > Is this the descriptor table index or the virtqueue number? > It's descriptor table index. Will fix it in v2. Thanks, Yongji
