On Sun, Jan 30, 2022 at 6:58 AM Jason Wang <[email protected]> wrote:
>
>
> 在 2022/1/22 上午4:27, Eugenio Pérez 写道:
> > Use translations added in VhostIOVATree in SVQ.
> >
> > Only introduce usage here, not allocation and deallocation. As with
> > previous patches, we use the dead code paths of shadow_vqs_enabled to
> > avoid commiting too many changes at once. These are impossible to take
> > at the moment.
> >
> > Signed-off-by: Eugenio Pérez <[email protected]>
> > ---
> > hw/virtio/vhost-shadow-virtqueue.h | 3 +-
> > include/hw/virtio/vhost-vdpa.h | 3 +
> > hw/virtio/vhost-shadow-virtqueue.c | 111 ++++++++++++++++----
> > hw/virtio/vhost-vdpa.c | 161 +++++++++++++++++++++++++----
> > 4 files changed, 238 insertions(+), 40 deletions(-)
> >
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.h
> > b/hw/virtio/vhost-shadow-virtqueue.h
> > index 19c934af49..c6f67d6f76 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.h
> > +++ b/hw/virtio/vhost-shadow-virtqueue.h
> > @@ -12,6 +12,7 @@
> >
> > #include "hw/virtio/vhost.h"
> > #include "qemu/event_notifier.h"
> > +#include "hw/virtio/vhost-iova-tree.h"
> >
> > typedef struct VhostShadowVirtqueue VhostShadowVirtqueue;
> >
> > @@ -37,7 +38,7 @@ void vhost_svq_start(VhostShadowVirtqueue *svq,
> > VirtIODevice *vdev,
> > VirtQueue *vq);
> > void vhost_svq_stop(VhostShadowVirtqueue *svq);
> >
> > -VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize);
> > +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize, VhostIOVATree
> > *iova_map);
> >
> > void vhost_svq_free(VhostShadowVirtqueue *vq);
> >
> > diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
> > index 009a9f3b6b..cd2388b3be 100644
> > --- a/include/hw/virtio/vhost-vdpa.h
> > +++ b/include/hw/virtio/vhost-vdpa.h
> > @@ -14,6 +14,7 @@
> >
> > #include <gmodule.h>
> >
> > +#include "hw/virtio/vhost-iova-tree.h"
> > #include "hw/virtio/virtio.h"
> > #include "standard-headers/linux/vhost_types.h"
> >
> > @@ -30,6 +31,8 @@ typedef struct vhost_vdpa {
> > MemoryListener listener;
> > struct vhost_vdpa_iova_range iova_range;
> > bool shadow_vqs_enabled;
> > + /* IOVA mapping used by Shadow Virtqueue */
> > + VhostIOVATree *iova_tree;
> > GPtrArray *shadow_vqs;
> > struct vhost_dev *dev;
> > VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
> > diff --git a/hw/virtio/vhost-shadow-virtqueue.c
> > b/hw/virtio/vhost-shadow-virtqueue.c
> > index a1a404f68f..c7888eb8cf 100644
> > --- a/hw/virtio/vhost-shadow-virtqueue.c
> > +++ b/hw/virtio/vhost-shadow-virtqueue.c
> > @@ -11,6 +11,7 @@
> > #include "hw/virtio/vhost-shadow-virtqueue.h"
> > #include "hw/virtio/vhost.h"
> > #include "hw/virtio/virtio-access.h"
> > +#include "hw/virtio/vhost-iova-tree.h"
> > #include "standard-headers/linux/vhost_types.h"
> >
> > #include "qemu/error-report.h"
> > @@ -45,6 +46,9 @@ typedef struct VhostShadowVirtqueue {
> > /* Virtio device */
> > VirtIODevice *vdev;
> >
> > + /* IOVA mapping */
> > + VhostIOVATree *iova_tree;
> > +
> > /* Map for returning guest's descriptors */
> > VirtQueueElement **ring_id_maps;
> >
> > @@ -97,13 +101,7 @@ bool vhost_svq_valid_device_features(uint64_t
> > *dev_features)
> > continue;
> >
> > case VIRTIO_F_ACCESS_PLATFORM:
> > - /* SVQ does not know how to translate addresses */
> > - if (*dev_features & BIT_ULL(b)) {
> > - clear_bit(b, dev_features);
> > - r = false;
> > - }
> > - break;
> > -
> > + /* SVQ trust in host's IOMMU to translate addresses */
> > case VIRTIO_F_VERSION_1:
> > /* SVQ trust that guest vring is little endian */
> > if (!(*dev_features & BIT_ULL(b))) {
> > @@ -205,7 +203,55 @@ static void
> > vhost_svq_set_notification(VhostShadowVirtqueue *svq, bool enable)
> > }
> > }
> >
> > +/**
> > + * Translate addresses between qemu's virtual address and SVQ IOVA
> > + *
> > + * @svq Shadow VirtQueue
> > + * @vaddr Translated IOVA addresses
> > + * @iovec Source qemu's VA addresses
> > + * @num Length of iovec and minimum length of vaddr
> > + */
> > +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
> > + void **addrs, const struct iovec
> > *iovec,
> > + size_t num)
> > +{
> > + size_t i;
> > +
> > + if (num == 0) {
> > + return true;
> > + }
> > +
> > + for (i = 0; i < num; ++i) {
> > + DMAMap needle = {
> > + .translated_addr = (hwaddr)iovec[i].iov_base,
> > + .size = iovec[i].iov_len,
> > + };
> > + size_t off;
> > +
> > + const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree,
> > &needle);
> > + /*
> > + * Map cannot be NULL since iova map contains all guest space and
> > + * qemu already has a physical address mapped
> > + */
> > + if (unlikely(!map)) {
> > + error_report("Invalid address 0x%"HWADDR_PRIx" given by guest",
> > + needle.translated_addr);
>
>
> This can be triggered by guest, we need use once or log_guest_error() etc.
>
Ok I see the issue, I will change.
>
> > + return false;
> > + }
> > +
> > + /*
> > + * Map->iova chunk size is ignored. What to do if descriptor
> > + * (addr, size) does not fit is delegated to the device.
> > + */
>
>
> I think we need at least check the size and fail if the size doesn't
> match here. Or is it possible that we have a buffer that may cross two
> memory regions?
>
It should be impossible, since both iova_tree and VirtQueue should be
in sync regarding the memory regions updates. If a VirtQueue buffer
crosses many memory regions, iovec has more entries.
I can add a return false, but I'm not able to trigger that situation
even with a malformed driver.
>
> > + off = needle.translated_addr - map->translated_addr;
> > + addrs[i] = (void *)(map->iova + off);
> > + }
> > +
> > + return true;
> > +}
> > +
> > static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > + void * const *vaddr_sg,
> > const struct iovec *iovec,
> > size_t num, bool more_descs, bool
> > write)
> > {
> > @@ -224,7 +270,7 @@ static void
> > vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > } else {
> > descs[i].flags = flags;
> > }
> > - descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
> > + descs[i].addr = cpu_to_le64((hwaddr)vaddr_sg[n]);
> > descs[i].len = cpu_to_le32(iovec[n].iov_len);
> >
> > last = i;
> > @@ -234,42 +280,60 @@ static void
> > vhost_vring_write_descs(VhostShadowVirtqueue *svq,
> > svq->free_head = le16_to_cpu(descs[last].next);
> > }
> >
> > -static unsigned vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > - VirtQueueElement *elem)
> > +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
> > + VirtQueueElement *elem,
> > + unsigned *head)
>
>
> I'd suggest to make it returns bool since the patch that introduces this
> function.
>
Ok I will do it from the start.
>
> > {
> > - int head;
> > unsigned avail_idx;
> > vring_avail_t *avail = svq->vring.avail;
> > + bool ok;
> > + g_autofree void **sgs = g_new(void *, MAX(elem->out_num,
> > elem->in_num));
> >
> > - head = svq->free_head;
> > + *head = svq->free_head;
> >
> > /* We need some descriptors here */
> > assert(elem->out_num || elem->in_num);
> >
> > - vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
> > + ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
> > + if (unlikely(!ok)) {
> > + return false;
> > + }
> > + vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
> > elem->in_num > 0, false);
> > - vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
> > +
> > +
> > + ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
> > + if (unlikely(!ok)) {
> > + return false;
> > + }
> > +
> > + vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false,
> > true);
> >
> > /*
> > * Put entry in available array (but don't update avail->idx until
> > they
> > * do sync).
> > */
> > avail_idx = svq->avail_idx_shadow & (svq->vring.num - 1);
> > - avail->ring[avail_idx] = cpu_to_le16(head);
> > + avail->ring[avail_idx] = cpu_to_le16(*head);
> > svq->avail_idx_shadow++;
> >
> > /* Update avail index after the descriptor is wrote */
> > smp_wmb();
> > avail->idx = cpu_to_le16(svq->avail_idx_shadow);
> >
> > - return head;
> > + return true;
> > }
> >
> > -static void vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement
> > *elem)
> > +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement
> > *elem)
> > {
> > - unsigned qemu_head = vhost_svq_add_split(svq, elem);
> > + unsigned qemu_head;
> > + bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
> > + if (unlikely(!ok)) {
> > + return false;
> > + }
> >
> > svq->ring_id_maps[qemu_head] = elem;
> > + return true;
> > }
> >
> > static void vhost_svq_kick(VhostShadowVirtqueue *svq)
> > @@ -309,6 +373,7 @@ static void
> > vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
> >
> > while (true) {
> > VirtQueueElement *elem;
> > + bool ok;
> >
> > if (svq->next_guest_avail_elem) {
> > elem = g_steal_pointer(&svq->next_guest_avail_elem);
> > @@ -337,7 +402,11 @@ static void
> > vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
> > return;
> > }
> >
> > - vhost_svq_add(svq, elem);
> > + ok = vhost_svq_add(svq, elem);
> > + if (unlikely(!ok)) {
> > + /* VQ is broken, just return and ignore any other kicks */
> > + return;
> > + }
> > vhost_svq_kick(svq);
> > }
> >
> > @@ -619,12 +688,13 @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
> > * methods and file descriptors.
> > *
> > * @qsize Shadow VirtQueue size
> > + * @iova_tree Tree to perform descriptors translations
> > *
> > * Returns the new virtqueue or NULL.
> > *
> > * In case of error, reason is reported through error_report.
> > */
> > -VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> > +VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize, VhostIOVATree
> > *iova_tree)
> > {
> > size_t desc_size = sizeof(vring_desc_t) * qsize;
> > size_t device_size, driver_size;
> > @@ -656,6 +726,7 @@ VhostShadowVirtqueue *vhost_svq_new(uint16_t qsize)
> > memset(svq->vring.desc, 0, driver_size);
> > svq->vring.used = qemu_memalign(qemu_real_host_page_size,
> > device_size);
> > memset(svq->vring.used, 0, device_size);
> > + svq->iova_tree = iova_tree;
> > svq->ring_id_maps = g_new0(VirtQueueElement *, qsize);
> > event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
> > return g_steal_pointer(&svq);
> > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> > index 0e5c00ed7e..276a559649 100644
> > --- a/hw/virtio/vhost-vdpa.c
> > +++ b/hw/virtio/vhost-vdpa.c
> > @@ -209,6 +209,18 @@ static void
> > vhost_vdpa_listener_region_add(MemoryListener *listener,
> > vaddr, section->readonly);
> >
> > llsize = int128_sub(llend, int128_make64(iova));
> > + if (v->shadow_vqs_enabled) {
> > + DMAMap mem_region = {
> > + .translated_addr = (hwaddr)vaddr,
> > + .size = int128_get64(llsize) - 1,
> > + .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
> > + };
> > +
> > + int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
> > + assert(r == IOVA_OK);
>
>
> It's better to fail or warn here.
>
Sure, a warning is possible.
>
> > +
> > + iova = mem_region.iova;
> > + }
> >
> > vhost_vdpa_iotlb_batch_begin_once(v);
> > ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
> > @@ -261,6 +273,20 @@ static void
> > vhost_vdpa_listener_region_del(MemoryListener *listener,
> >
> > llsize = int128_sub(llend, int128_make64(iova));
> >
> > + if (v->shadow_vqs_enabled) {
> > + const DMAMap *result;
> > + const void *vaddr = memory_region_get_ram_ptr(section->mr) +
> > + section->offset_within_region +
> > + (iova - section->offset_within_address_space);
> > + DMAMap mem_region = {
> > + .translated_addr = (hwaddr)vaddr,
> > + .size = int128_get64(llsize) - 1,
> > + };
> > +
> > + result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
> > + iova = result->iova;
> > + vhost_iova_tree_remove(v->iova_tree, &mem_region);
> > + }
> > vhost_vdpa_iotlb_batch_begin_once(v);
> > ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
> > if (ret) {
> > @@ -783,33 +809,70 @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev
> > *dev,
> > /**
> > * Unmap SVQ area in the device
> > */
> > -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
> > - hwaddr size)
> > +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
> > + const DMAMap *needle)
> > {
> > + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
> > + hwaddr size;
> > int r;
> >
> > - size = ROUND_UP(size, qemu_real_host_page_size);
> > - r = vhost_vdpa_dma_unmap(v, iova, size);
> > + if (unlikely(!result)) {
> > + error_report("Unable to find SVQ address to unmap");
> > + return false;
> > + }
> > +
> > + size = ROUND_UP(result->size, qemu_real_host_page_size);
> > + r = vhost_vdpa_dma_unmap(v, result->iova, size);
> > return r == 0;
> > }
> >
> > static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
> > const VhostShadowVirtqueue *svq)
> > {
> > + DMAMap needle;
> > struct vhost_vdpa *v = dev->opaque;
> > struct vhost_vring_addr svq_addr;
> > - size_t device_size = vhost_svq_device_area_size(svq);
> > - size_t driver_size = vhost_svq_driver_area_size(svq);
> > bool ok;
> >
> > vhost_svq_get_vring_addr(svq, &svq_addr);
> >
> > - ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr,
> > driver_size);
> > + needle = (DMAMap) {
> > + .translated_addr = svq_addr.desc_user_addr,
> > + };
> > + ok = vhost_vdpa_svq_unmap_ring(v, &needle);
> > if (unlikely(!ok)) {
> > return false;
> > }
> >
> > - return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr,
> > device_size);
> > + needle = (DMAMap) {
> > + .translated_addr = svq_addr.used_user_addr,
> > + };
> > + return vhost_vdpa_svq_unmap_ring(v, &needle);
> > +}
> > +
> > +/**
> > + * Map SVQ area in the device
> > + *
> > + * @v Vhost-vdpa device
> > + * @needle The area to search iova
> > + * @readonly Permissions of the area
> > + */
> > +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, const DMAMap
> > *needle,
> > + bool readonly)
> > +{
> > + hwaddr off;
> > + const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
> > + int r;
> > +
> > + if (unlikely(!result)) {
> > + error_report("Can't locate SVQ ring");
> > + return false;
> > + }
> > +
> > + off = needle->translated_addr - result->translated_addr;
> > + r = vhost_vdpa_dma_map(v, result->iova + off, needle->size,
> > + (void *)needle->translated_addr, readonly);
> > + return r == 0;
> > }
> >
> > /**
> > @@ -821,23 +884,29 @@ static bool vhost_vdpa_svq_unmap_rings(struct
> > vhost_dev *dev,
> > static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
> > const VhostShadowVirtqueue *svq)
> > {
> > + DMAMap needle;
> > struct vhost_vdpa *v = dev->opaque;
> > struct vhost_vring_addr svq_addr;
> > size_t device_size = vhost_svq_device_area_size(svq);
> > size_t driver_size = vhost_svq_driver_area_size(svq);
> > - int r;
> > + bool ok;
> >
> > vhost_svq_get_vring_addr(svq, &svq_addr);
> >
> > - r = vhost_vdpa_dma_map(v, svq_addr.desc_user_addr, driver_size,
> > - (void *)svq_addr.desc_user_addr, true);
> > - if (unlikely(r != 0)) {
> > + needle = (DMAMap) {
> > + .translated_addr = svq_addr.desc_user_addr,
> > + .size = driver_size,
> > + };
> > + ok = vhost_vdpa_svq_map_ring(v, &needle, true);
> > + if (unlikely(!ok)) {
> > return false;
> > }
> >
> > - r = vhost_vdpa_dma_map(v, svq_addr.used_user_addr, device_size,
> > - (void *)svq_addr.used_user_addr, false);
> > - return r == 0;
> > + needle = (DMAMap) {
> > + .translated_addr = svq_addr.used_user_addr,
> > + .size = device_size,
> > + };
> > + return vhost_vdpa_svq_map_ring(v, &needle, false);
> > }
> >
> > static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
> > @@ -1006,6 +1075,23 @@ static int vhost_vdpa_set_owner(struct vhost_dev
> > *dev)
> > return vhost_vdpa_call(dev, VHOST_SET_OWNER, NULL);
> > }
> >
> > +static bool vhost_vdpa_svq_get_vq_region(struct vhost_vdpa *v,
> > + unsigned long long addr,
> > + uint64_t *iova_addr)
> > +{
> > + const DMAMap needle = {
> > + .translated_addr = addr,
> > + };
> > + const DMAMap *translation = vhost_iova_tree_find_iova(v->iova_tree,
> > + &needle);
> > + if (!translation) {
> > + return false;
> > + }
> > +
> > + *iova_addr = translation->iova + (addr - translation->translated_addr);
> > + return true;
> > +}
> > +
> > static void vhost_vdpa_vq_get_guest_addr(struct vhost_vring_addr *addr,
> > struct vhost_virtqueue *vq)
> > {
> > @@ -1023,10 +1109,23 @@ static int vhost_vdpa_vq_get_addr(struct vhost_dev
> > *dev,
> > assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_VDPA);
> >
> > if (v->shadow_vqs_enabled) {
> > + struct vhost_vring_addr svq_addr;
> > int idx = vhost_vdpa_get_vq_index(dev, addr->index);
> > VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, idx);
> >
> > - vhost_svq_get_vring_addr(svq, addr);
> > + vhost_svq_get_vring_addr(svq, &svq_addr);
> > + if (!vhost_vdpa_svq_get_vq_region(v, svq_addr.desc_user_addr,
> > + &addr->desc_user_addr)) {
> > + return -1;
> > + }
> > + if (!vhost_vdpa_svq_get_vq_region(v, svq_addr.avail_user_addr,
> > + &addr->avail_user_addr)) {
> > + return -1;
> > + }
> > + if (!vhost_vdpa_svq_get_vq_region(v, svq_addr.used_user_addr,
> > + &addr->used_user_addr)) {
> > + return -1;
> > + }
> > } else {
> > vhost_vdpa_vq_get_guest_addr(addr, vq);
> > }
> > @@ -1095,13 +1194,37 @@ static int vhost_vdpa_init_svq(struct vhost_dev
> > *hdev, struct vhost_vdpa *v,
> >
> > shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_psvq_free);
> > for (unsigned n = 0; n < hdev->nvqs; ++n) {
> > - VhostShadowVirtqueue *svq = vhost_svq_new(qsize);
> > -
> > + DMAMap device_region, driver_region;
> > + struct vhost_vring_addr addr;
> > + VhostShadowVirtqueue *svq = vhost_svq_new(qsize, v->iova_tree);
> > if (unlikely(!svq)) {
> > error_setg(errp, "Cannot create svq %u", n);
> > return -1;
> > }
> > - g_ptr_array_add(v->shadow_vqs, svq);
> > +
> > + vhost_svq_get_vring_addr(svq, &addr);
> > + driver_region = (DMAMap) {
> > + .translated_addr = (hwaddr)addr.desc_user_addr,
> > +
> > + /*
> > + * DMAMAp.size include the last byte included in the range,
> > while
> > + * sizeof marks one past it. Substract one byte to make them
> > match.
> > + */
> > + .size = vhost_svq_driver_area_size(svq) - 1,
> > + .perm = VHOST_ACCESS_RO,
> > + };
> > + device_region = (DMAMap) {
> > + .translated_addr = (hwaddr)addr.used_user_addr,
> > + .size = vhost_svq_device_area_size(svq) - 1,
> > + .perm = VHOST_ACCESS_RW,
> > + };
> > +
> > + r = vhost_iova_tree_map_alloc(v->iova_tree, &driver_region);
> > + assert(r == IOVA_OK);
>
>
> Let's fail instead of assert here.
>
Sure, I see how we must not assert here too.
Thanks!
> Thanks
>
>
> > + r = vhost_iova_tree_map_alloc(v->iova_tree, &device_region);
> > + assert(r == IOVA_OK);
> > +
> > + g_ptr_array_add(shadow_vqs, svq);
> > }
> >
> > out:
>