There's generally two modes of operation for IOMMUFD:

* The simple user API which intends to perform relatively simple things
with IOMMUs e.g. DPDK. It generally creates an IOAS and attach to VFIO
and mainly performs IOAS_MAP and UNMAP.

* The native IOMMUFD API where you have fine grained control of the
IOMMU domain and model it accordingly. This is where most new feature
are being steered to.

For dirty tracking 2) is required, as it needs to ensure that
the stage-2/parent IOMMU domain will only attach devices
that support dirty tracking (so far it is all homogeneous in x86, likely
not the case for smmuv3). Such invariant on dirty tracking provides a
useful guarantee to VMMs that will refuse incompatible device
attachments for IOMMU domains.

For dirty tracking such property is enabled/enforced via HWPT_ALLOC,
which is responsible for creating an IOMMU domain. This is contrast to
the 'simple API' where the IOMMU domain is created by IOMMUFD
automatically when it attaches to VFIO (usually referred as autodomains)

To support dirty tracking with the advanced IOMMUFD API, it needs
similar logic, where IOMMU domains are created and devices attached to
compatible domains. Essentially mimmicing kernel
iommufd_device_auto_get_domain(). If this fails (i.e. mdevs) it falls back
to IOAS attach.

Signed-off-by: Joao Martins <[email protected]>
---
Right now the only alternative to a userspace autodomains implementation
is to mimmicing all the flags being added to HWPT_ALLOC but into VFIO
IOAS attach. So opted for autodomains userspace approach to avoid the
duplication of hwpt-alloc flags vs attach-ioas flags. I lack mdev real
drivers atm, so testing with those is still TBD.

Opinions, comments, welcome!
---
 backends/iommufd.c            | 29 +++++++++++++
 backends/trace-events         |  1 +
 hw/vfio/iommufd.c             | 78 +++++++++++++++++++++++++++++++++++
 include/hw/vfio/vfio-common.h |  9 ++++
 include/sysemu/iommufd.h      |  4 ++
 5 files changed, 121 insertions(+)

diff --git a/backends/iommufd.c b/backends/iommufd.c
index 8486894f1b3f..2970135af4b9 100644
--- a/backends/iommufd.c
+++ b/backends/iommufd.c
@@ -211,6 +211,35 @@ int iommufd_backend_unmap_dma(IOMMUFDBackend *be, uint32_t 
ioas_id,
     return ret;
 }
 
+int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
+                               uint32_t pt_id, uint32_t flags,
+                               uint32_t data_type, uint32_t data_len,
+                               void *data_ptr, uint32_t *out_hwpt)
+{
+    int ret;
+    struct iommu_hwpt_alloc alloc_hwpt = {
+        .size = sizeof(struct iommu_hwpt_alloc),
+        .flags = flags,
+        .dev_id = dev_id,
+        .pt_id = pt_id,
+        .data_type = data_type,
+        .data_len = data_len,
+        .data_uptr = (uint64_t)data_ptr,
+        .__reserved = 0,
+    };
+
+    ret = ioctl(iommufd, IOMMU_HWPT_ALLOC, &alloc_hwpt);
+    trace_iommufd_backend_alloc_hwpt(iommufd, dev_id, pt_id, flags, data_type,
+                                     data_len, (uint64_t)data_ptr,
+                                     alloc_hwpt.out_hwpt_id, ret);
+    if (ret) {
+        error_report("IOMMU_HWPT_ALLOC failed: %m");
+    } else {
+        *out_hwpt = alloc_hwpt.out_hwpt_id;
+    }
+    return !ret ? 0 : -errno;
+}
+
 static const TypeInfo iommufd_backend_info = {
     .name = TYPE_IOMMUFD_BACKEND,
     .parent = TYPE_OBJECT,
diff --git a/backends/trace-events b/backends/trace-events
index d45c6e31a67e..f83a276a4253 100644
--- a/backends/trace-events
+++ b/backends/trace-events
@@ -13,5 +13,6 @@ iommu_backend_set_fd(int fd) "pre-opened /dev/iommu fd=%d"
 iommufd_backend_map_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t 
size, void *vaddr, bool readonly, int ret) " iommufd=%d ioas=%d 
iova=0x%"PRIx64" size=0x%"PRIx64" addr=%p readonly=%d (%d)"
 iommufd_backend_unmap_dma_non_exist(int iommufd, uint32_t ioas, uint64_t iova, 
uint64_t size, int ret) " Unmap nonexistent mapping: iommufd=%d ioas=%d 
iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
 iommufd_backend_unmap_dma(int iommufd, uint32_t ioas, uint64_t iova, uint64_t 
size, int ret) " iommufd=%d ioas=%d iova=0x%"PRIx64" size=0x%"PRIx64" (%d)"
+iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id, uint32_t pt_id, 
uint32_t flags, uint32_t hwpt_type, uint32_t len, uint64_t data_ptr, uint32_t 
out_hwpt_id, int ret) " iommufd=%d dev_id=%u pt_id=%u flags=0x%x hwpt_type=%u 
len=%u data_ptr=0x%"PRIx64" out_hwpt=%u (%d)"
 iommufd_backend_alloc_ioas(int iommufd, uint32_t ioas, int ret) " iommufd=%d 
ioas=%d (%d)"
 iommufd_backend_free_id(int iommufd, uint32_t id, int ret) " iommufd=%d id=%d 
(%d)"
diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index 7d39d7a5fa51..ca7ec45e725c 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -219,10 +219,82 @@ static int iommufd_cdev_detach_ioas_hwpt(VFIODevice 
*vbasedev, Error **errp)
     return ret;
 }
 
+static int iommufd_cdev_autodomains_get(VFIODevice *vbasedev,
+                                        VFIOIOMMUFDContainer *container,
+                                        Error **errp)
+{
+    int iommufd = vbasedev->iommufd_dev.iommufd->fd;
+    VFIOIOASHwpt *hwpt;
+    Error *err = NULL;
+    int ret = -EINVAL;
+    uint32_t hwpt_id;
+
+    /* Try to find a domain */
+    QLIST_FOREACH(hwpt, &container->hwpt_list, next) {
+        ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
+        if (ret) {
+            /* -EINVAL means the domain is incompatible with the device. */
+            if (ret == -EINVAL) {
+                continue;
+            }
+            return ret;
+        } else {
+            vbasedev->hwpt = hwpt;
+            QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+            return 0;
+        }
+    }
+
+    ret = iommufd_backend_alloc_hwpt(iommufd,
+                                     vbasedev->iommufd_dev.devid,
+                                     container->ioas_id, 0, 0, 0,
+                                     NULL, &hwpt_id);
+    if (ret) {
+        error_append_hint(&err,
+                   "Failed to allocate HWPT for device %s. Fallback to IOAS 
attach\n",
+                   vbasedev->name);
+        warn_report_err(err);
+        return ret;
+    }
+
+    hwpt = g_malloc0(sizeof(*hwpt));
+    hwpt->hwpt_id = hwpt_id;
+    QLIST_INIT(&hwpt->device_list);
+
+    ret = iommufd_cdev_attach_ioas_hwpt(vbasedev, hwpt->hwpt_id, &err);
+    if (ret) {
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+        return ret;
+    }
+
+    vbasedev->hwpt = hwpt;
+    QLIST_INSERT_HEAD(&hwpt->device_list, vbasedev, hwpt_next);
+    QLIST_INSERT_HEAD(&container->hwpt_list, hwpt, next);
+    return 0;
+}
+
+static void iommufd_cdev_autodomains_put(VFIODevice *vbasedev,
+                                         VFIOIOMMUFDContainer *container)
+{
+    VFIOIOASHwpt *hwpt = vbasedev->hwpt;
+
+    QLIST_REMOVE(vbasedev, hwpt_next);
+    QLIST_REMOVE(hwpt, next);
+    if (QLIST_EMPTY(&hwpt->device_list)) {
+        iommufd_backend_free_id(container->be, hwpt->hwpt_id);
+        g_free(hwpt);
+    }
+}
+
 static int iommufd_cdev_attach_container(VFIODevice *vbasedev,
                                          VFIOIOMMUFDContainer *container,
                                          Error **errp)
 {
+    if (!iommufd_cdev_autodomains_get(vbasedev, container, errp)) {
+        return 0;
+    }
+
     return iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
 }
 
@@ -231,6 +303,11 @@ static void iommufd_cdev_detach_container(VFIODevice 
*vbasedev,
 {
     Error *err = NULL;
 
+    if (vbasedev->hwpt) {
+        iommufd_cdev_autodomains_put(vbasedev, container);
+        return;
+    }
+
     if (iommufd_cdev_detach_ioas_hwpt(vbasedev, &err)) {
         error_report_err(err);
     }
@@ -370,6 +447,7 @@ static int iommufd_cdev_attach(const char *name, VFIODevice 
*vbasedev,
     container = g_malloc0(sizeof(*container));
     container->be = vbasedev->iommufd_dev.iommufd;
     container->ioas_id = ioas_id;
+    QLIST_INIT(&container->hwpt_list);
 
     bcontainer = &container->bcontainer;
     vfio_container_init(bcontainer, space, iommufd_vioc);
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 9c4b60c906d9..7f7d823221e2 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -93,10 +93,17 @@ typedef struct VFIOHostDMAWindow {
 
 typedef struct IOMMUFDBackend IOMMUFDBackend;
 
+typedef struct VFIOIOASHwpt {
+    uint32_t hwpt_id;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOIOASHwpt) next;
+} VFIOIOASHwpt;
+
 typedef struct VFIOIOMMUFDContainer {
     VFIOContainerBase bcontainer;
     IOMMUFDBackend *be;
     uint32_t ioas_id;
+    QLIST_HEAD(, VFIOIOASHwpt) hwpt_list;
 } VFIOIOMMUFDContainer;
 
 /* Abstraction of host IOMMU legacy device */
@@ -136,6 +143,8 @@ typedef struct VFIODevice {
         IOMMULegacyDevice legacy_dev;
         IOMMUFDDevice iommufd_dev;
     };
+    VFIOIOASHwpt *hwpt;
+    QLIST_ENTRY(VFIODevice) hwpt_next;
 } VFIODevice;
 
 QEMU_BUILD_BUG_ON(offsetof(VFIODevice, legacy_dev.base) !=
diff --git a/include/sysemu/iommufd.h b/include/sysemu/iommufd.h
index 4afe97307dbe..1966b75caae2 100644
--- a/include/sysemu/iommufd.h
+++ b/include/sysemu/iommufd.h
@@ -49,4 +49,8 @@ typedef struct IOMMUFDDevice {
 void iommufd_device_init(IOMMUFDDevice *idev);
 int iommufd_device_get_hw_capabilities(IOMMUFDDevice *idev, uint64_t *caps,
                                        Error **errp);
+int iommufd_backend_alloc_hwpt(int iommufd, uint32_t dev_id,
+                               uint32_t pt_id, uint32_t flags,
+                               uint32_t data_type, uint32_t data_len,
+                               void *data_ptr, uint32_t *out_hwpt);
 #endif
-- 
2.39.3


Reply via email to