On 6/25/2025 7:40 AM, Duan, Zhenzhong wrote:
-----Original Message-----
From: Steve Sistare <steven.sist...@oracle.com>
Subject: [PATCH V5 33/38] vfio/iommufd: reconstruct device

Reconstruct userland device state after CPR.  During vfio_realize, skip all
ioctls that configure the device, as it was already configured in old QEMU.

Skip bind, and use the devid from CPR state.

Skip allocation of, and attachment to, ioas_id.  Recover ioas_id from CPR
state, and use it to find a matching container, if any, before creating a
new one.

This reconstruction is not complete.  hwpt_id is handled in a subsequent
patch.

Signed-off-by: Steve Sistare <steven.sist...@oracle.com>
---
hw/vfio/iommufd.c | 30 ++++++++++++++++++++++++++++--
1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
index f0d57ea..a650517 100644
--- a/hw/vfio/iommufd.c
+++ b/hw/vfio/iommufd.c
@@ -25,6 +25,7 @@
#include "system/reset.h"
#include "qemu/cutils.h"
#include "qemu/chardev_open.h"
+#include "migration/cpr.h"
#include "pci.h"
#include "vfio-iommufd.h"
#include "vfio-helpers.h"
@@ -121,6 +122,10 @@ static bool
iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
         goto err_kvm_device_add;
     }

+    if (cpr_is_incoming()) {
+        goto skip_bind;
+    }
+
     /* Bind device to iommufd */
     bind.iommufd = iommufd->fd;
     if (ioctl(vbasedev->fd, VFIO_DEVICE_BIND_IOMMUFD, &bind)) {
@@ -132,6 +137,8 @@ static bool
iommufd_cdev_connect_and_bind(VFIODevice *vbasedev, Error **errp)
     vbasedev->devid = bind.out_devid;
     trace_iommufd_cdev_connect_and_bind(bind.iommufd, vbasedev->name,
                                         vbasedev->fd, vbasedev->devid);
+
+skip_bind:

I'm not sure if we should take above trace for CPR..

My thinking is: on cpr, we do not connect or bind, so we should not log it.
iommufd_backend_connect() is called but it just reuses a cpr fd, and we
can observe the latter with cpr traces.

     return true;
err_bind:
     iommufd_cdev_kvm_device_del(vbasedev);
@@ -421,7 +428,9 @@ static bool iommufd_cdev_attach_container(VFIODevice
*vbasedev,
         return iommufd_cdev_autodomains_get(vbasedev, container, errp);
     }

-    return !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
+    /* If CPR, we are already attached to ioas_id. */
+    return cpr_is_incoming() ||
+           !iommufd_cdev_attach_ioas_hwpt(vbasedev, container->ioas_id, errp);
}

static void iommufd_cdev_detach_container(VFIODevice *vbasedev,
@@ -510,6 +519,7 @@ static bool iommufd_cdev_attach(const char *name,
VFIODevice *vbasedev,
     VFIOAddressSpace *space;
     struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
     int ret, devfd;
+    bool res;
     uint32_t ioas_id;
     Error *err = NULL;
     const VFIOIOMMUClass *iommufd_vioc =
@@ -540,7 +550,16 @@ static bool iommufd_cdev_attach(const char *name,
VFIODevice *vbasedev,
             vbasedev->iommufd != container->be) {
             continue;
         }
-        if (!iommufd_cdev_attach_container(vbasedev, container, &err)) {
+
+        if (!cpr_is_incoming()) {
+            res = iommufd_cdev_attach_container(vbasedev, container, &err);
+        } else if (vbasedev->cpr.ioas_id == container->ioas_id) {
+            res = true;
+        } else {
+            continue;
+        }
+
+        if (!res) {
             const char *msg = error_get_pretty(err);

             trace_iommufd_cdev_fail_attach_existing_container(msg);
@@ -557,6 +576,11 @@ static bool iommufd_cdev_attach(const char *name,
VFIODevice *vbasedev,
         }
     }

+    if (cpr_is_incoming()) {
+        ioas_id = vbasedev->cpr.ioas_id;
+        goto skip_ioas_alloc;
+    }
+
     /* Need to allocate a new dedicated container */
     if (!iommufd_backend_alloc_ioas(vbasedev->iommufd, &ioas_id, errp)) {
         goto err_alloc_ioas;
@@ -564,10 +588,12 @@ static bool iommufd_cdev_attach(const char *name,
VFIODevice *vbasedev,

     trace_iommufd_cdev_alloc_ioas(vbasedev->iommufd->fd, ioas_id);

+skip_ioas_alloc:

Same here, others look good.

During cpr, we do not allocate a new ioas, we use the one from cpr state.
I think it would be confusing to print a trace that suggests we allocated
a new ioas.

Perhaps I should add a trace in vfio_cpr_find_device:

    trace_vfio_cpr_find_device(elem->ioas_id, elem->dev_id, elem->hwpt_id)

- Steve

     container =
VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD));
     container->be = vbasedev->iommufd;
     container->ioas_id = ioas_id;
     QLIST_INIT(&container->hwpt_list);
+    vbasedev->cpr.ioas_id = ioas_id;

     bcontainer = &container->bcontainer;
     vfio_address_space_insert(space, bcontainer);
--
1.8.3.1



Reply via email to