xe: Add vendor-specific vfio_pci driver for Intel graphics

Winiarski, Michal Wed, 29 Oct 2025 13:46:22 -0700

On Mon, Oct 27, 2025 at 08:24:37AM +0100, Tian, Kevin wrote:
> > From: Winiarski, Michal <[email protected]>
> > Sent: Wednesday, October 22, 2025 6:42 AM
> > +
> > +/**
> > + * struct xe_vfio_pci_migration_file - file used for reading / writing 
> > migration
> > data
> > + */
> 
> let's use the comment style in vfio, i.e. "/*" instead of "/**"


It's a kernel-doc format (it's also used in vfio in some places).
I'll drop it though - because of the comments below.

> 
> > +struct xe_vfio_pci_migration_file {
> > +   /** @filp: pointer to underlying &struct file */
> > +   struct file *filp;
> > +   /** @lock: serializes accesses to migration data */
> > +   struct mutex lock;
> > +   /** @xe_vdev: backpointer to &struct xe_vfio_pci_core_device */
> > +   struct xe_vfio_pci_core_device *xe_vdev;
> 
> above comments are obvious...

Ok - will keep it simple and drop the obvious ones.

> 
> > +struct xe_vfio_pci_core_device {
> > +   /** @core_device: vendor-agnostic VFIO device */
> > +   struct vfio_pci_core_device core_device;
> > +
> > +   /** @mig_state: current device migration state */
> > +   enum vfio_device_mig_state mig_state;
> > +
> > +   /** @vfid: VF number used by PF, xe uses 1-based indexing for vfid
> > */
> > +   unsigned int vfid;
> 
> is 1-based indexing a sw or hw requirement?

HW/FW components are using 1-based indexing.
I'll update the comment to state that.

> 
> > +
> > +   /** @pf: pointer to driver_private of physical function */
> > +   struct pci_dev *pf;
> > +
> > +   /** @fd: &struct xe_vfio_pci_migration_file for userspace to
> > read/write migration data */
> > +   struct xe_vfio_pci_migration_file *fd;
> 
> s/fd/migf/, as 'fd' is integer in all other places

Ok.

> 
> btw it's risky w/o a lock protecting the state transition. See the usage of
> state_mutex in other migration drivers.

It's a gap - I'll introduce a state_mutex.

> 
> > +static void xe_vfio_pci_reset_done(struct pci_dev *pdev)
> > +{
> > +   struct xe_vfio_pci_core_device *xe_vdev = pci_get_drvdata(pdev);
> > +   int ret;
> > +
> > +   ret = xe_sriov_vfio_wait_flr_done(xe_vdev->pf, xe_vdev->vfid);
> > +   if (ret)
> > +           dev_err(&pdev->dev, "Failed to wait for FLR: %d\n", ret);
> 
> why is there a device specific wait for flr done? suppose it's already
> covered by pci core...

No, unfortunately some of the state is cleared by PF driver, after
PCI-level VF FLR is already done.
More info on that is available in patch 23:
"VF FLR requires additional processing done by PF driver.
The processing is done after FLR is already finished from PCIe
perspective.
In order to avoid a scenario where migration state transitions while
PF processing is still in progress, additional synchronization
point is needed.
Add a helper that will be used as part of VF driver struct
pci_error_handlers .reset_done() callback."

I'll add a comment, so that it's available here as well.

> 
> > +
> > +   xe_vfio_pci_reset(xe_vdev);
> > +}
> > +
> > +static const struct pci_error_handlers xe_vfio_pci_err_handlers = {
> > +   .reset_done = xe_vfio_pci_reset_done,
> > +};
> 
> missing ".error_detected "

Ok. I'll use the generic one - vfio_pci_core_aer_err_detected().

> 
> > +static struct xe_vfio_pci_migration_file *
> > +xe_vfio_pci_alloc_file(struct xe_vfio_pci_core_device *xe_vdev,
> > +                  enum xe_vfio_pci_file_type type)
> > +{
> > +   struct xe_vfio_pci_migration_file *migf;
> > +   const struct file_operations *fops;
> > +   int flags;
> > +
> > +   migf = kzalloc(sizeof(*migf), GFP_KERNEL);
> > +   if (!migf)
> > +           return ERR_PTR(-ENOMEM);
> > +
> > +   fops = type == XE_VFIO_FILE_SAVE ? &xe_vfio_pci_save_fops :
> > &xe_vfio_pci_resume_fops;
> > +   flags = type == XE_VFIO_FILE_SAVE ? O_RDONLY : O_WRONLY;
> > +   migf->filp = anon_inode_getfile("xe_vfio_mig", fops, migf, flags);
> > +   if (IS_ERR(migf->filp)) {
> > +           kfree(migf);
> > +           return ERR_CAST(migf->filp);
> > +   }
> > +
> > +   mutex_init(&migf->lock);
> > +   migf->xe_vdev = xe_vdev;
> > +   xe_vdev->fd = migf;
> > +
> > +   stream_open(migf->filp->f_inode, migf->filp);
> > +
> > +   return migf;
> 
> miss a get_file(). vfio core will do another fput() upon error.
> 
> see vfio_ioct_mig_return_fd()

Ok. I'll take a ref on both STOP_COPY and RESUMING transition.

> 
> > +}
> > +
> > +static struct file *
> > +xe_vfio_set_state(struct xe_vfio_pci_core_device *xe_vdev, u32 new)
> > +{
> > +   u32 cur = xe_vdev->mig_state;
> > +   int ret;
> > +
> > +   dev_dbg(xe_vdev_to_dev(xe_vdev),
> > +           "state: %s->%s\n", vfio_dev_state_str(cur),
> > vfio_dev_state_str(new));
> > +
> > +   /*
> > +    * "STOP" handling is reused for "RUNNING_P2P", as the device
> > doesn't have the capability to
> > +    * selectively block p2p DMA transfers.
> > +    * The device is not processing new workload requests when the VF is
> > stopped, and both
> > +    * memory and MMIO communication channels are transferred to
> > destination (where processing
> > +    * will be resumed).
> > +    */
> > +   if ((cur == VFIO_DEVICE_STATE_RUNNING && new ==
> > VFIO_DEVICE_STATE_STOP) ||
> 
> this is not required when P2P is supported. vfio_mig_get_next_state() will
> find the right arc from RUNNING to RUNNING_P2P to STOP.

I'll remove both states (RUNNING -> STOP, STOP -> RUNNING).

> 
> > +       (cur == VFIO_DEVICE_STATE_RUNNING && new ==
> > VFIO_DEVICE_STATE_RUNNING_P2P)) {
> > +           ret = xe_sriov_vfio_stop(xe_vdev->pf, xe_vdev->vfid);
> > +           if (ret)
> > +                   goto err;
> > +
> > +           return NULL;
> > +   }
> 
> better to align with other drivers, s/stop/suspend/ and s/run/resume/

This will collide with resume_enter / resume_exit for actual migration
data loading.
I'll use suspend_device / resume_device, and resume_data_enter /
resume_data_exit.

> 
> > +
> > +   if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new ==
> > VFIO_DEVICE_STATE_STOP) ||
> > +       (cur == VFIO_DEVICE_STATE_STOP && new ==
> > VFIO_DEVICE_STATE_RUNNING_P2P))
> > +           return NULL;
> > +
> > +   if ((cur == VFIO_DEVICE_STATE_STOP && new ==
> > VFIO_DEVICE_STATE_RUNNING) ||
> > +       (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new ==
> > VFIO_DEVICE_STATE_RUNNING)) {
> > +           ret = xe_sriov_vfio_run(xe_vdev->pf, xe_vdev->vfid);
> > +           if (ret)
> > +                   goto err;
> > +
> > +           return NULL;
> > +   }
> > +
> > +   if (cur == VFIO_DEVICE_STATE_STOP && new ==
> > VFIO_DEVICE_STATE_STOP_COPY) {
> > +           struct xe_vfio_pci_migration_file *migf;
> > +
> > +           migf = xe_vfio_pci_alloc_file(xe_vdev, XE_VFIO_FILE_SAVE);
> > +           if (IS_ERR(migf)) {
> > +                   ret = PTR_ERR(migf);
> > +                   goto err;
> > +           }
> > +
> > +           ret = xe_sriov_vfio_stop_copy_enter(xe_vdev->pf, xe_vdev-
> > >vfid);
> > +           if (ret) {
> > +                   fput(migf->filp);
> > +                   goto err;
> > +           }
> > +
> > +           return migf->filp;
> > +   }
> > +
> > +   if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new ==
> > VFIO_DEVICE_STATE_STOP)) {
> > +           if (xe_vdev->fd)
> > +                   xe_vfio_pci_disable_file(xe_vdev->fd);
> > +
> > +           xe_sriov_vfio_stop_copy_exit(xe_vdev->pf, xe_vdev->vfid);
> > +
> > +           return NULL;
> > +   }
> > +
> > +   if (cur == VFIO_DEVICE_STATE_STOP && new ==
> > VFIO_DEVICE_STATE_RESUMING) {
> > +           struct xe_vfio_pci_migration_file *migf;
> > +
> > +           migf = xe_vfio_pci_alloc_file(xe_vdev,
> > XE_VFIO_FILE_RESUME);
> > +           if (IS_ERR(migf)) {
> > +                   ret = PTR_ERR(migf);
> > +                   goto err;
> > +           }
> > +
> > +           ret = xe_sriov_vfio_resume_enter(xe_vdev->pf, xe_vdev-
> > >vfid);
> > +           if (ret) {
> > +                   fput(migf->filp);
> > +                   goto err;
> > +           }
> > +
> > +           return migf->filp;
> > +   }
> > +
> > +   if (cur == VFIO_DEVICE_STATE_RESUMING && new ==
> > VFIO_DEVICE_STATE_STOP) {
> > +           if (xe_vdev->fd)
> > +                   xe_vfio_pci_disable_file(xe_vdev->fd);
> > +
> > +           xe_sriov_vfio_resume_exit(xe_vdev->pf, xe_vdev->vfid);
> > +
> > +           return NULL;
> > +   }
> > +
> > +   if (new == VFIO_DEVICE_STATE_ERROR)
> > +           xe_sriov_vfio_error(xe_vdev->pf, xe_vdev->vfid);
> 
> the ERROR state is not passed to the variant driver. You'll get -EINVAL 
> from vfio_mig_get_next_state(). so this is dead code.
> 
> If the pf driver needs to be notified, you could check the ret value instead.

Ok. I'll do that instead.

> 
> > +static void xe_vfio_pci_migration_init(struct vfio_device *core_vdev)
> > +{
> > +   struct xe_vfio_pci_core_device *xe_vdev =
> > +           container_of(core_vdev, struct xe_vfio_pci_core_device,
> > core_device.vdev);
> > +   struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
> > +
> > +   if (!xe_sriov_vfio_migration_supported(pdev->physfn))
> > +           return;
> > +
> > +   /* vfid starts from 1 for xe */
> > +   xe_vdev->vfid = pci_iov_vf_id(pdev) + 1;
> 
> pci_iov_vf_id() returns error if it's not vf. should be checked.

Ok.

> 
> > +static int xe_vfio_pci_init_dev(struct vfio_device *core_vdev)
> > +{
> > +   struct pci_dev *pdev = to_pci_dev(core_vdev->dev);
> > +
> > +   if (pdev->is_virtfn && strcmp(pdev->physfn->dev.driver->name, "xe")
> > == 0)
> > +           xe_vfio_pci_migration_init(core_vdev);
> 
> I didn't see the point of checking the driver name.

It will go away after transitioning to xe_pci_get_pf_xe_device().

> 
> > +
> > +MODULE_LICENSE("GPL");
> > +MODULE_AUTHOR("Intel Corporation");
> 
> please use the author name, as other drivers do

Ok.

Thanks,
-Michał

Re: [PATCH v2 26/26] vfio/xe: Add vendor-specific vfio_pci driver for Intel graphics

Reply via email to