On Tue, Nov 11, 2025 at 05:43:54PM +0100, Thomas Hellström wrote:
> If a device holds a reference on a foregin device's drm_pagemap,
> and a device unbind is executed on the foreign device,
> Typically that foreign device would evict its device-private
> pages and then continue its device-managed cleanup eventually
> releasing its drm device and possibly allow for module unload.
> However, since we're still holding a reference on a drm_pagemap,
> when that reference is released and the provider module is
> unloaded we'd execute out of undefined memory.
> 
> Therefore keep a reference on the provider device and module until
> the last drm_pagemap reference is gone.
> 
> Note that in theory, the drm_gpusvm_helper module may be unloaded
> as soon as the final module_put() of the provider driver module is
> executed, so we need to add a module_exit() function that waits
> for the work item executing the module_put() has completed.
> 
> v2:
> - Better commit message (Matt Brost)
> 
> Signed-off-by: Thomas Hellström <[email protected]>

Reviewed-by: Matthew Brost <[email protected]>

> ---
>  drivers/gpu/drm/drm_pagemap.c | 101 ++++++++++++++++++++++++++++++++--
>  drivers/gpu/drm/xe/xe_svm.c   |  15 ++++-
>  include/drm/drm_pagemap.h     |  10 +++-
>  3 files changed, 117 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c
> index 173b3ecb07d5..fb18a80d6a1c 100644
> --- a/drivers/gpu/drm/drm_pagemap.c
> +++ b/drivers/gpu/drm/drm_pagemap.c
> @@ -8,6 +8,7 @@
>  #include <linux/pagemap.h>
>  #include <drm/drm_drv.h>
>  #include <drm/drm_pagemap.h>
> +#include <drm/drm_print.h>
>  
>  /**
>   * DOC: Overview
> @@ -544,16 +545,92 @@ static int drm_pagemap_migrate_populate_ram_pfn(struct 
> vm_area_struct *vas,
>       return -ENOMEM;
>  }
>  
> +static void drm_pagemap_dev_unhold_work(struct work_struct *work);
> +static LLIST_HEAD(drm_pagemap_unhold_list);
> +static DECLARE_WORK(drm_pagemap_work, drm_pagemap_dev_unhold_work);
> +
> +/**
> + * struct drm_pagemap_dev_hold - Struct to aid in drm_device release.
> + * @link: Link into drm_pagemap_unhold_list for deferred reference releases.
> + * @drm: drm device to put.
> + *
> + * When a struct drm_pagemap is released, we also need to release the
> + * reference it holds on the drm device. However, typically that needs
> + * to be done separately from a system-wide workqueue.
> + * Each time a struct drm_pagemap is initialized
> + * (or re-initialized if cached) therefore allocate a separate
> + * drm_pagemap_dev_hold item, from which we put the drm device and
> + * associated module.
> + */
> +struct drm_pagemap_dev_hold {
> +     struct llist_node link;
> +     struct drm_device *drm;
> +};
> +
>  static void drm_pagemap_release(struct kref *ref)
>  {
>       struct drm_pagemap *dpagemap = container_of(ref, typeof(*dpagemap), 
> ref);
> -
> +     struct drm_pagemap_dev_hold *dev_hold = dpagemap->dev_hold;
> +
> +     /*
> +      * We know the pagemap provider is alive at this point, since
> +      * the struct drm_pagemap_dev_hold holds a reference to the
> +      * pagemap provider drm_device and its module.
> +      */
> +     dpagemap->dev_hold = NULL;
>       kfree(dpagemap);
> +     llist_add(&dev_hold->link, &drm_pagemap_unhold_list);
> +     schedule_work(&drm_pagemap_work);
> +     /*
> +      * Here, either the provider device is still alive, since if called from
> +      * page_free(), the caller is holding a reference on the dev_pagemap,
> +      * or if called from drm_pagemap_put(), the direct caller is still 
> alive.
> +      * This ensures we can't race with THIS module unload.
> +      */
> +}
> +
> +static void drm_pagemap_dev_unhold_work(struct work_struct *work)
> +{
> +     struct llist_node *node = llist_del_all(&drm_pagemap_unhold_list);
> +     struct drm_pagemap_dev_hold *dev_hold, *next;
> +
> +     /*
> +      * Deferred release of drm_pagemap provider device and module.
> +      * THIS module is kept alive during the release by the
> +      * flush_work() in the drm_pagemap_exit() function.
> +      */
> +     llist_for_each_entry_safe(dev_hold, next, node, link) {
> +             struct drm_device *drm = dev_hold->drm;
> +             struct module *module = drm->driver->fops->owner;
> +
> +             drm_dbg(drm, "Releasing reference on provider device and 
> module.\n");
> +             drm_dev_put(drm);
> +             module_put(module);
> +             kfree(dev_hold);
> +     }
> +}
> +
> +static struct drm_pagemap_dev_hold *
> +drm_pagemap_dev_hold(struct drm_pagemap *dpagemap)
> +{
> +     struct drm_pagemap_dev_hold *dev_hold;
> +     struct drm_device *drm = dpagemap->drm;
> +
> +     dev_hold = kzalloc(sizeof(*dev_hold), GFP_KERNEL);
> +     if (!dev_hold)
> +             return ERR_PTR(-ENOMEM);
> +
> +     init_llist_node(&dev_hold->link);
> +     dev_hold->drm = drm;
> +     (void)try_module_get(drm->driver->fops->owner);
> +     drm_dev_get(drm);
> +
> +     return dev_hold;
>  }
>  
>  /**
>   * drm_pagemap_create() - Create a struct drm_pagemap.
> - * @dev: Pointer to a struct device providing the device-private memory.
> + * @drm: Pointer to a struct drm_device providing the device-private memory.
>   * @pagemap: Pointer to a pre-setup struct dev_pagemap providing the struct 
> pages.
>   * @ops: Pointer to the struct drm_pagemap_ops.
>   *
> @@ -563,20 +640,28 @@ static void drm_pagemap_release(struct kref *ref)
>   * Error pointer on error.
>   */
>  struct drm_pagemap *
> -drm_pagemap_create(struct device *dev,
> +drm_pagemap_create(struct drm_device *drm,
>                  struct dev_pagemap *pagemap,
>                  const struct drm_pagemap_ops *ops)
>  {
>       struct drm_pagemap *dpagemap = kzalloc(sizeof(*dpagemap), GFP_KERNEL);
> +     struct drm_pagemap_dev_hold *dev_hold;
>  
>       if (!dpagemap)
>               return ERR_PTR(-ENOMEM);
>  
>       kref_init(&dpagemap->ref);
> -     dpagemap->dev = dev;
> +     dpagemap->drm = drm;
>       dpagemap->ops = ops;
>       dpagemap->pagemap = pagemap;
>  
> +     dev_hold = drm_pagemap_dev_hold(dpagemap);
> +     if (IS_ERR(dev_hold)) {
> +             kfree(dpagemap);
> +             return ERR_CAST(dev_hold);
> +     }
> +     dpagemap->dev_hold = dev_hold;
> +
>       return dpagemap;
>  }
>  EXPORT_SYMBOL(drm_pagemap_create);
> @@ -937,3 +1022,11 @@ int drm_pagemap_populate_mm(struct drm_pagemap 
> *dpagemap,
>       return err;
>  }
>  EXPORT_SYMBOL(drm_pagemap_populate_mm);
> +
> +static void drm_pagemap_exit(void)
> +{
> +     flush_work(&drm_pagemap_work);
> +     if (WARN_ON(!llist_empty(&drm_pagemap_unhold_list)))
> +             disable_work_sync(&drm_pagemap_work);
> +}
> +module_exit(drm_pagemap_exit);
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index a3f97cf9c254..aab939fbcf80 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -1436,7 +1436,7 @@ xe_drm_pagemap_device_map(struct drm_pagemap *dpagemap,
>                         unsigned int order,
>                         enum dma_data_direction dir)
>  {
> -     struct device *pgmap_dev = dpagemap->dev;
> +     struct device *pgmap_dev = dpagemap->drm->dev;
>       enum drm_interconnect_protocol prot;
>       dma_addr_t addr;
>  
> @@ -1456,6 +1456,14 @@ static const struct drm_pagemap_ops xe_drm_pagemap_ops 
> = {
>       .populate_mm = xe_drm_pagemap_populate_mm,
>  };
>  
> +static void xe_devm_release(void *data)
> +{
> +     struct xe_vram_region *vr = data;
> +
> +     drm_pagemap_put(vr->dpagemap);
> +     vr->dpagemap = NULL;
> +}
> +
>  /**
>   * xe_devm_add: Remap and provide memmap backing for device memory
>   * @tile: tile that the memory region belongs to
> @@ -1481,7 +1489,7 @@ int xe_devm_add(struct xe_tile *tile, struct 
> xe_vram_region *vr)
>               return ret;
>       }
>  
> -     vr->dpagemap = drm_pagemap_create(dev, &vr->pagemap,
> +     vr->dpagemap = drm_pagemap_create(&xe->drm, &vr->pagemap,
>                                         &xe_drm_pagemap_ops);
>       if (IS_ERR(vr->dpagemap)) {
>               drm_err(&xe->drm, "Failed to create drm_pagemap tile %d memory: 
> %pe\n",
> @@ -1489,6 +1497,9 @@ int xe_devm_add(struct xe_tile *tile, struct 
> xe_vram_region *vr)
>               ret = PTR_ERR(vr->dpagemap);
>               goto out_no_dpagemap;
>       }
> +     ret = devm_add_action_or_reset(dev, xe_devm_release, vr);
> +     if (ret)
> +             goto out_no_dpagemap;
>  
>       vr->pagemap.type = MEMORY_DEVICE_PRIVATE;
>       vr->pagemap.range.start = res->start;
> diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h
> index 2c7de928865b..5cfe54331ba7 100644
> --- a/include/drm/drm_pagemap.h
> +++ b/include/drm/drm_pagemap.h
> @@ -9,6 +9,7 @@
>  #define NR_PAGES(order) (1U << (order))
>  
>  struct drm_pagemap;
> +struct drm_pagemap_dev_hold;
>  struct drm_pagemap_zdd;
>  struct device;
>  
> @@ -130,14 +131,17 @@ struct drm_pagemap_ops {
>   * used for device p2p handshaking.
>   * @ops: The struct drm_pagemap_ops.
>   * @ref: Reference count.
> - * @dev: The struct drevice owning the device-private memory.
> + * @drm: The struct drm device owning the device-private memory.
>   * @pagemap: Pointer to the underlying dev_pagemap.
> + * @dev_hold: Pointer to a struct drm_pagemap_dev_hold for
> + * device referencing.
>   */
>  struct drm_pagemap {
>       const struct drm_pagemap_ops *ops;
>       struct kref ref;
> -     struct device *dev;
> +     struct drm_device *drm;
>       struct dev_pagemap *pagemap;
> +     struct drm_pagemap_dev_hold *dev_hold;
>  };
>  
>  struct drm_pagemap_devmem;
> @@ -206,7 +210,7 @@ struct drm_pagemap_devmem_ops {
>                          unsigned long npages);
>  };
>  
> -struct drm_pagemap *drm_pagemap_create(struct device *dev,
> +struct drm_pagemap *drm_pagemap_create(struct drm_device *drm,
>                                      struct dev_pagemap *pagemap,
>                                      const struct drm_pagemap_ops *ops);
>  
> -- 
> 2.51.1
> 

Reply via email to