If a device holds a reference on a foregin device's drm_pagemap,
and a device unbind is executed on the foreign device,
Typically that foreign device would evict its device-private
pages and then continue its device-managed cleanup eventually
releasing its drm device and possibly allow for module unload.
However, since we're still holding a reference on a drm_pagemap,
when that reference is released and the provider module is
unloaded we'd execute out of undefined memory.

Therefore keep a reference on the provider device and module until
the last drm_pagemap reference is gone.

Note that in theory, the drm_gpusvm_helper module may be unloaded
as soon as the final module_put() of the provider driver module is
executed, so we need to add a module_exit() function that waits
for the work item executing the module_put() has completed.

v2:
- Better commit message (Matt Brost)

Signed-off-by: Thomas Hellström <[email protected]>
---
 drivers/gpu/drm/drm_pagemap.c | 101 ++++++++++++++++++++++++++++++++--
 drivers/gpu/drm/xe/xe_svm.c   |  15 ++++-
 include/drm/drm_pagemap.h     |  10 +++-
 3 files changed, 117 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c
index 173b3ecb07d5..fb18a80d6a1c 100644
--- a/drivers/gpu/drm/drm_pagemap.c
+++ b/drivers/gpu/drm/drm_pagemap.c
@@ -8,6 +8,7 @@
 #include <linux/pagemap.h>
 #include <drm/drm_drv.h>
 #include <drm/drm_pagemap.h>
+#include <drm/drm_print.h>
 
 /**
  * DOC: Overview
@@ -544,16 +545,92 @@ static int drm_pagemap_migrate_populate_ram_pfn(struct 
vm_area_struct *vas,
        return -ENOMEM;
 }
 
+static void drm_pagemap_dev_unhold_work(struct work_struct *work);
+static LLIST_HEAD(drm_pagemap_unhold_list);
+static DECLARE_WORK(drm_pagemap_work, drm_pagemap_dev_unhold_work);
+
+/**
+ * struct drm_pagemap_dev_hold - Struct to aid in drm_device release.
+ * @link: Link into drm_pagemap_unhold_list for deferred reference releases.
+ * @drm: drm device to put.
+ *
+ * When a struct drm_pagemap is released, we also need to release the
+ * reference it holds on the drm device. However, typically that needs
+ * to be done separately from a system-wide workqueue.
+ * Each time a struct drm_pagemap is initialized
+ * (or re-initialized if cached) therefore allocate a separate
+ * drm_pagemap_dev_hold item, from which we put the drm device and
+ * associated module.
+ */
+struct drm_pagemap_dev_hold {
+       struct llist_node link;
+       struct drm_device *drm;
+};
+
 static void drm_pagemap_release(struct kref *ref)
 {
        struct drm_pagemap *dpagemap = container_of(ref, typeof(*dpagemap), 
ref);
-
+       struct drm_pagemap_dev_hold *dev_hold = dpagemap->dev_hold;
+
+       /*
+        * We know the pagemap provider is alive at this point, since
+        * the struct drm_pagemap_dev_hold holds a reference to the
+        * pagemap provider drm_device and its module.
+        */
+       dpagemap->dev_hold = NULL;
        kfree(dpagemap);
+       llist_add(&dev_hold->link, &drm_pagemap_unhold_list);
+       schedule_work(&drm_pagemap_work);
+       /*
+        * Here, either the provider device is still alive, since if called from
+        * page_free(), the caller is holding a reference on the dev_pagemap,
+        * or if called from drm_pagemap_put(), the direct caller is still 
alive.
+        * This ensures we can't race with THIS module unload.
+        */
+}
+
+static void drm_pagemap_dev_unhold_work(struct work_struct *work)
+{
+       struct llist_node *node = llist_del_all(&drm_pagemap_unhold_list);
+       struct drm_pagemap_dev_hold *dev_hold, *next;
+
+       /*
+        * Deferred release of drm_pagemap provider device and module.
+        * THIS module is kept alive during the release by the
+        * flush_work() in the drm_pagemap_exit() function.
+        */
+       llist_for_each_entry_safe(dev_hold, next, node, link) {
+               struct drm_device *drm = dev_hold->drm;
+               struct module *module = drm->driver->fops->owner;
+
+               drm_dbg(drm, "Releasing reference on provider device and 
module.\n");
+               drm_dev_put(drm);
+               module_put(module);
+               kfree(dev_hold);
+       }
+}
+
+static struct drm_pagemap_dev_hold *
+drm_pagemap_dev_hold(struct drm_pagemap *dpagemap)
+{
+       struct drm_pagemap_dev_hold *dev_hold;
+       struct drm_device *drm = dpagemap->drm;
+
+       dev_hold = kzalloc(sizeof(*dev_hold), GFP_KERNEL);
+       if (!dev_hold)
+               return ERR_PTR(-ENOMEM);
+
+       init_llist_node(&dev_hold->link);
+       dev_hold->drm = drm;
+       (void)try_module_get(drm->driver->fops->owner);
+       drm_dev_get(drm);
+
+       return dev_hold;
 }
 
 /**
  * drm_pagemap_create() - Create a struct drm_pagemap.
- * @dev: Pointer to a struct device providing the device-private memory.
+ * @drm: Pointer to a struct drm_device providing the device-private memory.
  * @pagemap: Pointer to a pre-setup struct dev_pagemap providing the struct 
pages.
  * @ops: Pointer to the struct drm_pagemap_ops.
  *
@@ -563,20 +640,28 @@ static void drm_pagemap_release(struct kref *ref)
  * Error pointer on error.
  */
 struct drm_pagemap *
-drm_pagemap_create(struct device *dev,
+drm_pagemap_create(struct drm_device *drm,
                   struct dev_pagemap *pagemap,
                   const struct drm_pagemap_ops *ops)
 {
        struct drm_pagemap *dpagemap = kzalloc(sizeof(*dpagemap), GFP_KERNEL);
+       struct drm_pagemap_dev_hold *dev_hold;
 
        if (!dpagemap)
                return ERR_PTR(-ENOMEM);
 
        kref_init(&dpagemap->ref);
-       dpagemap->dev = dev;
+       dpagemap->drm = drm;
        dpagemap->ops = ops;
        dpagemap->pagemap = pagemap;
 
+       dev_hold = drm_pagemap_dev_hold(dpagemap);
+       if (IS_ERR(dev_hold)) {
+               kfree(dpagemap);
+               return ERR_CAST(dev_hold);
+       }
+       dpagemap->dev_hold = dev_hold;
+
        return dpagemap;
 }
 EXPORT_SYMBOL(drm_pagemap_create);
@@ -937,3 +1022,11 @@ int drm_pagemap_populate_mm(struct drm_pagemap *dpagemap,
        return err;
 }
 EXPORT_SYMBOL(drm_pagemap_populate_mm);
+
+static void drm_pagemap_exit(void)
+{
+       flush_work(&drm_pagemap_work);
+       if (WARN_ON(!llist_empty(&drm_pagemap_unhold_list)))
+               disable_work_sync(&drm_pagemap_work);
+}
+module_exit(drm_pagemap_exit);
diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
index a3f97cf9c254..aab939fbcf80 100644
--- a/drivers/gpu/drm/xe/xe_svm.c
+++ b/drivers/gpu/drm/xe/xe_svm.c
@@ -1436,7 +1436,7 @@ xe_drm_pagemap_device_map(struct drm_pagemap *dpagemap,
                          unsigned int order,
                          enum dma_data_direction dir)
 {
-       struct device *pgmap_dev = dpagemap->dev;
+       struct device *pgmap_dev = dpagemap->drm->dev;
        enum drm_interconnect_protocol prot;
        dma_addr_t addr;
 
@@ -1456,6 +1456,14 @@ static const struct drm_pagemap_ops xe_drm_pagemap_ops = 
{
        .populate_mm = xe_drm_pagemap_populate_mm,
 };
 
+static void xe_devm_release(void *data)
+{
+       struct xe_vram_region *vr = data;
+
+       drm_pagemap_put(vr->dpagemap);
+       vr->dpagemap = NULL;
+}
+
 /**
  * xe_devm_add: Remap and provide memmap backing for device memory
  * @tile: tile that the memory region belongs to
@@ -1481,7 +1489,7 @@ int xe_devm_add(struct xe_tile *tile, struct 
xe_vram_region *vr)
                return ret;
        }
 
-       vr->dpagemap = drm_pagemap_create(dev, &vr->pagemap,
+       vr->dpagemap = drm_pagemap_create(&xe->drm, &vr->pagemap,
                                          &xe_drm_pagemap_ops);
        if (IS_ERR(vr->dpagemap)) {
                drm_err(&xe->drm, "Failed to create drm_pagemap tile %d memory: 
%pe\n",
@@ -1489,6 +1497,9 @@ int xe_devm_add(struct xe_tile *tile, struct 
xe_vram_region *vr)
                ret = PTR_ERR(vr->dpagemap);
                goto out_no_dpagemap;
        }
+       ret = devm_add_action_or_reset(dev, xe_devm_release, vr);
+       if (ret)
+               goto out_no_dpagemap;
 
        vr->pagemap.type = MEMORY_DEVICE_PRIVATE;
        vr->pagemap.range.start = res->start;
diff --git a/include/drm/drm_pagemap.h b/include/drm/drm_pagemap.h
index 2c7de928865b..5cfe54331ba7 100644
--- a/include/drm/drm_pagemap.h
+++ b/include/drm/drm_pagemap.h
@@ -9,6 +9,7 @@
 #define NR_PAGES(order) (1U << (order))
 
 struct drm_pagemap;
+struct drm_pagemap_dev_hold;
 struct drm_pagemap_zdd;
 struct device;
 
@@ -130,14 +131,17 @@ struct drm_pagemap_ops {
  * used for device p2p handshaking.
  * @ops: The struct drm_pagemap_ops.
  * @ref: Reference count.
- * @dev: The struct drevice owning the device-private memory.
+ * @drm: The struct drm device owning the device-private memory.
  * @pagemap: Pointer to the underlying dev_pagemap.
+ * @dev_hold: Pointer to a struct drm_pagemap_dev_hold for
+ * device referencing.
  */
 struct drm_pagemap {
        const struct drm_pagemap_ops *ops;
        struct kref ref;
-       struct device *dev;
+       struct drm_device *drm;
        struct dev_pagemap *pagemap;
+       struct drm_pagemap_dev_hold *dev_hold;
 };
 
 struct drm_pagemap_devmem;
@@ -206,7 +210,7 @@ struct drm_pagemap_devmem_ops {
                           unsigned long npages);
 };
 
-struct drm_pagemap *drm_pagemap_create(struct device *dev,
+struct drm_pagemap *drm_pagemap_create(struct drm_device *drm,
                                       struct dev_pagemap *pagemap,
                                       const struct drm_pagemap_ops *ops);
 
-- 
2.51.1

Reply via email to