On Tue, Nov 11, 2025 at 05:44:06PM +0100, Thomas Hellström wrote:
> Support migration over interconnect when migrating from
> device-private pages with the same dev_pagemap owner.
>
> Since we now also collect device-private pages to migrate,
> also abort migration if the range to migrate is already
> fully populated with pages from the desired pagemap.
>
> Finally return -EBUSY from drm_pagemap_populate_mm()
> if the migration can't be completed without first migrating all
> pages in the range to system. It is expected that the caller
> will perform that before retrying the call to
> drm_pagemap_populate_mm().
>
> Assume for now that the drm_pagemap implementation is *not*
> capable of migrating data within the pagemap itself. This
> restriction will be configurable in upcoming patches.
>
> Signed-off-by: Thomas Hellström <[email protected]>
> ---
> drivers/gpu/drm/drm_pagemap.c | 177 +++++++++++++++++++++++++---------
> drivers/gpu/drm/xe/xe_svm.c | 20 ++--
> 2 files changed, 143 insertions(+), 54 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_pagemap.c b/drivers/gpu/drm/drm_pagemap.c
> index 1477a2057a15..e87676313ff9 100644
> --- a/drivers/gpu/drm/drm_pagemap.c
> +++ b/drivers/gpu/drm/drm_pagemap.c
> @@ -210,6 +210,7 @@ static void drm_pagemap_get_devmem_page(struct page *page,
> /**
> * drm_pagemap_migrate_map_pages() - Map migration pages for GPU SVM
> migration
> * @dev: The device for which the pages are being mapped
> + * @local_dpagemap: The drm_pagemap pointer of the local drm_pagemap.
> * @pagemap_addr: Array to store DMA information corresponding to mapped
> pages
> * @migrate_pfn: Array of migrate page frame numbers to map
> * @npages: Number of pages to map
> @@ -223,12 +224,14 @@ static void drm_pagemap_get_devmem_page(struct page
> *page,
> * Returns: 0 on success, -EFAULT if an error occurs during mapping.
> */
> static int drm_pagemap_migrate_map_pages(struct device *dev,
> + struct drm_pagemap *local_dpagemap,
> struct drm_pagemap_addr *pagemap_addr,
> unsigned long *migrate_pfn,
> unsigned long npages,
> enum dma_data_direction dir)
> {
> unsigned long i;
> + unsigned long num_peer_pages = 0;
>
> for (i = 0; i < npages;) {
> struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> @@ -239,31 +242,48 @@ static int drm_pagemap_migrate_map_pages(struct device
> *dev,
> if (!page)
> goto next;
>
> - if (WARN_ON_ONCE(is_zone_device_page(page)))
> - return -EFAULT;
> -
> folio = page_folio(page);
> order = folio_order(folio);
>
> - dma_addr = dma_map_page(dev, page, 0, page_size(page), dir);
> - if (dma_mapping_error(dev, dma_addr))
> - return -EFAULT;
> + if (is_zone_device_page(page)) {
> + struct drm_pagemap_zdd *zdd = page->zone_device_data;
> + struct drm_pagemap *dpagemap = zdd->dpagemap;
> + struct drm_pagemap_addr addr;
> +
> + if (dpagemap == local_dpagemap)
> + goto next;
>
> - pagemap_addr[i] =
> - drm_pagemap_addr_encode(dma_addr,
> - DRM_INTERCONNECT_SYSTEM,
> - order, dir);
> + num_peer_pages += NR_PAGES(order);
> + addr = dpagemap->ops->device_map(dpagemap, dev, page,
> order, dir);
> + if (dma_mapping_error(dev, addr.addr))
> + return -EFAULT;
> + } else {
> + dma_addr = dma_map_page(dev, page, 0, page_size(page),
> dir);
> + if (dma_mapping_error(dev, dma_addr))
> + return -EFAULT;
> +
> + pagemap_addr[i] =
> + drm_pagemap_addr_encode(dma_addr,
> + DRM_INTERCONNECT_SYSTEM,
> + order, dir);
> + }
>
> next:
> i += NR_PAGES(order);
> }
>
> + if (num_peer_pages)
> + drm_dbg(local_dpagemap->drm, "Migrating %lu peer pages over
> interconnect.\n",
> + num_peer_pages);
> +
> return 0;
> }
>
> /**
> * drm_pagemap_migrate_unmap_pages() - Unmap pages previously mapped for GPU
> SVM migration
> * @dev: The device for which the pages were mapped
> + * @migrate_pfn: Array of migrate pfns set up for the mapped pages. Used to
> + * determine the drm_pagemap of a peer device private page.
> * @pagemap_addr: Array of DMA information corresponding to mapped pages
> * @npages: Number of pages to unmap
> * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> @@ -274,16 +294,27 @@ static int drm_pagemap_migrate_map_pages(struct device
> *dev,
> */
> static void drm_pagemap_migrate_unmap_pages(struct device *dev,
> struct drm_pagemap_addr
> *pagemap_addr,
> + unsigned long *migrate_pfn,
> unsigned long npages,
> enum dma_data_direction dir)
> {
> unsigned long i;
>
> for (i = 0; i < npages;) {
> - if (!pagemap_addr[i].addr || dma_mapping_error(dev,
> pagemap_addr[i].addr))
> + struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
> +
> + if (!page || !pagemap_addr[i].addr || dma_mapping_error(dev,
> pagemap_addr[i].addr))
> goto next;
>
> - dma_unmap_page(dev, pagemap_addr[i].addr, PAGE_SIZE <<
> pagemap_addr[i].order, dir);
> + if (is_zone_device_page(page)) {
> + struct drm_pagemap_zdd *zdd = page->zone_device_data;
> + struct drm_pagemap *dpagemap = zdd->dpagemap;
> +
> + dpagemap->ops->device_unmap(dpagemap, dev,
> pagemap_addr[i]);
> + } else {
> + dma_unmap_page(dev, pagemap_addr[i].addr,
> + PAGE_SIZE << pagemap_addr[i].order, dir);
> + }
>
> next:
> i += NR_PAGES(pagemap_addr[i].order);
> @@ -308,6 +339,7 @@ npages_in_range(unsigned long start, unsigned long end)
> * @timeslice_ms: The time requested for the migrated pagemap pages to
> * be present in @mm before being allowed to be migrated back.
> * @pgmap_owner: Not used currently, since only system memory is considered.
> + * @mflags: Flags governing the migration.
> *
> * This function migrates the specified virtual address range to device
> memory.
> * It performs the necessary setup and invokes the driver-specific
> operations for
> @@ -333,13 +365,18 @@ int drm_pagemap_migrate_to_devmem(struct
> drm_pagemap_devmem *devmem_allocation,
> .start = start,
> .end = end,
> .pgmap_owner = pgmap_owner,
> - .flags = MIGRATE_VMA_SELECT_SYSTEM,
> + .flags = MIGRATE_VMA_SELECT_SYSTEM |
> + MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
> + MIGRATE_VMA_SELECT_DEVICE_COHERENT,
> };
> unsigned long i, npages = npages_in_range(start, end);
> + unsigned long own_pages = 0, migrated_pages = 0;
> struct vm_area_struct *vas;
> struct drm_pagemap_zdd *zdd = NULL;
> struct page **pages;
> struct drm_pagemap_addr *pagemap_addr;
> + struct drm_pagemap *dpagemap = devmem_allocation->dpagemap;
> + struct dev_pagemap *pagemap = dpagemap->pagemap;
> void *buf;
> int err;
>
> @@ -374,11 +411,13 @@ int drm_pagemap_migrate_to_devmem(struct
> drm_pagemap_devmem *devmem_allocation,
> pagemap_addr = buf + (2 * sizeof(*migrate.src) * npages);
> pages = buf + (2 * sizeof(*migrate.src) + sizeof(*pagemap_addr)) *
> npages;
>
> - zdd = drm_pagemap_zdd_alloc(devmem_allocation->dpagemap, pgmap_owner);
> + zdd = drm_pagemap_zdd_alloc(dpagemap, pgmap_owner);
> if (!zdd) {
> err = -ENOMEM;
> - goto err_free;
> + kvfree(buf);
> + goto err_out;
> }
> + zdd->devmem_allocation = devmem_allocation; /* Owns ref */
>
> migrate.vma = vas;
> migrate.src = buf;
> @@ -389,54 +428,108 @@ int drm_pagemap_migrate_to_devmem(struct
> drm_pagemap_devmem *devmem_allocation,
> goto err_free;
>
> if (!migrate.cpages) {
> - err = -EFAULT;
> + /* No pages to migrate. Raced or unknown device pages. */
> + err = -EBUSY;
> goto err_free;
> }
>
> if (migrate.cpages != npages) {
> + /*
> + * Some pages to migrate. But we want to migrate all or
> + * nothing. Raced or unknown device pages.
> + */
> err = -EBUSY;
> - goto err_finalize;
> + goto err_aborted_migration;
> + }
> +
> + /* Count device-private pages to migrate */
> + for (i = 0; i < npages; ++i) {
> + struct page *src_page = migrate_pfn_to_page(migrate.src[i]);
> +
> + if (src_page && is_zone_device_page(src_page)) {
> + if (page_pgmap(src_page) == pagemap)
> + own_pages++;
> + }
> + }
I understand what this is doing—aborting the migration if the pages are
in the correct location. Conceptually, I believe this is correct, but
implementation-wise it is likely not. The pages collected here are
gathered via migrate_vma_setup. This step issues an MMU notifier and
installs migration PTEs, which are expensive operations. For example, if
another GPU already has the correct mappings and the pages are in the
correct location, migrate_vma_setup will result in the range being
invalidated and the GPU<->GPU mapping being removed. Installing
migration PTEs is also CPU-intensive.
I think the step to check own_pages should be built on top of
hmm_range_fault without HMM_PFN_REQ_FAULT set, which is fast and will
not issue an MMU notifier.
Matt
> +
> + drm_dbg(dpagemap->drm, "Total pages %lu; Own pages: %lu.\n",
> + npages, own_pages);
> + if (own_pages == npages) {
> + err = 0;
> + drm_dbg(dpagemap->drm, "Migration wasn't necessary.\n");
> + goto err_aborted_migration;
> + } else if (own_pages) {
> + err = -EBUSY;
> + drm_dbg(dpagemap->drm, "Migration aborted due to
> fragmentation.\n");
> + goto err_aborted_migration;
> }
>
> err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst);
> if (err)
> goto err_finalize;
>
> - err = drm_pagemap_migrate_map_pages(devmem_allocation->dev,
> pagemap_addr,
> + err = drm_pagemap_migrate_map_pages(devmem_allocation->dev,
> + devmem_allocation->dpagemap,
> pagemap_addr,
> migrate.src, npages, DMA_TO_DEVICE);
>
> - if (err)
> + if (err) {
> + drm_pagemap_migrate_unmap_pages(devmem_allocation->dev,
> pagemap_addr,
> + migrate.src, npages,
> DMA_TO_DEVICE);
> +
> goto err_finalize;
> + }
>
> + own_pages = 0;
> for (i = 0; i < npages; ++i) {
> struct page *page = pfn_to_page(migrate.dst[i]);
> + struct page *src_page = migrate_pfn_to_page(migrate.src[i]);
>
> + if (unlikely(src_page && is_zone_device_page(src_page) &&
> + page_pgmap(src_page) == pagemap)) {
> + migrate.dst[i] = 0;
> + pages[i] = NULL;
> + own_pages++;
> + continue;
> + }
> pages[i] = page;
> migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> drm_pagemap_get_devmem_page(page, zdd);
> }
> + drm_WARN_ON(dpagemap->drm, !!own_pages);
>
> err = ops->copy_to_devmem(pages, pagemap_addr, npages);
> + drm_pagemap_migrate_unmap_pages(devmem_allocation->dev, pagemap_addr,
> + migrate.src, npages, DMA_TO_DEVICE);
> if (err)
> goto err_finalize;
>
> /* Upon success bind devmem allocation to range and zdd */
> devmem_allocation->timeslice_expiration = get_jiffies_64() +
> msecs_to_jiffies(timeslice_ms);
> - zdd->devmem_allocation = devmem_allocation; /* Owns ref */
>
> err_finalize:
> if (err)
> drm_pagemap_migration_unlock_put_pages(npages, migrate.dst);
> +err_aborted_migration:
> migrate_vma_pages(&migrate);
> +
> + for (i = 0; i < npages; ++i)
> + if (migrate.src[i] & MIGRATE_PFN_MIGRATE)
> + migrated_pages++;
> +
> + if (!err && migrated_pages < npages - own_pages) {
> + drm_dbg(dpagemap->drm, "Raced while finalizing migration.\n");
> + err = -EBUSY;
> + }
> +
> migrate_vma_finalize(&migrate);
> - drm_pagemap_migrate_unmap_pages(devmem_allocation->dev, pagemap_addr,
> npages,
> - DMA_TO_DEVICE);
> err_free:
> - if (zdd)
> - drm_pagemap_zdd_put(zdd);
> + drm_pagemap_zdd_put(zdd);
> kvfree(buf);
> + return err;
> +
> err_out:
> + devmem_allocation->ops->devmem_release(devmem_allocation);
> return err;
> }
> EXPORT_SYMBOL_GPL(drm_pagemap_migrate_to_devmem);
> @@ -747,7 +840,8 @@ int drm_pagemap_evict_to_ram(struct drm_pagemap_devmem
> *devmem_allocation)
> if (err || !mpages)
> goto err_finalize;
>
> - err = drm_pagemap_migrate_map_pages(devmem_allocation->dev,
> pagemap_addr,
> + err = drm_pagemap_migrate_map_pages(devmem_allocation->dev,
> + devmem_allocation->dpagemap,
> pagemap_addr,
> dst, npages, DMA_FROM_DEVICE);
> if (err)
> goto err_finalize;
> @@ -764,7 +858,7 @@ int drm_pagemap_evict_to_ram(struct drm_pagemap_devmem
> *devmem_allocation)
> drm_pagemap_migration_unlock_put_pages(npages, dst);
> migrate_device_pages(src, dst, npages);
> migrate_device_finalize(src, dst, npages);
> - drm_pagemap_migrate_unmap_pages(devmem_allocation->dev, pagemap_addr,
> npages,
> + drm_pagemap_migrate_unmap_pages(devmem_allocation->dev, pagemap_addr,
> dst, npages,
> DMA_FROM_DEVICE);
> err_free:
> kvfree(buf);
> @@ -820,12 +914,10 @@ static int __drm_pagemap_migrate_to_ram(struct
> vm_area_struct *vas,
> void *buf;
> int i, err = 0;
>
> - if (page) {
> - zdd = page->zone_device_data;
> - if (time_before64(get_jiffies_64(),
> - zdd->devmem_allocation->timeslice_expiration))
> - return 0;
> - }
> + zdd = page->zone_device_data;
> + if (time_before64(get_jiffies_64(),
> + zdd->devmem_allocation->timeslice_expiration))
> + return 0;
>
> start = ALIGN_DOWN(fault_addr, size);
> end = ALIGN(fault_addr + 1, size);
> @@ -861,19 +953,6 @@ static int __drm_pagemap_migrate_to_ram(struct
> vm_area_struct *vas,
> if (!migrate.cpages)
> goto err_free;
>
> - if (!page) {
> - for (i = 0; i < npages; ++i) {
> - if (!(migrate.src[i] & MIGRATE_PFN_MIGRATE))
> - continue;
> -
> - page = migrate_pfn_to_page(migrate.src[i]);
> - break;
> - }
> -
> - if (!page)
> - goto err_finalize;
> - }
> - zdd = page->zone_device_data;
> ops = zdd->devmem_allocation->ops;
> dev = zdd->devmem_allocation->dev;
>
> @@ -883,7 +962,7 @@ static int __drm_pagemap_migrate_to_ram(struct
> vm_area_struct *vas,
> if (err)
> goto err_finalize;
>
> - err = drm_pagemap_migrate_map_pages(dev, pagemap_addr, migrate.dst,
> npages,
> + err = drm_pagemap_migrate_map_pages(dev, zdd->dpagemap, pagemap_addr,
> migrate.dst, npages,
> DMA_FROM_DEVICE);
> if (err)
> goto err_finalize;
> @@ -901,8 +980,8 @@ static int __drm_pagemap_migrate_to_ram(struct
> vm_area_struct *vas,
> migrate_vma_pages(&migrate);
> migrate_vma_finalize(&migrate);
> if (dev)
> - drm_pagemap_migrate_unmap_pages(dev, pagemap_addr, npages,
> - DMA_FROM_DEVICE);
> + drm_pagemap_migrate_unmap_pages(dev, pagemap_addr, migrate.dst,
> + npages, DMA_FROM_DEVICE);
> err_free:
> kvfree(buf);
> err_out:
> @@ -938,10 +1017,12 @@ static vm_fault_t drm_pagemap_migrate_to_ram(struct
> vm_fault *vmf)
> struct drm_pagemap_zdd *zdd = vmf->page->zone_device_data;
> int err;
>
> + drm_pagemap_zdd_get(zdd);
> err = __drm_pagemap_migrate_to_ram(vmf->vma,
> zdd->device_private_page_owner,
> vmf->page, vmf->address,
> zdd->devmem_allocation->size);
> + drm_pagemap_zdd_put(zdd);
>
> return err ? VM_FAULT_SIGBUS : 0;
> }
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index 0b39905c9312..56bb3896b89a 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -1028,11 +1028,10 @@ static int xe_drm_pagemap_populate_mm(struct
> drm_pagemap *dpagemap,
>
> /* Ensure the device has a pm ref while there are device pages
> active. */
> xe_pm_runtime_get_noresume(xe);
> + /* Consumes the devmem allocation. */
> err = drm_pagemap_migrate_to_devmem(&bo->devmem_allocation, mm,
> start, end, timeslice_ms,
> xpagemap->pagemap.owner);
> - if (err)
> - xe_svm_devmem_release(&bo->devmem_allocation);
> xe_bo_unlock(bo);
> xe_bo_put(bo);
> }
> @@ -1546,6 +1545,7 @@ int xe_svm_alloc_vram(struct xe_svm_range *range, const
> struct drm_gpusvm_ctx *c
> struct drm_pagemap *dpagemap)
> {
> struct xe_device *xe = range_to_vm(&range->base)->xe;
> + int err, retries = 1;
>
> xe_assert(range_to_vm(&range->base)->xe,
> range->base.pages.flags.migrate_devmem);
> range_debug(range, "ALLOCATE VRAM");
> @@ -1554,10 +1554,18 @@ int xe_svm_alloc_vram(struct xe_svm_range *range,
> const struct drm_gpusvm_ctx *c
> drm_dbg(&xe->drm, "Request migration to device memory on
> \"%s\".\n",
> dpagemap->drm->unique);
>
> - return drm_pagemap_populate_mm(dpagemap, xe_svm_range_start(range),
> - xe_svm_range_end(range),
> - range->base.gpusvm->mm,
> - ctx->timeslice_ms);
> + do {
> + err = drm_pagemap_populate_mm(dpagemap,
> xe_svm_range_start(range),
> + xe_svm_range_end(range),
> + range->base.gpusvm->mm,
> + ctx->timeslice_ms);
> +
> + if (err == -EBUSY && retries)
> + drm_gpusvm_range_evict(range->base.gpusvm,
> &range->base);
> +
> + } while (err == -EBUSY && retries--);
> +
> + return err;
> }
>
> static struct drm_pagemap_addr
> --
> 2.51.1
>