On Wed, 2025-11-26 at 08:32 -0800, Matthew Brost wrote:
> On Tue, Nov 11, 2025 at 05:44:06PM +0100, Thomas Hellström wrote:
> > Support migration over interconnect when migrating from
> > device-private pages with the same dev_pagemap owner.
> >
> > Since we now also collect device-private pages to migrate,
> > also abort migration if the range to migrate is already
> > fully populated with pages from the desired pagemap.
> >
> > Finally return -EBUSY from drm_pagemap_populate_mm()
> > if the migration can't be completed without first migrating all
> > pages in the range to system. It is expected that the caller
> > will perform that before retrying the call to
> > drm_pagemap_populate_mm().
> >
> > Assume for now that the drm_pagemap implementation is *not*
> > capable of migrating data within the pagemap itself. This
> > restriction will be configurable in upcoming patches.
> >
> > Signed-off-by: Thomas Hellström <[email protected]>
> > ---
> > drivers/gpu/drm/drm_pagemap.c | 177 +++++++++++++++++++++++++-----
> > ----
> > drivers/gpu/drm/xe/xe_svm.c | 20 ++--
> > 2 files changed, 143 insertions(+), 54 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/drm_pagemap.c
> > b/drivers/gpu/drm/drm_pagemap.c
> > index 1477a2057a15..e87676313ff9 100644
> > --- a/drivers/gpu/drm/drm_pagemap.c
> > +++ b/drivers/gpu/drm/drm_pagemap.c
> > @@ -210,6 +210,7 @@ static void drm_pagemap_get_devmem_page(struct
> > page *page,
> > /**
> > * drm_pagemap_migrate_map_pages() - Map migration pages for GPU
> > SVM migration
> > * @dev: The device for which the pages are being mapped
> > + * @local_dpagemap: The drm_pagemap pointer of the local
> > drm_pagemap.
> > * @pagemap_addr: Array to store DMA information corresponding to
> > mapped pages
> > * @migrate_pfn: Array of migrate page frame numbers to map
> > * @npages: Number of pages to map
> > @@ -223,12 +224,14 @@ static void
> > drm_pagemap_get_devmem_page(struct page *page,
> > * Returns: 0 on success, -EFAULT if an error occurs during
> > mapping.
> > */
> > static int drm_pagemap_migrate_map_pages(struct device *dev,
> > + struct drm_pagemap
> > *local_dpagemap,
> > struct drm_pagemap_addr
> > *pagemap_addr,
> > unsigned long
> > *migrate_pfn,
> > unsigned long npages,
> > enum dma_data_direction
> > dir)
> > {
> > unsigned long i;
> > + unsigned long num_peer_pages = 0;
> >
> > for (i = 0; i < npages;) {
> > struct page *page =
> > migrate_pfn_to_page(migrate_pfn[i]);
> > @@ -239,31 +242,48 @@ static int
> > drm_pagemap_migrate_map_pages(struct device *dev,
> > if (!page)
> > goto next;
> >
> > - if (WARN_ON_ONCE(is_zone_device_page(page)))
> > - return -EFAULT;
> > -
> > folio = page_folio(page);
> > order = folio_order(folio);
> >
> > - dma_addr = dma_map_page(dev, page, 0,
> > page_size(page), dir);
> > - if (dma_mapping_error(dev, dma_addr))
> > - return -EFAULT;
> > + if (is_zone_device_page(page)) {
> > + struct drm_pagemap_zdd *zdd = page-
> > >zone_device_data;
> > + struct drm_pagemap *dpagemap = zdd-
> > >dpagemap;
> > + struct drm_pagemap_addr addr;
> > +
> > + if (dpagemap == local_dpagemap)
> > + goto next;
> >
> > - pagemap_addr[i] =
> > - drm_pagemap_addr_encode(dma_addr,
> > -
> > DRM_INTERCONNECT_SYSTEM,
> > - order, dir);
> > + num_peer_pages += NR_PAGES(order);
> > + addr = dpagemap->ops->device_map(dpagemap,
> > dev, page, order, dir);
> > + if (dma_mapping_error(dev, addr.addr))
> > + return -EFAULT;
> > + } else {
> > + dma_addr = dma_map_page(dev, page, 0,
> > page_size(page), dir);
> > + if (dma_mapping_error(dev, dma_addr))
> > + return -EFAULT;
> > +
> > + pagemap_addr[i] =
> > + drm_pagemap_addr_encode(dma_addr,
> > + DRM_INTERC
> > ONNECT_SYSTEM,
> > + order,
> > dir);
> > + }
> >
> > next:
> > i += NR_PAGES(order);
> > }
> >
> > + if (num_peer_pages)
> > + drm_dbg(local_dpagemap->drm, "Migrating %lu peer
> > pages over interconnect.\n",
> > + num_peer_pages);
> > +
> > return 0;
> > }
> >
> > /**
> > * drm_pagemap_migrate_unmap_pages() - Unmap pages previously
> > mapped for GPU SVM migration
> > * @dev: The device for which the pages were mapped
> > + * @migrate_pfn: Array of migrate pfns set up for the mapped
> > pages. Used to
> > + * determine the drm_pagemap of a peer device private page.
> > * @pagemap_addr: Array of DMA information corresponding to mapped
> > pages
> > * @npages: Number of pages to unmap
> > * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
> > @@ -274,16 +294,27 @@ static int
> > drm_pagemap_migrate_map_pages(struct device *dev,
> > */
> > static void drm_pagemap_migrate_unmap_pages(struct device *dev,
> > struct
> > drm_pagemap_addr *pagemap_addr,
> > + unsigned long
> > *migrate_pfn,
> > unsigned long npages,
> > enum
> > dma_data_direction dir)
> > {
> > unsigned long i;
> >
> > for (i = 0; i < npages;) {
> > - if (!pagemap_addr[i].addr ||
> > dma_mapping_error(dev, pagemap_addr[i].addr))
> > + struct page *page =
> > migrate_pfn_to_page(migrate_pfn[i]);
> > +
> > + if (!page || !pagemap_addr[i].addr ||
> > dma_mapping_error(dev, pagemap_addr[i].addr))
> > goto next;
> >
> > - dma_unmap_page(dev, pagemap_addr[i].addr,
> > PAGE_SIZE << pagemap_addr[i].order, dir);
> > + if (is_zone_device_page(page)) {
> > + struct drm_pagemap_zdd *zdd = page-
> > >zone_device_data;
> > + struct drm_pagemap *dpagemap = zdd-
> > >dpagemap;
> > +
> > + dpagemap->ops->device_unmap(dpagemap, dev,
> > pagemap_addr[i]);
> > + } else {
> > + dma_unmap_page(dev, pagemap_addr[i].addr,
> > + PAGE_SIZE <<
> > pagemap_addr[i].order, dir);
> > + }
> >
> > next:
> > i += NR_PAGES(pagemap_addr[i].order);
> > @@ -308,6 +339,7 @@ npages_in_range(unsigned long start, unsigned
> > long end)
> > * @timeslice_ms: The time requested for the migrated pagemap
> > pages to
> > * be present in @mm before being allowed to be migrated back.
> > * @pgmap_owner: Not used currently, since only system memory is
> > considered.
> > + * @mflags: Flags governing the migration.
> > *
> > * This function migrates the specified virtual address range to
> > device memory.
> > * It performs the necessary setup and invokes the driver-specific
> > operations for
> > @@ -333,13 +365,18 @@ int drm_pagemap_migrate_to_devmem(struct
> > drm_pagemap_devmem *devmem_allocation,
> > .start = start,
> > .end = end,
> > .pgmap_owner = pgmap_owner,
> > - .flags = MIGRATE_VMA_SELECT_SYSTEM,
> > + .flags = MIGRATE_VMA_SELECT_SYSTEM |
> > + MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
> > + MIGRATE_VMA_SELECT_DEVICE_COHERENT,
> > };
> > unsigned long i, npages = npages_in_range(start, end);
> > + unsigned long own_pages = 0, migrated_pages = 0;
> > struct vm_area_struct *vas;
> > struct drm_pagemap_zdd *zdd = NULL;
> > struct page **pages;
> > struct drm_pagemap_addr *pagemap_addr;
> > + struct drm_pagemap *dpagemap = devmem_allocation-
> > >dpagemap;
> > + struct dev_pagemap *pagemap = dpagemap->pagemap;
> > void *buf;
> > int err;
> >
> > @@ -374,11 +411,13 @@ int drm_pagemap_migrate_to_devmem(struct
> > drm_pagemap_devmem *devmem_allocation,
> > pagemap_addr = buf + (2 * sizeof(*migrate.src) * npages);
> > pages = buf + (2 * sizeof(*migrate.src) +
> > sizeof(*pagemap_addr)) * npages;
> >
> > - zdd = drm_pagemap_zdd_alloc(devmem_allocation->dpagemap,
> > pgmap_owner);
> > + zdd = drm_pagemap_zdd_alloc(dpagemap, pgmap_owner);
> > if (!zdd) {
> > err = -ENOMEM;
> > - goto err_free;
> > + kvfree(buf);
> > + goto err_out;
> > }
> > + zdd->devmem_allocation = devmem_allocation; /* Owns
> > ref */
> >
> > migrate.vma = vas;
> > migrate.src = buf;
> > @@ -389,54 +428,108 @@ int drm_pagemap_migrate_to_devmem(struct
> > drm_pagemap_devmem *devmem_allocation,
> > goto err_free;
> >
> > if (!migrate.cpages) {
> > - err = -EFAULT;
> > + /* No pages to migrate. Raced or unknown device
> > pages. */
> > + err = -EBUSY;
> > goto err_free;
> > }
> >
> > if (migrate.cpages != npages) {
> > + /*
> > + * Some pages to migrate. But we want to migrate
> > all or
> > + * nothing. Raced or unknown device pages.
> > + */
> > err = -EBUSY;
> > - goto err_finalize;
> > + goto err_aborted_migration;
> > + }
> > +
> > + /* Count device-private pages to migrate */
> > + for (i = 0; i < npages; ++i) {
> > + struct page *src_page =
> > migrate_pfn_to_page(migrate.src[i]);
> > +
> > + if (src_page && is_zone_device_page(src_page)) {
> > + if (page_pgmap(src_page) == pagemap)
> > + own_pages++;
> > + }
> > + }
>
> I understand what this is doing—aborting the migration if the pages
> are
> in the correct location. Conceptually, I believe this is correct, but
> implementation-wise it is likely not. The pages collected here are
> gathered via migrate_vma_setup. This step issues an MMU notifier and
> installs migration PTEs, which are expensive operations. For example,
> if
> another GPU already has the correct mappings and the pages are in the
> correct location, migrate_vma_setup will result in the range being
> invalidated and the GPU<->GPU mapping being removed. Installing
> migration PTEs is also CPU-intensive.
>
> I think the step to check own_pages should be built on top of
> hmm_range_fault without HMM_PFN_REQ_FAULT set, which is fast and will
> not issue an MMU notifier.
I agree fully and what you say above is also what we outlined in a
previous discussion. I was planning to do the hmm_range_fault() check
in drm_gpusvm, as a follow-up, though, before calling into drm_pagemap
to migrate.
Reason is that in drm_pagemap, the current implementation doesn't
register any mmu notifier callbacks.
But I realize now that this is pretty much required for any efficiency
with multiple gpus that would otherwise invalidate just checking that
the pages are in place....
/Thomas
>
> Matt
>
> > +
> > + drm_dbg(dpagemap->drm, "Total pages %lu; Own pages:
> > %lu.\n",
> > + npages, own_pages);
> > + if (own_pages == npages) {
> > + err = 0;
> > + drm_dbg(dpagemap->drm, "Migration wasn't
> > necessary.\n");
> > + goto err_aborted_migration;
> > + } else if (own_pages) {
> > + err = -EBUSY;
> > + drm_dbg(dpagemap->drm, "Migration aborted due to
> > fragmentation.\n");
> > + goto err_aborted_migration;
> > }
> >
> > err = ops->populate_devmem_pfn(devmem_allocation, npages,
> > migrate.dst);
> > if (err)
> > goto err_finalize;
> >
> > - err = drm_pagemap_migrate_map_pages(devmem_allocation-
> > >dev, pagemap_addr,
> > + err = drm_pagemap_migrate_map_pages(devmem_allocation-
> > >dev,
> > + devmem_allocation-
> > >dpagemap, pagemap_addr,
> > migrate.src, npages,
> > DMA_TO_DEVICE);
> >
> > - if (err)
> > + if (err) {
> > + drm_pagemap_migrate_unmap_pages(devmem_allocation-
> > >dev, pagemap_addr,
> > + migrate.src,
> > npages, DMA_TO_DEVICE);
> > +
> > goto err_finalize;
> > + }
> >
> > + own_pages = 0;
> > for (i = 0; i < npages; ++i) {
> > struct page *page = pfn_to_page(migrate.dst[i]);
> > + struct page *src_page =
> > migrate_pfn_to_page(migrate.src[i]);
> >
> > + if (unlikely(src_page &&
> > is_zone_device_page(src_page) &&
> > + page_pgmap(src_page) == pagemap)) {
> > + migrate.dst[i] = 0;
> > + pages[i] = NULL;
> > + own_pages++;
> > + continue;
> > + }
> > pages[i] = page;
> > migrate.dst[i] = migrate_pfn(migrate.dst[i]);
> > drm_pagemap_get_devmem_page(page, zdd);
> > }
> > + drm_WARN_ON(dpagemap->drm, !!own_pages);
> >
> > err = ops->copy_to_devmem(pages, pagemap_addr, npages);
> > + drm_pagemap_migrate_unmap_pages(devmem_allocation->dev,
> > pagemap_addr,
> > + migrate.src, npages,
> > DMA_TO_DEVICE);
> > if (err)
> > goto err_finalize;
> >
> > /* Upon success bind devmem allocation to range and zdd */
> > devmem_allocation->timeslice_expiration = get_jiffies_64()
> > +
> > msecs_to_jiffies(timeslice_ms);
> > - zdd->devmem_allocation = devmem_allocation; /* Owns
> > ref */
> >
> > err_finalize:
> > if (err)
> > drm_pagemap_migration_unlock_put_pages(npages,
> > migrate.dst);
> > +err_aborted_migration:
> > migrate_vma_pages(&migrate);
> > +
> > + for (i = 0; i < npages; ++i)
> > + if (migrate.src[i] & MIGRATE_PFN_MIGRATE)
> > + migrated_pages++;
> > +
> > + if (!err && migrated_pages < npages - own_pages) {
> > + drm_dbg(dpagemap->drm, "Raced while finalizing
> > migration.\n");
> > + err = -EBUSY;
> > + }
> > +
> > migrate_vma_finalize(&migrate);
> > - drm_pagemap_migrate_unmap_pages(devmem_allocation->dev,
> > pagemap_addr, npages,
> > - DMA_TO_DEVICE);
> > err_free:
> > - if (zdd)
> > - drm_pagemap_zdd_put(zdd);
> > + drm_pagemap_zdd_put(zdd);
> > kvfree(buf);
> > + return err;
> > +
> > err_out:
> > + devmem_allocation->ops->devmem_release(devmem_allocation);
> > return err;
> > }
> > EXPORT_SYMBOL_GPL(drm_pagemap_migrate_to_devmem);
> > @@ -747,7 +840,8 @@ int drm_pagemap_evict_to_ram(struct
> > drm_pagemap_devmem *devmem_allocation)
> > if (err || !mpages)
> > goto err_finalize;
> >
> > - err = drm_pagemap_migrate_map_pages(devmem_allocation-
> > >dev, pagemap_addr,
> > + err = drm_pagemap_migrate_map_pages(devmem_allocation-
> > >dev,
> > + devmem_allocation-
> > >dpagemap, pagemap_addr,
> > dst, npages,
> > DMA_FROM_DEVICE);
> > if (err)
> > goto err_finalize;
> > @@ -764,7 +858,7 @@ int drm_pagemap_evict_to_ram(struct
> > drm_pagemap_devmem *devmem_allocation)
> > drm_pagemap_migration_unlock_put_pages(npages,
> > dst);
> > migrate_device_pages(src, dst, npages);
> > migrate_device_finalize(src, dst, npages);
> > - drm_pagemap_migrate_unmap_pages(devmem_allocation->dev,
> > pagemap_addr, npages,
> > + drm_pagemap_migrate_unmap_pages(devmem_allocation->dev,
> > pagemap_addr, dst, npages,
> > DMA_FROM_DEVICE);
> > err_free:
> > kvfree(buf);
> > @@ -820,12 +914,10 @@ static int
> > __drm_pagemap_migrate_to_ram(struct vm_area_struct *vas,
> > void *buf;
> > int i, err = 0;
> >
> > - if (page) {
> > - zdd = page->zone_device_data;
> > - if (time_before64(get_jiffies_64(),
> > - zdd->devmem_allocation-
> > >timeslice_expiration))
> > - return 0;
> > - }
> > + zdd = page->zone_device_data;
> > + if (time_before64(get_jiffies_64(),
> > + zdd->devmem_allocation-
> > >timeslice_expiration))
> > + return 0;
> >
> > start = ALIGN_DOWN(fault_addr, size);
> > end = ALIGN(fault_addr + 1, size);
> > @@ -861,19 +953,6 @@ static int __drm_pagemap_migrate_to_ram(struct
> > vm_area_struct *vas,
> > if (!migrate.cpages)
> > goto err_free;
> >
> > - if (!page) {
> > - for (i = 0; i < npages; ++i) {
> > - if (!(migrate.src[i] &
> > MIGRATE_PFN_MIGRATE))
> > - continue;
> > -
> > - page =
> > migrate_pfn_to_page(migrate.src[i]);
> > - break;
> > - }
> > -
> > - if (!page)
> > - goto err_finalize;
> > - }
> > - zdd = page->zone_device_data;
> > ops = zdd->devmem_allocation->ops;
> > dev = zdd->devmem_allocation->dev;
> >
> > @@ -883,7 +962,7 @@ static int __drm_pagemap_migrate_to_ram(struct
> > vm_area_struct *vas,
> > if (err)
> > goto err_finalize;
> >
> > - err = drm_pagemap_migrate_map_pages(dev, pagemap_addr,
> > migrate.dst, npages,
> > + err = drm_pagemap_migrate_map_pages(dev, zdd->dpagemap,
> > pagemap_addr, migrate.dst, npages,
> > DMA_FROM_DEVICE);
> > if (err)
> > goto err_finalize;
> > @@ -901,8 +980,8 @@ static int __drm_pagemap_migrate_to_ram(struct
> > vm_area_struct *vas,
> > migrate_vma_pages(&migrate);
> > migrate_vma_finalize(&migrate);
> > if (dev)
> > - drm_pagemap_migrate_unmap_pages(dev, pagemap_addr,
> > npages,
> > - DMA_FROM_DEVICE);
> > + drm_pagemap_migrate_unmap_pages(dev, pagemap_addr,
> > migrate.dst,
> > + npages,
> > DMA_FROM_DEVICE);
> > err_free:
> > kvfree(buf);
> > err_out:
> > @@ -938,10 +1017,12 @@ static vm_fault_t
> > drm_pagemap_migrate_to_ram(struct vm_fault *vmf)
> > struct drm_pagemap_zdd *zdd = vmf->page->zone_device_data;
> > int err;
> >
> > + drm_pagemap_zdd_get(zdd);
> > err = __drm_pagemap_migrate_to_ram(vmf->vma,
> > zdd-
> > >device_private_page_owner,
> > vmf->page, vmf-
> > >address,
> > zdd->devmem_allocation-
> > >size);
> > + drm_pagemap_zdd_put(zdd);
> >
> > return err ? VM_FAULT_SIGBUS : 0;
> > }
> > diff --git a/drivers/gpu/drm/xe/xe_svm.c
> > b/drivers/gpu/drm/xe/xe_svm.c
> > index 0b39905c9312..56bb3896b89a 100644
> > --- a/drivers/gpu/drm/xe/xe_svm.c
> > +++ b/drivers/gpu/drm/xe/xe_svm.c
> > @@ -1028,11 +1028,10 @@ static int
> > xe_drm_pagemap_populate_mm(struct drm_pagemap *dpagemap,
> >
> > /* Ensure the device has a pm ref while there are
> > device pages active. */
> > xe_pm_runtime_get_noresume(xe);
> > + /* Consumes the devmem allocation. */
> > err = drm_pagemap_migrate_to_devmem(&bo-
> > >devmem_allocation, mm,
> > start, end,
> > timeslice_ms,
> > xpagemap-
> > >pagemap.owner);
> > - if (err)
> > - xe_svm_devmem_release(&bo-
> > >devmem_allocation);
> > xe_bo_unlock(bo);
> > xe_bo_put(bo);
> > }
> > @@ -1546,6 +1545,7 @@ int xe_svm_alloc_vram(struct xe_svm_range
> > *range, const struct drm_gpusvm_ctx *c
> > struct drm_pagemap *dpagemap)
> > {
> > struct xe_device *xe = range_to_vm(&range->base)->xe;
> > + int err, retries = 1;
> >
> > xe_assert(range_to_vm(&range->base)->xe, range-
> > >base.pages.flags.migrate_devmem);
> > range_debug(range, "ALLOCATE VRAM");
> > @@ -1554,10 +1554,18 @@ int xe_svm_alloc_vram(struct xe_svm_range
> > *range, const struct drm_gpusvm_ctx *c
> > drm_dbg(&xe->drm, "Request migration to device
> > memory on \"%s\".\n",
> > dpagemap->drm->unique);
> >
> > - return drm_pagemap_populate_mm(dpagemap,
> > xe_svm_range_start(range),
> > - xe_svm_range_end(range),
> > - range->base.gpusvm->mm,
> > - ctx->timeslice_ms);
> > + do {
> > + err = drm_pagemap_populate_mm(dpagemap,
> > xe_svm_range_start(range),
> > +
> > xe_svm_range_end(range),
> > + range->base.gpusvm-
> > >mm,
> > + ctx->timeslice_ms);
> > +
> > + if (err == -EBUSY && retries)
> > + drm_gpusvm_range_evict(range->base.gpusvm,
> > &range->base);
> > +
> > + } while (err == -EBUSY && retries--);
> > +
> > + return err;
> > }
> >
> > static struct drm_pagemap_addr
> > --
> > 2.51.1
> >