xe: Split TLB invalidation into submit and wait steps

Matthew Brost Mon, 02 Mar 2026 11:07:18 -0800

On Mon, Mar 02, 2026 at 05:32:47PM +0100, Thomas Hellström wrote:
> xe_vm_range_tilemask_tlb_inval() submits TLB invalidation requests to
> all GTs in a tile mask and then immediately waits for them to complete
> before returning. This is fine for the existing callers, but a
> subsequent patch will need to defer the wait in order to overlap TLB
> invalidations across multiple VMAs.
> 
> Introduce xe_tlb_inval_range_tilemask_submit() and
> xe_tlb_inval_batch_wait() in xe_tlb_inval.c as the submit and wait
> halves respectively. The batch of fences is carried in the new
> xe_tlb_inval_batch structure. Remove xe_vm_range_tilemask_tlb_inval()
> and convert all three call sites to the new API.
>


Mostly nits...

> Assisted-by: GitHub Copilot:claude-sonnet-4.6
> Signed-off-by: Thomas Hellström <[email protected]>
> ---
>  drivers/gpu/drm/xe/xe_svm.c             |  6 +-
>  drivers/gpu/drm/xe/xe_tlb_inval.c       | 82 +++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_tlb_inval.h       |  6 ++
>  drivers/gpu/drm/xe/xe_tlb_inval_types.h | 14 +++++
>  drivers/gpu/drm/xe/xe_vm.c              | 69 +++------------------
>  drivers/gpu/drm/xe/xe_vm.h              |  3 -
>  drivers/gpu/drm/xe/xe_vm_madvise.c      |  9 ++-
>  drivers/gpu/drm/xe/xe_vm_types.h        |  1 +
>  8 files changed, 123 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index 002b6c22ad3f..6ea4972c2791 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -19,6 +19,7 @@
>  #include "xe_pt.h"
>  #include "xe_svm.h"
>  #include "xe_tile.h"
> +#include "xe_tlb_inval.h"
>  #include "xe_ttm_vram_mgr.h"
>  #include "xe_vm.h"
>  #include "xe_vm_types.h"
> @@ -225,6 +226,7 @@ static void xe_svm_invalidate(struct drm_gpusvm *gpusvm,
>                             const struct mmu_notifier_range *mmu_range)
>  {
>       struct xe_vm *vm = gpusvm_to_vm(gpusvm);
> +     struct xe_tlb_inval_batch _batch;
>       struct xe_device *xe = vm->xe;
>       struct drm_gpusvm_range *r, *first;
>       struct xe_tile *tile;
> @@ -276,7 +278,9 @@ static void xe_svm_invalidate(struct drm_gpusvm *gpusvm,
>  
>       xe_device_wmb(xe);
>  
> -     err = xe_vm_range_tilemask_tlb_inval(vm, adj_start, adj_end, tile_mask);
> +     err = xe_tlb_inval_range_tilemask_submit(xe, vm->usm.asid, adj_start, 
> adj_end,
> +                                              tile_mask, &_batch);
> +     xe_tlb_inval_batch_wait(&_batch);

No need to call wait on an error but it is harmless.

So you could write it like this:

if (!WARN_ON_ONCE(err))
        xe_tlb_inval_batch_wait(&_batch);

>       WARN_ON_ONCE(err);
>  
>  range_notifier_event_end:
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.c 
> b/drivers/gpu/drm/xe/xe_tlb_inval.c
> index 933f30fb617d..343e37cfe715 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval.c
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval.c
> @@ -486,3 +486,85 @@ bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval)
>       guard(spinlock_irq)(&tlb_inval->pending_lock);
>       return list_is_singular(&tlb_inval->pending_fences);
>  }
> +
> +/**
> + * xe_tlb_inval_batch_wait() - Wait for all fences in a TLB invalidation 
> batch
> + * @batch: Batch of TLB invalidation fences to wait on
> + *
> + * Waits for every fence in @batch to signal, then resets @batch so it can be
> + * reused for a subsequent invalidation.
> + */
> +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch)
> +{
> +     struct xe_tlb_inval_fence *fence = &batch->fence[0];

Would this be better:

s/&batch->fence[0]/batch->fence

Personal preference I guess.

> +     unsigned int i;
> +
> +     for (i = 0; i < batch->num_fences; ++i)
> +             xe_tlb_inval_fence_wait(fence++);
> +
> +     batch->num_fences = 0;
> +}
> +
> +/**
> + * xe_tlb_inval_range_tilemask_submit() - Submit TLB invalidations for an
> + * address range on a tile mask
> + * @xe: The xe device
> + * @asid: Address space ID
> + * @start: start address
> + * @end: end address
> + * @tile_mask: mask for which gt's issue tlb invalidation
> + * @batch: Batch of tlb invalidate fences
> + *
> + * Issue a range based TLB invalidation for gt's in tilemask
> + *

Mention no need to wait on batch if this function returns an error?

> + * Returns 0 for success, negative error code otherwise.
> + */
> +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32 asid,
> +                                    u64 start, u64 end, u8 tile_mask,
> +                                    struct xe_tlb_inval_batch *batch)
> +{
> +     struct xe_tlb_inval_fence *fence = &batch->fence[0];
> +     struct xe_tile *tile;
> +     u32 fence_id = 0;
> +     u8 id;
> +     int err;
> +
> +     batch->num_fences = 0;
> +     if (!tile_mask)
> +             return 0;
> +
> +     for_each_tile(tile, xe, id) {
> +             if (!(tile_mask & BIT(id)))
> +                     continue;
> +
> +             xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
> +                                     &fence[fence_id], true);
> +
> +             err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
> +                                      &fence[fence_id], start, end,
> +                                      asid, NULL);
> +             if (err)
> +                     goto wait;
> +             ++fence_id;
> +
> +             if (!tile->media_gt)
> +                     continue;
> +
> +             xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
> +                                     &fence[fence_id], true);
> +
> +             err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
> +                                      &fence[fence_id], start, end,
> +                                      asid, NULL);
> +             if (err)
> +                     goto wait;
> +             ++fence_id;
> +     }
> +
> +wait:
> +     batch->num_fences = fence_id;

Should 'batch->num_fences' only get set on success?

> +     if (err)
> +             xe_tlb_inval_batch_wait(batch);
> +
> +     return err;
> +}
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.h 
> b/drivers/gpu/drm/xe/xe_tlb_inval.h
> index 62089254fa23..a76b7823a5f2 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval.h
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval.h
> @@ -45,4 +45,10 @@ void xe_tlb_inval_done_handler(struct xe_tlb_inval 
> *tlb_inval, int seqno);
>  
>  bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval);
>  
> +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32 asid,
> +                                    u64 start, u64 end, u8 tile_mask,
> +                                    struct xe_tlb_inval_batch *batch);
> +
> +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch);
> +
>  #endif       /* _XE_TLB_INVAL_ */
> diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_types.h 
> b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> index 3b089f90f002..3d1797d186fd 100644
> --- a/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> +++ b/drivers/gpu/drm/xe/xe_tlb_inval_types.h
> @@ -9,6 +9,8 @@
>  #include <linux/workqueue.h>
>  #include <linux/dma-fence.h>
>  
> +#include "xe_device_types.h"
> +
>  struct drm_suballoc;
>  struct xe_tlb_inval;
>  
> @@ -132,4 +134,16 @@ struct xe_tlb_inval_fence {
>       ktime_t inval_time;
>  };
>  
> +/**
> + * struct xe_tlb_inval_batch - Batch of TLB invalidation fences
> + *
> + * Holds one fence per GT covered by a TLB invalidation request.
> + */
> +struct xe_tlb_inval_batch {
> +     /** @fence: per-GT TLB invalidation fences */
> +     struct xe_tlb_inval_fence fence[XE_MAX_TILES_PER_DEVICE * 
> XE_MAX_GT_PER_TILE];
> +     /** @num_fences: number of valid entries in @fence */
> +     unsigned int num_fences;
> +};
> +
>  #endif
> diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> index 548b0769b3ef..7f29d2b2972d 100644
> --- a/drivers/gpu/drm/xe/xe_vm.c
> +++ b/drivers/gpu/drm/xe/xe_vm.c
> @@ -3966,66 +3966,6 @@ void xe_vm_unlock(struct xe_vm *vm)
>       dma_resv_unlock(xe_vm_resv(vm));
>  }
>  
> -/**
> - * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on this 
> tilemask for an
> - * address range
> - * @vm: The VM
> - * @start: start address
> - * @end: end address
> - * @tile_mask: mask for which gt's issue tlb invalidation
> - *
> - * Issue a range based TLB invalidation for gt's in tilemask
> - *
> - * Returns 0 for success, negative error code otherwise.
> - */
> -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
> -                                u64 end, u8 tile_mask)
> -{
> -     struct xe_tlb_inval_fence
> -             fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE];
> -     struct xe_tile *tile;
> -     u32 fence_id = 0;
> -     u8 id;
> -     int err;
> -
> -     if (!tile_mask)
> -             return 0;
> -
> -     for_each_tile(tile, vm->xe, id) {
> -             if (!(tile_mask & BIT(id)))
> -                     continue;
> -
> -             xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval,
> -                                     &fence[fence_id], true);
> -
> -             err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval,
> -                                      &fence[fence_id], start, end,
> -                                      vm->usm.asid, NULL);
> -             if (err)
> -                     goto wait;
> -             ++fence_id;
> -
> -             if (!tile->media_gt)
> -                     continue;
> -
> -             xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval,
> -                                     &fence[fence_id], true);
> -
> -             err = xe_tlb_inval_range(&tile->media_gt->tlb_inval,
> -                                      &fence[fence_id], start, end,
> -                                      vm->usm.asid, NULL);
> -             if (err)
> -                     goto wait;
> -             ++fence_id;
> -     }
> -
> -wait:
> -     for (id = 0; id < fence_id; ++id)
> -             xe_tlb_inval_fence_wait(&fence[id]);
> -
> -     return err;
> -}
> -
>  /**
>   * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock
>   * @vma: VMA to invalidate
> @@ -4040,6 +3980,7 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
>  {
>       struct xe_device *xe = xe_vma_vm(vma)->xe;
>       struct xe_vm *vm = xe_vma_vm(vma);
> +     struct xe_tlb_inval_batch _batch;

Why not just 'batch'?

>       struct xe_tile *tile;
>       u8 tile_mask = 0;
>       int ret = 0;
> @@ -4080,12 +4021,16 @@ int xe_vm_invalidate_vma(struct xe_vma *vma)
>  
>       xe_device_wmb(xe);
>  
> -     ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma), xe_vma_start(vma),
> -                                          xe_vma_end(vma), tile_mask);
> +     ret = xe_tlb_inval_range_tilemask_submit(xe, xe_vma_vm(vma)->usm.asid,
> +                                              xe_vma_start(vma), 
> xe_vma_end(vma),
> +                                              tile_mask, &_batch);
>  
>       /* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */
>       WRITE_ONCE(vma->tile_invalidated, vma->tile_mask);
>  
> +     if (!ret)
> +             xe_tlb_inval_batch_wait(&_batch);
> +

Here we skip the wait on error, hence my suggestion to skip waits in
other code paths or at a minimum make call sematics consistent. 

>       return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h
> index f849e369432b..62f4b6fec0bc 100644
> --- a/drivers/gpu/drm/xe/xe_vm.h
> +++ b/drivers/gpu/drm/xe/xe_vm.h
> @@ -240,9 +240,6 @@ struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm,
>  struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm,
>                                    struct xe_svm_range *range);
>  
> -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start,
> -                                u64 end, u8 tile_mask);
> -
>  int xe_vm_invalidate_vma(struct xe_vma *vma);
>  
>  int xe_vm_validate_protected(struct xe_vm *vm);
> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c 
> b/drivers/gpu/drm/xe/xe_vm_madvise.c
> index 95bf53cc29e3..39717026e84f 100644
> --- a/drivers/gpu/drm/xe/xe_vm_madvise.c
> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
> @@ -12,6 +12,7 @@
>  #include "xe_pat.h"
>  #include "xe_pt.h"
>  #include "xe_svm.h"
> +#include "xe_tlb_inval.h"
>  
>  struct xe_vmas_in_madvise_range {
>       u64 addr;
> @@ -235,13 +236,19 @@ static u8 xe_zap_ptes_in_madvise_range(struct xe_vm 
> *vm, u64 start, u64 end)
>  static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 
> end)
>  {
>       u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end);
> +     struct xe_tlb_inval_batch batch;
> +     int err;
>  
>       if (!tile_mask)
>               return 0;
>  
>       xe_device_wmb(vm->xe);
>  
> -     return xe_vm_range_tilemask_tlb_inval(vm, start, end, tile_mask);
> +     err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm->usm.asid, start, 
> end,
> +                                              tile_mask, &batch);
> +     xe_tlb_inval_batch_wait(&batch);

No need to wait on error.

> +
> +     return err;
>  }
>  
>  static bool madvise_args_are_sane(struct xe_device *xe, const struct 
> drm_xe_madvise *args)
> diff --git a/drivers/gpu/drm/xe/xe_vm_types.h 
> b/drivers/gpu/drm/xe/xe_vm_types.h
> index 1f6f7e30e751..de6544165cfa 100644
> --- a/drivers/gpu/drm/xe/xe_vm_types.h
> +++ b/drivers/gpu/drm/xe/xe_vm_types.h
> @@ -18,6 +18,7 @@
>  #include "xe_device_types.h"
>  #include "xe_pt_types.h"
>  #include "xe_range_fence.h"
> +#include "xe_tlb_inval_types.h"
>  #include "xe_userptr.h"
>  
>  struct drm_pagemap;
> -- 
> 2.53.0
>

Re: [PATCH v2 3/4] drm/xe: Split TLB invalidation into submit and wait steps

Reply via email to