On Mon, Mar 02, 2026 at 05:32:47PM +0100, Thomas Hellström wrote: > xe_vm_range_tilemask_tlb_inval() submits TLB invalidation requests to > all GTs in a tile mask and then immediately waits for them to complete > before returning. This is fine for the existing callers, but a > subsequent patch will need to defer the wait in order to overlap TLB > invalidations across multiple VMAs. > > Introduce xe_tlb_inval_range_tilemask_submit() and > xe_tlb_inval_batch_wait() in xe_tlb_inval.c as the submit and wait > halves respectively. The batch of fences is carried in the new > xe_tlb_inval_batch structure. Remove xe_vm_range_tilemask_tlb_inval() > and convert all three call sites to the new API. >
Mostly nits... > Assisted-by: GitHub Copilot:claude-sonnet-4.6 > Signed-off-by: Thomas Hellström <[email protected]> > --- > drivers/gpu/drm/xe/xe_svm.c | 6 +- > drivers/gpu/drm/xe/xe_tlb_inval.c | 82 +++++++++++++++++++++++++ > drivers/gpu/drm/xe/xe_tlb_inval.h | 6 ++ > drivers/gpu/drm/xe/xe_tlb_inval_types.h | 14 +++++ > drivers/gpu/drm/xe/xe_vm.c | 69 +++------------------ > drivers/gpu/drm/xe/xe_vm.h | 3 - > drivers/gpu/drm/xe/xe_vm_madvise.c | 9 ++- > drivers/gpu/drm/xe/xe_vm_types.h | 1 + > 8 files changed, 123 insertions(+), 67 deletions(-) > > diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c > index 002b6c22ad3f..6ea4972c2791 100644 > --- a/drivers/gpu/drm/xe/xe_svm.c > +++ b/drivers/gpu/drm/xe/xe_svm.c > @@ -19,6 +19,7 @@ > #include "xe_pt.h" > #include "xe_svm.h" > #include "xe_tile.h" > +#include "xe_tlb_inval.h" > #include "xe_ttm_vram_mgr.h" > #include "xe_vm.h" > #include "xe_vm_types.h" > @@ -225,6 +226,7 @@ static void xe_svm_invalidate(struct drm_gpusvm *gpusvm, > const struct mmu_notifier_range *mmu_range) > { > struct xe_vm *vm = gpusvm_to_vm(gpusvm); > + struct xe_tlb_inval_batch _batch; > struct xe_device *xe = vm->xe; > struct drm_gpusvm_range *r, *first; > struct xe_tile *tile; > @@ -276,7 +278,9 @@ static void xe_svm_invalidate(struct drm_gpusvm *gpusvm, > > xe_device_wmb(xe); > > - err = xe_vm_range_tilemask_tlb_inval(vm, adj_start, adj_end, tile_mask); > + err = xe_tlb_inval_range_tilemask_submit(xe, vm->usm.asid, adj_start, > adj_end, > + tile_mask, &_batch); > + xe_tlb_inval_batch_wait(&_batch); No need to call wait on an error but it is harmless. So you could write it like this: if (!WARN_ON_ONCE(err)) xe_tlb_inval_batch_wait(&_batch); > WARN_ON_ONCE(err); > > range_notifier_event_end: > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.c > b/drivers/gpu/drm/xe/xe_tlb_inval.c > index 933f30fb617d..343e37cfe715 100644 > --- a/drivers/gpu/drm/xe/xe_tlb_inval.c > +++ b/drivers/gpu/drm/xe/xe_tlb_inval.c > @@ -486,3 +486,85 @@ bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval) > guard(spinlock_irq)(&tlb_inval->pending_lock); > return list_is_singular(&tlb_inval->pending_fences); > } > + > +/** > + * xe_tlb_inval_batch_wait() - Wait for all fences in a TLB invalidation > batch > + * @batch: Batch of TLB invalidation fences to wait on > + * > + * Waits for every fence in @batch to signal, then resets @batch so it can be > + * reused for a subsequent invalidation. > + */ > +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch) > +{ > + struct xe_tlb_inval_fence *fence = &batch->fence[0]; Would this be better: s/&batch->fence[0]/batch->fence Personal preference I guess. > + unsigned int i; > + > + for (i = 0; i < batch->num_fences; ++i) > + xe_tlb_inval_fence_wait(fence++); > + > + batch->num_fences = 0; > +} > + > +/** > + * xe_tlb_inval_range_tilemask_submit() - Submit TLB invalidations for an > + * address range on a tile mask > + * @xe: The xe device > + * @asid: Address space ID > + * @start: start address > + * @end: end address > + * @tile_mask: mask for which gt's issue tlb invalidation > + * @batch: Batch of tlb invalidate fences > + * > + * Issue a range based TLB invalidation for gt's in tilemask > + * Mention no need to wait on batch if this function returns an error? > + * Returns 0 for success, negative error code otherwise. > + */ > +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32 asid, > + u64 start, u64 end, u8 tile_mask, > + struct xe_tlb_inval_batch *batch) > +{ > + struct xe_tlb_inval_fence *fence = &batch->fence[0]; > + struct xe_tile *tile; > + u32 fence_id = 0; > + u8 id; > + int err; > + > + batch->num_fences = 0; > + if (!tile_mask) > + return 0; > + > + for_each_tile(tile, xe, id) { > + if (!(tile_mask & BIT(id))) > + continue; > + > + xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval, > + &fence[fence_id], true); > + > + err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval, > + &fence[fence_id], start, end, > + asid, NULL); > + if (err) > + goto wait; > + ++fence_id; > + > + if (!tile->media_gt) > + continue; > + > + xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval, > + &fence[fence_id], true); > + > + err = xe_tlb_inval_range(&tile->media_gt->tlb_inval, > + &fence[fence_id], start, end, > + asid, NULL); > + if (err) > + goto wait; > + ++fence_id; > + } > + > +wait: > + batch->num_fences = fence_id; Should 'batch->num_fences' only get set on success? > + if (err) > + xe_tlb_inval_batch_wait(batch); > + > + return err; > +} > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval.h > b/drivers/gpu/drm/xe/xe_tlb_inval.h > index 62089254fa23..a76b7823a5f2 100644 > --- a/drivers/gpu/drm/xe/xe_tlb_inval.h > +++ b/drivers/gpu/drm/xe/xe_tlb_inval.h > @@ -45,4 +45,10 @@ void xe_tlb_inval_done_handler(struct xe_tlb_inval > *tlb_inval, int seqno); > > bool xe_tlb_inval_idle(struct xe_tlb_inval *tlb_inval); > > +int xe_tlb_inval_range_tilemask_submit(struct xe_device *xe, u32 asid, > + u64 start, u64 end, u8 tile_mask, > + struct xe_tlb_inval_batch *batch); > + > +void xe_tlb_inval_batch_wait(struct xe_tlb_inval_batch *batch); > + > #endif /* _XE_TLB_INVAL_ */ > diff --git a/drivers/gpu/drm/xe/xe_tlb_inval_types.h > b/drivers/gpu/drm/xe/xe_tlb_inval_types.h > index 3b089f90f002..3d1797d186fd 100644 > --- a/drivers/gpu/drm/xe/xe_tlb_inval_types.h > +++ b/drivers/gpu/drm/xe/xe_tlb_inval_types.h > @@ -9,6 +9,8 @@ > #include <linux/workqueue.h> > #include <linux/dma-fence.h> > > +#include "xe_device_types.h" > + > struct drm_suballoc; > struct xe_tlb_inval; > > @@ -132,4 +134,16 @@ struct xe_tlb_inval_fence { > ktime_t inval_time; > }; > > +/** > + * struct xe_tlb_inval_batch - Batch of TLB invalidation fences > + * > + * Holds one fence per GT covered by a TLB invalidation request. > + */ > +struct xe_tlb_inval_batch { > + /** @fence: per-GT TLB invalidation fences */ > + struct xe_tlb_inval_fence fence[XE_MAX_TILES_PER_DEVICE * > XE_MAX_GT_PER_TILE]; > + /** @num_fences: number of valid entries in @fence */ > + unsigned int num_fences; > +}; > + > #endif > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c > index 548b0769b3ef..7f29d2b2972d 100644 > --- a/drivers/gpu/drm/xe/xe_vm.c > +++ b/drivers/gpu/drm/xe/xe_vm.c > @@ -3966,66 +3966,6 @@ void xe_vm_unlock(struct xe_vm *vm) > dma_resv_unlock(xe_vm_resv(vm)); > } > > -/** > - * xe_vm_range_tilemask_tlb_inval - Issue a TLB invalidation on this > tilemask for an > - * address range > - * @vm: The VM > - * @start: start address > - * @end: end address > - * @tile_mask: mask for which gt's issue tlb invalidation > - * > - * Issue a range based TLB invalidation for gt's in tilemask > - * > - * Returns 0 for success, negative error code otherwise. > - */ > -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start, > - u64 end, u8 tile_mask) > -{ > - struct xe_tlb_inval_fence > - fence[XE_MAX_TILES_PER_DEVICE * XE_MAX_GT_PER_TILE]; > - struct xe_tile *tile; > - u32 fence_id = 0; > - u8 id; > - int err; > - > - if (!tile_mask) > - return 0; > - > - for_each_tile(tile, vm->xe, id) { > - if (!(tile_mask & BIT(id))) > - continue; > - > - xe_tlb_inval_fence_init(&tile->primary_gt->tlb_inval, > - &fence[fence_id], true); > - > - err = xe_tlb_inval_range(&tile->primary_gt->tlb_inval, > - &fence[fence_id], start, end, > - vm->usm.asid, NULL); > - if (err) > - goto wait; > - ++fence_id; > - > - if (!tile->media_gt) > - continue; > - > - xe_tlb_inval_fence_init(&tile->media_gt->tlb_inval, > - &fence[fence_id], true); > - > - err = xe_tlb_inval_range(&tile->media_gt->tlb_inval, > - &fence[fence_id], start, end, > - vm->usm.asid, NULL); > - if (err) > - goto wait; > - ++fence_id; > - } > - > -wait: > - for (id = 0; id < fence_id; ++id) > - xe_tlb_inval_fence_wait(&fence[id]); > - > - return err; > -} > - > /** > * xe_vm_invalidate_vma - invalidate GPU mappings for VMA without a lock > * @vma: VMA to invalidate > @@ -4040,6 +3980,7 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) > { > struct xe_device *xe = xe_vma_vm(vma)->xe; > struct xe_vm *vm = xe_vma_vm(vma); > + struct xe_tlb_inval_batch _batch; Why not just 'batch'? > struct xe_tile *tile; > u8 tile_mask = 0; > int ret = 0; > @@ -4080,12 +4021,16 @@ int xe_vm_invalidate_vma(struct xe_vma *vma) > > xe_device_wmb(xe); > > - ret = xe_vm_range_tilemask_tlb_inval(xe_vma_vm(vma), xe_vma_start(vma), > - xe_vma_end(vma), tile_mask); > + ret = xe_tlb_inval_range_tilemask_submit(xe, xe_vma_vm(vma)->usm.asid, > + xe_vma_start(vma), > xe_vma_end(vma), > + tile_mask, &_batch); > > /* WRITE_ONCE pairs with READ_ONCE in xe_vm_has_valid_gpu_mapping() */ > WRITE_ONCE(vma->tile_invalidated, vma->tile_mask); > > + if (!ret) > + xe_tlb_inval_batch_wait(&_batch); > + Here we skip the wait on error, hence my suggestion to skip waits in other code paths or at a minimum make call sematics consistent. > return ret; > } > > diff --git a/drivers/gpu/drm/xe/xe_vm.h b/drivers/gpu/drm/xe/xe_vm.h > index f849e369432b..62f4b6fec0bc 100644 > --- a/drivers/gpu/drm/xe/xe_vm.h > +++ b/drivers/gpu/drm/xe/xe_vm.h > @@ -240,9 +240,6 @@ struct dma_fence *xe_vm_range_rebind(struct xe_vm *vm, > struct dma_fence *xe_vm_range_unbind(struct xe_vm *vm, > struct xe_svm_range *range); > > -int xe_vm_range_tilemask_tlb_inval(struct xe_vm *vm, u64 start, > - u64 end, u8 tile_mask); > - > int xe_vm_invalidate_vma(struct xe_vma *vma); > > int xe_vm_validate_protected(struct xe_vm *vm); > diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c > b/drivers/gpu/drm/xe/xe_vm_madvise.c > index 95bf53cc29e3..39717026e84f 100644 > --- a/drivers/gpu/drm/xe/xe_vm_madvise.c > +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c > @@ -12,6 +12,7 @@ > #include "xe_pat.h" > #include "xe_pt.h" > #include "xe_svm.h" > +#include "xe_tlb_inval.h" > > struct xe_vmas_in_madvise_range { > u64 addr; > @@ -235,13 +236,19 @@ static u8 xe_zap_ptes_in_madvise_range(struct xe_vm > *vm, u64 start, u64 end) > static int xe_vm_invalidate_madvise_range(struct xe_vm *vm, u64 start, u64 > end) > { > u8 tile_mask = xe_zap_ptes_in_madvise_range(vm, start, end); > + struct xe_tlb_inval_batch batch; > + int err; > > if (!tile_mask) > return 0; > > xe_device_wmb(vm->xe); > > - return xe_vm_range_tilemask_tlb_inval(vm, start, end, tile_mask); > + err = xe_tlb_inval_range_tilemask_submit(vm->xe, vm->usm.asid, start, > end, > + tile_mask, &batch); > + xe_tlb_inval_batch_wait(&batch); No need to wait on error. > + > + return err; > } > > static bool madvise_args_are_sane(struct xe_device *xe, const struct > drm_xe_madvise *args) > diff --git a/drivers/gpu/drm/xe/xe_vm_types.h > b/drivers/gpu/drm/xe/xe_vm_types.h > index 1f6f7e30e751..de6544165cfa 100644 > --- a/drivers/gpu/drm/xe/xe_vm_types.h > +++ b/drivers/gpu/drm/xe/xe_vm_types.h > @@ -18,6 +18,7 @@ > #include "xe_device_types.h" > #include "xe_pt_types.h" > #include "xe_range_fence.h" > +#include "xe_tlb_inval_types.h" > #include "xe_userptr.h" > > struct drm_pagemap; > -- > 2.53.0 >
