On Wed, Feb 11, 2026 at 5:37 AM Pierre-Eric Pelloux-Prayer <[email protected]> wrote: > > This is tricky to implement right and we're going to need > it from the devcoredump. > > Signed-off-by: Pierre-Eric Pelloux-Prayer <[email protected]>
Acked-by: Alex Deucher <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 84 +++++++++++++++++--------- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 3 + > 2 files changed, 57 insertions(+), 30 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 807f8bcc7de5..6a5b3e148554 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -2930,6 +2930,50 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > return 0; > } > > +/** > + * amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a > pasid, if possible. > + * @adev: amdgpu device pointer > + * @root: root BO of the VM > + * @pasid: PASID of the VM > + * The caller needs to unreserve and unref the root bo on success. > + */ > +struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, > + struct amdgpu_bo **root, u32 pasid) > +{ > + unsigned long irqflags; > + struct amdgpu_vm *vm; > + int r; > + > + xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); > + vm = xa_load(&adev->vm_manager.pasids, pasid); > + *root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL; > + xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); > + > + if (!*root) > + return NULL; > + > + r = amdgpu_bo_reserve(*root, true); > + if (r) > + goto error_unref; > + > + /* Double check that the VM still exists */ > + xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); > + vm = xa_load(&adev->vm_manager.pasids, pasid); > + if (vm && vm->root.bo != *root) > + vm = NULL; > + xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); > + if (!vm) > + goto error_unlock; > + > + return vm; > +error_unlock: > + amdgpu_bo_unreserve(*root); > + > +error_unref: > + amdgpu_bo_unref(root); > + return NULL; > +} > + > /** > * amdgpu_vm_handle_fault - graceful handling of VM faults. > * @adev: amdgpu device pointer > @@ -2945,50 +2989,31 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void > *data, struct drm_file *filp) > * shouldn't be reported any more. > */ > bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, > - u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, > - bool write_fault) > + u32 vmid, u32 node_id, uint64_t addr, > + uint64_t ts, bool write_fault) > { > bool is_compute_context = false; > struct amdgpu_bo *root; > - unsigned long irqflags; > uint64_t value, flags; > struct amdgpu_vm *vm; > int r; > > - xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); > - vm = xa_load(&adev->vm_manager.pasids, pasid); > - if (vm) { > - root = amdgpu_bo_ref(vm->root.bo); > - is_compute_context = vm->is_compute_context; > - } else { > - root = NULL; > - } > - xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); > - > - if (!root) > + vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid); > + if (!vm) > return false; > > + is_compute_context = vm->is_compute_context; > + > addr /= AMDGPU_GPU_PAGE_SIZE; > > - if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid, > - node_id, addr, ts, write_fault)) { > + if (is_compute_context && > + !svm_range_restore_pages(adev, pasid, vmid, node_id, addr, > + ts, write_fault)) { > + amdgpu_bo_unreserve(root); > amdgpu_bo_unref(&root); > return true; > } > > - r = amdgpu_bo_reserve(root, true); > - if (r) > - goto error_unref; > - > - /* Double check that the VM still exists */ > - xa_lock_irqsave(&adev->vm_manager.pasids, irqflags); > - vm = xa_load(&adev->vm_manager.pasids, pasid); > - if (vm && vm->root.bo != root) > - vm = NULL; > - xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags); > - if (!vm) > - goto error_unlock; > - > flags = AMDGPU_PTE_VALID | AMDGPU_PTE_SNOOPED | > AMDGPU_PTE_SYSTEM; > > @@ -3027,7 +3052,6 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, > u32 pasid, > if (r < 0) > dev_err(adev->dev, "Can't handle page fault (%d)\n", r); > > -error_unref: > amdgpu_bo_unref(&root); > > return false; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index 139642eacdd0..2051eda55c99 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -589,6 +589,9 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, > u32 pasid, > u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, > bool write_fault); > > +struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev, > + struct amdgpu_bo **root, u32 pasid); > + > void amdgpu_vm_set_task_info(struct amdgpu_vm *vm); > > void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, > -- > 2.43.0 >
