On Wed, Feb 11, 2026 at 5:37 AM Pierre-Eric Pelloux-Prayer
<[email protected]> wrote:
>
> This is tricky to implement right and we're going to need
> it from the devcoredump.
>
> Signed-off-by: Pierre-Eric Pelloux-Prayer <[email protected]>

Acked-by: Alex Deucher <[email protected]>

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 84 +++++++++++++++++---------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  3 +
>  2 files changed, 57 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 807f8bcc7de5..6a5b3e148554 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2930,6 +2930,50 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *filp)
>         return 0;
>  }
>
> +/**
> + * amdgpu_vm_lock_by_pasid - return an amdgpu_vm and its root bo from a 
> pasid, if possible.
> + * @adev: amdgpu device pointer
> + * @root: root BO of the VM
> + * @pasid: PASID of the VM
> + * The caller needs to unreserve and unref the root bo on success.
> + */
> +struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev,
> +                                         struct amdgpu_bo **root, u32 pasid)
> +{
> +       unsigned long irqflags;
> +       struct amdgpu_vm *vm;
> +       int r;
> +
> +       xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
> +       vm = xa_load(&adev->vm_manager.pasids, pasid);
> +       *root = vm ? amdgpu_bo_ref(vm->root.bo) : NULL;
> +       xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
> +
> +       if (!*root)
> +               return NULL;
> +
> +       r = amdgpu_bo_reserve(*root, true);
> +       if (r)
> +               goto error_unref;
> +
> +       /* Double check that the VM still exists */
> +       xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
> +       vm = xa_load(&adev->vm_manager.pasids, pasid);
> +       if (vm && vm->root.bo != *root)
> +               vm = NULL;
> +       xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
> +       if (!vm)
> +               goto error_unlock;
> +
> +       return vm;
> +error_unlock:
> +       amdgpu_bo_unreserve(*root);
> +
> +error_unref:
> +       amdgpu_bo_unref(root);
> +       return NULL;
> +}
> +
>  /**
>   * amdgpu_vm_handle_fault - graceful handling of VM faults.
>   * @adev: amdgpu device pointer
> @@ -2945,50 +2989,31 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *filp)
>   * shouldn't be reported any more.
>   */
>  bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> -                           u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
> -                           bool write_fault)
> +                           u32 vmid, u32 node_id, uint64_t addr,
> +                           uint64_t ts, bool write_fault)
>  {
>         bool is_compute_context = false;
>         struct amdgpu_bo *root;
> -       unsigned long irqflags;
>         uint64_t value, flags;
>         struct amdgpu_vm *vm;
>         int r;
>
> -       xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
> -       vm = xa_load(&adev->vm_manager.pasids, pasid);
> -       if (vm) {
> -               root = amdgpu_bo_ref(vm->root.bo);
> -               is_compute_context = vm->is_compute_context;
> -       } else {
> -               root = NULL;
> -       }
> -       xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
> -
> -       if (!root)
> +       vm = amdgpu_vm_lock_by_pasid(adev, &root, pasid);
> +       if (!vm)
>                 return false;
>
> +       is_compute_context = vm->is_compute_context;
> +
>         addr /= AMDGPU_GPU_PAGE_SIZE;
>
> -       if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
> -           node_id, addr, ts, write_fault)) {
> +       if (is_compute_context &&
> +               !svm_range_restore_pages(adev, pasid, vmid, node_id, addr,
> +                                       ts, write_fault)) {
> +               amdgpu_bo_unreserve(root);
>                 amdgpu_bo_unref(&root);
>                 return true;
>         }
>
> -       r = amdgpu_bo_reserve(root, true);
> -       if (r)
> -               goto error_unref;
> -
> -       /* Double check that the VM still exists */
> -       xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
> -       vm = xa_load(&adev->vm_manager.pasids, pasid);
> -       if (vm && vm->root.bo != root)
> -               vm = NULL;
> -       xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
> -       if (!vm)
> -               goto error_unlock;
> -
>         flags = AMDGPU_PTE_VALID | AMDGPU_PTE_SNOOPED |
>                 AMDGPU_PTE_SYSTEM;
>
> @@ -3027,7 +3052,6 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
> u32 pasid,
>         if (r < 0)
>                 dev_err(adev->dev, "Can't handle page fault (%d)\n", r);
>
> -error_unref:
>         amdgpu_bo_unref(&root);
>
>         return false;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 139642eacdd0..2051eda55c99 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -589,6 +589,9 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
> u32 pasid,
>                             u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
>                             bool write_fault);
>
> +struct amdgpu_vm *amdgpu_vm_lock_by_pasid(struct amdgpu_device *adev,
> +                                         struct amdgpu_bo **root, u32 pasid);
> +
>  void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>
>  void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
> --
> 2.43.0
>

Reply via email to