amdkfd: check access permisson to restore retry fault

Felix Kuehling Thu, 19 Aug 2021 14:13:09 -0700

Am 2021-08-19 um 10:56 a.m. schrieb Philip Yang:
> Check range access permission to restore GPU retry fault, if GPU retry
> fault on address which belongs to VMA, and VMA has no read or write
> permission requested by GPU, failed to restore the address. The vm fault
> event will pass back to user space.
>
> Signed-off-by: Philip Yang <[email protected]>


Just some nit-picks. Other than that the patch looks good to me. See
inline ...


> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c |  5 +++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  2 +-
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 ++-
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  3 ++-
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.c   | 30 +++++++++++++++++++++++++-
>  drivers/gpu/drm/amd/amdkfd/kfd_svm.h   |  5 +++--
>  6 files changed, 40 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 831f00644460..ff6de40b860c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -3347,12 +3347,13 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>   * @adev: amdgpu device pointer
>   * @pasid: PASID of the VM
>   * @addr: Address of the fault
> + * @rw_fault: 0 is read fault, 1 is write fault
>   *
>   * Try to gracefully handle a VM fault. Return true if the fault was handled 
> and
>   * shouldn't be reported any more.
>   */
>  bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> -                         uint64_t addr)
> +                         uint64_t addr, uint32_t rw_fault)

Should rw_fault be a bool? And maybe call it write_fault to clarify what
"true" means.


>  {
>       bool is_compute_context = false;
>       struct amdgpu_bo *root;
> @@ -3377,7 +3378,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, 
> u32 pasid,
>       addr /= AMDGPU_GPU_PAGE_SIZE;
>  
>       if (is_compute_context &&
> -         !svm_range_restore_pages(adev, pasid, addr)) {
> +         !svm_range_restore_pages(adev, pasid, addr, rw_fault)) {
>               amdgpu_bo_unref(&root)
>               return true;
>       }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 80cc9ab2c1d0..1cc574ece180 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -448,7 +448,7 @@ void amdgpu_vm_check_compute_bug(struct amdgpu_device 
> *adev);
>  void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
>                            struct amdgpu_task_info *task_info);
>  bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
> -                         uint64_t addr);
> +                         uint64_t addr, uint32_t rw_fault);

bool write_fault


>  
>  void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 24b781e90bef..994983901006 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -93,6 +93,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device 
> *adev,
>                                      struct amdgpu_iv_entry *entry)
>  {
>       bool retry_fault = !!(entry->src_data[1] & 0x80);
> +     bool rw_fault = !!(entry->src_data[1] & 0x20);

write_fault


>       struct amdgpu_vmhub *hub = &adev->vmhub[entry->vmid_src];
>       struct amdgpu_task_info task_info;
>       uint32_t status = 0;
> @@ -121,7 +122,7 @@ static int gmc_v10_0_process_interrupt(struct 
> amdgpu_device *adev,
>               /* Try to handle the recoverable page faults by filling page
>                * tables
>                */
> -             if (amdgpu_vm_handle_fault(adev, entry->pasid, addr))
> +             if (amdgpu_vm_handle_fault(adev, entry->pasid, addr, rw_fault))
>                       return 1;
>       }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 097230b5e946..9a37fd0527a9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -506,6 +506,7 @@ static int gmc_v9_0_process_interrupt(struct 
> amdgpu_device *adev,
>                                     struct amdgpu_iv_entry *entry)
>  {
>       bool retry_fault = !!(entry->src_data[1] & 0x80);
> +     bool rw_fault = !!(entry->src_data[1] & 0x20);

write_fault


>       uint32_t status = 0, cid = 0, rw = 0;
>       struct amdgpu_task_info task_info;
>       struct amdgpu_vmhub *hub;
> @@ -536,7 +537,7 @@ static int gmc_v9_0_process_interrupt(struct 
> amdgpu_device *adev,
>               /* Try to handle the recoverable page faults by filling page
>                * tables
>                */
> -             if (amdgpu_vm_handle_fault(adev, entry->pasid, addr))
> +             if (amdgpu_vm_handle_fault(adev, entry->pasid, addr, rw_fault))
>                       return 1;
>       }
>  
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> index d4a43c94bcf9..cf1009bb532a 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
> @@ -2400,9 +2400,29 @@ svm_range_count_fault(struct amdgpu_device *adev, 
> struct kfd_process *p,
>               WRITE_ONCE(pdd->faults, pdd->faults + 1);
>  }
>  
> +static bool
> +svm_range_allow_access(struct mm_struct *mm, uint64_t addr, uint32_t 
> rw_fault)

I think the function name "svm_range_..." is a bit misleading because it
doesn't do anything for an address range. It only checks one VMA at one
virtual address. I'd suggest "svm_fault_allowed".


> +{
> +     unsigned long requested = VM_READ;
> +     struct vm_area_struct *vma;
> +
> +     if (rw_fault)
> +             requested |= VM_WRITE;
> +
> +     vma = find_vma(mm, addr << PAGE_SHIFT);
> +     if (!vma || (addr << PAGE_SHIFT) < vma->vm_start) {
> +             pr_debug("address 0x%llx VMA is removed\n", addr);
> +             return true;
> +     }
> +
> +     pr_debug("requested 0x%lx, vma permission flags 0x%lx\n", requested,
> +             vma->vm_flags);
> +     return (requested & ~vma->vm_flags) == 0;

I think this is the same as "(vma->vm_flags & requested) == requested".


> +}
> +
>  int
>  svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
> -                     uint64_t addr)
> +                     uint64_t addr, uint32_t rw_fault)

bool write_fault

Regards,
  Felix


>  {
>       struct mm_struct *mm = NULL;
>       struct svm_range_list *svms;
> @@ -2440,6 +2460,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
> unsigned int pasid,
>       }
>  
>       mmap_read_lock(mm);
> +
>  retry_write_locked:
>       mutex_lock(&svms->lock);
>       prange = svm_range_from_addr(svms, addr, NULL);
> @@ -2484,6 +2505,13 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
> unsigned int pasid,
>               goto out_unlock_range;
>       }
>  
> +     if (!svm_range_allow_access(mm, addr, rw_fault)) {
> +             pr_debug("fault addr 0x%llx no %s permission\n", addr,
> +                     rw_fault ? "write" : "read");
> +             r = -EPERM;
> +             goto out_unlock_range;
> +     }
> +
>       best_loc = svm_range_best_restore_location(prange, adev, &gpuidx);
>       if (best_loc == -1) {
>               pr_debug("svms %p failed get best restore loc [0x%lx 0x%lx]\n",
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> index e7fc5e8998aa..e77d90de08a6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
> @@ -175,7 +175,7 @@ int svm_range_split_by_granularity(struct kfd_process *p, 
> struct mm_struct *mm,
>                              unsigned long addr, struct svm_range *parent,
>                              struct svm_range *prange);
>  int svm_range_restore_pages(struct amdgpu_device *adev,
> -                         unsigned int pasid, uint64_t addr);
> +                         unsigned int pasid, uint64_t addr, uint32_t rw);
>  int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
>  void svm_range_add_list_work(struct svm_range_list *svms,
>                            struct svm_range *prange, struct mm_struct *mm,
> @@ -210,7 +210,8 @@ static inline void svm_range_list_fini(struct kfd_process 
> *p)
>  }
>  
>  static inline int svm_range_restore_pages(struct amdgpu_device *adev,
> -                                       unsigned int pasid, uint64_t addr)
> +                                       unsigned int pasid, uint64_t addr,
> +                                       uint32_t rw_fault)
>  {
>       return -EFAULT;
>  }

Re: [PATCH 1/2] drm/amdkfd: check access permisson to restore retry fault

Reply via email to