Use the process_info->notifier lock for validating and mapping SVM ranges.
Take advantage of the locking done inside amdgpu_vm_ptes_update to fix
a lock dependency issue with page table allocations.

TODO: double check that prange->lock it not still needed somewhere inside
the notifier lock because it protects a few other things.

[   83.979486] ======================================================
[   83.986583] WARNING: possible circular locking dependency detected
[   83.993643] 5.19.0-kfd-fkuehlin #75 Not tainted
[   83.999044] ------------------------------------------------------
[   84.006088] kfdtest/3453 is trying to acquire lock:
[   84.011820] ffff9a998561e210 (&prange->lock){+.+.}-{3:3}, at: 
svm_range_cpu_invalidate_pagetables+0x90/0x5e0 [amdgpu]
[   84.023911]
               but task is already holding lock:
[   84.031608] ffffffffbcd929c0 
(mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: unmap_vmas+0x5/0x170
[   84.041992]
               which lock already depends on the new lock.

[   84.052785]
               the existing dependency chain (in reverse order) is:
[   84.061993]
               -> #3 (mmu_notifier_invalidate_range_start){+.+.}-{0:0}:
[   84.071548]        fs_reclaim_acquire+0x6d/0xd0
[   84.076941]        kmem_cache_alloc_trace+0x34/0x760
[   84.082766]        alloc_workqueue_attrs+0x1b/0x50
[   84.088411]        workqueue_init+0x88/0x318
[   84.093533]        kernel_init_freeable+0x134/0x28f
[   84.099258]        kernel_init+0x16/0x130
[   84.104107]        ret_from_fork+0x1f/0x30
[   84.109038]
               -> #2 (fs_reclaim){+.+.}-{0:0}:
[   84.116348]        fs_reclaim_acquire+0xa1/0xd0
[   84.121697]        kmem_cache_alloc+0x2c/0x760
[   84.126948]        drm_block_alloc.isra.0+0x27/0x50 [drm_buddy]
[   84.133679]        split_block+0x4d/0x140 [drm_buddy]
[   84.139539]        drm_buddy_alloc_blocks+0x385/0x580 [drm_buddy]
[   84.146435]        amdgpu_vram_mgr_new+0x213/0x4f0 [amdgpu]
[   84.153399]        ttm_resource_alloc+0x31/0x80 [ttm]
[   84.159366]        ttm_bo_mem_space+0x8f/0x230 [ttm]
[   84.165169]        ttm_bo_validate+0xc5/0x170 [ttm]
[   84.170872]        ttm_bo_init_reserved+0x1a6/0x230 [ttm]
[   84.177075]        amdgpu_bo_create+0x1a0/0x510 [amdgpu]
[   84.183600]        amdgpu_bo_create_reserved+0x188/0x1e0 [amdgpu]
[   84.190803]        amdgpu_bo_create_kernel_at+0x64/0x200 [amdgpu]
[   84.197994]        amdgpu_ttm_init+0x420/0x4c0 [amdgpu]
[   84.204301]        gmc_v10_0_sw_init+0x33a/0x530 [amdgpu]
[   84.210813]        amdgpu_device_init.cold+0x10d4/0x17a1 [amdgpu]
[   84.218077]        amdgpu_driver_load_kms+0x15/0x110 [amdgpu]
[   84.224919]        amdgpu_pci_probe+0x142/0x350 [amdgpu]
[   84.231313]        local_pci_probe+0x40/0x80
[   84.236437]        work_for_cpu_fn+0x10/0x20
[   84.241500]        process_one_work+0x270/0x5a0
[   84.246805]        worker_thread+0x203/0x3c0
[   84.251828]        kthread+0xea/0x110
[   84.256229]        ret_from_fork+0x1f/0x30
[   84.261061]
               -> #1 (&mgr->lock){+.+.}-{3:3}:
[   84.268156]        __mutex_lock+0x9a/0xf30
[   84.272967]        amdgpu_vram_mgr_new+0x14a/0x4f0 [amdgpu]
[   84.279752]        ttm_resource_alloc+0x31/0x80 [ttm]
[   84.285602]        ttm_bo_mem_space+0x8f/0x230 [ttm]
[   84.291321]        ttm_bo_validate+0xc5/0x170 [ttm]
[   84.296939]        ttm_bo_init_reserved+0xe2/0x230 [ttm]
[   84.302969]        amdgpu_bo_create+0x1a0/0x510 [amdgpu]
[   84.309297]        amdgpu_bo_create_vm+0x2e/0x80 [amdgpu]
[   84.315656]        amdgpu_vm_pt_create+0xf5/0x270 [amdgpu]
[   84.322090]        amdgpu_vm_ptes_update+0x6c4/0x8f0 [amdgpu]
[   84.328793]        amdgpu_vm_update_range+0x29b/0x730 [amdgpu]
[   84.335537]        svm_range_validate_and_map+0xc78/0x1390 [amdgpu]
[   84.342734]        svm_range_set_attr+0xc74/0x1170 [amdgpu]
[   84.349222]        kfd_ioctl+0x189/0x5c0 [amdgpu]
[   84.354808]        __x64_sys_ioctl+0x80/0xb0
[   84.359738]        do_syscall_64+0x35/0x80
[   84.364481]        entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   84.370687]
               -> #0 (&prange->lock){+.+.}-{3:3}:
[   84.377864]        __lock_acquire+0x12ed/0x27e0
[   84.383027]        lock_acquire+0xca/0x2e0
[   84.387759]        __mutex_lock+0x9a/0xf30
[   84.392491]        svm_range_cpu_invalidate_pagetables+0x90/0x5e0 [amdgpu]
[   84.400345]        __mmu_notifier_invalidate_range_start+0x1d3/0x230
[   84.407410]        unmap_vmas+0x162/0x170
[   84.412126]        unmap_region+0xa8/0x110
[   84.416905]        __do_munmap+0x209/0x4f0
[   84.421680]        __vm_munmap+0x78/0x120
[   84.426365]        __x64_sys_munmap+0x17/0x20
[   84.431392]        do_syscall_64+0x35/0x80
[   84.436164]        entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   84.442405]
               other info that might help us debug this:

[   84.452431] Chain exists of:
                 &prange->lock --> fs_reclaim --> 
mmu_notifier_invalidate_range_start

[   84.466395]  Possible unsafe locking scenario:

[   84.473720]        CPU0                    CPU1
[   84.479020]        ----                    ----
[   84.484296]   lock(mmu_notifier_invalidate_range_start);
[   84.490333]                                lock(fs_reclaim);
[   84.496705]                                
lock(mmu_notifier_invalidate_range_start);
[   84.505246]   lock(&prange->lock);
[   84.509361]
                *** DEADLOCK ***

[   84.517360] 2 locks held by kfdtest/3453:
[   84.522060]  #0: ffff9a99821ec4a8 (&mm->mmap_lock#2){++++}-{3:3}, at: 
__do_munmap+0x417/0x4f0
[   84.531287]  #1: ffffffffbcd929c0 
(mmu_notifier_invalidate_range_start){+.+.}-{0:0}, at: unmap_vmas+0x5/0x170
[   84.541896]
               stack backtrace:
[   84.547630] CPU: 3 PID: 3453 Comm: kfdtest Not tainted 5.19.0-kfd-fkuehlin 
#75
[   84.555537] Hardware name: To Be Filled By O.E.M. To Be Filled By 
O.E.M./EPYCD8-2T, BIOS P2.60 04/10/2020
[   84.565788] Call Trace:
[   84.568925]  <TASK>
[   84.571702]  dump_stack_lvl+0x45/0x59
[   84.576034]  check_noncircular+0xfe/0x110
[   84.580715]  ? kernel_text_address+0x6d/0xe0
[   84.585652]  __lock_acquire+0x12ed/0x27e0
[   84.590340]  lock_acquire+0xca/0x2e0
[   84.594595]  ? svm_range_cpu_invalidate_pagetables+0x90/0x5e0 [amdgpu]
[   84.602338]  __mutex_lock+0x9a/0xf30
[   84.606714]  ? svm_range_cpu_invalidate_pagetables+0x90/0x5e0 [amdgpu]
[   84.614262]  ? svm_range_cpu_invalidate_pagetables+0x90/0x5e0 [amdgpu]
[   84.621806]  ? svm_range_cpu_invalidate_pagetables+0x90/0x5e0 [amdgpu]
[   84.629353]  svm_range_cpu_invalidate_pagetables+0x90/0x5e0 [amdgpu]
[   84.636742]  ? lock_release+0x139/0x2b0
[   84.641374]  __mmu_notifier_invalidate_range_start+0x1d3/0x230
[   84.647976]  unmap_vmas+0x162/0x170
[   84.652203]  unmap_region+0xa8/0x110
[   84.656503]  __do_munmap+0x209/0x4f0
[   84.660792]  __vm_munmap+0x78/0x120
[   84.664977]  __x64_sys_munmap+0x17/0x20
[   84.669499]  do_syscall_64+0x35/0x80
[   84.673755]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
[   84.679485] RIP: 0033:0x7f32872eb97b
[   84.683738] Code: 8b 15 19 35 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb 
89 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 0b 00 00 00 0f 05 <48> 3d 01 
f0 ff ff 73 01 c3 48 8b 0d e5 34 0d 00 f7 d8 64 89 01 48
[   84.703915] RSP: 002b:00007fffb06c4508 EFLAGS: 00000246 ORIG_RAX: 
000000000000000b
[   84.712205] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f32872eb97b
[   84.720072] RDX: 0000000004000000 RSI: 0000000004000000 RDI: 00007f32831ae000
[   84.727944] RBP: 00007fffb06c4750 R08: 00007fffb06c4548 R09: 000055e7570ad230
[   84.735809] R10: 000055e757088010 R11: 0000000000000246 R12: 000055e75453cefa
[   84.743688] R13: 0000000000000000 R14: 0000000000000021 R15: 0000000000000000
[   84.751584]  </TASK>

Signed-off-by: Felix Kuehling <[email protected]>
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 13 ++---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c     | 60 ++++++++++++------------
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h     | 16 +------
 3 files changed, 37 insertions(+), 52 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 884f7d4ee695..b8430c27d775 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -882,6 +882,7 @@ svm_migrate_to_vram(struct svm_range *prange, uint32_t 
best_loc,
  */
 static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
 {
+       struct amdkfd_process_info *process_info;
        unsigned long addr = vmf->address;
        struct svm_range_bo *svm_bo;
        enum svm_work_list_ops op;
@@ -889,6 +890,7 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
        struct svm_range *prange;
        struct kfd_process *p;
        struct mm_struct *mm;
+       unsigned int flags;
        int r = 0;
 
        svm_bo = vmf->page->zone_device_data;
@@ -916,6 +918,7 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
                r = 0;
                goto out_unref_process;
        }
+       process_info = p->kgd_process_info;
 
        pr_debug("CPU page fault svms 0x%p address 0x%lx\n", &p->svms, addr);
        addr >>= PAGE_SHIFT;
@@ -936,13 +939,11 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf)
        if (!prange->actual_loc)
                goto out_unlock_prange;
 
-       svm_range_lock(parent);
-       if (prange != parent)
-               mutex_lock_nested(&prange->lock, 1);
+       mutex_lock(&process_info->notifier_lock);
+       flags = memalloc_noreclaim_save();
        r = svm_range_split_by_granularity(p, mm, addr, parent, prange);
-       if (prange != parent)
-               mutex_unlock(&prange->lock);
-       svm_range_unlock(parent);
+       memalloc_noreclaim_restore(flags);
+       mutex_unlock(&process_info->notifier_lock);
        if (r) {
                pr_debug("failed %d to split range by granularity\n", r);
                goto out_unlock_prange;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index cc46878901c1..7020861438fa 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1278,6 +1278,7 @@ svm_range_unmap_from_gpus(struct svm_range *prange, 
unsigned long start,
 
 static int
 svm_range_map_to_gpu(struct kfd_process_device *pdd, struct svm_range *prange,
+                    struct hmm_range *hmm_range,
                     unsigned long offset, unsigned long npages, bool readonly,
                     dma_addr_t *dma_addr, struct amdgpu_device *bo_adev,
                     struct dma_fence **fence, bool flush_tlb)
@@ -1323,7 +1324,8 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, 
struct svm_range *prange,
                                           pte_flags,
                                           (last_start - prange->start) << 
PAGE_SHIFT,
                                           bo_adev ? 
bo_adev->vm_manager.vram_base_offset : 0,
-                                          NULL, dma_addr, NULL, 
&vm->last_update);
+                                          NULL, dma_addr, hmm_range,
+                                          &vm->last_update);
 
                for (j = last_start - prange->start; j <= i; j++)
                        dma_addr[j] |= last_domain;
@@ -1350,9 +1352,10 @@ svm_range_map_to_gpu(struct kfd_process_device *pdd, 
struct svm_range *prange,
 }
 
 static int
-svm_range_map_to_gpus(struct svm_range *prange, unsigned long offset,
-                     unsigned long npages, bool readonly,
-                     unsigned long *bitmap, bool wait, bool flush_tlb)
+svm_range_map_to_gpus(struct svm_range *prange, struct hmm_range *hmm_range,
+                     unsigned long offset, unsigned long npages,
+                     bool readonly, unsigned long *bitmap, bool wait,
+                     bool flush_tlb)
 {
        struct kfd_process_device *pdd;
        struct amdgpu_device *bo_adev;
@@ -1385,7 +1388,8 @@ svm_range_map_to_gpus(struct svm_range *prange, unsigned 
long offset,
                        continue;
                }
 
-               r = svm_range_map_to_gpu(pdd, prange, offset, npages, readonly,
+               r = svm_range_map_to_gpu(pdd, prange, hmm_range, offset,
+                                        npages, readonly,
                                         prange->dma_addr[gpuidx],
                                         bo_adev, wait ? &fence : NULL,
                                         flush_tlb);
@@ -1613,23 +1617,15 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
                        goto unreserve_out;
                }
 
-               svm_range_lock(prange);
-               if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
-                       pr_debug("hmm update the range, need validate again\n");
-                       r = -EAGAIN;
-                       goto unlock_out;
-               }
-               if (!list_empty(&prange->child_list)) {
-                       pr_debug("range split by unmap in parallel, validate 
again\n");
-                       r = -EAGAIN;
-                       goto unlock_out;
-               }
+               r = svm_range_map_to_gpus(prange, hmm_range, offset, npages,
+                                         readonly, ctx.bitmap, wait,
+                                         flush_tlb);
 
-               r = svm_range_map_to_gpus(prange, offset, npages, readonly,
-                                         ctx.bitmap, wait, flush_tlb);
-
-unlock_out:
-               svm_range_unlock(prange);
+               /* Ignoring return value because this just frees the hmm_range.
+                * Actual checking is done in amdgpu_vm_ptes_update under the
+                * notifier lock.
+                */
+               amdgpu_hmm_range_get_pages_done(hmm_range);
 
                addr = next;
        }
@@ -1806,13 +1802,11 @@ svm_range_evict(struct svm_range *prange, struct 
mm_struct *mm,
                        if (!pchild->mapped_to_gpu)
                                continue;
                        mapped = true;
-                       mutex_lock_nested(&pchild->lock, 1);
                        if (pchild->start <= last && pchild->last >= start) {
                                pr_debug("increment pchild invalid [0x%lx 
0x%lx]\n",
                                         pchild->start, pchild->last);
                                atomic_inc(&pchild->invalid);
                        }
-                       mutex_unlock(&pchild->lock);
                }
 
                if (!mapped)
@@ -1848,12 +1842,10 @@ svm_range_evict(struct svm_range *prange, struct 
mm_struct *mm,
                pr_debug("invalidate unmap svms 0x%p [0x%lx 0x%lx] from GPUs\n",
                         prange->svms, start, last);
                list_for_each_entry(pchild, &prange->child_list, child_list) {
-                       mutex_lock_nested(&pchild->lock, 1);
                        s = max(start, pchild->start);
                        l = min(last, pchild->last);
                        if (l >= s)
                                svm_range_unmap_from_gpus(pchild, s, l, 
trigger);
-                       mutex_unlock(&pchild->lock);
                }
                s = max(start, prange->start);
                l = min(last, prange->last);
@@ -2335,13 +2327,11 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct 
svm_range *prange,
        unmap_parent = start <= prange->start && last >= prange->last;
 
        list_for_each_entry(pchild, &prange->child_list, child_list) {
-               mutex_lock_nested(&pchild->lock, 1);
                s = max(start, pchild->start);
                l = min(last, pchild->last);
                if (l >= s)
                        svm_range_unmap_from_gpus(pchild, s, l, trigger);
                svm_range_unmap_split(mm, prange, pchild, start, last);
-               mutex_unlock(&pchild->lock);
        }
        s = max(start, prange->start);
        l = min(last, prange->last);
@@ -2384,9 +2374,12 @@ svm_range_cpu_invalidate_pagetables(struct 
mmu_interval_notifier *mni,
                                    const struct mmu_notifier_range *range,
                                    unsigned long cur_seq)
 {
+       struct amdkfd_process_info *process_info;
        struct svm_range *prange;
+       struct kfd_process *p;
        unsigned long start;
        unsigned long last;
+       unsigned int flags;
 
        if (range->event == MMU_NOTIFY_RELEASE)
                return true;
@@ -2404,8 +2397,11 @@ svm_range_cpu_invalidate_pagetables(struct 
mmu_interval_notifier *mni,
                 mni->interval_tree.last >> PAGE_SHIFT, range->event);
 
        prange = container_of(mni, struct svm_range, notifier);
+       p = container_of(prange->svms, struct kfd_process, svms);
+       process_info = p->kgd_process_info;
 
-       svm_range_lock(prange);
+       mutex_lock(&process_info->notifier_lock);
+       flags = memalloc_noreclaim_save();
        mmu_interval_set_seq(mni, cur_seq);
 
        switch (range->event) {
@@ -2417,7 +2413,8 @@ svm_range_cpu_invalidate_pagetables(struct 
mmu_interval_notifier *mni,
                break;
        }
 
-       svm_range_unlock(prange);
+       memalloc_noreclaim_restore(flags);
+       mutex_unlock(&process_info->notifier_lock);
        mmput(mni->mm);
 
        return true;
@@ -2959,6 +2956,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
unsigned int pasid,
 int
 svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled)
 {
+       struct amdkfd_process_info *process_info = p->kgd_process_info;
        struct svm_range *prange, *pchild;
        uint64_t reserved_size = 0;
        uint64_t size;
@@ -2967,9 +2965,9 @@ svm_range_switch_xnack_reserve_mem(struct kfd_process *p, 
bool xnack_enabled)
        pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, 
xnack_enabled);
 
        mutex_lock(&p->svms.lock);
+       mutex_lock(&process_info->notifier_lock);
 
        list_for_each_entry(prange, &p->svms.list, list) {
-               svm_range_lock(prange);
                list_for_each_entry(pchild, &prange->child_list, child_list) {
                        size = (pchild->last - pchild->start + 1) << PAGE_SHIFT;
                        if (xnack_enabled) {
@@ -2996,10 +2994,10 @@ svm_range_switch_xnack_reserve_mem(struct kfd_process 
*p, bool xnack_enabled)
                        reserved_size += size;
                }
 out_unlock:
-               svm_range_unlock(prange);
                if (r)
                        break;
        }
+       mutex_unlock(&process_info->notifier_lock);
 
        if (r)
                amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index 7a33b93f9df6..bb455dc7f549 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -82,8 +82,7 @@ struct svm_work_list_item {
  * @offset:     range start offset within mm_nodes
  * @svm_bo:     struct to manage splited amdgpu_bo
  * @svm_bo_list:link list node, to scan all ranges which share same svm_bo
- * @lock:       protect prange start, last, child_list, svm_bo_list
- * @saved_flags:save/restore current PF_MEMALLOC flags
+ * @lock:       protect prange start, last, svm_bo_list
  * @flags:      flags defined as KFD_IOCTL_SVM_FLAG_*
  * @perferred_loc: perferred location, 0 for CPU, or GPU id
  * @perfetch_loc: last prefetch location, 0 for CPU, or GPU id
@@ -117,7 +116,6 @@ struct svm_range {
        struct svm_range_bo             *svm_bo;
        struct list_head                svm_bo_list;
        struct mutex                    lock;
-       unsigned int                    saved_flags;
        uint32_t                        flags;
        uint32_t                        preferred_loc;
        uint32_t                        prefetch_loc;
@@ -135,18 +133,6 @@ struct svm_range {
        bool                            mapped_to_gpu;
 };
 
-static inline void svm_range_lock(struct svm_range *prange)
-{
-       mutex_lock(&prange->lock);
-       prange->saved_flags = memalloc_noreclaim_save();
-
-}
-static inline void svm_range_unlock(struct svm_range *prange)
-{
-       memalloc_noreclaim_restore(prange->saved_flags);
-       mutex_unlock(&prange->lock);
-}
-
 static inline struct svm_range_bo *svm_range_bo_ref(struct svm_range_bo 
*svm_bo)
 {
        if (svm_bo)
-- 
2.32.0

Reply via email to