This reverts commit c45c3bc930bf60e7658f87c519a40f77513b96aa.

Found KFDSVMEvict test regression on vega10, kernel BUG backtrace:

[  135.365083] amdgpu: Migration failed during eviction
[  135.365090] ------------[ cut here ]------------
[  135.365097] This was not the last reference
[  135.365122] WARNING: CPU: 5 PID: 1998 at
drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_svm.c:3515
svm_range_evict_svm_bo_worker+0x21c/0x390 [amdgpu]
[  135.365836]  svm_range_evict_svm_bo_worker+0x21c/0x390 [amdgpu]
[  135.366249]  process_one_work+0x298/0x590
[  135.366256]  worker_thread+0x3d/0x3d0
......
[  135.721257] kernel BUG at include/linux/swapops.h:472!
[  135.721537] Call Trace:
[  135.721540]  <TASK>
[  135.721592]  hmm_vma_walk_pmd+0x5c8/0x780
[  135.721598]  walk_pgd_range+0x3bc/0x7c0
[  135.721604]  __walk_page_range+0x1ec/0x200
[  135.721609]  walk_page_range+0x119/0x1a0
[  135.721613]  hmm_range_fault+0x5d/0xb0
[  135.721617]  amdgpu_hmm_range_get_pages+0x159/0x240 [amdgpu]
[  135.721820]  svm_range_validate_and_map+0x57f/0x16c0 [amdgpu]
[  135.722411]  svm_range_restore_pages+0xcd8/0x1150 [amdgpu]
[  135.722613]  amdgpu_vm_handle_fault+0xc2/0x360 [amdgpu]
[  135.722777]  gmc_v9_0_process_interrupt+0x255/0x670 [amdgpu]

Signed-off-by: Philip Yang <[email protected]>
---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 35 +++++++++-------------------
 1 file changed, 11 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index f2b33fb2afcf..4d000c63cde8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -1565,7 +1565,6 @@ static void *kfd_svm_page_owner(struct kfd_process *p, 
int32_t gpuidx)
  * 5. Release page table (and SVM BO) reservation
  */
 static int svm_range_validate_and_map(struct mm_struct *mm,
-                                     unsigned long map_start, unsigned long 
map_last,
                                      struct svm_range *prange, int32_t gpuidx,
                                      bool intr, bool wait, bool flush_tlb)
 {
@@ -1646,8 +1645,6 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
        end = (prange->last + 1) << PAGE_SHIFT;
        for (addr = start; !r && addr < end; ) {
                struct hmm_range *hmm_range;
-               unsigned long map_start_vma;
-               unsigned long map_last_vma;
                struct vm_area_struct *vma;
                uint64_t vram_pages_vma;
                unsigned long next = 0;
@@ -1696,16 +1693,9 @@ static int svm_range_validate_and_map(struct mm_struct 
*mm,
                        r = -EAGAIN;
                }
 
-               if (!r) {
-                       map_start_vma = max(map_start, prange->start + offset);
-                       map_last_vma = min(map_last, prange->start + offset + 
npages - 1);
-                       if (map_start_vma <= map_last_vma) {
-                               offset = map_start_vma - prange->start;
-                               npages = map_last_vma - map_start_vma + 1;
-                               r = svm_range_map_to_gpus(prange, offset, 
npages, readonly,
-                                                         ctx->bitmap, wait, 
flush_tlb);
-                       }
-               }
+               if (!r)
+                       r = svm_range_map_to_gpus(prange, offset, npages, 
readonly,
+                                                 ctx->bitmap, wait, flush_tlb);
 
                if (!r && next == end)
                        prange->mapped_to_gpu = true;
@@ -1811,8 +1801,8 @@ static void svm_range_restore_work(struct work_struct 
*work)
                 */
                mutex_lock(&prange->migrate_mutex);
 
-               r = svm_range_validate_and_map(mm, prange->start, prange->last, 
prange,
-                                              MAX_GPU_INSTANCE, false, true, 
false);
+               r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
+                                              false, true, false);
                if (r)
                        pr_debug("failed %d to map 0x%lx to gpus\n", r,
                                 prange->start);
@@ -3026,8 +3016,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
unsigned int pasid,
        kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr,
                                       write_fault, timestamp);
 
-       start = prange->start;
-       last = prange->last;
        if (prange->actual_loc != 0 || best_loc != 0) {
                migration = true;
                /* Align migration range start and size to granularity size */
@@ -3061,11 +3049,10 @@ svm_range_restore_pages(struct amdgpu_device *adev, 
unsigned int pasid,
                }
        }
 
-       r = svm_range_validate_and_map(mm, start, last, prange, gpuidx, false,
-                                      false, false);
+       r = svm_range_validate_and_map(mm, prange, gpuidx, false, false, false);
        if (r)
                pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
-                        r, svms, start, last);
+                        r, svms, prange->start, prange->last);
 
        kfd_smi_event_page_fault_end(node, p->lead_thread->pid, addr,
                                     migration);
@@ -3611,8 +3598,8 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
 
                flush_tlb = !migrated && update_mapping && 
prange->mapped_to_gpu;
 
-               r = svm_range_validate_and_map(mm, prange->start, prange->last, 
prange,
-                                              MAX_GPU_INSTANCE, true, true, 
flush_tlb);
+               r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
+                                              true, true, flush_tlb);
                if (r)
                        pr_debug("failed %d to map svm range\n", r);
 
@@ -3626,8 +3613,8 @@ svm_range_set_attr(struct kfd_process *p, struct 
mm_struct *mm,
                pr_debug("Remapping prange 0x%p [0x%lx 0x%lx]\n",
                         prange, prange->start, prange->last);
                mutex_lock(&prange->migrate_mutex);
-               r = svm_range_validate_and_map(mm,  prange->start, 
prange->last, prange,
-                                              MAX_GPU_INSTANCE, true, true, 
prange->mapped_to_gpu);
+               r = svm_range_validate_and_map(mm, prange, MAX_GPU_INSTANCE,
+                                              true, true, 
prange->mapped_to_gpu);
                if (r)
                        pr_debug("failed %d on remap svm range\n", r);
                mutex_unlock(&prange->migrate_mutex);
-- 
2.35.1

Reply via email to