On Wed, Feb 11, 2026 at 6:07 AM Pierre-Eric Pelloux-Prayer
<[email protected]> wrote:
>
> Now that we have a worker thread, we can try to access the
> IBs of the job. The process is:
> * get the VM from the PASID
> * get the BO from its VA and the VM
> * map the BO for CPU access
> * copy everything, then add it to the dump
> Each step can fail so we have to be cautious.
> These operations can be slow so when amdgpu_devcoredump_format
> is called only to determine the size of the buffer we skip all
> of them and assume they will succeed.
>
> Signed-off-by: Pierre-Eric Pelloux-Prayer <[email protected]>
> ---
>  .../gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c  | 84 ++++++++++++++++++-
>  1 file changed, 83 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> index d0af8a294abf..d576518c212d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c
> @@ -200,12 +200,20 @@ static void amdgpu_devcoredump_fw_info(struct 
> amdgpu_device *adev,
>  static ssize_t
>  amdgpu_devcoredump_format(char *buffer, size_t count, struct 
> amdgpu_coredump_info *coredump)
>  {
> +       struct amdgpu_device *adev = coredump->adev;
>         struct drm_printer p;
>         struct drm_print_iterator iter;
>         struct amdgpu_vm_fault_info *fault_info;
> +       struct amdgpu_bo_va_mapping *mapping;
>         struct amdgpu_ip_block *ip_block;
> +       struct amdgpu_res_cursor cursor;
> +       struct amdgpu_bo *abo, *root;
> +       uint64_t va_start, offset;
>         struct amdgpu_ring *ring;
> -       int ver, i, j;
> +       struct amdgpu_vm *vm;
> +       u32 *ib_content;
> +       uint8_t *kptr;
> +       int ver, i, j, r;
>         u32 ring_idx, off;
>
>         iter.data = buffer;
> @@ -323,6 +331,80 @@ amdgpu_devcoredump_format(char *buffer, size_t count, 
> struct amdgpu_coredump_inf
>         else if (coredump->reset_vram_lost)
>                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
>
> +       if (coredump->num_ibs) {
> +               if (buffer)
> +                       vm = amdgpu_vm_lock_by_pasid(adev, &root, 
> coredump->pasid);
> +               else
> +                       vm = NULL;

Is there any point in doing the loop if the vm is NULL?

> +
> +               for (int i = 0; i < coredump->num_ibs && (!buffer || vm); 
> i++) {
> +                       ib_content = 
> kvmalloc_array(coredump->ibs[i].ib_size_dw, 4,
> +                                                   GFP_KERNEL);

Shouldn't this be GFP_NOWAIT?

Alex

> +                       if (!ib_content)
> +                               continue;
> +
> +                       if (!vm)
> +                               goto output_ib_content;
> +
> +                       va_start = coredump->ibs[i].gpu_addr & 
> AMDGPU_GMC_HOLE_MASK;
> +                       mapping = amdgpu_vm_bo_lookup_mapping(vm, va_start / 
> AMDGPU_GPU_PAGE_SIZE);
> +                       if (!mapping)
> +                               goto free_ib_content;
> +
> +                       offset = va_start - (mapping->start * 
> AMDGPU_GPU_PAGE_SIZE);
> +                       abo = amdgpu_bo_ref(mapping->bo_va->base.bo);
> +                       r = amdgpu_bo_reserve(abo, false);
> +                       if (r)
> +                               goto free_ib_content;
> +
> +                       if (abo->flags & AMDGPU_GEM_CREATE_NO_CPU_ACCESS) {
> +                               off = 0;
> +
> +                               if (abo->tbo.resource->mem_type != 
> TTM_PL_VRAM)
> +                                       goto unreserve_abo;
> +
> +                               amdgpu_res_first(abo->tbo.resource, offset,
> +                                                coredump->ibs[i].ib_size_dw 
> * 4,
> +                                                &cursor);
> +                               while (cursor.remaining) {
> +                                       amdgpu_device_mm_access(adev, 
> cursor.start / 4,
> +                                                               
> &ib_content[off], cursor.size / 4,
> +                                                               false);
> +                                       off += cursor.size;
> +                                       amdgpu_res_next(&cursor, cursor.size);
> +                               }
> +                       } else {
> +                               r = ttm_bo_kmap(&abo->tbo, 0,
> +                                               PFN_UP(abo->tbo.base.size),
> +                                               &abo->kmap);
> +                               if (r)
> +                                       goto unreserve_abo;
> +
> +                               kptr = amdgpu_bo_kptr(abo);
> +                               kptr += offset;
> +                               memcpy(ib_content, kptr,
> +                                      coredump->ibs[i].ib_size_dw * 4);
> +
> +                               amdgpu_bo_kunmap(abo);
> +                       }
> +
> +output_ib_content:
> +                       drm_printf(&p, "\nIB #%d 0x%llx %d dw\n",
> +                                  i, coredump->ibs[i].gpu_addr, 
> coredump->ibs[i].ib_size_dw);
> +                       for (int j = 0; j < coredump->ibs[i].ib_size_dw; j++)
> +                               drm_printf(&p, "0x%08x\n", ib_content[j]);
> +unreserve_abo:
> +                       if (vm)
> +                               amdgpu_bo_unreserve(abo);
> +free_ib_content:
> +                       kfree(ib_content);
> +               }
> +               if (vm) {
> +                       amdgpu_bo_unreserve(root);
> +                       amdgpu_bo_unref(&root);
> +               }
> +       }
> +
>         return count - iter.remain;
>  }
>
> --
> 2.43.0
>

Reply via email to