[Public] > -----Original Message----- > From: Francis, David <[email protected]> > Sent: Thursday, March 5, 2026 9:56 AM > To: [email protected] > Cc: Koenig, Christian <[email protected]>; Deucher, Alexander > <[email protected]>; Freehill, Chris <[email protected]>; > Francis, David <[email protected]> > Subject: [PATCH] drm/amdgpu: Add profiling counters in fdinfo > > Add five counters to the fdinfo for amdgpu device files. > > They are: > amd-vmfault-counter: %llu > amd-queue-eviction-counter: %llu > amd-svm-migrate-counter: %llu > amd-svm-page-fault-counter: %llu > amd-svm-unmap-counter: %llu > > These counters begin at 0 when a device file is opened. > They are for use by profiling applications. > > Signed-off-by: David Francis <[email protected]>
These looks reasonable to me. Acked-by: Alex Deucher <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c | 15 ++++++++++- > drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 5 ++++ > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 3 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 31 > ++++++++++++++++++++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 26 ++++++++++++++++-- > drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 6 +++++ > drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 ++++++-- > 7 files changed, 90 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c > index b349bb3676d5..96d6063ecaa8 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c > @@ -61,6 +61,7 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct > drm_file *file) > struct amdgpu_vm *vm = &fpriv->vm; > > struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]; > + struct amdgpu_process_stats process_stats; > ktime_t usage[AMDGPU_HW_IP_NUM]; > const char *pl_name[] = { > [TTM_PL_VRAM] = "vram", > @@ -74,7 +75,7 @@ void amdgpu_show_fdinfo(struct drm_printer *p, struct > drm_file *file) > }; > unsigned int hw_ip, i; > > - amdgpu_vm_get_memory(vm, stats); > + amdgpu_vm_get_memory(vm, stats, &process_stats); > amdgpu_ctx_mgr_usage(&fpriv->ctx_mgr, usage); > > /* > @@ -114,6 +115,18 @@ void amdgpu_show_fdinfo(struct drm_printer *p, > struct drm_file *file) > (stats[TTM_PL_TT].drm.shared + > stats[TTM_PL_TT].drm.private) / 1024UL); > > + /* Amdgpu specific counters: */ > + drm_printf(p, "amd-vmfault-counter:\t%llu\n", > + process_stats.vmfault_counter); > + drm_printf(p, "amd-queue-eviction-counter:\t%llu\n", > + process_stats.queue_eviction_counter); > + drm_printf(p, "amd-svm-migrate-counter:\t%llu\n", > + process_stats.svm_migrate_counter); > + drm_printf(p, "amd-svm-page-fault-counter:\t%llu\n", > + process_stats.svm_page_fault_counter); > + drm_printf(p, "amd-svm-unmap-counter:\t%llu\n", > + process_stats.svm_unmap_counter); > + > for (hw_ip = 0; hw_ip < AMDGPU_HW_IP_NUM; ++hw_ip) { > if (!usage[hw_ip]) > continue; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > index 82bc6d657e5a..ad1042639dbe 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c > @@ -476,6 +476,7 @@ void amdgpu_irq_dispatch(struct amdgpu_device > *adev, > struct amdgpu_iv_entry entry; > unsigned int client_id, src_id; > struct amdgpu_irq_src *src; > + struct amdgpu_vm *vm; > bool handled = false; > int r; > > @@ -513,6 +514,10 @@ void amdgpu_irq_dispatch(struct amdgpu_device > *adev, > client_id, src_id); > > } else if ((src = adev->irq.client[client_id].sources[src_id])) { > + vm = amdgpu_vm_get_vm_from_pasid(adev, entry.pasid); > + if (vm) > + amdgpu_vm_increment_process_counter(vm, > AMDGPU_VM_VMFAULT_COUNTER); > + > r = src->funcs->process(adev, src, &entry); > if (r < 0) > dev_err(adev->dev, "error processing interrupt > (%d)\n", diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > index 001fcfcbde0f..9ba6f166cb5c 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c > @@ -1300,6 +1300,9 @@ amdgpu_userq_evict_all(struct > amdgpu_userq_mgr *uq_mgr) > queue = amdgpu_userq_get(uq_mgr, queue_id); > if (!queue) > continue; > + > + amdgpu_vm_increment_process_counter(queue- > >fw_obj.obj->vm_bo->vm, > +AMDGPU_VM_QUEUE_EVICTION_COUNTER); > + > r = amdgpu_userq_preempt_helper(queue); > if (r) > ret = r; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > index 01fef0e4f408..d7d82f23377f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c > @@ -1241,10 +1241,12 @@ int amdgpu_vm_update_range(struct > amdgpu_device *adev, struct amdgpu_vm *vm, } > > void amdgpu_vm_get_memory(struct amdgpu_vm *vm, > - struct amdgpu_mem_stats > stats[__AMDGPU_PL_NUM]) > + struct amdgpu_mem_stats > stats[__AMDGPU_PL_NUM], > + struct amdgpu_process_stats *process_stats) > { > spin_lock(&vm->status_lock); > memcpy(stats, vm->stats, sizeof(*stats) * __AMDGPU_PL_NUM); > + memcpy(process_stats, &vm->process_stats, sizeof(*process_stats)); > spin_unlock(&vm->status_lock); > } > > @@ -2472,7 +2474,7 @@ static void amdgpu_vm_destroy_task_info(struct > kref *kref) > kfree(ti); > } > > -static inline struct amdgpu_vm * > +inline struct amdgpu_vm * > amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid) { > struct amdgpu_vm *vm; > @@ -3234,3 +3236,28 @@ void amdgpu_sdma_set_vm_pte_scheds(struct > amdgpu_device *adev, > adev->vm_manager.vm_pte_num_scheds = adev- > >sdma.num_instances; > adev->vm_manager.vm_pte_funcs = vm_pte_funcs; } > + > +void amdgpu_vm_increment_process_counter(struct amdgpu_vm *vm, > enum > +amdgpu_process_stat_type stat_type) { > + spin_lock(&vm->status_lock); > + switch (stat_type) { > + case AMDGPU_VM_VMFAULT_COUNTER: > + vm->process_stats.vmfault_counter++; > + break; > + case AMDGPU_VM_QUEUE_EVICTION_COUNTER: > + vm->process_stats.queue_eviction_counter++; > + break; > + case AMDGPU_VM_SVM_MIGRATE_COUNTER: > + vm->process_stats.svm_migrate_counter++; > + break; > + case AMDGPU_VM_SVM_PAGE_FAULT_COUNTER: > + vm->process_stats.svm_page_fault_counter++; > + break; > + case AMDGPU_VM_SVM_UNMAP_COUNTER: > + vm->process_stats.svm_unmap_counter++; > + break; > + default: > + pr_debug("unknown process stat type 0x%x\n", stat_type); > + } > + spin_unlock(&vm->status_lock); > +} > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > index dc4b0ec672ec..4a63f0384c7d 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h > @@ -334,6 +334,14 @@ struct amdgpu_mem_stats { > uint64_t evicted; > }; > > +struct amdgpu_process_stats { > + uint64_t vmfault_counter; > + uint64_t queue_eviction_counter; > + uint64_t svm_migrate_counter; > + uint64_t svm_page_fault_counter; > + uint64_t svm_unmap_counter; > +}; > + > struct amdgpu_vm { > /* tree of virtual addresses mapped */ > struct rb_root_cached va; > @@ -348,8 +356,9 @@ struct amdgpu_vm { > /* Lock to protect vm_bo add/del/move on all lists of vm */ > spinlock_t status_lock; > > - /* Memory statistics for this vm, protected by status_lock */ > + /* Statistics for this vm, protected by stats_lock */ > struct amdgpu_mem_stats stats[__AMDGPU_PL_NUM]; > + struct amdgpu_process_stats process_stats; > > /* > * The following lists contain amdgpu_vm_bo_base objects for either > @@ -586,6 +595,8 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm > *vm); > > void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info); > > +struct amdgpu_vm *amdgpu_vm_get_vm_from_pasid(struct > amdgpu_device > +*adev, u32 pasid); > + > bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid, > u32 vmid, u32 node_id, uint64_t addr, uint64_t ts, > bool write_fault); > @@ -595,7 +606,8 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm > *vm); void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, > struct amdgpu_vm *vm); > void amdgpu_vm_get_memory(struct amdgpu_vm *vm, > - struct amdgpu_mem_stats > stats[__AMDGPU_PL_NUM]); > + struct amdgpu_mem_stats > stats[__AMDGPU_PL_NUM], > + struct amdgpu_process_stats *process_stats); > > int amdgpu_vm_pt_clear(struct amdgpu_device *adev, struct amdgpu_vm > *vm, > struct amdgpu_bo_vm *vmbo, bool immediate); @@ - > 621,6 +633,16 @@ int amdgpu_vm_pt_map_tables(struct amdgpu_device > *adev, struct amdgpu_vm *vm); > > bool amdgpu_vm_is_bo_always_valid(struct amdgpu_vm *vm, struct > amdgpu_bo *bo); > > +enum amdgpu_process_stat_type { > + AMDGPU_VM_VMFAULT_COUNTER, > + AMDGPU_VM_QUEUE_EVICTION_COUNTER, > + AMDGPU_VM_SVM_MIGRATE_COUNTER, > + AMDGPU_VM_SVM_PAGE_FAULT_COUNTER, > + AMDGPU_VM_SVM_UNMAP_COUNTER, > +}; > + > +void amdgpu_vm_increment_process_counter(struct amdgpu_vm *vm, > enum > +amdgpu_process_stat_type stat_type); > + > /** > * amdgpu_vm_tlb_seq - return tlb flush sequence number > * @vm: the amdgpu_vm structure to query diff --git > a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c > b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c > index b3d304aab686..c341b6842460 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c > @@ -427,6 +427,9 @@ svm_migrate_vma_to_vram(struct kfd_node *node, > struct svm_range *prange, > start >> PAGE_SHIFT, end >> PAGE_SHIFT, > 0, node->id, prange->prefetch_loc, > prange->preferred_loc, trigger); > + pdd = svm_range_get_pdd_by_node(prange, node); > + if (pdd) > + > amdgpu_vm_increment_process_counter(drm_priv_to_vm(pdd- > >drm_priv), > +AMDGPU_VM_SVM_MIGRATE_COUNTER); > > r = migrate_vma_setup(&migrate); > if (r) { > @@ -729,6 +732,9 @@ svm_migrate_vma_to_ram(struct kfd_node *node, > struct svm_range *prange, > start >> PAGE_SHIFT, end >> PAGE_SHIFT, > node->id, 0, prange->prefetch_loc, > prange->preferred_loc, trigger); > + pdd = svm_range_get_pdd_by_node(prange, node); > + if (pdd) > + > amdgpu_vm_increment_process_counter(drm_priv_to_vm(pdd- > >drm_priv), > +AMDGPU_VM_SVM_MIGRATE_COUNTER); > > r = migrate_vma_setup(&migrate); > if (r) { > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > index fcddb54a439f..499882a76581 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c > @@ -1375,9 +1375,11 @@ svm_range_unmap_from_gpus(struct svm_range > *prange, unsigned long start, > struct kfd_process_device *pdd; > struct dma_fence *fence = NULL; > struct kfd_process *p; > + struct amdgpu_vm *vm; > uint32_t gpuidx; > int r = 0; > > + > if (!prange->mapped_to_gpu) { > pr_debug("prange 0x%p [0x%lx 0x%lx] not mapped to > GPU\n", > prange, prange->start, prange->last); @@ -1398,13 > +1400,14 @@ svm_range_unmap_from_gpus(struct svm_range *prange, > unsigned long start, > pr_debug("failed to find device idx %d\n", gpuidx); > return -EINVAL; > } > + vm = drm_priv_to_vm(pdd->drm_priv); > > kfd_smi_event_unmap_from_gpu(pdd->dev, p->lead_thread- > >pid, > start, last, trigger); > + amdgpu_vm_increment_process_counter(vm, > AMDGPU_VM_SVM_UNMAP_COUNTER); > > r = svm_range_unmap_from_gpu(pdd->dev->adev, > - drm_priv_to_vm(pdd->drm_priv), > - start, last, &fence); > + vm, start, last, &fence); > if (r) > break; > > @@ -3039,6 +3042,7 @@ svm_range_restore_pages(struct amdgpu_device > *adev, unsigned int pasid, > struct svm_range_list *svms; > struct svm_range *prange; > struct kfd_process *p; > + struct kfd_process_device *pdd; > ktime_t timestamp = ktime_get_boottime(); > struct kfd_node *node; > int32_t best_loc; > @@ -3193,6 +3197,9 @@ svm_range_restore_pages(struct amdgpu_device > *adev, unsigned int pasid, > > kfd_smi_event_page_fault_start(node, p->lead_thread->pid, addr, > write_fault, timestamp); > + pdd = svm_range_get_pdd_by_node(prange, node); > + if (pdd) > + > amdgpu_vm_increment_process_counter(drm_priv_to_vm(pdd- > >drm_priv), > +AMDGPU_VM_SVM_MIGRATE_COUNTER); > > /* Align migration range start and size to granularity size */ > size = 1UL << prange->granularity; > -- > 2.34.1
