Register the VRAM manager with the dmem cgroup reclaim infrastructure so that lowering dmem.max below current VRAM usage triggers TTM eviction rather than failing with -EBUSY.
Guard place->flags in amdgpu_ttm_bo_eviction_valuable() against NULL, as the TTM reclaim path passes a NULL place in cgroup drain mode. v3: - Rebased on fix for uninitialized list and buddy allocator on the drmm_cgroup_register_region() error path. v5: - Rebased on the introduction of struct dmem_cgroup_init. - Clear the reclaim callback in amdgpu_vram_mgr_fini() to prevent use-after-free if cgroup reclaim is triggered after driver unbind while userspace holds an open DRM file descriptor. (Sashiko-bot) - Switch from drmm_cgroup_register_region() to the raw dmem_cgroup_register_region() and store the region in amdgpu_vram_mgr.cg_region. Call dmem_cgroup_unregister_region() in amdgpu_vram_mgr_fini() after ttm_resource_manager_evict_all() to drain in-flight reclaim callbacks, and clear man->cg afterwards. This is required because amdgpu's vram manager fini is called explicitly during driver unbind, which may precede the DRM device release and thus precede any drmm-based cleanup. (Sashiko-bot) v6: - Fix mgr->cg_region never being assigned, so dmem_cgroup_unregister_region() in fini silently no-ops on NULL and leaks the region. (Sashiko-bot) - Reorder fini to call set_used(false) and evict_all() before dmem_cgroup_unregister_region(), so ttm_resource_free() can uncharge via man->cg during eviction; clear man->cg after unregister. (Sashiko-bot) Assisted-by: GitHub_Copilot:claude-sonnet-4.6 Signed-off-by: Thomas Hellström <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 31 ++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h | 2 ++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index 2740de94e93c..8cbcd33f51a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -1488,7 +1488,7 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo, dma_resv_for_each_fence(&resv_cursor, bo->base.resv, DMA_RESV_USAGE_BOOKKEEP, f) { if (amdkfd_fence_check_mm(f, current->mm) && - !(place->flags & TTM_PL_FLAG_CONTIGUOUS)) + !(place && (place->flags & TTM_PL_FLAG_CONTIGUOUS))) return false; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c index 08f05c3aed1d..2250bab0970d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c @@ -906,6 +906,10 @@ static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = { .debug = amdgpu_vram_mgr_debug }; +static const struct dmem_cgroup_ops amdgpu_vram_mgr_dmem_ops = { + .reclaim = ttm_resource_manager_dmem_reclaim, +}; + /** * amdgpu_vram_mgr_init - init VRAM manager and DRM MM * @@ -917,6 +921,7 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev) { struct amdgpu_vram_mgr *mgr = &adev->mman.vram_mgr; struct ttm_resource_manager *man = &mgr->manager; + struct dmem_cgroup_region *cg; int err; ttm_resource_manager_init(man, &adev->mman.bdev, @@ -933,12 +938,16 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev) if (err) return err; - man->cg = drmm_cgroup_register_region(adev_to_drm(adev), "vram", - &(struct dmem_cgroup_init){ - .size = adev->gmc.real_vram_size, - }); - if (IS_ERR(man->cg)) - return PTR_ERR(man->cg); + cg = dmem_cgroup_register_region(&(struct dmem_cgroup_init){ + .size = adev->gmc.real_vram_size, + .ops = &amdgpu_vram_mgr_dmem_ops, + .reclaim_priv = man, + }, "vram"); + if (IS_ERR(cg)) + return PTR_ERR(cg); + + mgr->cg_region = cg; + ttm_resource_manager_set_dmem_region(man, cg); ttm_set_driver_manager(&adev->mman.bdev, TTM_PL_VRAM, &mgr->manager); ttm_resource_manager_set_used(man, true); @@ -966,6 +975,16 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev) if (ret) return; + /* + * Drain any in-flight dmem cgroup reclaim callbacks and remove the + * region from the global list. This must happen after evict_all() + * so that ttm_resource_free() can still uncharge via man->cg while + * BOs are being evicted. + */ + dmem_cgroup_unregister_region(mgr->cg_region); + mgr->cg_region = NULL; + man->cg = NULL; + mutex_lock(&mgr->lock); list_for_each_entry_safe(rsv, temp, &mgr->reservations_pending, blocks) kfree(rsv); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h index 429a21a2e9b2..07103cddb335 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h @@ -36,6 +36,8 @@ struct amdgpu_vram_mgr { atomic64_t vis_usage; u64 default_page_size; struct list_head allocated_vres_list; + /** @cg_region: dmem cgroup region for VRAM; unregistered in fini. */ + struct dmem_cgroup_region *cg_region; }; struct amdgpu_vres_task { -- 2.54.0
