On 12/2/25 10:47, Pierre-Eric Pelloux-Prayer wrote: > If multiple entities share the same window we must make sure > that jobs using them are executed sequentially. > > This commit gives separate windows to each entity, so jobs > from multiple entities could execute in parallel if needed. > (for now they all use the first sdma engine, so it makes no > difference yet). > The entity stores the gart window offsets to centralize the > "window id" to "window offset" in a single place. > > default_entity doesn't get any windows reserved since there is > no use for them. > > --- > v3: > - renamed gart_window_lock -> lock (Christian) > - added amdgpu_ttm_buffer_entity_init (Christian) > - fixed gart_addr in svm_migrate_gart_map (Felix) > - renamed gart_window_idX -> gart_window_offs[] > - added amdgpu_compute_gart_address > v4: > - u32 -> u64 > - added kerneldoc > --- > > Signed-off-by: Pierre-Eric Pelloux-Prayer <[email protected]> > Acked-by: Felix Kuehling <[email protected]>
Reviewed-by: Christian König <[email protected]> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 6 +-- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 61 +++++++++++++++++------- > drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h | 19 +++++++- > drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 9 ++-- > 4 files changed, 69 insertions(+), 26 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > index 94e07b9ec7b4..0d2784fe0be3 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c > @@ -686,7 +686,7 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, > uint32_t vmid, > * translation. Avoid this by doing the invalidation from the SDMA > * itself at least for GART. > */ > - mutex_lock(&adev->mman.gtt_window_lock); > + mutex_lock(&adev->mman.default_entity.lock); > r = amdgpu_job_alloc_with_ib(ring->adev, > &adev->mman.default_entity.base, > AMDGPU_FENCE_OWNER_UNDEFINED, > 16 * 4, AMDGPU_IB_POOL_IMMEDIATE, > @@ -699,7 +699,7 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, > uint32_t vmid, > job->ibs->ptr[job->ibs->length_dw++] = ring->funcs->nop; > amdgpu_ring_pad_ib(ring, &job->ibs[0]); > fence = amdgpu_job_submit(job); > - mutex_unlock(&adev->mman.gtt_window_lock); > + mutex_unlock(&adev->mman.default_entity.lock); > > dma_fence_wait(fence, false); > dma_fence_put(fence); > @@ -707,7 +707,7 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, > uint32_t vmid, > return; > > error_alloc: > - mutex_unlock(&adev->mman.gtt_window_lock); > + mutex_unlock(&adev->mman.default_entity.lock); > dev_err(adev->dev, "Error flushing GPU TLB using the SDMA (%d)!\n", r); > } > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > index a5048cd8e10d..d3d0419397c5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c > @@ -228,9 +228,7 @@ static int amdgpu_ttm_map_buffer(struct > amdgpu_ttm_buffer_entity *entity, > > *size = min(*size, (uint64_t)num_pages * PAGE_SIZE - offset); > > - *addr = adev->gmc.gart_start; > - *addr += (u64)window * AMDGPU_GTT_MAX_TRANSFER_SIZE * > - AMDGPU_GPU_PAGE_SIZE; > + *addr = amdgpu_compute_gart_address(&adev->gmc, entity, window); > *addr += offset; > > num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8); > @@ -248,7 +246,7 @@ static int amdgpu_ttm_map_buffer(struct > amdgpu_ttm_buffer_entity *entity, > src_addr += job->ibs[0].gpu_addr; > > dst_addr = amdgpu_bo_gpu_offset(adev->gart.bo); > - dst_addr += window * AMDGPU_GTT_MAX_TRANSFER_SIZE * 8; > + dst_addr += (entity->gart_window_offs[window] >> AMDGPU_GPU_PAGE_SHIFT) > * 8; > amdgpu_emit_copy_buffer(adev, &job->ibs[0], src_addr, > dst_addr, num_bytes, 0); > > @@ -313,7 +311,7 @@ static int amdgpu_ttm_copy_mem_to_mem(struct > amdgpu_device *adev, > amdgpu_res_first(src->mem, src->offset, size, &src_mm); > amdgpu_res_first(dst->mem, dst->offset, size, &dst_mm); > > - mutex_lock(&adev->mman.gtt_window_lock); > + mutex_lock(&entity->lock); > while (src_mm.remaining) { > uint64_t from, to, cur_size, tiling_flags; > uint32_t num_type, data_format, max_com, write_compress_disable; > @@ -368,7 +366,7 @@ static int amdgpu_ttm_copy_mem_to_mem(struct > amdgpu_device *adev, > amdgpu_res_next(&dst_mm, cur_size); > } > error: > - mutex_unlock(&adev->mman.gtt_window_lock); > + mutex_unlock(&entity->lock); > *f = fence; > return r; > } > @@ -1500,7 +1498,7 @@ static int amdgpu_ttm_access_memory_sdma(struct > ttm_buffer_object *bo, > if (r) > goto out; > > - mutex_lock(&adev->mman.gtt_window_lock); > + mutex_lock(&adev->mman.default_entity.lock); > amdgpu_res_first(abo->tbo.resource, offset, len, &src_mm); > src_addr = amdgpu_ttm_domain_start(adev, bo->resource->mem_type) + > src_mm.start; > @@ -1512,7 +1510,7 @@ static int amdgpu_ttm_access_memory_sdma(struct > ttm_buffer_object *bo, > PAGE_SIZE, 0); > > fence = amdgpu_ttm_job_submit(adev, job, num_dw); > - mutex_unlock(&adev->mman.gtt_window_lock); > + mutex_unlock(&adev->mman.default_entity.lock); > > if (!dma_fence_wait_timeout(fence, false, adev->sdma_timeout)) > r = -ETIMEDOUT; > @@ -1875,6 +1873,27 @@ static void amdgpu_ttm_mmio_remap_bo_fini(struct > amdgpu_device *adev) > adev->rmmio_remap.bo = NULL; > } > > +static int amdgpu_ttm_buffer_entity_init(struct amdgpu_ttm_buffer_entity > *entity, > + int starting_gart_window, > + bool needs_src_gart_window, > + bool needs_dst_gart_window) > +{ > + mutex_init(&entity->lock); > + if (needs_src_gart_window) { > + entity->gart_window_offs[0] = > + (u64)starting_gart_window * > AMDGPU_GTT_MAX_TRANSFER_SIZE * > + AMDGPU_GPU_PAGE_SIZE; > + starting_gart_window++; > + } > + if (needs_dst_gart_window) { > + entity->gart_window_offs[1] = > + (u64)starting_gart_window * > AMDGPU_GTT_MAX_TRANSFER_SIZE * > + AMDGPU_GPU_PAGE_SIZE; > + starting_gart_window++; > + } > + return starting_gart_window; > +} > + > /* > * amdgpu_ttm_init - Init the memory management (ttm) as well as various > * gtt/vram related fields. > @@ -1889,8 +1908,6 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) > uint64_t gtt_size; > int r; > > - mutex_init(&adev->mman.gtt_window_lock); > - > dma_set_max_seg_size(adev->dev, UINT_MAX); > /* No others user of address space so set it to 0 */ > r = ttm_device_init(&adev->mman.bdev, &amdgpu_bo_driver, adev->dev, > @@ -2160,6 +2177,7 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev) > void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool > enable) > { > struct ttm_resource_manager *man = ttm_manager_type(&adev->mman.bdev, > TTM_PL_VRAM); > + u32 used_windows; > uint64_t size; > int r; > > @@ -2203,6 +2221,14 @@ void amdgpu_ttm_set_buffer_funcs_status(struct > amdgpu_device *adev, bool enable) > drm_sched_entity_destroy(&adev->mman.clear_entity.base); > goto error_free_entity; > } > + > + /* Statically assign GART windows to each entity. */ > + used_windows = > amdgpu_ttm_buffer_entity_init(&adev->mman.default_entity, > + 0, false, false); > + used_windows = > amdgpu_ttm_buffer_entity_init(&adev->mman.move_entity, > + used_windows, > true, true); > + used_windows = > amdgpu_ttm_buffer_entity_init(&adev->mman.clear_entity, > + used_windows, > false, true); > } else { > drm_sched_entity_destroy(&adev->mman.default_entity.base); > drm_sched_entity_destroy(&adev->mman.clear_entity.base); > @@ -2361,6 +2387,7 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, > struct dma_fence **fence) > { > struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); > + struct amdgpu_ttm_buffer_entity *entity; > struct amdgpu_res_cursor cursor; > u64 addr; > int r = 0; > @@ -2371,11 +2398,12 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, > if (!fence) > return -EINVAL; > > + entity = &adev->mman.clear_entity; > *fence = dma_fence_get_stub(); > > amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &cursor); > > - mutex_lock(&adev->mman.gtt_window_lock); > + mutex_lock(&entity->lock); > while (cursor.remaining) { > struct dma_fence *next = NULL; > u64 size; > @@ -2388,13 +2416,12 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, > /* Never clear more than 256MiB at once to avoid timeouts */ > size = min(cursor.size, 256ULL << 20); > > - r = amdgpu_ttm_map_buffer(&adev->mman.clear_entity, > - &bo->tbo, bo->tbo.resource, &cursor, > + r = amdgpu_ttm_map_buffer(entity, &bo->tbo, bo->tbo.resource, > &cursor, > 1, false, &size, &addr); > if (r) > goto err; > > - r = amdgpu_ttm_fill_mem(adev, &adev->mman.clear_entity, 0, > addr, size, resv, > + r = amdgpu_ttm_fill_mem(adev, entity, 0, addr, size, resv, > &next, true, > AMDGPU_KERNEL_JOB_ID_TTM_CLEAR_BUFFER); > if (r) > @@ -2406,7 +2433,7 @@ int amdgpu_ttm_clear_buffer(struct amdgpu_bo *bo, > amdgpu_res_next(&cursor, size); > } > err: > - mutex_unlock(&adev->mman.gtt_window_lock); > + mutex_unlock(&entity->lock); > > return r; > } > @@ -2431,7 +2458,7 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_buffer_entity > *entity, > > amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst); > > - mutex_lock(&adev->mman.gtt_window_lock); > + mutex_lock(&entity->lock); > while (dst.remaining) { > struct dma_fence *next; > uint64_t cur_size, to; > @@ -2456,7 +2483,7 @@ int amdgpu_fill_buffer(struct amdgpu_ttm_buffer_entity > *entity, > amdgpu_res_next(&dst, cur_size); > } > error: > - mutex_unlock(&adev->mman.gtt_window_lock); > + mutex_unlock(&entity->lock); > if (f) > *f = dma_fence_get(fence); > dma_fence_put(fence); > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > index 9288599c9c46..6e04f80b6a75 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h > @@ -29,6 +29,7 @@ > #include <drm/ttm/ttm_placement.h> > #include "amdgpu_vram_mgr.h" > #include "amdgpu_hmm.h" > +#include "amdgpu_gmc.h" > > #define AMDGPU_PL_GDS (TTM_PL_PRIV + 0) > #define AMDGPU_PL_GWS (TTM_PL_PRIV + 1) > @@ -39,7 +40,7 @@ > #define __AMDGPU_PL_NUM (TTM_PL_PRIV + 6) > > #define AMDGPU_GTT_MAX_TRANSFER_SIZE 512 > -#define AMDGPU_GTT_NUM_TRANSFER_WINDOWS 2 > +#define AMDGPU_GTT_NUM_TRANSFER_WINDOWS 3 > > extern const struct attribute_group amdgpu_vram_mgr_attr_group; > extern const struct attribute_group amdgpu_gtt_mgr_attr_group; > @@ -54,6 +55,8 @@ struct amdgpu_gtt_mgr { > > struct amdgpu_ttm_buffer_entity { > struct drm_sched_entity base; > + struct mutex lock; > + u64 gart_window_offs[2]; > }; > > struct amdgpu_mman { > @@ -69,6 +72,7 @@ struct amdgpu_mman { > > struct mutex gtt_window_lock; > > + /* @default_entity: for workarounds, has no gart windows */ > struct amdgpu_ttm_buffer_entity default_entity; > struct amdgpu_ttm_buffer_entity clear_entity; > struct amdgpu_ttm_buffer_entity move_entity; > @@ -199,6 +203,19 @@ static inline int amdgpu_ttm_tt_get_user_pages(struct > amdgpu_bo *bo, > } > #endif > > +/** > + * amdgpu_compute_gart_address() - Returns GART address of an entity's window > + * @gmc: The &struct amdgpu_gmc instance to use > + * @entity: The &struct amdgpu_ttm_buffer_entity owning the GART window > + * @index: The window to use (must be 0 or 1) > + */ > +static inline u64 amdgpu_compute_gart_address(struct amdgpu_gmc *gmc, > + struct amdgpu_ttm_buffer_entity > *entity, > + int index) > +{ > + return gmc->gart_start + entity->gart_window_offs[index]; > +} > + > void amdgpu_ttm_tt_set_user_pages(struct ttm_tt *ttm, struct > amdgpu_hmm_range *range); > int amdgpu_ttm_tt_get_userptr(const struct ttm_buffer_object *tbo, > uint64_t *user_addr); > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c > b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c > index 9c76f1ba0e55..0cc1d2b35026 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c > @@ -59,8 +59,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, > void *cpu_addr; > int r; > > - /* use gart window 0 */ > - *gart_addr = adev->gmc.gart_start; > + *gart_addr = amdgpu_compute_gart_address(&adev->gmc, entity, 0); > > num_dw = ALIGN(adev->mman.buffer_funcs->copy_num_dw, 8); > num_bytes = npages * 8; > @@ -116,7 +115,7 @@ svm_migrate_gart_map(struct amdgpu_ring *ring, > * multiple GTT_MAX_PAGES transfer, all sdma operations are serialized, wait > for > * the last sdma finish fence which is returned to check copy memory is done. > * > - * Context: Process context, takes and releases gtt_window_lock > + * Context: Process context > * > * Return: > * 0 - OK, otherwise error code > @@ -138,7 +137,7 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, > dma_addr_t *sys, > > entity = &adev->mman.move_entity; > > - mutex_lock(&adev->mman.gtt_window_lock); > + mutex_lock(&entity->lock); > > while (npages) { > size = min(GTT_MAX_PAGES, npages); > @@ -175,7 +174,7 @@ svm_migrate_copy_memory_gart(struct amdgpu_device *adev, > dma_addr_t *sys, > } > > out_unlock: > - mutex_unlock(&adev->mman.gtt_window_lock); > + mutex_unlock(&entity->lock); > > return r; > }
