Hello,
I want to prepend that I've never done kernel programming and am not familiar
with the amdgpu driver. That said, I read through the issues on Mesa regarding
its behavior when RADV runs out of VRAM [1] and the related information.
Thereby I stumbled upon Christian König mentioning that it would be buggy
behavior if the preferred domain included GTT when the initial domain was
updated [2]. I read some of the related code in the kernel driver, trying to
understand it, and got the impression that this is actually the case. When
amdgpu_gem_object_create() fails in amdgpu_gem_create_ioctl(),
"AMDGPU_GEM_DOMAIN_GTT" is added to "initial_domain" which during the retry
gets written into both, "domain" and "preferred_domain" in
amdgpu_gem_object_create(). Is this intended or a bug?
I've attached a minimal proof-of-concept patch (applied to kernel 6.14-rc7)
that runs fine for me. I don't know though if this has any real effect on the
driver's memory management. So please excuse me if this is a dumb
question/patch and I did steal your time. Any insight would still be
appreciated.
Thanks and best regards
Malte
[1] https://gitlab.freedesktop.org/mesa/mesa/-/issues/8763
[2] https://lists.freedesktop.org/archives/amd-gfx/2024-April/107401.html
______________________________________________________________________________
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 2025-03-16 23:55:17.000000000 +0100
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 2025-03-20 22:58:59.849195645 +0100
@@ -93,7 +93,7 @@
}
int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size,
- int alignment, u32 initial_domain,
+ int alignment, u32 initial_domain, u32 domain,
u64 flags, enum ttm_bo_type type,
struct dma_resv *resv,
struct drm_gem_object **obj, int8_t xcp_id_plus1)
@@ -113,7 +113,7 @@
bp.resv = resv;
bp.preferred_domain = initial_domain;
bp.flags = flags;
- bp.domain = initial_domain;
+ bp.domain = domain;
bp.bo_ptr_size = sizeof(struct amdgpu_bo);
bp.xcp_id_plus1 = xcp_id_plus1;
@@ -318,7 +318,7 @@
uint64_t size = args->in.bo_size;
struct dma_resv *resv = NULL;
struct drm_gem_object *gobj;
- uint32_t handle, initial_domain;
+ uint32_t handle, initial_domain, domain;
int r;
/* reject DOORBELLs until userspace code to use it is available */
@@ -371,9 +371,9 @@
}
- initial_domain = (u32)(0xffffffff & args->in.domains);
+ domain = initial_domain = (u32)(0xffffffff & args->in.domains);
retry:
r = amdgpu_gem_object_create(adev, size, args->in.alignment,
- initial_domain,
+ initial_domain, domain,
flags, ttm_bo_type_device, resv, &gobj, fpriv->xcp_id + 1);
if (r && r != -ERESTARTSYS) {
if (flags & AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED) {
@@ -381,8 +382,8 @@
goto retry;
}
- if (initial_domain == AMDGPU_GEM_DOMAIN_VRAM) {
- initial_domain |= AMDGPU_GEM_DOMAIN_GTT;
+ if (domain == AMDGPU_GEM_DOMAIN_VRAM) {
+ domain |= AMDGPU_GEM_DOMAIN_GTT;
goto retry;
}
DRM_DEBUG("Failed to allocate GEM object (%llu, %d, %llu, %d)\n",
@@ -443,7 +444,7 @@
}
/* create a gem object to contain this object in */
- r = amdgpu_gem_object_create(adev, args->size, 0, AMDGPU_GEM_DOMAIN_CPU,
+ r = amdgpu_gem_object_create(adev, args->size, 0, AMDGPU_GEM_DOMAIN_CPU, AMDGPU_GEM_DOMAIN_CPU,
0, ttm_bo_type_device, NULL, &gobj, fpriv->xcp_id + 1);
if (r)
return r;
@@ -967,7 +968,7 @@
args->size = ALIGN(args->size, PAGE_SIZE);
domain = amdgpu_bo_get_preferred_domain(adev,
amdgpu_display_supported_domains(adev, flags));
- r = amdgpu_gem_object_create(adev, args->size, 0, domain, flags,
+ r = amdgpu_gem_object_create(adev, args->size, 0, domain, domain, flags,
ttm_bo_type_device, NULL, &gobj, fpriv->xcp_id + 1);
if (r)
return -ENOMEM;
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h 2025-03-16 23:55:17.000000000 +0100
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.h 2025-03-20 21:48:44.416738211 +0100
@@ -42,7 +42,7 @@
*/
void amdgpu_gem_force_release(struct amdgpu_device *adev);
int amdgpu_gem_object_create(struct amdgpu_device *adev, unsigned long size,
- int alignment, u32 initial_domain,
+ int alignment, u32 initial_domain, u32 domain,
u64 flags, enum ttm_bo_type type,
struct dma_resv *resv,
struct drm_gem_object **obj, int8_t xcp_id_plus1);
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 2025-03-16 23:55:17.000000000 +0100
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 2025-03-20 22:32:53.057709080 +0100
@@ -328,7 +328,7 @@
AMDGPU_GEM_CREATE_UNCACHED);
ret = amdgpu_gem_object_create(adev, mem->bo->tbo.base.size, 1,
- AMDGPU_GEM_DOMAIN_CPU, AMDGPU_GEM_CREATE_PREEMPTIBLE | flags,
+ AMDGPU_GEM_DOMAIN_CPU, AMDGPU_GEM_DOMAIN_CPU, AMDGPU_GEM_CREATE_PREEMPTIBLE | flags,
ttm_bo_type_sg, mem->bo->tbo.base.resv, &gem_obj, 0);
amdgpu_bo_unreserve(mem->bo);
@@ -1815,7 +1815,7 @@
va, (*mem)->aql_queue ? size << 1 : size,
domain_string(alloc_domain), xcp_id);
- ret = amdgpu_gem_object_create(adev, aligned_size, 1, alloc_domain, alloc_flags,
+ ret = amdgpu_gem_object_create(adev, aligned_size, 1, alloc_domain, alloc_domain, alloc_flags,
bo_type, NULL, &gobj, xcp_id + 1);
if (ret) {
pr_debug("Failed to create BO on domain %s. ret %d\n",
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 2025-03-20 22:31:38.770603335 +0100
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c 2025-03-20 22:31:51.866509502 +0100
@@ -308,7 +308,7 @@
}
ret = amdgpu_gem_object_create(adev, dma_buf->size, PAGE_SIZE,
- AMDGPU_GEM_DOMAIN_CPU, flags,
+ AMDGPU_GEM_DOMAIN_CPU, AMDGPU_GEM_DOMAIN_CPU, flags,
ttm_bo_type_sg, resv, &gobj, 0);
if (ret)
goto error;