Am 20.03.2018 um 07:29 schrieb Emily Deng:
under SR-IOV sometimes CPU based tlb flush would timeout
within the given 100ms period, instead let it fail and
continue we can give it more chance to repeat the
tlb flush on the failed VMHUB
this could fix the massive "Timeout waiting for VM flush ACK"
error during vk_encoder test.
Well that one is a big NAK since it once more just hides the real
problem that we sometimes drop register writes.
What we did during debugging to avoid the problem is the following:
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index a70cbc45c4c1..3536d50375fa 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -338,6 +338,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct
amdgpu_device *adev,
u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
+ while (RREG32_NO_KIQ(hub->vm_inv_eng0_req + eng) != tmp) {
+ DRM_ERROR("Need one more try to write the
VMHUB flush request!");
+ WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
+ }
/* Busy wait for ACK.*/
for (j = 0; j < 100; j++) {
But that can only be a temporary workaround as well.
The question is rather can you reliable reproduce this issue with the
vk_encoder test?
Thanks,
Christian.
Signed-off-by: Monk Liu <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 24 +++++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index a70cbc4..517712b 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -329,13 +329,18 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device
*adev,
{
/* Use register 17 for GART */
const unsigned eng = 17;
- unsigned i, j;
+ unsigned i, j, loop = 0;
+ unsigned flush_done = 0;
+
+retry:
spin_lock(&adev->gmc.invalidate_lock);
for (i = 0; i < AMDGPU_MAX_VMHUBS; ++i) {
struct amdgpu_vmhub *hub = &adev->vmhub[i];
u32 tmp = gmc_v9_0_get_invalidate_req(vmid);
+ if (flush_done & (1 << i)) /* this vmhub flushed */
+ continue;
WREG32_NO_KIQ(hub->vm_inv_eng0_req + eng, tmp);
@@ -347,8 +352,10 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device *adev,
break;
cpu_relax();
}
- if (j < 100)
+ if (j < 100) {
+ flush_done |= (1 << i);
continue;
+ }
/* Wait for ACK with a delay.*/
for (j = 0; j < adev->usec_timeout; j++) {
@@ -358,15 +365,22 @@ static void gmc_v9_0_flush_gpu_tlb(struct amdgpu_device
*adev,
break;
udelay(1);
}
- if (j < adev->usec_timeout)
+ if (j < adev->usec_timeout) {
+ flush_done |= (1 << i);
continue;
-
- DRM_ERROR("Timeout waiting for VM flush ACK!\n");
+ }
}
spin_unlock(&adev->gmc.invalidate_lock);
+ if (flush_done != 3) {
+ if (loop++ < 3)
+ goto retry;
+ else
+ DRM_ERROR("Timeout waiting for VM flush ACK!\n");
+ }
}
+
static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
unsigned vmid, uint64_t pd_addr)
{
_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx