Rework hangcheck code to work well toghether with preemption. Track the time a job has spent in a ring by storing timestamps of the `CP_ALWAYS_ON_CONTEXT` register at the beginning of a job and when switching rings as well as reading it back if the ring is currently active.
Signed-off-by: Anna Maniscalco <[email protected]> --- drivers/gpu/drm/msm/adreno/a5xx_gpu.c | 3 +- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 3 +- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 28 +++++++++++++++-- drivers/gpu/drm/msm/adreno/a6xx_gpu.h | 1 + drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 25 +++++++++++---- drivers/gpu/drm/msm/adreno/adreno_gpu.c | 3 +- drivers/gpu/drm/msm/msm_gpu.c | 51 +++++++++++++++++++++++++------ drivers/gpu/drm/msm/msm_gpu.h | 3 ++ drivers/gpu/drm/msm/msm_ringbuffer.h | 6 ++++ 9 files changed, 102 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c index 4a04dc43a8e6764a113d0ade3dee94bd4c0083af..cb4775a35da0706e571eb27ce617044de84ca118 100644 --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c @@ -1255,7 +1255,8 @@ static void a5xx_fault_detect_irq(struct msm_gpu *gpu) gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ)); /* Turn off the hangcheck timer to keep it from bothering us */ - timer_delete(&gpu->hangcheck_timer); + for (int i = 0; i < gpu->nr_rings; i++) + timer_delete(&gpu->rb[i]->hangcheck_timer); kthread_queue_work(gpu->worker, &gpu->recover_work); } diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index fc62fef2fed87f065cb8fa4e997abefe4ff11cd5..103c19fa8669f06a6c1627ced1daf2bcd60415db 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -28,7 +28,8 @@ static void a6xx_gmu_fault(struct a6xx_gmu *gmu) gmu->hung = true; /* Turn off the hangcheck timer while we are resetting */ - timer_delete(&gpu->hangcheck_timer); + for (int i = 0; i < gpu->nr_rings; i++) + timer_delete(&gpu->rb[i]->hangcheck_timer); /* Queue the GPU handler because we need to treat this as a recovery */ kthread_queue_work(gpu->worker, &gpu->recover_work); diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index b8f8ae940b55f5578abdbdec6bf1e90a53e721a5..7647e3dfd50db7446589e67949ed08d0a422f543 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -465,6 +465,9 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) get_stats_counter(ring, REG_A6XX_CP_ALWAYS_ON_COUNTER, rbmemptr_stats(ring, index, alwayson_start)); + get_stats_counter(ring, REG_A6XX_CP_ALWAYS_ON_CONTEXT, + rbmemptr(ring, last_job_start_ctx)); + OUT_PKT7(ring, CP_THREAD_CONTROL, 1); OUT_RING(ring, CP_SET_THREAD_BOTH); @@ -1816,7 +1819,8 @@ static void a6xx_fault_detect_irq(struct msm_gpu *gpu) gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE)); /* Turn off the hangcheck timer to keep it from bothering us */ - timer_delete(&gpu->hangcheck_timer); + for (int i = 0; i < gpu->nr_rings; i++) + timer_delete(&gpu->rb[i]->hangcheck_timer); /* Turn off interrupts to avoid triggering recovery again */ gpu_write(gpu, REG_A6XX_RBBM_INT_0_MASK, 0); @@ -1839,7 +1843,8 @@ static void a7xx_sw_fuse_violation_irq(struct msm_gpu *gpu) */ if (status & (A7XX_CX_MISC_SW_FUSE_VALUE_RAYTRACING | A7XX_CX_MISC_SW_FUSE_VALUE_LPAC)) { - timer_delete(&gpu->hangcheck_timer); + for (int i = 0; i < gpu->nr_rings; i++) + timer_delete(&gpu->rb[i]->hangcheck_timer); kthread_queue_work(gpu->worker, &gpu->recover_work); } @@ -2327,6 +2332,22 @@ static int a6xx_get_timestamp(struct msm_gpu *gpu, uint64_t *value) return 0; } +static int a6xx_get_ctx_timestamp(struct msm_ringbuffer *ring, uint64_t *value) +{ + struct msm_gpu *gpu = ring->gpu; + struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu); + + guard(spinlock_irqsave)(&a6xx_gpu->eval_lock); + + if (a6xx_in_preempt(a6xx_gpu) || ring != a6xx_gpu->cur_ring) + return -EBUSY; + + *value = gpu_read64(gpu, REG_A6XX_CP_ALWAYS_ON_CONTEXT); + + return 0; +} + static struct msm_ringbuffer *a6xx_active_ring(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); @@ -2555,6 +2576,7 @@ static const struct adreno_gpu_funcs funcs = { .get_rptr = a6xx_get_rptr, .progress = a6xx_progress, .sysprof_setup = a6xx_gmu_sysprof_setup, + .get_ctx_timestamp = a6xx_get_ctx_timestamp, }, .get_timestamp = a6xx_gmu_get_timestamp, }; @@ -2584,6 +2606,7 @@ static const struct adreno_gpu_funcs funcs_gmuwrapper = { .create_private_vm = a6xx_create_private_vm, .get_rptr = a6xx_get_rptr, .progress = a6xx_progress, + .get_ctx_timestamp = a6xx_get_ctx_timestamp, }, .get_timestamp = a6xx_get_timestamp, }; @@ -2616,6 +2639,7 @@ static const struct adreno_gpu_funcs funcs_a7xx = { .get_rptr = a6xx_get_rptr, .progress = a6xx_progress, .sysprof_setup = a6xx_gmu_sysprof_setup, + .get_ctx_timestamp = a6xx_get_ctx_timestamp, }, .get_timestamp = a6xx_gmu_get_timestamp, }; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h index 0b17d36c36a9567e6afa4269ae7783ed3578e40e..7248d3d38c6d8a06cb4a536043bf4877179447cc 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h @@ -80,6 +80,7 @@ struct a6xx_gpu { struct drm_gem_object *preempt_postamble_bo; void *preempt_postamble_ptr; uint64_t preempt_postamble_iova; + uint64_t preempt_postamble_cntreset_end; uint64_t preempt_postamble_len; bool postamble_enabled; diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c index afc5f4aa3b17334027f3c20072cc3f059a9733b7..88a65549fa8038d4836eb5aeaea775d679415315 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c @@ -99,11 +99,15 @@ static void a6xx_preempt_timer(struct timer_list *t) kthread_queue_work(gpu->worker, &gpu->recover_work); } -static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu) +static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu, struct msm_ringbuffer *ring) { u32 *postamble = a6xx_gpu->preempt_postamble_ptr; + uint64_t last_active_ctxcycles; u32 count = 0; + if (ring) + last_active_ctxcycles = rbmemptr(ring, last_active_ctxcycles); + postamble[count++] = PKT7(CP_REG_RMW, 3); postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD; postamble[count++] = 0; @@ -118,6 +122,15 @@ static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu) postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1); postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0); + a6xx_gpu->preempt_postamble_cntreset_end = count; + + postamble[count++] = PKT7(ring ? CP_REG_TO_MEM : CP_NOP, 3); + postamble[count++] = CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_CONTEXT) | + CP_REG_TO_MEM_0_CNT(2) | + CP_REG_TO_MEM_0_64B; + postamble[count++] = lower_32_bits(last_active_ctxcycles); + postamble[count++] = upper_32_bits(last_active_ctxcycles); + a6xx_gpu->preempt_postamble_len = count; a6xx_gpu->postamble_enabled = true; @@ -129,9 +142,9 @@ static void preempt_disable_postamble(struct a6xx_gpu *a6xx_gpu) /* * Disable the postamble by replacing the first packet header with a NOP - * that covers the whole buffer. + * that skips the counters reset part. */ - *postamble = PKT7(CP_NOP, (a6xx_gpu->preempt_postamble_len - 1)); + *postamble = PKT7(CP_NOP, (a6xx_gpu->preempt_postamble_cntreset_end - 1)); a6xx_gpu->postamble_enabled = false; } @@ -338,8 +351,8 @@ void a6xx_preempt_trigger(struct msm_gpu *gpu) /* Enable or disable postamble as needed */ sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1; - if (!sysprof && !a6xx_gpu->postamble_enabled) - preempt_prepare_postamble(a6xx_gpu); + if (!sysprof) + preempt_prepare_postamble(a6xx_gpu, ring); if (sysprof && a6xx_gpu->postamble_enabled) preempt_disable_postamble(a6xx_gpu); @@ -454,7 +467,7 @@ void a6xx_preempt_init(struct msm_gpu *gpu) gpu->vm, &a6xx_gpu->preempt_postamble_bo, &a6xx_gpu->preempt_postamble_iova); - preempt_prepare_postamble(a6xx_gpu); + preempt_prepare_postamble(a6xx_gpu, NULL); if (IS_ERR(a6xx_gpu->preempt_postamble_ptr)) goto fail; diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/drm/msm/adreno/adreno_gpu.c index afaa3cfefd357dc0230994c8b5830a14c6d7a352..58f1e2a95bbfb00feb5a3bb91853e6bb533ec631 100644 --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c @@ -334,7 +334,8 @@ int adreno_fault_handler(struct msm_gpu *gpu, unsigned long iova, int flags, struct msm_gpu_fault_info fault_info = {}; /* Turn off the hangcheck timer to keep it from bothering us */ - timer_delete(&gpu->hangcheck_timer); + for (int i = 0; i < gpu->nr_rings; i++) + timer_delete(&gpu->rb[i]->hangcheck_timer); fault_info.ttbr0 = info->ttbr0; fault_info.iova = iova; diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/msm_gpu.c index 17759abc46d7d7af4117b1d71f1d5fba6ba0b61c..a3c5073aca1f65e450e0673262e8ca4bc7a5be6f 100644 --- a/drivers/gpu/drm/msm/msm_gpu.c +++ b/drivers/gpu/drm/msm/msm_gpu.c @@ -463,7 +463,9 @@ static void recover_worker(struct kthread_work *work) struct drm_device *dev = gpu->dev; struct msm_drm_private *priv = dev->dev_private; struct msm_gem_submit *submit; - struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu); + struct msm_ringbuffer *cur_ring = gpu->hung_ring ? + gpu->hung_ring : gpu->funcs->active_ring(gpu); + gpu->hung_ring = NULL; char *comm = NULL, *cmd = NULL; struct task_struct *task; int i; @@ -613,11 +615,17 @@ void msm_gpu_fault_crashstate_capture(struct msm_gpu *gpu, struct msm_gpu_fault_ mutex_unlock(&gpu->lock); } -static void hangcheck_timer_reset(struct msm_gpu *gpu) +static void hangcheck_ring_timer_reset(struct msm_gpu *gpu, struct msm_ringbuffer *ring) { struct msm_drm_private *priv = gpu->dev->dev_private; - mod_timer(&gpu->hangcheck_timer, - round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period))); + mod_timer(&ring->hangcheck_timer, + round_jiffies_up(jiffies + msecs_to_jiffies(priv->hangcheck_period))); +} + +static void hangcheck_timer_reset(struct msm_gpu *gpu) +{ + for (int i = 0; i < gpu->nr_rings; i++) + hangcheck_ring_timer_reset(gpu, gpu->rb[i]); } static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) @@ -635,11 +643,33 @@ static bool made_progress(struct msm_gpu *gpu, struct msm_ringbuffer *ring) return true; } +static bool check_ring_timeout(struct msm_ringbuffer *ring, unsigned long timeout) +{ + struct msm_gpu *gpu = ring->gpu; + struct msm_ringbuffer *curr_ring = gpu->funcs->active_ring(gpu); + u64 start, end; + int ret; + + if (!gpu->funcs->get_ctx_timestamp) + return !made_progress(gpu, ring); + + start = ring->memptrs->last_job_start_ctx; + + if (!gpu->funcs->get_ctx_timestamp(ring, &end)) + end = ring->memptrs->last_active_ctxcycles; + + if (end >= start) + return (end - start) < timeout; + else + return false; +} + static void hangcheck_handler(struct timer_list *t) { - struct msm_gpu *gpu = timer_container_of(gpu, t, hangcheck_timer); + struct msm_ringbuffer *ring = timer_container_of(ring, t, hangcheck_timer); + struct msm_gpu *gpu = ring->gpu; + struct msm_drm_private *priv = gpu->dev->dev_private; struct drm_device *dev = gpu->dev; - struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu); uint32_t fence = ring->memptrs->fence; if (fence != ring->hangcheck_fence) { @@ -647,7 +677,7 @@ static void hangcheck_handler(struct timer_list *t) ring->hangcheck_fence = fence; ring->hangcheck_progress_retries = 0; } else if (fence_before(fence, ring->fctx->last_fence) && - !made_progress(gpu, ring)) { + check_ring_timeout(ring, priv->hangcheck_period * 192000)) { /* no progress and not done.. hung! */ ring->hangcheck_fence = fence; ring->hangcheck_progress_retries = 0; @@ -658,6 +688,7 @@ static void hangcheck_handler(struct timer_list *t) DRM_DEV_ERROR(dev->dev, "%s: submitted fence: %u\n", gpu->name, ring->fctx->last_fence); + gpu->hung_ring = ring; kthread_queue_work(gpu->worker, &gpu->recover_work); } @@ -911,7 +942,7 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) submit->ring->cur_ctx_seqno = submit->queue->ctx->seqno; pm_runtime_put(&gpu->pdev->dev); - hangcheck_timer_reset(gpu); + hangcheck_ring_timer_reset(gpu, submit->ring); } /* @@ -1011,8 +1042,6 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, if (funcs->progress) priv->hangcheck_period /= 2; - timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0); - spin_lock_init(&gpu->perf_lock); @@ -1097,6 +1126,8 @@ int msm_gpu_init(struct drm_device *drm, struct platform_device *pdev, goto fail; } + timer_setup(&gpu->rb[i]->hangcheck_timer, hangcheck_handler, 0); + memptrs += sizeof(struct msm_rbmemptrs); memptrs_iova += sizeof(struct msm_rbmemptrs); } diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h index a597f2bee30b6370ecc3639bfe1072c85993e789..7bf1b7f4bc4b61338bfa4c1463eb549f8c22d5c3 100644 --- a/drivers/gpu/drm/msm/msm_gpu.h +++ b/drivers/gpu/drm/msm/msm_gpu.h @@ -93,6 +93,7 @@ struct msm_gpu_funcs { */ bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer *ring); void (*sysprof_setup)(struct msm_gpu *gpu); + int (*get_ctx_timestamp)(struct msm_ringbuffer *ring, uint64_t *value); }; /* Additional state for iommu faults: */ @@ -257,6 +258,8 @@ struct msm_gpu { /* work for handling GPU recovery: */ struct kthread_work recover_work; + struct msm_ringbuffer *hung_ring; + /** retire_event: notified when submits are retired: */ wait_queue_head_t retire_event; diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/drm/msm/msm_ringbuffer.h index d1e49f701c8176e50d2b9a5cca35acee67f75209..316247fb089f26bd657ccf8464a5039e1cd1ac45 100644 --- a/drivers/gpu/drm/msm/msm_ringbuffer.h +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h @@ -37,6 +37,8 @@ struct msm_rbmemptrs { volatile struct msm_gpu_submit_stats stats[MSM_GPU_SUBMIT_STATS_COUNT]; volatile u64 ttbr0; volatile u32 context_idr; + volatile u64 last_job_start_ctx; + volatile u64 last_active_ctxcycles; }; struct msm_cp_state { @@ -73,6 +75,10 @@ struct msm_ringbuffer { uint64_t memptrs_iova; struct msm_fence_context *fctx; + /* Hang and Inactivity Detection: + */ + struct timer_list hangcheck_timer; + /** * hangcheck_progress_retries: * -- 2.51.0
