sriov:implement guilty ctx for loose reset

Liu, Monk Mon, 09 Oct 2017 01:39:47 -0700

How APP/UMD aware that a context is guilty or triggered too much loops of hang 
??


Why APP/UMD voluntarily call amdgpu_ctx_query() to check whether gpu reset 
occurred or not ?

Please be aware that for another CSP customer this "loose mode" is 100% welcome 
and wanted by they, and more important 

This approach won't cross X server at all, only the guilty process/context is 
rejected upon its submitting


I don't agree that we should rely on ctx_query(), no one is responsible to call 
it from time to time



-----Original Message-----
From: Christian König [mailto:[email protected]] 
Sent: 2017年10月9日 16:28
To: Liu, Monk <[email protected]>; [email protected]
Subject: Re: [PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for loose reset

Am 30.09.2017 um 08:03 schrieb Monk Liu:
> Change-Id: I7904f362aa0f578a5cbf5d40c7a242c2c6680a92
> Signed-off-by: Monk Liu <[email protected]>

NAK, if a context is guilty of a GPU reset should be determined in
amdgpu_ctx_query() by looking at the fences in the ring buffer.

Not when the GPU reset itself occurs.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 16 +++++++++-------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       |  1 +
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 22 ++++++++++++++++++++++
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h |  1 +
>   5 files changed, 34 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index b40d4ba..b63e602 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -737,6 +737,7 @@ struct amdgpu_ctx {
>       struct dma_fence        **fences;
>       struct amdgpu_ctx_ring  rings[AMDGPU_MAX_RINGS];
>       bool preamble_presented;
> +     bool guilty;
>   };
>   
>   struct amdgpu_ctx_mgr {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 6a1515e..f92962e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -79,16 +79,19 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser 
> *p, void *data)
>       if (cs->in.num_chunks == 0)
>               return 0;
>   
> +     p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
> +     if (!p->ctx)
> +             return -EINVAL;
> +
> +     if (amdgpu_sriov_vf(p->adev) &&
> +             amdgpu_sriov_reset_level == 0 &&
> +             p->ctx->guilty)
> +             return -ENODEV;
> +
>       chunk_array = kmalloc_array(cs->in.num_chunks, sizeof(uint64_t), 
> GFP_KERNEL);
>       if (!chunk_array)
>               return -ENOMEM;
>   
> -     p->ctx = amdgpu_ctx_get(fpriv, cs->in.ctx_id);
> -     if (!p->ctx) {
> -             ret = -EINVAL;
> -             goto free_chunk;
> -     }
> -
>       /* get chunks */
>       chunk_array_user = u64_to_user_ptr(cs->in.chunks);
>       if (copy_from_user(chunk_array, chunk_array_user, @@ -184,7 +187,6 
> @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, void *data)
>       p->nchunks = 0;
>   put_ctx:
>       amdgpu_ctx_put(p->ctx);
> -free_chunk:
>       kfree(chunk_array);
>   
>       return ret;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 75c933b..028e9f1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -60,6 +60,7 @@ static int amdgpu_ctx_init(struct amdgpu_device *adev, 
> struct amdgpu_ctx *ctx)
>                                         rq, amdgpu_sched_jobs);
>               if (r)
>                       goto failed;
> +             ctx->rings[i].entity.guilty = &ctx->guilty;
>       }
>   
>       r = amdgpu_queue_mgr_init(adev, &ctx->queue_mgr); diff --git 
> a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 12c3092..89b0573 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -493,10 +493,32 @@ void amd_sched_set_queue_hang(struct amd_gpu_scheduler 
> *sched)
>   void amd_sched_job_kickout(struct amd_sched_job *s_job)
>   {
>       struct amd_gpu_scheduler *sched = s_job->sched;
> +     struct amd_sched_entity *entity, *tmp;
> +     struct amd_sched_rq *rq;
> +     int i;
> +     bool found;
>   
>       spin_lock(&sched->job_list_lock);
>       list_del_init(&s_job->node);
>       spin_unlock(&sched->job_list_lock);
> +
> +     dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
> +
> +     for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++) {
> +             rq = &sched->sched_rq[i];
> +
> +             spin_lock(&rq->lock);
> +             list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
> +                     if (s_job->s_entity == entity && entity->guilty) {
> +                             *entity->guilty = true;
> +                             found = true;
> +                             break;
> +                     }
> +             }
> +             spin_unlock(&rq->lock);
> +             if (found)
> +                     break;
> +     }
>   }
>   
>   void amd_sched_job_recovery(struct amd_gpu_scheduler *sched) diff 
> --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index f0242aa..16c2244 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -49,6 +49,7 @@ struct amd_sched_entity {
>   
>       struct dma_fence                *dependency;
>       struct dma_fence_cb             cb;
> +     bool *guilty; /* this points to ctx's guilty */
>   };
>   
>   /**


_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 10/12] drm/amdgpu/sriov:implement guilty ctx for loose reset

Reply via email to