scheduler:imple job skip feature(v3)

Liu, Monk Mon, 30 Oct 2017 03:34:01 -0700

Yeah, my typo, thanks !

-----Original Message-----
From: Christian König [mailto:[email protected]] 
Sent: 2017年10月30日 18:00
To: Liu, Monk <[email protected]>; [email protected]
Subject: Re: [PATCH 1/6] amd/scheduler:imple job skip feature(v3)


Am 30.10.2017 um 05:15 schrieb Monk Liu:
> jobs are skipped under two cases
> 1)when the entity behind this job marked guilty, the job poped from 
> this entity's queue will be dropped in sched_main loop.
>
> 2)in job_recovery(), skip the scheduling job if its karma detected 
> above limit, and also skipped as well for other jobs sharing the same 
> fence context. this approach is becuase job_recovery() cannot access 
> job->entity due to entity may already dead.
>
> v2:
> some logic fix
>
> v3:
> when entity detected guilty, don't drop the job in the poping stage, 
> instead set its fence error as -ECANCELED
>
> in run_job(), skip the scheduling either:1) fence->error < 0 or 2) 
> there was a VRAM LOST occurred on this job.
> this way we can unify the job skipping logic.
>
> with this feature we can introduce new gpu recover feature.
>
> Change-Id: I268b1c752c94e6ecd4ea78c87eb226ea3f52908a
> Signed-off-by: Monk Liu <[email protected]>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 13 +++++----
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c | 39 
> ++++++++++++++++-----------
>   2 files changed, 31 insertions(+), 21 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index f60662e..0a90c76 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -180,7 +180,7 @@ static struct dma_fence 
> *amdgpu_job_dependency(struct amd_sched_job *sched_job,
>   
>   static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
>   {
> -     struct dma_fence *fence = NULL;
> +     struct dma_fence *fence = NULL, *finished;
>       struct amdgpu_device *adev;
>       struct amdgpu_job *job;
>       int r;
> @@ -190,15 +190,18 @@ static struct dma_fence *amdgpu_job_run(struct 
> amd_sched_job *sched_job)
>               return NULL;
>       }
>       job = to_amdgpu_job(sched_job);
> +     finished = &job->base.s_fence->finished;
>       adev = job->adev;
>   
>       BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
>   
>       trace_amdgpu_sched_run_job(job);
> -     /* skip ib schedule when vram is lost */
> -     if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
> -             dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
> -             DRM_ERROR("Skip scheduling IBs!\n");
> +
> +     if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
> +             dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if 
> +VRAM lost */
> +
> +     if (finished->error < 0) {
> +             DRM_INFO("Skip scheduling IBs!\n");
>       } else {
>               r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
>                                      &fence);
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 903ef8b..3d8c994 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -344,6 +344,10 @@ amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>               if (amd_sched_entity_add_dependency_cb(entity))
>                       return NULL;
>   
> +     /* skip jobs from entity that marked guilty */
> +     if (entity->guilty && atomic_read(entity->guilty))
> +             dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
> +
>       spsc_queue_pop(&entity->job_queue);
>       return sched_job;
>   }
> @@ -440,14 +444,6 @@ static void amd_sched_job_timedout(struct work_struct 
> *work)
>       job->sched->ops->timedout_job(job);
>   }
>   
> -static void amd_sched_set_guilty(struct amd_sched_job *s_job,
> -                              struct amd_sched_entity *s_entity)
> -{
> -     if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
> -             if (s_entity->guilty)
> -                     atomic_set(s_entity->guilty, 1);
> -}
> -
>   void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct 
> amd_sched_job *bad)
>   {
>       struct amd_sched_job *s_job;
> @@ -467,21 +463,24 @@ void amd_sched_hw_job_reset(struct amd_gpu_scheduler 
> *sched, struct amd_sched_jo
>       spin_unlock(&sched->job_list_lock);
>   
>       if (bad) {
> -             bool found = false;
> -
> -             for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; 
> i++ ) {
> +             /* don't increase @bad's karma if it's from KERNEL RQ,
> +              * becuase sometimes GPU hang would cause kernel jobs (like VM 
> updating jobs)
> +              * corrupt but keep in mind that kernel jobs always considered 
> good.
> +              */
> +             for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; 
> i++ 
> +) {
>                       struct amd_sched_rq *rq = &sched->sched_rq[i];
>   
>                       spin_lock(&rq->lock);
>                       list_for_each_entry_safe(entity, tmp, &rq->entities, 
> list) {
>                               if (bad->s_fence->scheduled.context == 
> entity->fence_context) {
> -                                     found = true;
> -                                     amd_sched_set_guilty(bad, entity);
> +                                 if (atomic_inc_return(&bad->karma) > 
> bad->sched->hang_limit)
> +                                             if (entity->guilty)
> +                                                     
> atomic_set(entity->guilty, 1);
>                                       break;
>                               }
>                       }
>                       spin_unlock(&rq->lock);
> -                     if (found)
> +                     if (&entity->list == &rq->entities)

That needs to be "&entity->list != &rq->entities", or otherwise we only check 
on the first round.

With that fixed the patch is Reviewed-by: Christian König 
<[email protected]>.

Nice work,
Christian.

>                               break;
>               }
>       }
> @@ -499,6 +498,7 @@ void amd_sched_job_kickout(struct amd_sched_job *s_job)
>   void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>   {
>       struct amd_sched_job *s_job, *tmp;
> +     bool found_guilty = false;
>       int r;
>   
>       spin_lock(&sched->job_list_lock);
> @@ -510,6 +510,15 @@ void amd_sched_job_recovery(struct amd_gpu_scheduler 
> *sched)
>       list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
>               struct amd_sched_fence *s_fence = s_job->s_fence;
>               struct dma_fence *fence;
> +             uint64_t guilty_context;
> +
> +             if (!found_guilty && atomic_read(&s_job->karma) > 
> sched->hang_limit) {
> +                     found_guilty = true;
> +                     guilty_context = s_job->s_fence->scheduled.context;
> +             }
> +
> +             if (found_guilty && s_job->s_fence->scheduled.context == 
> guilty_context)
> +                     dma_fence_set_error(&s_fence->finished, -ECANCELED);
>   
>               spin_unlock(&sched->job_list_lock);
>               fence = sched->ops->run_job(s_job); @@ -525,7 +534,6 @@ void 
> amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>                                         r);
>                       dma_fence_put(fence);
>               } else {
> -                     DRM_ERROR("Failed to run job!\n");
>                       amd_sched_process_job(NULL, &s_fence->cb);
>               }
>               spin_lock(&sched->job_list_lock);
> @@ -663,7 +671,6 @@ static int amd_sched_main(void *param)
>                                         r);
>                       dma_fence_put(fence);
>               } else {
> -                     DRM_ERROR("Failed to run job!\n");
>                       amd_sched_process_job(NULL, &s_fence->cb);
>               }
>   


_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 1/6] amd/scheduler:imple job skip feature(v3)

Reply via email to