amdgpu: Fix deadlock during GPU reset.

Liu, Monk Mon, 23 Oct 2017 00:24:45 -0700

For VM updates, the situation is same as commands submission, deadlock can also 
be solved by moving ttm_eu_fence_buffer_objects() ahead of push_job()
For TTM buffers moves can we pardon gpu_reset routine and wait till the TTM 
moves complete ?


-----Original Message-----
From: Christian König [mailto:[email protected]] 
Sent: 2017年10月23日 15:09
To: Liu, Monk <[email protected]>; Grodzovsky, Andrey 
<[email protected]>; [email protected]
Cc: Koenig, Christian <[email protected]>
Subject: Re: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

We discussed that as well, problem is that this won't be sufficient.

We push to the kfifo not only during command submission, but also for VM 
updates and TTM buffers moves.

So we can still deadlock because of them.

Regards,
Christian.

Am 23.10.2017 um 05:03 schrieb Liu, Monk:
> Why not use a more simple way ?
>
> Like moving ttm_eu_fence_buffer_objects() to before 
> amd_sched_entity_push_job() ?
> That could solve the deadlock from your description
>
> And the push order is already guaranteed by context->mutex (which is 
> also a patch from you)
>
>
> BR Monk
>
> -----Original Message-----
> From: amd-gfx [mailto:[email protected]] On Behalf 
> Of Andrey Grodzovsky
> Sent: 2017年10月20日 21:32
> To: [email protected]
> Cc: Grodzovsky, Andrey <[email protected]>; Koenig, Christian 
> <[email protected]>
> Subject: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.
>
> Switch from kfifo to SPSC queue.
>
> Bug:
> Kfifo is limited at size, during GPU reset it would fill up to limit and the 
> pushing thread (producer) would wait for the scheduler worker to consume the 
> items in the fifo while holding reservation lock on a BO. The gpu reset 
> thread on the other hand blocks the scheduler during reset. Before it 
> unblocks the sceduler it might want to recover VRAM and so will try to 
> reserve the same BO the producer thread is already holding creating a 
> deadlock.
>
> Fix:
> Switch from kfifo to SPSC queue which is unlimited in size.
>
> Signed-off-by: Andrey Grodzovsky <[email protected]>
> ---
>   drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h |  4 +-
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.c   | 51 
> ++++++++++---------------
>   drivers/gpu/drm/amd/scheduler/gpu_scheduler.h   |  4 +-
>   3 files changed, 26 insertions(+), 33 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> index 8bd3810..86838a8 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_sched_trace.h
> @@ -28,8 +28,8 @@ TRACE_EVENT(amd_sched_job,
>                          __entry->id = sched_job->id;
>                          __entry->fence = &sched_job->s_fence->finished;
>                          __entry->name = sched_job->sched->name;
> -                        __entry->job_count = kfifo_len(
> -                                &sched_job->s_entity->job_queue) / 
> sizeof(sched_job);
> +                        __entry->job_count = spsc_queue_count(
> +                                &sched_job->s_entity->job_queue);
>                          __entry->hw_job_count = atomic_read(
>                                  &sched_job->sched->hw_rq_count);
>                          ),
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> index 1bbbce2..0c9cdc0 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.c
> @@ -28,9 +28,14 @@
>   #include <drm/drmP.h>
>   #include "gpu_scheduler.h"
>   
> +#include "spsc_queue.h"
> +
>   #define CREATE_TRACE_POINTS
>   #include "gpu_sched_trace.h"
>   
> +#define to_amd_sched_job(sched_job)          \
> +             container_of((sched_job), struct amd_sched_job, queue_node)
> +
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity);  
> static void amd_sched_wakeup(struct amd_gpu_scheduler *sched);  static void 
> amd_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb); @@ 
> -123,8 +128,6 @@ int amd_sched_entity_init(struct amd_gpu_scheduler *sched,
>                         struct amd_sched_rq *rq,
>                         uint32_t jobs)
>   {
> -     int r;
> -
>       if (!(sched && entity && rq))
>               return -EINVAL;
>   
> @@ -135,9 +138,7 @@ int amd_sched_entity_init(struct amd_gpu_scheduler 
> *sched,
>   
>       spin_lock_init(&entity->rq_lock);
>       spin_lock_init(&entity->queue_lock);
> -     r = kfifo_alloc(&entity->job_queue, jobs * sizeof(void *), GFP_KERNEL);
> -     if (r)
> -             return r;
> +     spsc_queue_init(&entity->job_queue);
>   
>       atomic_set(&entity->fence_seq, 0);
>       entity->fence_context = dma_fence_context_alloc(2); @@ -170,7 +171,7 @@ 
> static bool amd_sched_entity_is_initialized(struct amd_gpu_scheduler *sched,  
> static bool amd_sched_entity_is_idle(struct amd_sched_entity *entity)  {
>       rmb();
> -     if (kfifo_is_empty(&entity->job_queue))
> +     if (spsc_queue_peek(&entity->job_queue) == NULL)
>               return true;
>   
>       return false;
> @@ -185,7 +186,7 @@ static bool amd_sched_entity_is_idle(struct 
> amd_sched_entity *entity)
>    */
>   static bool amd_sched_entity_is_ready(struct amd_sched_entity *entity)  {
> -     if (kfifo_is_empty(&entity->job_queue))
> +     if (spsc_queue_peek(&entity->job_queue) == NULL)
>               return false;
>   
>       if (ACCESS_ONCE(entity->dependency)) @@ -227,7 +228,7 @@ void 
> amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>                */
>               kthread_park(sched->thread);
>               kthread_unpark(sched->thread);
> -             while (kfifo_out(&entity->job_queue, &job, sizeof(job))) {
> +             while ((job = 
> +to_amd_sched_job(spsc_queue_pop(&entity->job_queue))))
> +{
>                       struct amd_sched_fence *s_fence = job->s_fence;
>                       amd_sched_fence_scheduled(s_fence);
>                       dma_fence_set_error(&s_fence->finished, -ESRCH); @@ 
> -235,9 +236,7 @@ void amd_sched_entity_fini(struct amd_gpu_scheduler *sched,
>                       dma_fence_put(&s_fence->finished);
>                       sched->ops->free_job(job);
>               }
> -
>       }
> -     kfifo_free(&entity->job_queue);
>   }
>   
>   static void amd_sched_entity_wakeup(struct dma_fence *f, struct 
> dma_fence_cb *cb) @@ -332,18 +331,21 @@ static bool 
> amd_sched_entity_add_dependency_cb(struct amd_sched_entity *entity)  }
>   
>   static struct amd_sched_job *
> -amd_sched_entity_peek_job(struct amd_sched_entity *entity)
> +amd_sched_entity_pop_job(struct amd_sched_entity *entity)
>   {
>       struct amd_gpu_scheduler *sched = entity->sched;
> -     struct amd_sched_job *sched_job;
> +     struct amd_sched_job *sched_job = to_amd_sched_job(
> +                                             
> spsc_queue_peek(&entity->job_queue));
>   
> -     if (!kfifo_out_peek(&entity->job_queue, &sched_job, sizeof(sched_job)))
> +     if (!sched_job)
>               return NULL;
>   
>       while ((entity->dependency = sched->ops->dependency(sched_job)))
>               if (amd_sched_entity_add_dependency_cb(entity))
>                       return NULL;
>   
> +     sched_job->s_entity = NULL;
> +     spsc_queue_pop(&entity->job_queue);
>       return sched_job;
>   }
>   
> @@ -354,18 +356,14 @@ amd_sched_entity_peek_job(struct amd_sched_entity 
> *entity)
>    *
>    * Returns true if we could submit the job.
>    */
> -static bool amd_sched_entity_in(struct amd_sched_job *sched_job)
> +static void amd_sched_entity_in(struct amd_sched_job *sched_job)
>   {
>       struct amd_gpu_scheduler *sched = sched_job->sched;
>       struct amd_sched_entity *entity = sched_job->s_entity;
> -     bool added, first = false;
> +     bool first = false;
>   
>       spin_lock(&entity->queue_lock);
> -     added = kfifo_in(&entity->job_queue, &sched_job,
> -                     sizeof(sched_job)) == sizeof(sched_job);
> -
> -     if (added && kfifo_len(&entity->job_queue) == sizeof(sched_job))
> -             first = true;
> +     first = spsc_queue_push(&entity->job_queue, &sched_job->queue_node);
>   
>       spin_unlock(&entity->queue_lock);
>   
> @@ -377,7 +375,6 @@ static bool amd_sched_entity_in(struct amd_sched_job 
> *sched_job)
>               spin_unlock(&entity->rq_lock);
>               amd_sched_wakeup(sched);
>       }
> -     return added;
>   }
>   
>   /* job_finish is called after hw fence signaled @@ -514,11 +511,8 @@ void 
> amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
>    */
>   void amd_sched_entity_push_job(struct amd_sched_job *sched_job)  {
> -     struct amd_sched_entity *entity = sched_job->s_entity;
> -
>       trace_amd_sched_job(sched_job);
> -     wait_event(entity->sched->job_scheduled,
> -                amd_sched_entity_in(sched_job));
> +     amd_sched_entity_in(sched_job);
>   }
>   
>   /* init a sched_job with basic field */ @@ -611,7 +605,7 @@ static int 
> amd_sched_main(void *param)  {
>       struct sched_param sparam = {.sched_priority = 1};
>       struct amd_gpu_scheduler *sched = (struct amd_gpu_scheduler *)param;
> -     int r, count;
> +     int r;
>   
>       sched_setscheduler(current, SCHED_FIFO, &sparam);
>   
> @@ -629,7 +623,7 @@ static int amd_sched_main(void *param)
>               if (!entity)
>                       continue;
>   
> -             sched_job = amd_sched_entity_peek_job(entity);
> +             sched_job = amd_sched_entity_pop_job(entity);
>               if (!sched_job)
>                       continue;
>   
> @@ -656,9 +650,6 @@ static int amd_sched_main(void *param)
>                       amd_sched_process_job(NULL, &s_fence->cb);
>               }
>   
> -             count = kfifo_out(&entity->job_queue, &sched_job,
> -                             sizeof(sched_job));
> -             WARN_ON(count != sizeof(sched_job));
>               wake_up(&sched->job_scheduled);
>       }
>       return 0;
> diff --git a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h 
> b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> index 3f75b45..8844ce7 100644
> --- a/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> +++ b/drivers/gpu/drm/amd/scheduler/gpu_scheduler.h
> @@ -26,6 +26,7 @@
>   
>   #include <linux/kfifo.h>
>   #include <linux/dma-fence.h>
> +#include "spsc_queue.h"
>   
>   struct amd_gpu_scheduler;
>   struct amd_sched_rq;
> @@ -56,7 +57,7 @@ struct amd_sched_entity {
>       struct amd_gpu_scheduler        *sched;
>   
>       spinlock_t                      queue_lock;
> -     struct kfifo                    job_queue;
> +     struct spsc_queue       job_queue;
>   
>       atomic_t                        fence_seq;
>       uint64_t                        fence_context;
> @@ -87,6 +88,7 @@ struct amd_sched_fence {  };
>   
>   struct amd_sched_job {
> +     struct spsc_node queue_node;
>       struct amd_gpu_scheduler        *sched;
>       struct amd_sched_entity         *s_entity;
>       struct amd_sched_fence          *s_fence;
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
> _______________________________________________
> amd-gfx mailing list
> [email protected]
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

RE: [PATCH 3/3] drm/amdgpu: Fix deadlock during GPU reset.

Reply via email to