xe: Stop abusing DRM scheduler internals

Matthew Brost Tue, 18 Nov 2025 14:54:45 -0800

On Tue, Nov 18, 2025 at 01:17:22PM -0800, Niranjana Vishwanathapura wrote:
> On Tue, Nov 18, 2025 at 09:59:32AM -0800, Matthew Brost wrote:
> > On Mon, Nov 17, 2025 at 10:39:42PM -0800, Niranjana Vishwanathapura wrote:
> > > On Thu, Oct 16, 2025 at 01:48:23PM -0700, Matthew Brost wrote:
> > > > Use new pending job list iterator and new helper functions in Xe to
> > > > avoid reaching into DRM scheduler internals.
> > > >
> > > > Part of this change involves removing pending jobs debug information
> > > > from debugfs and devcoredump. As agreed, the pending job list should
> > > > only be accessed when the scheduler is stopped. However, it's not
> > > > straightforward to determine whether the scheduler is stopped from the
> > > > shared debugfs/devcoredump code path. Additionally, the pending job list
> > > > provides little useful information, as pending jobs can be inferred from
> > > > seqnos and ring head/tail positions. Therefore, this debug information
> > > > is being removed.
> > > >
> > > > Signed-off-by: Matthew Brost <[email protected]>
> > > > ---
> > > > drivers/gpu/drm/xe/xe_gpu_scheduler.c    |  4 +-
> > > > drivers/gpu/drm/xe/xe_gpu_scheduler.h    | 34 +++--------
> > > > drivers/gpu/drm/xe/xe_guc_submit.c       | 74 ++++--------------------
> > > > drivers/gpu/drm/xe/xe_guc_submit_types.h | 11 ----
> > > > drivers/gpu/drm/xe/xe_hw_fence.c         | 16 -----
> > > > drivers/gpu/drm/xe/xe_hw_fence.h         |  2 -
> > > > 6 files changed, 20 insertions(+), 121 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.c 
> > > > b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> > > > index f4f23317191f..9c8004d5dd91 100644
> > > > --- a/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> > > > +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.c
> > > > @@ -7,7 +7,7 @@
> > > >
> > > > static void xe_sched_process_msg_queue(struct xe_gpu_scheduler *sched)
> > > > {
> > > > -       if (!READ_ONCE(sched->base.pause_submit))
> > > > +       if (!drm_sched_is_stopped(&sched->base))
> > > >                 queue_work(sched->base.submit_wq, 
> > > > &sched->work_process_msg);
> > > > }
> > > >
> > > > @@ -43,7 +43,7 @@ static void xe_sched_process_msg_work(struct 
> > > > work_struct *w)
> > > >                 container_of(w, struct xe_gpu_scheduler, 
> > > > work_process_msg);
> > > >         struct xe_sched_msg *msg;
> > > >
> > > > -       if (READ_ONCE(sched->base.pause_submit))
> > > > +       if (drm_sched_is_stopped(&sched->base))
> > > >                 return;
> > > >
> > > >         msg = xe_sched_get_msg(sched);
> > > > diff --git a/drivers/gpu/drm/xe/xe_gpu_scheduler.h 
> > > > b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> > > > index b971b6b69419..583372a78140 100644
> > > > --- a/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> > > > +++ b/drivers/gpu/drm/xe/xe_gpu_scheduler.h
> > > > @@ -55,14 +55,10 @@ static inline void xe_sched_resubmit_jobs(struct 
> > > > xe_gpu_scheduler *sched)
> > > > {
> > > >         struct drm_sched_job *s_job;
> > > >
> > > > -       list_for_each_entry(s_job, &sched->base.pending_list, list) {
> > > > -               struct drm_sched_fence *s_fence = s_job->s_fence;
> > > > -               struct dma_fence *hw_fence = s_fence->parent;
> > > > -
> > > > +       drm_sched_for_each_pending_job(s_job, &sched->base, NULL)
> > > >                 if (to_xe_sched_job(s_job)->skip_emit ||
> > > > -                   (hw_fence && !dma_fence_is_signaled(hw_fence)))
> > > > +                   !drm_sched_job_is_signaled(s_job))
> > > >                         sched->base.ops->run_job(s_job);
> > > > -       }
> > > > }
> > > >
> > > > static inline bool
> > > > @@ -71,14 +67,6 @@ xe_sched_invalidate_job(struct xe_sched_job *job, 
> > > > int threshold)
> > > >         return drm_sched_invalidate_job(&job->drm, threshold);
> > > > }
> > > >
> > > > -static inline void xe_sched_add_pending_job(struct xe_gpu_scheduler 
> > > > *sched,
> > > > -                                           struct xe_sched_job *job)
> > > > -{
> > > > -       spin_lock(&sched->base.job_list_lock);
> > > > -       list_add(&job->drm.list, &sched->base.pending_list);
> > > > -       spin_unlock(&sched->base.job_list_lock);
> > > > -}
> > > > -
> > > > /**
> > > >  * xe_sched_first_pending_job() - Find first pending job which is 
> > > > unsignaled
> > > >  * @sched: Xe GPU scheduler
> > > > @@ -88,21 +76,13 @@ static inline void xe_sched_add_pending_job(struct 
> > > > xe_gpu_scheduler *sched,
> > > > static inline
> > > > struct xe_sched_job *xe_sched_first_pending_job(struct xe_gpu_scheduler 
> > > > *sched)
> > > > {
> > > > -       struct xe_sched_job *job, *r_job = NULL;
> > > > -
> > > > -       spin_lock(&sched->base.job_list_lock);
> > > > -       list_for_each_entry(job, &sched->base.pending_list, drm.list) {
> > > > -               struct drm_sched_fence *s_fence = job->drm.s_fence;
> > > > -               struct dma_fence *hw_fence = s_fence->parent;
> > > > +       struct drm_sched_job *job;
> > > >
> > > > -               if (hw_fence && !dma_fence_is_signaled(hw_fence)) {
> > > > -                       r_job = job;
> > > > -                       break;
> > > > -               }
> > > > -       }
> > > > -       spin_unlock(&sched->base.job_list_lock);
> > > > +       drm_sched_for_each_pending_job(job, &sched->base, NULL)
> > > > +               if (!drm_sched_job_is_signaled(job))
> > > > +                       return to_xe_sched_job(job);
> > > >
> > > > -       return r_job;
> > > > +       return NULL;
> > > > }
> > > >
> > > > static inline int
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c 
> > > > b/drivers/gpu/drm/xe/xe_guc_submit.c
> > > > index 0ef67d3523a7..680696efc434 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> > > > @@ -1032,7 +1032,7 @@ static void xe_guc_exec_queue_lr_cleanup(struct 
> > > > work_struct *w)
> > > >         struct xe_exec_queue *q = ge->q;
> > > >         struct xe_guc *guc = exec_queue_to_guc(q);
> > > >         struct xe_gpu_scheduler *sched = &ge->sched;
> > > > -       struct xe_sched_job *job;
> > > > +       struct drm_sched_job *job;
> > > >         bool wedged = false;
> > > >
> > > >         xe_gt_assert(guc_to_gt(guc), xe_exec_queue_is_lr(q));
> > > > @@ -1091,16 +1091,10 @@ static void xe_guc_exec_queue_lr_cleanup(struct 
> > > > work_struct *w)
> > > >         if (!exec_queue_killed(q) && !xe_lrc_ring_is_idle(q->lrc[0]))
> > > >                 xe_devcoredump(q, NULL, "LR job cleanup, guc_id=%d", 
> > > > q->guc->id);
> > > >
> > > > -       xe_hw_fence_irq_stop(q->fence_irq);
> > > > +       drm_sched_for_each_pending_job(job, &sched->base, NULL)
> > > > +               xe_sched_job_set_error(to_xe_sched_job(job), 
> > > > -ECANCELED);
> > > >
> > > >         xe_sched_submission_start(sched);
> > > > -
> > > > -       spin_lock(&sched->base.job_list_lock);
> > > > -       list_for_each_entry(job, &sched->base.pending_list, drm.list)
> > > > -               xe_sched_job_set_error(job, -ECANCELED);
> > > > -       spin_unlock(&sched->base.job_list_lock);
> > > > -
> > > > -       xe_hw_fence_irq_start(q->fence_irq);
> > > > }
> > > >
> > > > #define ADJUST_FIVE_PERCENT(__t)        mul_u64_u32_div(__t, 105, 100)
> > > > @@ -1219,7 +1213,7 @@ static enum drm_gpu_sched_stat
> > > > guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> > > > {
> > > >         struct xe_sched_job *job = to_xe_sched_job(drm_job);
> > > > -       struct xe_sched_job *tmp_job;
> > > > +       struct drm_sched_job *tmp_job;
> > > >         struct xe_exec_queue *q = job->q;
> > > >         struct xe_gpu_scheduler *sched = &q->guc->sched;
> > > >         struct xe_guc *guc = exec_queue_to_guc(q);
> > > > @@ -1228,7 +1222,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job 
> > > > *drm_job)
> > > >         unsigned int fw_ref;
> > > >         int err = -ETIME;
> > > >         pid_t pid = -1;
> > > > -       int i = 0;
> > > >         bool wedged = false, skip_timeout_check;
> > > >
> > > >         xe_gt_assert(guc_to_gt(guc), !xe_exec_queue_is_lr(q));
> > > > @@ -1395,28 +1388,15 @@ guc_exec_queue_timedout_job(struct 
> > > > drm_sched_job *drm_job)
> > > >                 __deregister_exec_queue(guc, q);
> > > >         }
> > > >
> > > > -       /* Stop fence signaling */
> > > > -       xe_hw_fence_irq_stop(q->fence_irq);
> > > > +       /* Mark all outstanding jobs as bad, thus completing them */
> > > > +       xe_sched_job_set_error(job, err);
> > > 
> > > This setting error for this timed out job is newly added.
> > > Why was it not there before and being added now?
> > > 
> > 
> > Because the TDR job was added back into the pending list first, so in
> > fact we did set the error on the job.
> > 
> 
> Ok, got it. Thanks.
> 
> > > > +       drm_sched_for_each_pending_job(tmp_job, &sched->base, NULL)
> > > > +               xe_sched_job_set_error(to_xe_sched_job(tmp_job), 
> > > > -ECANCELED);
> > > >
> > > > -       /*
> > > > -        * Fence state now stable, stop / start scheduler which cleans 
> > > > up any
> > > > -        * fences that are complete
> > > > -        */
> > > > -       xe_sched_add_pending_job(sched, job);
> > > 
> > > Why xe_sched_add_pending_job() was there before?
> > > 
> > 
> > We (DRM scheduler maintainers agreed drivers shouldn't touch the pending
> > list), below returning DRM_GPU_SCHED_STAT_NO_HANG defers this step to
> > the DRM scheduler core.
> > 
> > > >         xe_sched_submission_start(sched);
> > > > -
> > > >         xe_guc_exec_queue_trigger_cleanup(q);
> > > 
> > > Why do we need to trigger cleanup again here?
> > > 
> > 
> > This is existing code and it should only be called once in this
> > function. At this point in time, we don't know if the TDR fired
> > naturally with a normal timeout value or if we are already in process of
> > cleaning up. If it is the former, then we switch to cleanup immediately
> > mode which is why this call is needed.
> > 
> > > >
> > > > -       /* Mark all outstanding jobs as bad, thus completing them */
> > > > -       spin_lock(&sched->base.job_list_lock);
> > > > -       list_for_each_entry(tmp_job, &sched->base.pending_list, 
> > > > drm.list)
> > > > -               xe_sched_job_set_error(tmp_job, !i++ ? err : 
> > > > -ECANCELED);
> > > > -       spin_unlock(&sched->base.job_list_lock);
> > > > -
> > > > -       /* Start fence signaling */
> > > > -       xe_hw_fence_irq_start(q->fence_irq);
> > > > -
> > > > -       return DRM_GPU_SCHED_STAT_RESET;
> > > > +       return DRM_GPU_SCHED_STAT_NO_HANG;
> > > 
> > > This is error case. So, why return is changed to NO_HANG?
> > > 
> > 
> > See above, this how we can delete xe_sched_add_pending_job.
> > 
> 
> Ok, returning NO_HANG here so that drm scheduler adds the job
> back into the pending list. It is bit confusing to reader as
> to why we return NO_HANG even the case of a hang (error)
> condition here. May be a comment will help.
>


This is in the DRM scheduler doc, but will add comment here too.

Matt

> Niranjana
> 
> > > Niranjana
> > > 
> > > >
> > > > sched_enable:
> > > >         set_exec_queue_pending_tdr_exit(q);
> > > > @@ -2244,7 +2224,7 @@ static void guc_exec_queue_unpause_prepare(struct 
> > > > xe_guc *guc,
> > > >         struct drm_sched_job *s_job;
> > > >         struct xe_sched_job *job = NULL;
> > > >
> > > > -       list_for_each_entry(s_job, &sched->base.pending_list, list) {
> > > > +       drm_sched_for_each_pending_job(s_job, &sched->base, NULL) {
> > > >                 job = to_xe_sched_job(s_job);
> > > >
> > > >                 xe_gt_dbg(guc_to_gt(guc), "Replay JOB - guc_id=%d, 
> > > > seqno=%d",
> > > > @@ -2349,7 +2329,7 @@ void xe_guc_submit_unpause(struct xe_guc *guc)
> > > >                  * created after resfix done.
> > > >                  */
> > > >                 if (q->guc->id != index ||
> > > > -                   !READ_ONCE(q->guc->sched.base.pause_submit))
> > > > +                   !drm_sched_is_stopped(&q->guc->sched.base))
> > > >                         continue;
> > > >
> > > >                 guc_exec_queue_unpause(guc, q);
> > > > @@ -2771,30 +2751,6 @@ xe_guc_exec_queue_snapshot_capture(struct 
> > > > xe_exec_queue *q)
> > > >         if (snapshot->parallel_execution)
> > > >                 guc_exec_queue_wq_snapshot_capture(q, snapshot);
> > > >
> > > > -       spin_lock(&sched->base.job_list_lock);
> > > > -       snapshot->pending_list_size = 
> > > > list_count_nodes(&sched->base.pending_list);
> > > > -       snapshot->pending_list = 
> > > > kmalloc_array(snapshot->pending_list_size,
> > > > -                                              sizeof(struct 
> > > > pending_list_snapshot),
> > > > -                                              GFP_ATOMIC);
> > > > -
> > > > -       if (snapshot->pending_list) {
> > > > -               struct xe_sched_job *job_iter;
> > > > -
> > > > -               i = 0;
> > > > -               list_for_each_entry(job_iter, 
> > > > &sched->base.pending_list, drm.list) {
> > > > -                       snapshot->pending_list[i].seqno =
> > > > -                               xe_sched_job_seqno(job_iter);
> > > > -                       snapshot->pending_list[i].fence =
> > > > -                               dma_fence_is_signaled(job_iter->fence) 
> > > > ? 1 : 0;
> > > > -                       snapshot->pending_list[i].finished =
> > > > -                               
> > > > dma_fence_is_signaled(&job_iter->drm.s_fence->finished)
> > > > -                               ? 1 : 0;
> > > > -                       i++;
> > > > -               }
> > > > -       }
> > > > -
> > > > -       spin_unlock(&sched->base.job_list_lock);
> > > > -
> > > >         return snapshot;
> > > > }
> > > >
> > > > @@ -2852,13 +2808,6 @@ xe_guc_exec_queue_snapshot_print(struct 
> > > > xe_guc_submit_exec_queue_snapshot *snaps
> > > >
> > > >         if (snapshot->parallel_execution)
> > > >                 guc_exec_queue_wq_snapshot_print(snapshot, p);
> > > > -
> > > > -       for (i = 0; snapshot->pending_list && i < 
> > > > snapshot->pending_list_size;
> > > > -            i++)
> > > > -               drm_printf(p, "\tJob: seqno=%d, fence=%d, 
> > > > finished=%d\n",
> > > > -                          snapshot->pending_list[i].seqno,
> > > > -                          snapshot->pending_list[i].fence,
> > > > -                          snapshot->pending_list[i].finished);
> > > > }
> > > >
> > > > /**
> > > > @@ -2881,7 +2830,6 @@ void xe_guc_exec_queue_snapshot_free(struct 
> > > > xe_guc_submit_exec_queue_snapshot *s
> > > >                         xe_lrc_snapshot_free(snapshot->lrc[i]);
> > > >                 kfree(snapshot->lrc);
> > > >         }
> > > > -       kfree(snapshot->pending_list);
> > > >         kfree(snapshot);
> > > > }
> > > >
> > > > diff --git a/drivers/gpu/drm/xe/xe_guc_submit_types.h 
> > > > b/drivers/gpu/drm/xe/xe_guc_submit_types.h
> > > > index dc7456c34583..0b08c79cf3b9 100644
> > > > --- a/drivers/gpu/drm/xe/xe_guc_submit_types.h
> > > > +++ b/drivers/gpu/drm/xe/xe_guc_submit_types.h
> > > > @@ -61,12 +61,6 @@ struct guc_submit_parallel_scratch {
> > > >         u32 wq[WQ_SIZE / sizeof(u32)];
> > > > };
> > > >
> > > > -struct pending_list_snapshot {
> > > > -       u32 seqno;
> > > > -       bool fence;
> > > > -       bool finished;
> > > > -};
> > > > -
> > > > /**
> > > >  * struct xe_guc_submit_exec_queue_snapshot - Snapshot for devcoredump
> > > >  */
> > > > @@ -134,11 +128,6 @@ struct xe_guc_submit_exec_queue_snapshot {
> > > >                 /** @wq: Workqueue Items */
> > > >                 u32 wq[WQ_SIZE / sizeof(u32)];
> > > >         } parallel;
> > > > -
> > > > -       /** @pending_list_size: Size of the pending list snapshot array 
> > > > */
> > > > -       int pending_list_size;
> > > > -       /** @pending_list: snapshot of the pending list info */
> > > > -       struct pending_list_snapshot *pending_list;
> > > > };
> > > >
> > > > #endif
> > > > diff --git a/drivers/gpu/drm/xe/xe_hw_fence.c 
> > > > b/drivers/gpu/drm/xe/xe_hw_fence.c
> > > > index b2a0c46dfcd4..e65dfcdfdbc5 100644
> > > > --- a/drivers/gpu/drm/xe/xe_hw_fence.c
> > > > +++ b/drivers/gpu/drm/xe/xe_hw_fence.c
> > > > @@ -110,22 +110,6 @@ void xe_hw_fence_irq_run(struct xe_hw_fence_irq 
> > > > *irq)
> > > >         irq_work_queue(&irq->work);
> > > > }
> > > >
> > > > -void xe_hw_fence_irq_stop(struct xe_hw_fence_irq *irq)
> > > > -{
> > > > -       spin_lock_irq(&irq->lock);
> > > > -       irq->enabled = false;
> > > > -       spin_unlock_irq(&irq->lock);
> > > > -}
> > > > -
> > > > -void xe_hw_fence_irq_start(struct xe_hw_fence_irq *irq)
> > > > -{
> > > > -       spin_lock_irq(&irq->lock);
> > > > -       irq->enabled = true;
> > > > -       spin_unlock_irq(&irq->lock);
> > > > -
> > > > -       irq_work_queue(&irq->work);
> > > > -}
> > > > -
> > > > void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt,
> > > >                           struct xe_hw_fence_irq *irq, const char *name)
> > > > {
> > > > diff --git a/drivers/gpu/drm/xe/xe_hw_fence.h 
> > > > b/drivers/gpu/drm/xe/xe_hw_fence.h
> > > > index f13a1c4982c7..599492c13f80 100644
> > > > --- a/drivers/gpu/drm/xe/xe_hw_fence.h
> > > > +++ b/drivers/gpu/drm/xe/xe_hw_fence.h
> > > > @@ -17,8 +17,6 @@ void xe_hw_fence_module_exit(void);
> > > > void xe_hw_fence_irq_init(struct xe_hw_fence_irq *irq);
> > > > void xe_hw_fence_irq_finish(struct xe_hw_fence_irq *irq);
> > > > void xe_hw_fence_irq_run(struct xe_hw_fence_irq *irq);
> > > > -void xe_hw_fence_irq_stop(struct xe_hw_fence_irq *irq);
> > > > -void xe_hw_fence_irq_start(struct xe_hw_fence_irq *irq);
> > > >
> > > > void xe_hw_fence_ctx_init(struct xe_hw_fence_ctx *ctx, struct xe_gt *gt,
> > > >                           struct xe_hw_fence_irq *irq, const char 
> > > > *name);
> > > > --
> > > > 2.34.1
> > > >

Re: [PATCH v3 4/7] drm/xe: Stop abusing DRM scheduler internals

Reply via email to