On Tue, Nov 18, 2025 at 10:02:00AM -0800, Matthew Brost wrote:
On Mon, Nov 17, 2025 at 10:41:52PM -0800, Niranjana Vishwanathapura wrote:
On Thu, Oct 16, 2025 at 01:48:24PM -0700, Matthew Brost wrote:
> Deregistering queues in the TDR introduces unnecessary complexity,
> requiring reference counting tricks to function correctly. All that's
> needed in the TDR is to kick the queue off the hardware, which is
> achieved by disabling scheduling. Queue deregistration should be handled
> in a single, well-defined point in the cleanup path, tied to the queue's
> reference count.
>

Overall looks good to me.
But it would help if the commit text describes why this extra reference
taking was there before for lr jobs and why it is not needed now.


This patch isn't related to LR jobs, the following patch is.


I was talking about the set/clear_exec_queue_extra_ref() and its usage
being removed in this patchset.

The deregistering queues in TDR was never required, and this patches
removes that flow.


Ok, thanks.

Niranjana

Matt

Niranjana

> Signed-off-by: Matthew Brost <[email protected]>
> ---
> drivers/gpu/drm/xe/xe_guc_submit.c | 57 +++---------------------------
> 1 file changed, 5 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c 
b/drivers/gpu/drm/xe/xe_guc_submit.c
> index 680696efc434..ab0f1a2d4871 100644
> --- a/drivers/gpu/drm/xe/xe_guc_submit.c
> +++ b/drivers/gpu/drm/xe/xe_guc_submit.c
> @@ -69,9 +69,8 @@ exec_queue_to_guc(struct xe_exec_queue *q)
> #define EXEC_QUEUE_STATE_WEDGED                    (1 << 8)
> #define EXEC_QUEUE_STATE_BANNED                    (1 << 9)
> #define EXEC_QUEUE_STATE_CHECK_TIMEOUT             (1 << 10)
> -#define EXEC_QUEUE_STATE_EXTRA_REF                (1 << 11)
> -#define EXEC_QUEUE_STATE_PENDING_RESUME           (1 << 12)
> -#define EXEC_QUEUE_STATE_PENDING_TDR_EXIT (1 << 13)
> +#define EXEC_QUEUE_STATE_PENDING_RESUME           (1 << 11)
> +#define EXEC_QUEUE_STATE_PENDING_TDR_EXIT (1 << 12)
>
> static bool exec_queue_registered(struct xe_exec_queue *q)
> {
> @@ -218,21 +217,6 @@ static void clear_exec_queue_check_timeout(struct 
xe_exec_queue *q)
>    atomic_and(~EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state);
> }
>
> -static bool exec_queue_extra_ref(struct xe_exec_queue *q)
> -{
> -  return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_EXTRA_REF;
> -}
> -
> -static void set_exec_queue_extra_ref(struct xe_exec_queue *q)
> -{
> -  atomic_or(EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state);
> -}
> -
> -static void clear_exec_queue_extra_ref(struct xe_exec_queue *q)
> -{
> -  atomic_and(~EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state);
> -}
> -
> static bool exec_queue_pending_resume(struct xe_exec_queue *q)
> {
>    return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_PENDING_RESUME;
> @@ -1190,25 +1174,6 @@ static void disable_scheduling(struct xe_exec_queue 
*q, bool immediate)
>                   G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1);
> }
>
> -static void __deregister_exec_queue(struct xe_guc *guc, struct xe_exec_queue 
*q)
> -{
> -  u32 action[] = {
> -          XE_GUC_ACTION_DEREGISTER_CONTEXT,
> -          q->guc->id,
> -  };
> -
> -  xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q));
> -  xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q));
> -  xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_enable(q));
> -  xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q));
> -
> -  set_exec_queue_destroyed(q);
> -  trace_xe_exec_queue_deregister(q);
> -
> -  xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action),
> -                 G2H_LEN_DW_DEREGISTER_CONTEXT, 1);
> -}
> -
> static enum drm_gpu_sched_stat
> guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
> {
> @@ -1326,8 +1291,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job 
*drm_job)
>                    xe_devcoredump(q, job,
>                                   "Schedule disable failed to respond, guc_id=%d, 
ret=%d, guc_read=%d",
>                                   q->guc->id, ret, xe_guc_read_stopped(guc));
> -                  set_exec_queue_extra_ref(q);
> -                  xe_exec_queue_get(q);   /* GT reset owns this */
>                    set_exec_queue_banned(q);
>                    xe_gt_reset_async(q->gt);
>                    xe_sched_tdr_queue_imm(sched);
> @@ -1380,13 +1343,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job 
*drm_job)
>            }
>    }
>
> -  /* Finish cleaning up exec queue via deregister */
>    set_exec_queue_banned(q);
> -  if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
> -          set_exec_queue_extra_ref(q);
> -          xe_exec_queue_get(q);
> -          __deregister_exec_queue(guc, q);
> -  }
>
>    /* Mark all outstanding jobs as bad, thus completing them */
>    xe_sched_job_set_error(job, err);
> @@ -1928,7 +1885,7 @@ static void guc_exec_queue_stop(struct xe_guc *guc, 
struct xe_exec_queue *q)
>
>    /* Clean up lost G2H + reset engine state */
>    if (exec_queue_registered(q)) {
> -          if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
> +          if (xe_exec_queue_is_lr(q))
>                    xe_exec_queue_put(q);
>            else if (exec_queue_destroyed(q))
>                    __guc_exec_queue_destroy(guc, q);
> @@ -2062,11 +2019,7 @@ static void 
guc_exec_queue_revert_pending_state_change(struct xe_guc *guc,
>
>    if (exec_queue_destroyed(q) && exec_queue_registered(q)) {
>            clear_exec_queue_destroyed(q);
> -          if (exec_queue_extra_ref(q))
> -                  xe_exec_queue_put(q);
> -          else
> -                  q->guc->needs_cleanup = true;
> -          clear_exec_queue_extra_ref(q);
> +          q->guc->needs_cleanup = true;
>            xe_gt_dbg(guc_to_gt(guc), "Replay CLEANUP - guc_id=%d",
>                      q->guc->id);
>    }
> @@ -2483,7 +2436,7 @@ static void handle_deregister_done(struct xe_guc *guc, 
struct xe_exec_queue *q)
>
>    clear_exec_queue_registered(q);
>
> -  if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q))
> +  if (xe_exec_queue_is_lr(q))
>            xe_exec_queue_put(q);
>    else
>            __guc_exec_queue_destroy(guc, q);
> --
> 2.34.1
>

Reply via email to