On Tue, Nov 18, 2025 at 01:19:16PM -0800, Niranjana Vishwanathapura wrote: > On Tue, Nov 18, 2025 at 10:02:00AM -0800, Matthew Brost wrote: > > On Mon, Nov 17, 2025 at 10:41:52PM -0800, Niranjana Vishwanathapura wrote: > > > On Thu, Oct 16, 2025 at 01:48:24PM -0700, Matthew Brost wrote: > > > > Deregistering queues in the TDR introduces unnecessary complexity, > > > > requiring reference counting tricks to function correctly. All that's > > > > needed in the TDR is to kick the queue off the hardware, which is > > > > achieved by disabling scheduling. Queue deregistration should be handled > > > > in a single, well-defined point in the cleanup path, tied to the queue's > > > > reference count. > > > > > > > > > > Overall looks good to me. > > > But it would help if the commit text describes why this extra reference > > > taking was there before for lr jobs and why it is not needed now. > > > > > > > This patch isn't related to LR jobs, the following patch is. > > > > I was talking about the set/clear_exec_queue_extra_ref() and its usage > being removed in this patchset. >
Oh, the extra was needed before to prevent the queue from disappearing on the final put or by a GT reset while a deregister from the TDR was in flight. It was a pretty hacky W/A to this odd UAF case. I can adjust the commit message with to include this information. Matt > > The deregistering queues in TDR was never required, and this patches > > removes that flow. > > > > Ok, thanks. > > Niranjana > > > Matt > > > > > Niranjana > > > > > > > Signed-off-by: Matthew Brost <[email protected]> > > > > --- > > > > drivers/gpu/drm/xe/xe_guc_submit.c | 57 +++--------------------------- > > > > 1 file changed, 5 insertions(+), 52 deletions(-) > > > > > > > > diff --git a/drivers/gpu/drm/xe/xe_guc_submit.c > > > > b/drivers/gpu/drm/xe/xe_guc_submit.c > > > > index 680696efc434..ab0f1a2d4871 100644 > > > > --- a/drivers/gpu/drm/xe/xe_guc_submit.c > > > > +++ b/drivers/gpu/drm/xe/xe_guc_submit.c > > > > @@ -69,9 +69,8 @@ exec_queue_to_guc(struct xe_exec_queue *q) > > > > #define EXEC_QUEUE_STATE_WEDGED (1 << 8) > > > > #define EXEC_QUEUE_STATE_BANNED (1 << 9) > > > > #define EXEC_QUEUE_STATE_CHECK_TIMEOUT (1 << 10) > > > > -#define EXEC_QUEUE_STATE_EXTRA_REF (1 << 11) > > > > -#define EXEC_QUEUE_STATE_PENDING_RESUME (1 << 12) > > > > -#define EXEC_QUEUE_STATE_PENDING_TDR_EXIT (1 << 13) > > > > +#define EXEC_QUEUE_STATE_PENDING_RESUME (1 << 11) > > > > +#define EXEC_QUEUE_STATE_PENDING_TDR_EXIT (1 << 12) > > > > > > > > static bool exec_queue_registered(struct xe_exec_queue *q) > > > > { > > > > @@ -218,21 +217,6 @@ static void clear_exec_queue_check_timeout(struct > > > > xe_exec_queue *q) > > > > atomic_and(~EXEC_QUEUE_STATE_CHECK_TIMEOUT, &q->guc->state); > > > > } > > > > > > > > -static bool exec_queue_extra_ref(struct xe_exec_queue *q) > > > > -{ > > > > - return atomic_read(&q->guc->state) & EXEC_QUEUE_STATE_EXTRA_REF; > > > > -} > > > > - > > > > -static void set_exec_queue_extra_ref(struct xe_exec_queue *q) > > > > -{ > > > > - atomic_or(EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state); > > > > -} > > > > - > > > > -static void clear_exec_queue_extra_ref(struct xe_exec_queue *q) > > > > -{ > > > > - atomic_and(~EXEC_QUEUE_STATE_EXTRA_REF, &q->guc->state); > > > > -} > > > > - > > > > static bool exec_queue_pending_resume(struct xe_exec_queue *q) > > > > { > > > > return atomic_read(&q->guc->state) & > > > > EXEC_QUEUE_STATE_PENDING_RESUME; > > > > @@ -1190,25 +1174,6 @@ static void disable_scheduling(struct > > > > xe_exec_queue *q, bool immediate) > > > > G2H_LEN_DW_SCHED_CONTEXT_MODE_SET, 1); > > > > } > > > > > > > > -static void __deregister_exec_queue(struct xe_guc *guc, struct > > > > xe_exec_queue *q) > > > > -{ > > > > - u32 action[] = { > > > > - XE_GUC_ACTION_DEREGISTER_CONTEXT, > > > > - q->guc->id, > > > > - }; > > > > - > > > > - xe_gt_assert(guc_to_gt(guc), !exec_queue_destroyed(q)); > > > > - xe_gt_assert(guc_to_gt(guc), exec_queue_registered(q)); > > > > - xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_enable(q)); > > > > - xe_gt_assert(guc_to_gt(guc), !exec_queue_pending_disable(q)); > > > > - > > > > - set_exec_queue_destroyed(q); > > > > - trace_xe_exec_queue_deregister(q); > > > > - > > > > - xe_guc_ct_send(&guc->ct, action, ARRAY_SIZE(action), > > > > - G2H_LEN_DW_DEREGISTER_CONTEXT, 1); > > > > -} > > > > - > > > > static enum drm_gpu_sched_stat > > > > guc_exec_queue_timedout_job(struct drm_sched_job *drm_job) > > > > { > > > > @@ -1326,8 +1291,6 @@ guc_exec_queue_timedout_job(struct drm_sched_job > > > > *drm_job) > > > > xe_devcoredump(q, job, > > > > "Schedule disable failed to > > > > respond, guc_id=%d, ret=%d, guc_read=%d", > > > > q->guc->id, ret, > > > > xe_guc_read_stopped(guc)); > > > > - set_exec_queue_extra_ref(q); > > > > - xe_exec_queue_get(q); /* GT reset owns this */ > > > > set_exec_queue_banned(q); > > > > xe_gt_reset_async(q->gt); > > > > xe_sched_tdr_queue_imm(sched); > > > > @@ -1380,13 +1343,7 @@ guc_exec_queue_timedout_job(struct drm_sched_job > > > > *drm_job) > > > > } > > > > } > > > > > > > > - /* Finish cleaning up exec queue via deregister */ > > > > set_exec_queue_banned(q); > > > > - if (!wedged && exec_queue_registered(q) && > > > > !exec_queue_destroyed(q)) { > > > > - set_exec_queue_extra_ref(q); > > > > - xe_exec_queue_get(q); > > > > - __deregister_exec_queue(guc, q); > > > > - } > > > > > > > > /* Mark all outstanding jobs as bad, thus completing them */ > > > > xe_sched_job_set_error(job, err); > > > > @@ -1928,7 +1885,7 @@ static void guc_exec_queue_stop(struct xe_guc > > > > *guc, struct xe_exec_queue *q) > > > > > > > > /* Clean up lost G2H + reset engine state */ > > > > if (exec_queue_registered(q)) { > > > > - if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) > > > > + if (xe_exec_queue_is_lr(q)) > > > > xe_exec_queue_put(q); > > > > else if (exec_queue_destroyed(q)) > > > > __guc_exec_queue_destroy(guc, q); > > > > @@ -2062,11 +2019,7 @@ static void > > > > guc_exec_queue_revert_pending_state_change(struct xe_guc *guc, > > > > > > > > if (exec_queue_destroyed(q) && exec_queue_registered(q)) { > > > > clear_exec_queue_destroyed(q); > > > > - if (exec_queue_extra_ref(q)) > > > > - xe_exec_queue_put(q); > > > > - else > > > > - q->guc->needs_cleanup = true; > > > > - clear_exec_queue_extra_ref(q); > > > > + q->guc->needs_cleanup = true; > > > > xe_gt_dbg(guc_to_gt(guc), "Replay CLEANUP - guc_id=%d", > > > > q->guc->id); > > > > } > > > > @@ -2483,7 +2436,7 @@ static void handle_deregister_done(struct xe_guc > > > > *guc, struct xe_exec_queue *q) > > > > > > > > clear_exec_queue_registered(q); > > > > > > > > - if (exec_queue_extra_ref(q) || xe_exec_queue_is_lr(q)) > > > > + if (xe_exec_queue_is_lr(q)) > > > > xe_exec_queue_put(q); > > > > else > > > > __guc_exec_queue_destroy(guc, q); > > > > -- > > > > 2.34.1 > > > >
