rocket: Submit one drm_sched_job per task

Philipp Stanner Mon, 08 Jun 2026 03:02:36 -0700

Hi,

On Fri, 2026-06-05 at 13:06 -0300, Maíra Canal wrote:
> The current submission model breaks the DRM scheduler's design in two
> ways.
> 
> First, a job spawns further hardware work from outside the scheduler.
> rocket_job_run() submits only the first task of an inference; every
> subsequent task is submitted by the threaded IRQ handler, which calls
> rocket_job_hw_submit() directly.
>  The scheduler expects all of a job's
> hardware submission to happen in run_job(). Driving it from the IRQ
> handler instead is invisible to the scheduler, and drm_sched_stop() only
> synchronizes the scheduler's workqueue, not the threaded IRQ, so the
> reset path races these IRQ-driven submissions. The job_lock mutex and the
> reset.pending flag exist only as a workaround that self-inflicted race.


nit: missing word "for"

> 
> Second, the submission path returns after arming a job. rocket_job_push()
> calls drm_sched_job_arm() and only then acquires the BO fences, bailing
> out and cleaning the job up if that fails. But arming is a point of no
> return: per its documentation, "Once this function was called, you *must*
> submit @job with drm_sched_entity_push_job()", because it publishes the
> job's fences.

Oh dear. Yeah, got catch.

> 
> Redesign the submission so each task is its own drm_sched_job, which is
> what the scheduler's model actually expects:
> 
>   - Every submission to the NPU flows through run_job(), and the IRQ
>     handler only signals the task's fence. Nothing is started without the
>     knowledge of the scheduler, so it can serialize submission against
>     reset.

+1

> 
>   - The BO reservations and implicit dependencies are acquired before any
>     task is armed, so the only fallible step precedes the point of no
>     return.
> 
> struct rocket_job becomes the refcounted inference shared by its tasks;

If they're now shared, do they also require synchronization?

> struct rocket_task becomes the per-task scheduled unit, and each task
> holds a reference to the inference, so the last task to be freed marks it
> done. Ordering between tasks comes from the per-file entity's FIFO and
> arming all tasks up front pins the inference to one core. The implicit
> BO dependencies and the completion fence are anchored on the last task.
> 
> The IOMMU domain is attached once by the first task to run and detached
> when the last task drops its reference. On timeout the inference is marked
> cancelled before the scheduler restarts; the remaining tasks observe the
> flag in run_job() and fail with -ECANCELED without touching hardware, so
> only the affected inference is abandoned.
> 
> With this, the Rocket driver can comply to the DRM scheduler's
> expectations.
> 
> Signed-off-by: Maíra Canal <[email protected]>

Some more fly-by comments below, but not a super deep review:

> ---
>  drivers/accel/rocket/rocket_core.h |   5 +-
>  drivers/accel/rocket/rocket_job.c  | 257 
> ++++++++++++++++++++-----------------
>  drivers/accel/rocket/rocket_job.h  |  26 +++-
>  3 files changed, 158 insertions(+), 130 deletions(-)
> 
> diff --git a/drivers/accel/rocket/rocket_core.h 
> b/drivers/accel/rocket/rocket_core.h
> index 4d163a6ac653..f74dc99d07bb 100644
> --- a/drivers/accel/rocket/rocket_core.h
> +++ b/drivers/accel/rocket/rocket_core.h
> @@ -41,14 +41,13 @@ struct rocket_core {
>  
>       struct iommu_group *iommu_group;
>  
> -     struct mutex job_lock;
> -     struct rocket_job *in_flight_job;
> +     /* Task currently running on the hardware. */
> +     struct rocket_task *in_flight_task;
>  
>       spinlock_t fence_lock;
>  
>       struct {
>               struct workqueue_struct *wq;
> -             atomic_t pending;
>       } reset;
>  
>       struct drm_gpu_scheduler sched;
> diff --git a/drivers/accel/rocket/rocket_job.c 
> b/drivers/accel/rocket/rocket_job.c
> index 8a26139a44f4..871041eb7a1d 100644
> --- a/drivers/accel/rocket/rocket_job.c
> +++ b/drivers/accel/rocket/rocket_job.c
> @@ -20,10 +20,10 @@
>  
>  #define JOB_TIMEOUT_MS 500
>  
> -static struct rocket_job *
> -to_rocket_job(struct drm_sched_job *sched_job)
> +static struct rocket_task *
> +to_rocket_task(struct drm_sched_job *sched_job)
>  {
> -     return container_of(sched_job, struct rocket_job, base);
> +     return container_of(sched_job, struct rocket_task, base);
>  }
>  
>  static const char *rocket_fence_get_driver_name(struct dma_fence *fence)
> @@ -61,8 +61,6 @@ rocket_copy_tasks(struct drm_device *dev,
>                 struct drm_rocket_job *job,
>                 struct rocket_job *rjob)
>  {
> -     int ret = 0;
> -
>       if (job->task_struct_size < sizeof(struct drm_rocket_task))
>               return -EINVAL;
>  
> @@ -71,7 +69,7 @@ rocket_copy_tasks(struct drm_device *dev,
>       if (!rjob->task_count)
>               return 0;
>  
> -     rjob->tasks = kvmalloc_objs(*rjob->tasks, job->task_count);
> +     rjob->tasks = kvzalloc_objs(*rjob->tasks, job->task_count);
>       if (!rjob->tasks) {
>               drm_dbg(dev, "Failed to allocate task array\n");
>               return -ENOMEM;
> @@ -84,14 +82,12 @@ rocket_copy_tasks(struct drm_device *dev,
>                                  u64_to_user_ptr(job->tasks) + i * 
> job->task_struct_size,
>                                  sizeof(task))) {
>                       drm_dbg(dev, "Failed to copy incoming tasks\n");
> -                     ret = -EFAULT;
> -                     goto fail;
> +                     return -EFAULT;
>               }
>  
>               if (task.regcmd_count == 0) {
>                       drm_dbg(dev, "regcmd_count field in drm_rocket_task 
> should be > 0.\n");
> -                     ret = -EINVAL;
> -                     goto fail;
> +                     return -EINVAL;
>               }
>  
>               rjob->tasks[i].regcmd = task.regcmd;
> @@ -99,26 +95,14 @@ rocket_copy_tasks(struct drm_device *dev,
>       }
>  
>       return 0;
> -
> -fail:
> -     kvfree(rjob->tasks);
> -     return ret;

Not super obvious to the reader why this is now unnecessary?

Could these changes be a separate patch?

>  }
>  

[…]

> 
>  static struct rocket_core *sched_to_core(struct rocket_device *rdev,
> @@ -286,108 +307,102 @@ static struct rocket_core *sched_to_core(struct 
> rocket_device *rdev,
>  
>  static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job)
>  {
> -     struct rocket_job *job = to_rocket_job(sched_job);
> -     struct rocket_device *rdev = job->rdev;
> -     struct rocket_core *core = sched_to_core(rdev, sched_job->sched);
> +     struct rocket_task *task = to_rocket_task(sched_job);
> +     struct rocket_job *job = task->job;
> +     struct rocket_core *core = sched_to_core(job->rdev, sched_job->sched);
>       struct dma_fence *fence = NULL;
>       int ret;
>  
> -     if (unlikely(job->base.s_fence->finished.error))
> +     if (unlikely(sched_job->s_fence->finished.error))
>               return NULL;
>  
> -     /*
> -      * Nothing to execute: can happen if the job has finished while
> -      * we were resetting the NPU.
> -      */
> -     if (job->next_task_idx == job->task_count)
> +     /* An earlier task of this inference timed out, abandon the inference. 
> */
> +     if (atomic_read(&job->cancelled)) {

I see that you touch some mutexes in this patch. Some seem to
disappear. These atomics stay.

They look a bit racy, too, don't they?

> +             dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
>               return NULL;
> +     }
>  
>       fence = rocket_fence_create(core);
>       if (IS_ERR(fence))
>               return fence;
>  
> -     if (job->done_fence)
> -             dma_fence_put(job->done_fence);
> -     job->done_fence = dma_fence_get(fence);
> +     if (task->done_fence)
> +             dma_fence_put(task->done_fence);
> +     task->done_fence = dma_fence_get(fence);
>  
> -     ret = pm_runtime_get_sync(core->dev);
> -     if (ret < 0)
> -             return fence;
> -
> -     ret = iommu_attach_group(job->domain->domain, core->iommu_group);
> -     if (ret < 0)
> -             return fence;
> -
> -     scoped_guard(mutex, &core->job_lock) {
> -             core->in_flight_job = job;
> -             rocket_job_hw_submit(core, job);
> +     ret = pm_runtime_resume_and_get(core->dev);
> +     if (ret < 0) {
> +             dma_fence_put(fence);
> +             return ERR_PTR(ret);
>       }
>  
> +     /* Attach the domain once for the whole inference. */
> +     if (!job->core) {
> +             ret = iommu_attach_group(job->domain->domain, 
> core->iommu_group);
> +             if (ret < 0) {
> +                     pm_runtime_put_autosuspend(core->dev);
> +                     dma_fence_put(fence);
> +                     return ERR_PTR(ret);
> +             }
> +             job->core = core;
> +     }
> +
> +     WRITE_ONCE(core->in_flight_task, task);

The scoped_guard above disappears, and now a WRITE_ONCE appears. Hmm.
The core doesn't have a lock?

> +     rocket_job_hw_submit(core, task);
> +
>       return fence;
>  }
>  
>  static void rocket_job_handle_irq(struct rocket_core *core)
>  {
> +     struct rocket_task *task;
> +
>       pm_runtime_mark_last_busy(core->dev);
>  
>       rocket_pc_writel(core, OPERATION_ENABLE, 0x0);
>       rocket_pc_writel(core, INTERRUPT_CLEAR, 0x1ffff);
>  
> -     scoped_guard(mutex, &core->job_lock)
> -             if (core->in_flight_job) {
> -                     if (core->in_flight_job->next_task_idx < 
> core->in_flight_job->task_count) {
> -                             rocket_job_hw_submit(core, core->in_flight_job);
> -                             return;
> -                     }
> -
> -                     iommu_detach_group(NULL, iommu_group_get(core->dev));
> -                     dma_fence_signal(core->in_flight_job->done_fence);
> -                     pm_runtime_put_autosuspend(core->dev);
> -                     core->in_flight_job = NULL;
> -             }
> +     /*
> +      * Claim the in-flight task: the reset path may run concurrently, so
> +      * whichever of us wins owns the PM put.
> +      */
> +     task = xchg(&core->in_flight_task, NULL);

Why is it sometimes WRITE_ONCE and sometimes xchg?

I don't understand much of rocket's design, but this makes me feel
uncomfortable about the overall locking / synchronization design.


Regards
P.

Re: [PATCH 2/3] drm/rocket: Submit one drm_sched_job per task

Reply via email to