Hi, On Fri, 2026-06-05 at 13:06 -0300, Maíra Canal wrote: > The current submission model breaks the DRM scheduler's design in two > ways. > > First, a job spawns further hardware work from outside the scheduler. > rocket_job_run() submits only the first task of an inference; every > subsequent task is submitted by the threaded IRQ handler, which calls > rocket_job_hw_submit() directly. > The scheduler expects all of a job's > hardware submission to happen in run_job(). Driving it from the IRQ > handler instead is invisible to the scheduler, and drm_sched_stop() only > synchronizes the scheduler's workqueue, not the threaded IRQ, so the > reset path races these IRQ-driven submissions. The job_lock mutex and the > reset.pending flag exist only as a workaround that self-inflicted race.
nit: missing word "for" > > Second, the submission path returns after arming a job. rocket_job_push() > calls drm_sched_job_arm() and only then acquires the BO fences, bailing > out and cleaning the job up if that fails. But arming is a point of no > return: per its documentation, "Once this function was called, you *must* > submit @job with drm_sched_entity_push_job()", because it publishes the > job's fences. Oh dear. Yeah, got catch. > > Redesign the submission so each task is its own drm_sched_job, which is > what the scheduler's model actually expects: > > - Every submission to the NPU flows through run_job(), and the IRQ > handler only signals the task's fence. Nothing is started without the > knowledge of the scheduler, so it can serialize submission against > reset. +1 > > - The BO reservations and implicit dependencies are acquired before any > task is armed, so the only fallible step precedes the point of no > return. > > struct rocket_job becomes the refcounted inference shared by its tasks; If they're now shared, do they also require synchronization? > struct rocket_task becomes the per-task scheduled unit, and each task > holds a reference to the inference, so the last task to be freed marks it > done. Ordering between tasks comes from the per-file entity's FIFO and > arming all tasks up front pins the inference to one core. The implicit > BO dependencies and the completion fence are anchored on the last task. > > The IOMMU domain is attached once by the first task to run and detached > when the last task drops its reference. On timeout the inference is marked > cancelled before the scheduler restarts; the remaining tasks observe the > flag in run_job() and fail with -ECANCELED without touching hardware, so > only the affected inference is abandoned. > > With this, the Rocket driver can comply to the DRM scheduler's > expectations. > > Signed-off-by: Maíra Canal <[email protected]> Some more fly-by comments below, but not a super deep review: > --- > drivers/accel/rocket/rocket_core.h | 5 +- > drivers/accel/rocket/rocket_job.c | 257 > ++++++++++++++++++++----------------- > drivers/accel/rocket/rocket_job.h | 26 +++- > 3 files changed, 158 insertions(+), 130 deletions(-) > > diff --git a/drivers/accel/rocket/rocket_core.h > b/drivers/accel/rocket/rocket_core.h > index 4d163a6ac653..f74dc99d07bb 100644 > --- a/drivers/accel/rocket/rocket_core.h > +++ b/drivers/accel/rocket/rocket_core.h > @@ -41,14 +41,13 @@ struct rocket_core { > > struct iommu_group *iommu_group; > > - struct mutex job_lock; > - struct rocket_job *in_flight_job; > + /* Task currently running on the hardware. */ > + struct rocket_task *in_flight_task; > > spinlock_t fence_lock; > > struct { > struct workqueue_struct *wq; > - atomic_t pending; > } reset; > > struct drm_gpu_scheduler sched; > diff --git a/drivers/accel/rocket/rocket_job.c > b/drivers/accel/rocket/rocket_job.c > index 8a26139a44f4..871041eb7a1d 100644 > --- a/drivers/accel/rocket/rocket_job.c > +++ b/drivers/accel/rocket/rocket_job.c > @@ -20,10 +20,10 @@ > > #define JOB_TIMEOUT_MS 500 > > -static struct rocket_job * > -to_rocket_job(struct drm_sched_job *sched_job) > +static struct rocket_task * > +to_rocket_task(struct drm_sched_job *sched_job) > { > - return container_of(sched_job, struct rocket_job, base); > + return container_of(sched_job, struct rocket_task, base); > } > > static const char *rocket_fence_get_driver_name(struct dma_fence *fence) > @@ -61,8 +61,6 @@ rocket_copy_tasks(struct drm_device *dev, > struct drm_rocket_job *job, > struct rocket_job *rjob) > { > - int ret = 0; > - > if (job->task_struct_size < sizeof(struct drm_rocket_task)) > return -EINVAL; > > @@ -71,7 +69,7 @@ rocket_copy_tasks(struct drm_device *dev, > if (!rjob->task_count) > return 0; > > - rjob->tasks = kvmalloc_objs(*rjob->tasks, job->task_count); > + rjob->tasks = kvzalloc_objs(*rjob->tasks, job->task_count); > if (!rjob->tasks) { > drm_dbg(dev, "Failed to allocate task array\n"); > return -ENOMEM; > @@ -84,14 +82,12 @@ rocket_copy_tasks(struct drm_device *dev, > u64_to_user_ptr(job->tasks) + i * > job->task_struct_size, > sizeof(task))) { > drm_dbg(dev, "Failed to copy incoming tasks\n"); > - ret = -EFAULT; > - goto fail; > + return -EFAULT; > } > > if (task.regcmd_count == 0) { > drm_dbg(dev, "regcmd_count field in drm_rocket_task > should be > 0.\n"); > - ret = -EINVAL; > - goto fail; > + return -EINVAL; > } > > rjob->tasks[i].regcmd = task.regcmd; > @@ -99,26 +95,14 @@ rocket_copy_tasks(struct drm_device *dev, > } > > return 0; > - > -fail: > - kvfree(rjob->tasks); > - return ret; Not super obvious to the reader why this is now unnecessary? Could these changes be a separate patch? > } > […] > > static struct rocket_core *sched_to_core(struct rocket_device *rdev, > @@ -286,108 +307,102 @@ static struct rocket_core *sched_to_core(struct > rocket_device *rdev, > > static struct dma_fence *rocket_job_run(struct drm_sched_job *sched_job) > { > - struct rocket_job *job = to_rocket_job(sched_job); > - struct rocket_device *rdev = job->rdev; > - struct rocket_core *core = sched_to_core(rdev, sched_job->sched); > + struct rocket_task *task = to_rocket_task(sched_job); > + struct rocket_job *job = task->job; > + struct rocket_core *core = sched_to_core(job->rdev, sched_job->sched); > struct dma_fence *fence = NULL; > int ret; > > - if (unlikely(job->base.s_fence->finished.error)) > + if (unlikely(sched_job->s_fence->finished.error)) > return NULL; > > - /* > - * Nothing to execute: can happen if the job has finished while > - * we were resetting the NPU. > - */ > - if (job->next_task_idx == job->task_count) > + /* An earlier task of this inference timed out, abandon the inference. > */ > + if (atomic_read(&job->cancelled)) { I see that you touch some mutexes in this patch. Some seem to disappear. These atomics stay. They look a bit racy, too, don't they? > + dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED); > return NULL; > + } > > fence = rocket_fence_create(core); > if (IS_ERR(fence)) > return fence; > > - if (job->done_fence) > - dma_fence_put(job->done_fence); > - job->done_fence = dma_fence_get(fence); > + if (task->done_fence) > + dma_fence_put(task->done_fence); > + task->done_fence = dma_fence_get(fence); > > - ret = pm_runtime_get_sync(core->dev); > - if (ret < 0) > - return fence; > - > - ret = iommu_attach_group(job->domain->domain, core->iommu_group); > - if (ret < 0) > - return fence; > - > - scoped_guard(mutex, &core->job_lock) { > - core->in_flight_job = job; > - rocket_job_hw_submit(core, job); > + ret = pm_runtime_resume_and_get(core->dev); > + if (ret < 0) { > + dma_fence_put(fence); > + return ERR_PTR(ret); > } > > + /* Attach the domain once for the whole inference. */ > + if (!job->core) { > + ret = iommu_attach_group(job->domain->domain, > core->iommu_group); > + if (ret < 0) { > + pm_runtime_put_autosuspend(core->dev); > + dma_fence_put(fence); > + return ERR_PTR(ret); > + } > + job->core = core; > + } > + > + WRITE_ONCE(core->in_flight_task, task); The scoped_guard above disappears, and now a WRITE_ONCE appears. Hmm. The core doesn't have a lock? > + rocket_job_hw_submit(core, task); > + > return fence; > } > > static void rocket_job_handle_irq(struct rocket_core *core) > { > + struct rocket_task *task; > + > pm_runtime_mark_last_busy(core->dev); > > rocket_pc_writel(core, OPERATION_ENABLE, 0x0); > rocket_pc_writel(core, INTERRUPT_CLEAR, 0x1ffff); > > - scoped_guard(mutex, &core->job_lock) > - if (core->in_flight_job) { > - if (core->in_flight_job->next_task_idx < > core->in_flight_job->task_count) { > - rocket_job_hw_submit(core, core->in_flight_job); > - return; > - } > - > - iommu_detach_group(NULL, iommu_group_get(core->dev)); > - dma_fence_signal(core->in_flight_job->done_fence); > - pm_runtime_put_autosuspend(core->dev); > - core->in_flight_job = NULL; > - } > + /* > + * Claim the in-flight task: the reset path may run concurrently, so > + * whichever of us wins owns the PM put. > + */ > + task = xchg(&core->in_flight_task, NULL); Why is it sometimes WRITE_ONCE and sometimes xchg? I don't understand much of rocket's design, but this makes me feel uncomfortable about the overall locking / synchronization design. Regards P.
