On Thu, Aug 28, 2025 at 5:01 PM Dave Airlie <[email protected]> wrote:
>
> From: Dave Airlie <[email protected]>
>
> Nouveau has code that when it gets an IRQ with no allowed handler
> it disables it to avoid storms.
>
> However with nonstall interrupts, we often disable them from
> the drm driver, but still request their emission via the push submission.
>
> Just don't disable nonstall irqs ever in normal operation, the
> event handling code will filter them out, and the driver will
> just enable/disable them at load time.
>
> This fixes timeouts we've been seeing on/off for a long time,
> but they became a lot more noticable on Blackwell.
>
> This doesn't fix all of them, there is a subsequent fence emission
> fix to fix the last few.
>
> Fixes: 3ebd64aa3c4f ("drm/nouveau/intr: support multiple trees, and explicit
> interfaces")
> Cc: [email protected]
> Signed-off-by: Dave Airlie <[email protected]>
> ---
> .../gpu/drm/nouveau/nvkm/engine/fifo/base.c | 2 ++
> .../gpu/drm/nouveau/nvkm/engine/fifo/ga100.c | 22 ++++++++++++-------
> .../gpu/drm/nouveau/nvkm/engine/fifo/priv.h | 2 ++
> .../nouveau/nvkm/subdev/gsp/rm/r535/fifo.c | 2 +-
> 4 files changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
> b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
> index fdffa0391b31..6fd4e60634fb 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/base.c
> @@ -350,6 +350,8 @@ nvkm_fifo_dtor(struct nvkm_engine *engine)
> nvkm_chid_unref(&fifo->chid);
>
> nvkm_event_fini(&fifo->nonstall.event);
> + if (fifo->func->nonstall_dtor)
> + fifo->func->nonstall_dtor(fifo);
> mutex_destroy(&fifo->mutex);
>
> if (fifo->func->dtor)
> diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
> b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
> index e74493a4569e..81beae473122 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/ga100.c
> @@ -517,19 +517,11 @@ ga100_fifo_nonstall_intr(struct nvkm_inth *inth)
> static void
> ga100_fifo_nonstall_block(struct nvkm_event *event, int type, int index)
> {
> - struct nvkm_fifo *fifo = container_of(event, typeof(*fifo),
> nonstall.event);
> - struct nvkm_runl *runl = nvkm_runl_get(fifo, index, 0);
> -
> - nvkm_inth_block(&runl->nonstall.inth);
> }
>
> static void
> ga100_fifo_nonstall_allow(struct nvkm_event *event, int type, int index)
> {
> - struct nvkm_fifo *fifo = container_of(event, typeof(*fifo),
> nonstall.event);
> - struct nvkm_runl *runl = nvkm_runl_get(fifo, index, 0);
> -
> - nvkm_inth_allow(&runl->nonstall.inth);
> }
>
> const struct nvkm_event_func
> @@ -564,12 +556,25 @@ ga100_fifo_nonstall_ctor(struct nvkm_fifo *fifo)
> if (ret)
> return ret;
>
> + nvkm_inth_allow(&runl->nonstall.inth);
> +
> nr = max(nr, runl->id + 1);
> }
>
> return nr;
> }
>
> +void
> +ga100_fifo_nonstall_dtor(struct nvkm_fifo *fifo)
> +{
> + struct nvkm_runl *runl;
> + nvkm_runl_foreach(runl, fifo) {
> + if (runl->nonstall.vector < 0)
> + continue;
> + nvkm_inth_block(&runl->nonstall.inth);
> + }
> +}
> +
> int
> ga100_fifo_runl_ctor(struct nvkm_fifo *fifo)
> {
> @@ -599,6 +604,7 @@ ga100_fifo = {
> .runl_ctor = ga100_fifo_runl_ctor,
> .mmu_fault = &tu102_fifo_mmu_fault,
> .nonstall_ctor = ga100_fifo_nonstall_ctor,
> + .nonstall_dtor = ga100_fifo_nonstall_dtor,
You're missing the corresponding update in ga102.c, which is what
actually covers most of the drivers. Honestly, I'm not even sure why
there are two files. They look identical to me.
~Faith
> .nonstall = &ga100_fifo_nonstall,
> .runl = &ga100_runl,
> .runq = &ga100_runq,
> diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
> b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
> index 5e81ae195329..fff1428ef267 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
> +++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/priv.h
> @@ -41,6 +41,7 @@ struct nvkm_fifo_func {
> void (*start)(struct nvkm_fifo *, unsigned long *);
>
> int (*nonstall_ctor)(struct nvkm_fifo *);
> + void (*nonstall_dtor)(struct nvkm_fifo *);
> const struct nvkm_event_func *nonstall;
>
> const struct nvkm_runl_func *runl;
> @@ -200,6 +201,7 @@ u32 tu102_chan_doorbell_handle(struct nvkm_chan *);
>
> int ga100_fifo_runl_ctor(struct nvkm_fifo *);
> int ga100_fifo_nonstall_ctor(struct nvkm_fifo *);
> +void ga100_fifo_nonstall_dtor(struct nvkm_fifo *);
> extern const struct nvkm_event_func ga100_fifo_nonstall;
> extern const struct nvkm_runl_func ga100_runl;
> extern const struct nvkm_runq_func ga100_runq;
> diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
> b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
> index 1ac5628c5140..b8be0a872e7a 100644
> --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
> +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/fifo.c
> @@ -601,7 +601,7 @@ r535_fifo_new(const struct nvkm_fifo_func *hw, struct
> nvkm_device *device,
> rm->chan.func = &r535_chan;
> rm->nonstall = &ga100_fifo_nonstall;
> rm->nonstall_ctor = ga100_fifo_nonstall_ctor;
> -
> + rm->nonstall_dtor = ga100_fifo_nonstall_dtor;
> return nvkm_fifo_new_(rm, device, type, inst, pfifo);
> }
>
> --
> 2.50.1
>