On Wed, Oct 29, 2025 at 01:05:42AM -0700, Jingyi Wang wrote: > From: Gokul krishna Krishnakumar <[email protected]> > > Subsystems can be brought out of reset by entities such as > bootloaders.
Good start of the commit message. > Before attaching such subsystems, it is important to > check the state of the subsystem. Why? I see three possible outcomes: 1) The subsystem was booted and is still running just fine. 2) The subsystem hit fatal error and flagged this in smp2p 3) The subsystem hit a wdog and I presume there would be a interrupt waiting for us as soon as we register a handler? Perhaps I'm wrong about the semantics of #3? If so this should be clearly documented in the commit message. Also, at this point in the commit message you've established the problem, the remainder is a description of how you're addressing the problem. A paragraph break would be suitable. > This patch adds support to attach > to a subsystem by ensuring that the subsystem is in a sane state by > reading SMP2P bits and pinging the subsystem. > > Signed-off-by: Gokul krishna Krishnakumar <[email protected]> I would prefer email addresses to be all lowercase, but more importantly, you lost the tail end of that address (same with author). > Co-developed-by: Jingyi Wang <[email protected]> > Signed-off-by: Jingyi Wang <[email protected]> > --- > drivers/remoteproc/qcom_q6v5.c | 89 > ++++++++++++++++++++++++++++++++++++- > drivers/remoteproc/qcom_q6v5.h | 14 +++++- > drivers/remoteproc/qcom_q6v5_adsp.c | 2 +- > drivers/remoteproc/qcom_q6v5_mss.c | 2 +- > drivers/remoteproc/qcom_q6v5_pas.c | 63 +++++++++++++++++++++++++- > 5 files changed, 165 insertions(+), 5 deletions(-) > > diff --git a/drivers/remoteproc/qcom_q6v5.c b/drivers/remoteproc/qcom_q6v5.c > index 58d5b85e58cd..4ce9e43fc5c7 100644 > --- a/drivers/remoteproc/qcom_q6v5.c > +++ b/drivers/remoteproc/qcom_q6v5.c > @@ -94,6 +94,9 @@ static irqreturn_t q6v5_wdog_interrupt(int irq, void *data) > size_t len; > char *msg; > > + if (q6v5->early_boot) > + complete(&q6v5->subsys_booted); > + > /* Sometimes the stop triggers a watchdog rather than a stop-ack */ > if (!q6v5->running) { > complete(&q6v5->stop_done); > @@ -118,6 +121,9 @@ static irqreturn_t q6v5_fatal_interrupt(int irq, void > *data) > size_t len; > char *msg; > > + if (q6v5->early_boot) > + complete(&q6v5->subsys_booted); > + > if (!q6v5->running) > return IRQ_HANDLED; > > @@ -139,6 +145,9 @@ static irqreturn_t q6v5_ready_interrupt(int irq, void > *data) > > complete(&q6v5->start_done); > > + if (q6v5->early_boot) > + complete(&q6v5->subsys_booted); > + > return IRQ_HANDLED; > } > > @@ -172,6 +181,9 @@ static irqreturn_t q6v5_handover_interrupt(int irq, void > *data) > if (q6v5->handover) > q6v5->handover(q6v5); > > + if (q6v5->early_boot) > + complete(&q6v5->subsys_booted); > + > icc_set_bw(q6v5->path, 0, 0); > > q6v5->handover_issued = true; > @@ -234,6 +246,77 @@ unsigned long qcom_q6v5_panic(struct qcom_q6v5 *q6v5) > } > EXPORT_SYMBOL_GPL(qcom_q6v5_panic); > > +static irqreturn_t q6v5_pong_interrupt(int irq, void *data) > +{ > + struct qcom_q6v5 *q6v5 = data; > + > + complete(&q6v5->ping_done); > + > + return IRQ_HANDLED; > +} > + > +int qcom_q6v5_ping_subsystem(struct qcom_q6v5 *q6v5) > +{ > + int ret; > + int ping_failed = 0; > + > + reinit_completion(&q6v5->ping_done); > + > + /* Set master kernel Ping bit */ > + ret = qcom_smem_state_update_bits(q6v5->ping_state, > + BIT(q6v5->ping_bit), > BIT(q6v5->ping_bit)); > + if (ret) { > + dev_err(q6v5->dev, "Failed to update ping bits\n"); > + return ret; > + } > + > + ret = wait_for_completion_timeout(&q6v5->ping_done, > msecs_to_jiffies(PING_TIMEOUT)); > + if (!ret) { > + ping_failed = -ETIMEDOUT; > + dev_err(q6v5->dev, "Failed to get back pong\n"); > + } > + > + /* Clear ping bit master kernel */ > + ret = qcom_smem_state_update_bits(q6v5->ping_state, > BIT(q6v5->ping_bit), 0); > + if (ret) { > + pr_err("Failed to clear master kernel bits\n"); Two dev_err and one pr_err? > + return ret; > + } > + > + if (ping_failed) > + return ping_failed; > + > + return 0; > +} > +EXPORT_SYMBOL_GPL(qcom_q6v5_ping_subsystem); > + > +int qcom_q6v5_ping_subsystem_init(struct qcom_q6v5 *q6v5, struct > platform_device *pdev) > +{ > + int ret = -ENODEV; > + > + q6v5->ping_state = devm_qcom_smem_state_get(&pdev->dev, "ping", > &q6v5->ping_bit); > + if (IS_ERR(q6v5->ping_state)) { > + dev_err(&pdev->dev, "failed to acquire smem state %ld\n", > + PTR_ERR(q6v5->ping_state)); > + return ret; > + } > + > + q6v5->pong_irq = platform_get_irq_byname(pdev, "pong"); > + if (q6v5->pong_irq < 0) > + return q6v5->pong_irq; > + > + ret = devm_request_threaded_irq(&pdev->dev, q6v5->pong_irq, NULL, > + q6v5_pong_interrupt, > IRQF_TRIGGER_RISING | IRQF_ONESHOT, > + "q6v5 pong", q6v5); > + if (ret) > + dev_err(&pdev->dev, "failed to acquire pong IRQ\n"); > + > + init_completion(&q6v5->ping_done); > + > + return ret; > +} > +EXPORT_SYMBOL_GPL(qcom_q6v5_ping_subsystem_init); > + > /** > * qcom_q6v5_init() - initializer of the q6v5 common struct > * @q6v5: handle to be initialized > @@ -247,7 +330,7 @@ EXPORT_SYMBOL_GPL(qcom_q6v5_panic); > */ > int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev, > struct rproc *rproc, int crash_reason, const char > *load_state, > - void (*handover)(struct qcom_q6v5 *q6v5)) > + bool early_boot, void (*handover)(struct qcom_q6v5 *q6v5)) > { > int ret; > > @@ -255,10 +338,14 @@ int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct > platform_device *pdev, > q6v5->dev = &pdev->dev; > q6v5->crash_reason = crash_reason; > q6v5->handover = handover; > + q6v5->early_boot = early_boot; > > init_completion(&q6v5->start_done); > init_completion(&q6v5->stop_done); > > + if (early_boot) > + init_completion(&q6v5->subsys_booted); > + > q6v5->wdog_irq = platform_get_irq_byname(pdev, "wdog"); > if (q6v5->wdog_irq < 0) > return q6v5->wdog_irq; > diff --git a/drivers/remoteproc/qcom_q6v5.h b/drivers/remoteproc/qcom_q6v5.h > index 5a859c41896e..8a227bf70d7e 100644 > --- a/drivers/remoteproc/qcom_q6v5.h > +++ b/drivers/remoteproc/qcom_q6v5.h > @@ -12,27 +12,35 @@ struct rproc; > struct qcom_smem_state; > struct qcom_sysmon; > > +#define PING_TIMEOUT 500 /* in milliseconds */ > +#define PING_TEST_WAIT 500 /* in milliseconds */ > + > struct qcom_q6v5 { > struct device *dev; > struct rproc *rproc; > > struct qcom_smem_state *state; > + struct qcom_smem_state *ping_state; > struct qmp *qmp; > > struct icc_path *path; > > unsigned stop_bit; > + unsigned int ping_bit; > > int wdog_irq; > int fatal_irq; > int ready_irq; > int handover_irq; > int stop_irq; > + int pong_irq; > > bool handover_issued; > > struct completion start_done; > struct completion stop_done; > + struct completion subsys_booted; > + struct completion ping_done; > > int crash_reason; > > @@ -40,11 +48,13 @@ struct qcom_q6v5 { > > const char *load_state; > void (*handover)(struct qcom_q6v5 *q6v5); > + > + bool early_boot; > }; > > int qcom_q6v5_init(struct qcom_q6v5 *q6v5, struct platform_device *pdev, > struct rproc *rproc, int crash_reason, const char > *load_state, > - void (*handover)(struct qcom_q6v5 *q6v5)); > + bool early_boot, void (*handover)(struct qcom_q6v5 *q6v5)); > void qcom_q6v5_deinit(struct qcom_q6v5 *q6v5); > > int qcom_q6v5_prepare(struct qcom_q6v5 *q6v5); > @@ -52,5 +62,7 @@ int qcom_q6v5_unprepare(struct qcom_q6v5 *q6v5); > int qcom_q6v5_request_stop(struct qcom_q6v5 *q6v5, struct qcom_sysmon > *sysmon); > int qcom_q6v5_wait_for_start(struct qcom_q6v5 *q6v5, int timeout); > unsigned long qcom_q6v5_panic(struct qcom_q6v5 *q6v5); > +int qcom_q6v5_ping_subsystem(struct qcom_q6v5 *q6v5); > +int qcom_q6v5_ping_subsystem_init(struct qcom_q6v5 *q6v5, struct > platform_device *pdev); > > #endif > diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c > b/drivers/remoteproc/qcom_q6v5_adsp.c > index e98b7e03162c..1576b435b921 100644 > --- a/drivers/remoteproc/qcom_q6v5_adsp.c > +++ b/drivers/remoteproc/qcom_q6v5_adsp.c > @@ -717,7 +717,7 @@ static int adsp_probe(struct platform_device *pdev) > goto disable_pm; > > ret = qcom_q6v5_init(&adsp->q6v5, pdev, rproc, desc->crash_reason_smem, > - desc->load_state, qcom_adsp_pil_handover); > + desc->load_state, false, qcom_adsp_pil_handover); > if (ret) > goto disable_pm; > > diff --git a/drivers/remoteproc/qcom_q6v5_mss.c > b/drivers/remoteproc/qcom_q6v5_mss.c > index 3087d895b87f..ee9bf048820a 100644 > --- a/drivers/remoteproc/qcom_q6v5_mss.c > +++ b/drivers/remoteproc/qcom_q6v5_mss.c > @@ -2165,7 +2165,7 @@ static int q6v5_probe(struct platform_device *pdev) > qproc->has_mba_logs = desc->has_mba_logs; > > ret = qcom_q6v5_init(&qproc->q6v5, pdev, rproc, MPSS_CRASH_REASON_SMEM, > "modem", > - qcom_msa_handover); > + false, qcom_msa_handover); > if (ret) > goto detach_proxy_pds; > > diff --git a/drivers/remoteproc/qcom_q6v5_pas.c > b/drivers/remoteproc/qcom_q6v5_pas.c > index 158bcd6cc85c..b667c11aadb5 100644 > --- a/drivers/remoteproc/qcom_q6v5_pas.c > +++ b/drivers/remoteproc/qcom_q6v5_pas.c > @@ -35,6 +35,8 @@ > > #define MAX_ASSIGN_COUNT 3 > > +#define EARLY_BOOT_RETRY_INTERVAL_MS 5000 "retry" makes it sounds like we're doing something in a loop, but as far as I can tell this is the "attach timeout". > + > struct qcom_pas_data { > int crash_reason_smem; > const char *firmware_name; > @@ -59,6 +61,7 @@ struct qcom_pas_data { > int region_assign_count; > bool region_assign_shared; > int region_assign_vmid; > + bool early_boot; > }; > > struct qcom_pas { > @@ -409,6 +412,8 @@ static int qcom_pas_stop(struct rproc *rproc) > if (pas->smem_host_id) > ret = qcom_smem_bust_hwspin_lock_by_host(pas->smem_host_id); > > + pas->q6v5.early_boot = false; > + > return ret; > } > > @@ -434,6 +439,51 @@ static unsigned long qcom_pas_panic(struct rproc *rproc) > return qcom_q6v5_panic(&pas->q6v5); > } > > +static int qcom_pas_attach(struct rproc *rproc) > +{ > + int ret; > + struct qcom_pas *adsp = rproc->priv; Why is this variable named "adsp"? > + bool ready_state; > + bool crash_state; > + > + if (!adsp->q6v5.early_boot) This would imply that rproc->state == RPROC_DETACHED for a subsystem with !early_boot. That shouldn't be possible, I think you should be more vocal about this. E.g. by making this: if (WARN_ON(!adsp->q6v5.early_boot)) Or just decide that it's too defensive and drop the check. > + return -EINVAL; > + > + ret = irq_get_irqchip_state(adsp->q6v5.fatal_irq, > + IRQCHIP_STATE_LINE_LEVEL, &crash_state); > + > + if (crash_state) { > + dev_err(adsp->dev, "Sub system has crashed before driver > probe\n"); > + adsp->rproc->state = RPROC_CRASHED; We're attaching to a running subsystem, we conclude that it has crashed, and we're doing nothing? Why don't we call rproc_report_crash() here? > + return -EINVAL; > + } > + > + ret = irq_get_irqchip_state(adsp->q6v5.ready_irq, > + IRQCHIP_STATE_LINE_LEVEL, &ready_state); > + > + if (ready_state) { > + dev_info(adsp->dev, "Sub system has boot-up before driver > probe\n"); What does this mean? Success? > + adsp->rproc->state = RPROC_DETACHED; Why do we just put it back in RPROC_DETACHED? What will happen now? Isn't this the point where we should mark it as RPROC_RUNNING? > + } else { /* * The early-boot subsystem has not crashed, nor signalled that it's * ready, wait for 5 more seconds in case it's late. */ That said, this begs the questions: can this really happen and why 5 seconds? > + ret = wait_for_completion_timeout(&adsp->q6v5.subsys_booted, > + > msecs_to_jiffies(EARLY_BOOT_RETRY_INTERVAL_MS)); > + if (!ret) { > + dev_err(adsp->dev, "Timeout on waiting for subsystem > interrupt\n"); So, we gave the subsystem timeof(kernel-boot) + 5 seconds to show sign of life, and it didn't show up. To me there are two possible reasons: 1) The bootloader never started the subsystem 2) The bootloader started it, it somehow ended up in a faulty scenario and we're failing to detect that. #1 seems like a perfectly reasonable scenario e.g. during bringup and/or development, and the path out of it seems to be to start the subsystem. #2 would be a faulty scenario that we want to debug, but the user probably want their subsystem (re)booted in the meantime. > + return -ETIMEDOUT; > + } We didn't time out, the subsystem did tell us something happen. There are four sources of this event, one of those correspond to our "start_done" state during normal startup. Should we really treat all four sources the same way here? For two of the events, we can immediately attempt a recovery, no need to ping the subsystem and wait for the timeout. > + } > + > + ret = qcom_q6v5_ping_subsystem(&adsp->q6v5); > + if (ret) { > + dev_err(adsp->dev, "Failed to ping subsystem, assuming device > crashed\n"); > + rproc->state = RPROC_CRASHED; As above, you mark it as crashed...and then what? > + return ret; > + } > + > + adsp->q6v5.running = true; > + return ret; > +} > + > static const struct rproc_ops qcom_pas_ops = { > .unprepare = qcom_pas_unprepare, > .start = qcom_pas_start, > @@ -442,6 +492,7 @@ static const struct rproc_ops qcom_pas_ops = { > .parse_fw = qcom_register_dump_segments, > .load = qcom_pas_load, > .panic = qcom_pas_panic, > + .attach = qcom_pas_attach, > }; > > static const struct rproc_ops qcom_pas_minidump_ops = { > @@ -765,7 +816,7 @@ static int qcom_pas_probe(struct platform_device *pdev) > pas->proxy_pd_count = ret; > > ret = qcom_q6v5_init(&pas->q6v5, pdev, rproc, desc->crash_reason_smem, > - desc->load_state, qcom_pas_handover); > + desc->load_state, desc->early_boot, > qcom_pas_handover); > if (ret) > goto detach_proxy_pds; > > @@ -779,6 +830,16 @@ static int qcom_pas_probe(struct platform_device *pdev) > } > > qcom_add_ssr_subdev(rproc, &pas->ssr_subdev, desc->ssr_name); > + > + if (pas->q6v5.early_boot) { > + ret = qcom_q6v5_ping_subsystem_init(&pas->q6v5, pdev); > + if (ret) > + dev_err(&pdev->dev, > + "Unable to find ping/pong bits, falling back to > firmware load\n"); You already printed more specific errors in qcom_q6v5_ping_subsystem_init(), no need to print once more here. Regards, Bjorn > + else > + pas->rproc->state = RPROC_DETACHED; > + } > + > ret = rproc_add(rproc); > if (ret) > goto remove_ssr_sysmon; > > -- > 2.25.1 >

