On Thu, Feb 26, 2026 at 07:48:31PM +0000, Long Li wrote: > > The GF stats periodic query is used as mechanism to monitor HWC health > > check. > > If this HWC command times out, it is a strong indication that the > > device/SoC is in a > > faulty state and requires recovery. > > > > Today, when a timeout is detected, the driver marks hwc_timeout_occurred, > > clears cached stats, and stops rescheduling the periodic work. However, the > > device itself is left in the same failing state. > > > > Extend the timeout handling path to trigger the existing MANA VF recovery > > service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item. > > This is expected to initiate the appropriate recovery flow by suspende > > resume > > first and if it fails then trigger a bus rescan. > > > > This change is intentionally limited to HWC command timeouts and does not > > trigger recovery for errors reported by the SoC as a normal command > > response. > > > > Signed-off-by: Dipayaan Roy <[email protected]> > > --- > > .../net/ethernet/microsoft/mana/gdma_main.c | 14 +++------- > > drivers/net/ethernet/microsoft/mana/mana_en.c | 28 ++++++++++++++++++- > > include/net/mana/gdma.h | 16 +++++++++-- > > 3 files changed, 45 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c > > b/drivers/net/ethernet/microsoft/mana/gdma_main.c > > index 0055c231acf6..16c438d2aaa3 100644 > > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c > > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c > > @@ -490,15 +490,9 @@ static void mana_serv_reset(struct pci_dev *pdev) > > dev_info(&pdev->dev, "MANA reset cycle completed\n"); > > > > out: > > - gc->in_service = false; > > + clear_bit(GC_IN_SERVICE, &gc->flags); > > } > > > > -struct mana_serv_work { > > - struct work_struct serv_work; > > - struct pci_dev *pdev; > > - enum gdma_eqe_type type; > > -}; > > - > > static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev) > > { > > switch (type) { > > @@ -542,7 +536,7 @@ static void mana_recovery_delayed_func(struct > > work_struct *w) > > spin_unlock_irqrestore(&work->lock, flags); } > > > > -static void mana_serv_func(struct work_struct *w) > > +void mana_serv_func(struct work_struct *w) > > { > > struct mana_serv_work *mns_wk; > > struct pci_dev *pdev; > > @@ -624,7 +618,7 @@ static void mana_gd_process_eqe(struct gdma_queue > > *eq) > > break; > > } > > > > - if (gc->in_service) { > > + if (test_bit(GC_IN_SERVICE, &gc->flags)) { > > dev_info(gc->dev, "Already in service\n"); > > break; > > } > > @@ -641,7 +635,7 @@ static void mana_gd_process_eqe(struct gdma_queue > > *eq) > > } > > > > dev_info(gc->dev, "Start MANA service type:%d\n", type); > > - gc->in_service = true; > > + set_bit(GC_IN_SERVICE, &gc->flags); > > mns_wk->pdev = to_pci_dev(gc->dev); > > mns_wk->type = type; > > pci_dev_get(mns_wk->pdev); > > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c > > b/drivers/net/ethernet/microsoft/mana/mana_en.c > > index 91c418097284..8da574cf06f2 100644 > > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c > > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c > > @@ -879,7 +879,7 @@ static void mana_tx_timeout(struct net_device *netdev, > > unsigned int txqueue) > > struct gdma_context *gc = ac->gdma_dev->gdma_context; > > > > /* Already in service, hence tx queue reset is not required.*/ > > - if (gc->in_service) > > + if (test_bit(GC_IN_SERVICE, &gc->flags)) > > return; > > > > /* Note: If there are pending queue reset work for this port(apc), @@ - > > 3533,6 +3533,8 @@ static void mana_gf_stats_work_handler(struct work_struct > > *work) { > > struct mana_context *ac = > > container_of(to_delayed_work(work), struct mana_context, > > gf_stats_work); > > + struct gdma_context *gc = ac->gdma_dev->gdma_context; > > + struct mana_serv_work *mns_wk; > > int err; > > > > err = mana_query_gf_stats(ac); > > @@ -3540,6 +3542,30 @@ static void mana_gf_stats_work_handler(struct > > work_struct *work) > > /* HWC timeout detected - reset stats and stop rescheduling */ > > ac->hwc_timeout_occurred = true; > > memset(&ac->hc_stats, 0, sizeof(ac->hc_stats)); > > + dev_warn(gc->dev, > > + "Gf stats wk handler: gf stats query timed out.\n"); > > + > > + /* As HWC timed out, indicating a faulty HW state and needs a > > + * reset. > > + */ > > + if (!test_and_set_bit(GC_IN_SERVICE, &gc->flags)) { > > + if (!try_module_get(THIS_MODULE)) { > > + dev_info(gc->dev, "Module is unloading\n"); > > + return; > > + } > > + > > + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC); > > + if (!mns_wk) { > > + module_put(THIS_MODULE); > > Maybe it's not necessary: check if you want to call clear_bit(GC_IN_SERVICE, > &gc->flags) here? > yes it makes sense to clear it here. > > + return; > > + } > > + > > + mns_wk->pdev = to_pci_dev(gc->dev); > > + mns_wk->type = GDMA_EQE_HWC_RESET_REQUEST; > > + pci_dev_get(mns_wk->pdev); > > + INIT_WORK(&mns_wk->serv_work, mana_serv_func); > > + schedule_work(&mns_wk->serv_work); > > + } > > return; > > } > > schedule_delayed_work(&ac->gf_stats_work, > > MANA_GF_STATS_PERIOD); diff --git a/include/net/mana/gdma.h >
Regards

