> The GF stats periodic query is used as mechanism to monitor HWC health check. > If this HWC command times out, it is a strong indication that the device/SoC > is in a > faulty state and requires recovery. > > Today, when a timeout is detected, the driver marks hwc_timeout_occurred, > clears cached stats, and stops rescheduling the periodic work. However, the > device itself is left in the same failing state. > > Extend the timeout handling path to trigger the existing MANA VF recovery > service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item. > This is expected to initiate the appropriate recovery flow by suspende resume > first and if it fails then trigger a bus rescan. > > This change is intentionally limited to HWC command timeouts and does not > trigger recovery for errors reported by the SoC as a normal command response. > > Signed-off-by: Dipayaan Roy <[email protected]> > --- > .../net/ethernet/microsoft/mana/gdma_main.c | 14 +++------- > drivers/net/ethernet/microsoft/mana/mana_en.c | 28 ++++++++++++++++++- > include/net/mana/gdma.h | 16 +++++++++-- > 3 files changed, 45 insertions(+), 13 deletions(-) > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c > b/drivers/net/ethernet/microsoft/mana/gdma_main.c > index 0055c231acf6..16c438d2aaa3 100644 > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c > @@ -490,15 +490,9 @@ static void mana_serv_reset(struct pci_dev *pdev) > dev_info(&pdev->dev, "MANA reset cycle completed\n"); > > out: > - gc->in_service = false; > + clear_bit(GC_IN_SERVICE, &gc->flags); > } > > -struct mana_serv_work { > - struct work_struct serv_work; > - struct pci_dev *pdev; > - enum gdma_eqe_type type; > -}; > - > static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev) > { > switch (type) { > @@ -542,7 +536,7 @@ static void mana_recovery_delayed_func(struct > work_struct *w) > spin_unlock_irqrestore(&work->lock, flags); } > > -static void mana_serv_func(struct work_struct *w) > +void mana_serv_func(struct work_struct *w) > { > struct mana_serv_work *mns_wk; > struct pci_dev *pdev; > @@ -624,7 +618,7 @@ static void mana_gd_process_eqe(struct gdma_queue > *eq) > break; > } > > - if (gc->in_service) { > + if (test_bit(GC_IN_SERVICE, &gc->flags)) { > dev_info(gc->dev, "Already in service\n"); > break; > } > @@ -641,7 +635,7 @@ static void mana_gd_process_eqe(struct gdma_queue > *eq) > } > > dev_info(gc->dev, "Start MANA service type:%d\n", type); > - gc->in_service = true; > + set_bit(GC_IN_SERVICE, &gc->flags); > mns_wk->pdev = to_pci_dev(gc->dev); > mns_wk->type = type; > pci_dev_get(mns_wk->pdev); > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c > b/drivers/net/ethernet/microsoft/mana/mana_en.c > index 91c418097284..8da574cf06f2 100644 > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c > @@ -879,7 +879,7 @@ static void mana_tx_timeout(struct net_device *netdev, > unsigned int txqueue) > struct gdma_context *gc = ac->gdma_dev->gdma_context; > > /* Already in service, hence tx queue reset is not required.*/ > - if (gc->in_service) > + if (test_bit(GC_IN_SERVICE, &gc->flags)) > return; > > /* Note: If there are pending queue reset work for this port(apc), @@ - > 3533,6 +3533,8 @@ static void mana_gf_stats_work_handler(struct work_struct > *work) { > struct mana_context *ac = > container_of(to_delayed_work(work), struct mana_context, > gf_stats_work); > + struct gdma_context *gc = ac->gdma_dev->gdma_context; > + struct mana_serv_work *mns_wk; > int err; > > err = mana_query_gf_stats(ac); > @@ -3540,6 +3542,30 @@ static void mana_gf_stats_work_handler(struct > work_struct *work) > /* HWC timeout detected - reset stats and stop rescheduling */ > ac->hwc_timeout_occurred = true; > memset(&ac->hc_stats, 0, sizeof(ac->hc_stats)); > + dev_warn(gc->dev, > + "Gf stats wk handler: gf stats query timed out.\n"); > + > + /* As HWC timed out, indicating a faulty HW state and needs a > + * reset. > + */ > + if (!test_and_set_bit(GC_IN_SERVICE, &gc->flags)) { > + if (!try_module_get(THIS_MODULE)) { > + dev_info(gc->dev, "Module is unloading\n"); > + return; > + } > + > + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC); > + if (!mns_wk) { > + module_put(THIS_MODULE);
Maybe it's not necessary: check if you want to call clear_bit(GC_IN_SERVICE, &gc->flags) here? > + return; > + } > + > + mns_wk->pdev = to_pci_dev(gc->dev); > + mns_wk->type = GDMA_EQE_HWC_RESET_REQUEST; > + pci_dev_get(mns_wk->pdev); > + INIT_WORK(&mns_wk->serv_work, mana_serv_func); > + schedule_work(&mns_wk->serv_work); > + } > return; > } > schedule_delayed_work(&ac->gf_stats_work, > MANA_GF_STATS_PERIOD); diff --git a/include/net/mana/gdma.h

