On Thu, Feb 26, 2026 at 07:48:31PM +0000, Long Li wrote:
> > The GF stats periodic query is used as mechanism to monitor HWC health 
> > check.
> > If this HWC command times out, it is a strong indication that the 
> > device/SoC is in a
> > faulty state and requires recovery.
> > 
> > Today, when a timeout is detected, the driver marks hwc_timeout_occurred,
> > clears cached stats, and stops rescheduling the periodic work. However, the
> > device itself is left in the same failing state.
> > 
> > Extend the timeout handling path to trigger the existing MANA VF recovery
> > service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item.
> > This is expected to initiate the appropriate recovery flow by suspende 
> > resume
> > first and if it fails then trigger a bus rescan.
> > 
> > This change is intentionally limited to HWC command timeouts and does not
> > trigger recovery for errors reported by the SoC as a normal command 
> > response.
> > 
> > Signed-off-by: Dipayaan Roy <[email protected]>
> > ---
> >  .../net/ethernet/microsoft/mana/gdma_main.c   | 14 +++-------
> >  drivers/net/ethernet/microsoft/mana/mana_en.c | 28 ++++++++++++++++++-
> >  include/net/mana/gdma.h                       | 16 +++++++++--
> >  3 files changed, 45 insertions(+), 13 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 0055c231acf6..16c438d2aaa3 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -490,15 +490,9 @@ static void mana_serv_reset(struct pci_dev *pdev)
> >             dev_info(&pdev->dev, "MANA reset cycle completed\n");
> > 
> >  out:
> > -   gc->in_service = false;
> > +   clear_bit(GC_IN_SERVICE, &gc->flags);
> >  }
> > 
> > -struct mana_serv_work {
> > -   struct work_struct serv_work;
> > -   struct pci_dev *pdev;
> > -   enum gdma_eqe_type type;
> > -};
> > -
> >  static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev)
> > {
> >     switch (type) {
> > @@ -542,7 +536,7 @@ static void mana_recovery_delayed_func(struct
> > work_struct *w)
> >     spin_unlock_irqrestore(&work->lock, flags);  }
> > 
> > -static void mana_serv_func(struct work_struct *w)
> > +void mana_serv_func(struct work_struct *w)
> >  {
> >     struct mana_serv_work *mns_wk;
> >     struct pci_dev *pdev;
> > @@ -624,7 +618,7 @@ static void mana_gd_process_eqe(struct gdma_queue
> > *eq)
> >                     break;
> >             }
> > 
> > -           if (gc->in_service) {
> > +           if (test_bit(GC_IN_SERVICE, &gc->flags)) {
> >                     dev_info(gc->dev, "Already in service\n");
> >                     break;
> >             }
> > @@ -641,7 +635,7 @@ static void mana_gd_process_eqe(struct gdma_queue
> > *eq)
> >             }
> > 
> >             dev_info(gc->dev, "Start MANA service type:%d\n", type);
> > -           gc->in_service = true;
> > +           set_bit(GC_IN_SERVICE, &gc->flags);
> >             mns_wk->pdev = to_pci_dev(gc->dev);
> >             mns_wk->type = type;
> >             pci_dev_get(mns_wk->pdev);
> > diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > index 91c418097284..8da574cf06f2 100644
> > --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> > +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> > @@ -879,7 +879,7 @@ static void mana_tx_timeout(struct net_device *netdev,
> > unsigned int txqueue)
> >     struct gdma_context *gc = ac->gdma_dev->gdma_context;
> > 
> >     /* Already in service, hence tx queue reset is not required.*/
> > -   if (gc->in_service)
> > +   if (test_bit(GC_IN_SERVICE, &gc->flags))
> >             return;
> > 
> >     /* Note: If there are pending queue reset work for this port(apc), @@ -
> > 3533,6 +3533,8 @@ static void mana_gf_stats_work_handler(struct work_struct
> > *work)  {
> >     struct mana_context *ac =
> >             container_of(to_delayed_work(work), struct mana_context,
> > gf_stats_work);
> > +   struct gdma_context *gc = ac->gdma_dev->gdma_context;
> > +   struct mana_serv_work *mns_wk;
> >     int err;
> > 
> >     err = mana_query_gf_stats(ac);
> > @@ -3540,6 +3542,30 @@ static void mana_gf_stats_work_handler(struct
> > work_struct *work)
> >             /* HWC timeout detected - reset stats and stop rescheduling */
> >             ac->hwc_timeout_occurred = true;
> >             memset(&ac->hc_stats, 0, sizeof(ac->hc_stats));
> > +           dev_warn(gc->dev,
> > +                    "Gf stats wk handler: gf stats query timed out.\n");
> > +
> > +           /* As HWC timed out, indicating a faulty HW state and needs a
> > +            * reset.
> > +            */
> > +           if (!test_and_set_bit(GC_IN_SERVICE, &gc->flags)) {
> > +                   if (!try_module_get(THIS_MODULE)) {
> > +                           dev_info(gc->dev, "Module is unloading\n");
> > +                           return;
> > +                   }
> > +
> > +                   mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
> > +                   if (!mns_wk) {
> > +                           module_put(THIS_MODULE);
> 
> Maybe it's not necessary: check if you want to call  clear_bit(GC_IN_SERVICE, 
> &gc->flags) here?
>
yes it makes sense to clear it here. 
> > +                           return;
> > +                   }
> > +
> > +                   mns_wk->pdev = to_pci_dev(gc->dev);
> > +                   mns_wk->type = GDMA_EQE_HWC_RESET_REQUEST;
> > +                   pci_dev_get(mns_wk->pdev);
> > +                   INIT_WORK(&mns_wk->serv_work, mana_serv_func);
> > +                   schedule_work(&mns_wk->serv_work);
> > +           }
> >             return;
> >     }
> >     schedule_delayed_work(&ac->gf_stats_work,
> > MANA_GF_STATS_PERIOD); diff --git a/include/net/mana/gdma.h
> 

Regards


Reply via email to