> The GF stats periodic query is used as mechanism to monitor HWC health check.
> If this HWC command times out, it is a strong indication that the device/SoC 
> is in a
> faulty state and requires recovery.
> 
> Today, when a timeout is detected, the driver marks hwc_timeout_occurred,
> clears cached stats, and stops rescheduling the periodic work. However, the
> device itself is left in the same failing state.
> 
> Extend the timeout handling path to trigger the existing MANA VF recovery
> service by queueing a GDMA_EQE_HWC_RESET_REQUEST work item.
> This is expected to initiate the appropriate recovery flow by suspende resume
> first and if it fails then trigger a bus rescan.
> 
> This change is intentionally limited to HWC command timeouts and does not
> trigger recovery for errors reported by the SoC as a normal command response.
> 
> Signed-off-by: Dipayaan Roy <[email protected]>
> ---
>  .../net/ethernet/microsoft/mana/gdma_main.c   | 14 +++-------
>  drivers/net/ethernet/microsoft/mana/mana_en.c | 28 ++++++++++++++++++-
>  include/net/mana/gdma.h                       | 16 +++++++++--
>  3 files changed, 45 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 0055c231acf6..16c438d2aaa3 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -490,15 +490,9 @@ static void mana_serv_reset(struct pci_dev *pdev)
>               dev_info(&pdev->dev, "MANA reset cycle completed\n");
> 
>  out:
> -     gc->in_service = false;
> +     clear_bit(GC_IN_SERVICE, &gc->flags);
>  }
> 
> -struct mana_serv_work {
> -     struct work_struct serv_work;
> -     struct pci_dev *pdev;
> -     enum gdma_eqe_type type;
> -};
> -
>  static void mana_do_service(enum gdma_eqe_type type, struct pci_dev *pdev)
> {
>       switch (type) {
> @@ -542,7 +536,7 @@ static void mana_recovery_delayed_func(struct
> work_struct *w)
>       spin_unlock_irqrestore(&work->lock, flags);  }
> 
> -static void mana_serv_func(struct work_struct *w)
> +void mana_serv_func(struct work_struct *w)
>  {
>       struct mana_serv_work *mns_wk;
>       struct pci_dev *pdev;
> @@ -624,7 +618,7 @@ static void mana_gd_process_eqe(struct gdma_queue
> *eq)
>                       break;
>               }
> 
> -             if (gc->in_service) {
> +             if (test_bit(GC_IN_SERVICE, &gc->flags)) {
>                       dev_info(gc->dev, "Already in service\n");
>                       break;
>               }
> @@ -641,7 +635,7 @@ static void mana_gd_process_eqe(struct gdma_queue
> *eq)
>               }
> 
>               dev_info(gc->dev, "Start MANA service type:%d\n", type);
> -             gc->in_service = true;
> +             set_bit(GC_IN_SERVICE, &gc->flags);
>               mns_wk->pdev = to_pci_dev(gc->dev);
>               mns_wk->type = type;
>               pci_dev_get(mns_wk->pdev);
> diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c
> b/drivers/net/ethernet/microsoft/mana/mana_en.c
> index 91c418097284..8da574cf06f2 100644
> --- a/drivers/net/ethernet/microsoft/mana/mana_en.c
> +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
> @@ -879,7 +879,7 @@ static void mana_tx_timeout(struct net_device *netdev,
> unsigned int txqueue)
>       struct gdma_context *gc = ac->gdma_dev->gdma_context;
> 
>       /* Already in service, hence tx queue reset is not required.*/
> -     if (gc->in_service)
> +     if (test_bit(GC_IN_SERVICE, &gc->flags))
>               return;
> 
>       /* Note: If there are pending queue reset work for this port(apc), @@ -
> 3533,6 +3533,8 @@ static void mana_gf_stats_work_handler(struct work_struct
> *work)  {
>       struct mana_context *ac =
>               container_of(to_delayed_work(work), struct mana_context,
> gf_stats_work);
> +     struct gdma_context *gc = ac->gdma_dev->gdma_context;
> +     struct mana_serv_work *mns_wk;
>       int err;
> 
>       err = mana_query_gf_stats(ac);
> @@ -3540,6 +3542,30 @@ static void mana_gf_stats_work_handler(struct
> work_struct *work)
>               /* HWC timeout detected - reset stats and stop rescheduling */
>               ac->hwc_timeout_occurred = true;
>               memset(&ac->hc_stats, 0, sizeof(ac->hc_stats));
> +             dev_warn(gc->dev,
> +                      "Gf stats wk handler: gf stats query timed out.\n");
> +
> +             /* As HWC timed out, indicating a faulty HW state and needs a
> +              * reset.
> +              */
> +             if (!test_and_set_bit(GC_IN_SERVICE, &gc->flags)) {
> +                     if (!try_module_get(THIS_MODULE)) {
> +                             dev_info(gc->dev, "Module is unloading\n");
> +                             return;
> +                     }
> +
> +                     mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
> +                     if (!mns_wk) {
> +                             module_put(THIS_MODULE);

Maybe it's not necessary: check if you want to call  clear_bit(GC_IN_SERVICE, 
&gc->flags) here?

> +                             return;
> +                     }
> +
> +                     mns_wk->pdev = to_pci_dev(gc->dev);
> +                     mns_wk->type = GDMA_EQE_HWC_RESET_REQUEST;
> +                     pci_dev_get(mns_wk->pdev);
> +                     INIT_WORK(&mns_wk->serv_work, mana_serv_func);
> +                     schedule_work(&mns_wk->serv_work);
> +             }
>               return;
>       }
>       schedule_delayed_work(&ac->gf_stats_work,
> MANA_GF_STATS_PERIOD); diff --git a/include/net/mana/gdma.h


Reply via email to