TX reporter reports an error on two scenarios:
- TX timeout on a specific tx queue
- TX completion error on a specific send queue
Prior to this patch, no dump data was supported by the tx reporter. This
patch adds support for SW data dump of the related SQ context. The dump
is simply the SQ's raw memory snapshot taken right after the error was
reported, before any recovery procedure was launched. With this
approach, no maintenance is needed as the driver fetch the actual data
according to the layout on which the SQ was compiled with.  By providing
a SW context, one can easily debug error on a given SQ.

In order to offline translate the raw memory into a human readable
format, the user can use some out-of-kernel scripts which receives as an
input the following:
- Object raw memory
- Driver object compiled with debug info (can be taken/generated at any time 
from the machine)
- Object name

An example of such script output can be seen below.
Note: the script is not offered as part of this patch as it do not
belong to the kernel, I just described it in order to grasp the general
idea of how/what can be fetched from SW dump via devlink health.

The output of the SW dump can be extracted by devlink health command:
$ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
 mlx5e_txqsq: sqn: 6336
 memory:
   00 00 00 00 00 00 00 00
   01 00 00 00 00 00 00 00
   00 00 00 00 00 00 00 00
   45 f4 88 cb 09 00 00 00
   00 00 00 00 00 00 00 00
   00 00 00 00 00 00 00 00
   c0 ff ff ff 1f 00 00 00
   f8 18 1e 89 81 88 ff ff
   ...

script output below, with struct members names and actual values:

struct  mlx5e_txqsq {
        short unsigned int         cc    0x5 ;
        unsigned int               dma_fifo_cc   0x5 ;
        struct  net_dim {
                unsigned char      state         0x1 ;
                struct  net_dim_stats {
                        int        ppms          0x0 ;
                        int        bpms          0x0 ;
                        int        epms          0x0 ;
                } prev_stats;
                struct  net_dim_sample {
                        long long int time       0x90766ef9d ;
                        unsigned int pkt_ctr     0x0 ;
                        unsigned int byte_ctr    0x0 ;
                        short unsigned int event_ctr     0x0 ;
                } start_sample;
                struct  work_struct {
                        struct   {
                                long int counter         0x1fffffffc0 ;
                        } data;
                        struct  list_head {
                                struct list_head * next          
0xffff8881b08998f8 ;
                                struct list_head * prev          
0xffff8881b08998f8 ;
                        } entry;
                        void       (*func)(struct work_struct *)         
0xffffffffa02d0e30 ;
                } work;
                unsigned char      profile_ix    0x60 ;
                unsigned char      mode          0x72 ;
                unsigned char      tune_state    0x35 ;
                unsigned char      steps_right   0xa0 ;
                unsigned char      steps_left    0xff ;
                unsigned char      tired         0xff ;
        } dim;
        short unsigned int         pc    0x0 ;
        unsigned int               dma_fifo_pc   0x0 ;
        struct  mlx5e_cq {
                struct  mlx5_cqwq {
                        struct  mlx5_frag_buf_ctrl {
                                struct mlx5_buf_list * frags     0x500000005 ;
                                unsigned int sz_m1       0x0 ;
                                short unsigned int frag_sz_m1    0x0 ;
                                short unsigned int strides_offset        0x0 ;
                                unsigned char log_sz     0x0 ;
                                unsigned char log_stride         0x0 ;
                                unsigned char log_frag_strides   0x0 ;
                        } fbc;
                        __be32 *   db    0x0 ;
                        unsigned int cc          0x0 ;
                } wq;
                short unsigned int event_ctr     0x0 ;
                struct napi_struct * napi        0x0 ;
                struct  mlx5_core_cq {
                        unsigned int cqn         0x0 ;
                        int        cqe_sz        0x0 ;
                        __be32 *   set_ci_db     0xffff8881b1aa4988 ;
                        __be32 *   arm_db        0x3f000003ff ;
                        struct mlx5_uars_page * uar      0x6060a ;
                        struct  refcount_struct {
                                struct   {
                                        int    counter   0xa1814500 ;
                                } refs;
                        } refcount;
                        struct  completion {
                                unsigned int done        0x5 ;
                                struct  wait_queue_head {
                                        struct  spinlock {
                                                union   {
                                                        struct  raw_spinlock {
                                                                struct  
qspinlock {
                                                                        union   
{
                                                                                
struct   {
                                                                                
        int                                                    counter   0x5 ;
                                                                                
} val;
                                                                                
struct   {
                                                                                
        unsigned char                                          locked    0x5 ;
                                                                                
        unsigned char                                          pending   0x0 ;
                                                                                
} ;
                                                                                
struct   {
                                                                                
        short unsigned int                                     locked_pending   
 0x5 ;
                                                                                
        short unsigned int                                     tail      0x0 ;
                                                                                
} ;
                                                                        } ;
                                                                } raw_lock;
                                                        } rlock;
                                                } ;
                                        } lock;
                                        struct  list_head {
                                                struct list_head * next         
 0xffff8881b089bb88 ;
                                                struct list_head * prev         
 0x4000000c0a ;
                                        } head;
                                } wait;
                        } free;
                        unsigned int vector      0xa1814500 ;
                        unsigned int irqn        0xffff8881 ;
                        void       (*comp)(struct mlx5_core_cq *)        
0xffff8881a1814504 ;
                        void       (*event)(struct mlx5_core_cq *, enum 
mlx5_event)      0xffff8881a2cdea08 ;
                        unsigned int cons_index          0x1 ;
                        unsigned int arm_sn      0x0 ;
                        struct mlx5_rsc_debug * dbg      0x0 ;
                        int        pid   0x0 ;
                        struct   {
                                struct  list_head {
                                        struct list_head * next          
0xffffffff ;
                                        struct list_head * prev          
0xffffffffffffffff ;
                                } list;
                                void (*comp)(struct mlx5_core_cq *)      
0xffffffffa0356940 ;
                                void * priv      0x0 ;
                        } tasklet_ctx;
                        int        reset_notify_added    0x0 ;
                        struct  list_head {
                                struct list_head * next          
0xffffffffa0300700 ;
                                struct list_head * prev          0xd ;
                        } reset_notify;
                        struct mlx5_eq_comp * eq         0x0 ;
                        short unsigned int uid   0x9a70 ;
                } mcq;
                struct mlx5e_channel * channel   0xffff8881b0899a70 ;
                struct mlx5_core_dev * mdev      0x4800000001 ;
                struct  mlx5_wq_ctrl {
                        struct mlx5_core_dev * mdev      0xffffffffa02d5350 ;
                        struct  mlx5_frag_buf {
                                struct mlx5_buf_list * frags     
0xffffffffa02d5460 ;
                                int npages       0x0 ;
                                int size         0x5 ;
                                unsigned char page_shift         0x8 ;
                        } buf;
                        struct  mlx5_db {
                                __be32 * db      0x1c6 ;
                                union   {
                                        struct mlx5_db_pgdir * pgdir     0x0 ;
                                        struct mlx5_ib_user_db_page * user_page 
         0x0 ;
                                } u;
                                long long unsigned int dma       
0xffff8881b0899ab0 ;
                                int index        0x0 ;
                        } db;
                } wq_ctrl;
        } cq;
        struct  mlx5_wq_cyc {
                struct  mlx5_frag_buf_ctrl {
                        struct mlx5_buf_list * frags     0xffff8881a7600160 ;
                        unsigned int sz_m1       0xa7600160 ;
                        short unsigned int frag_sz_m1    0x8881 ;
                        short unsigned int strides_offset        0xffff ;
                        unsigned char log_sz     0x88 ;
                        unsigned char log_stride         0x49 ;
                        unsigned char log_frag_strides   0xaa ;
                } fbc;
                __be32 *           db    0x1000000000010 ;
                short unsigned int sz    0xc ;
                short unsigned int wqe_ctr       0x0 ;
                short unsigned int cur_sz        0x0 ;
        } wq;
        unsigned int               dma_fifo_mask         0xa1814500 ;
        struct mlx5e_sq_stats *    stats         0xffff8881a33a0348 ;
        struct   {
                struct mlx5e_sq_dma * dma_fifo   0x1a1814500 ;
                struct mlx5e_tx_wqe_info * wqe_info      0x14 ;
        } db;
        void *                     uar_map       0x0 ;
        struct netdev_queue *      txq   0x0 ;
        unsigned int               sqn   0x18c0 ;
        unsigned char              min_inline_mode       0x0 ;
        struct device *            pdev          0x0 ;
        unsigned int               mkey_be       0x0 ;
        long unsigned int          state         0x0 ;
        struct hwtstamp_config *   tstamp        0x0 ;
        struct mlx5_clock *        clock         0xffff8881b1aa6f88 ;
        struct  mlx5_wq_ctrl {
                struct mlx5_core_dev * mdev      0x3f000003ff ;
                struct  mlx5_frag_buf {
                        struct mlx5_buf_list * frags     0x6060a ;
                        int        npages        0xa1814604 ;
                        int        size          0xffff8881 ;
                        unsigned char page_shift         0x0 ;
                } buf;
                struct  mlx5_db {
                        __be32 *   db    0xfff ;
                        union   {
                                struct mlx5_db_pgdir * pgdir     0x0 ;
                                struct mlx5_ib_user_db_page * user_page         
 0x0 ;
                        } u;
                        long long unsigned int dma       0xffff888188440000 ;
                        int        index         0x8b074000 ;
                } db;
        } wq_ctrl;
        struct mlx5e_channel *     channel       0xffffc9000010d800 ;
        int                        txq_ix        0xa0020180 ;
        unsigned int               rate_limit    0xffff8881 ;
        struct  work_struct {
                struct   {
                        long int   counter       0x1000018c0 ;
                } data;
                struct  list_head {
                        struct list_head * next          0xffff8881c32b68e8 ;
                        struct list_head * prev          0x800 ;
                } entry;
                void               (*func)(struct work_struct *)         0x9 ;
        } recover_work;
} ;

Signed-off-by: Aya Levin <a...@mellanox.com>
---
 .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 +++++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
index 476dd97f7f2f..8a39f5525e57 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
@@ -9,6 +9,7 @@
 
 struct mlx5e_tx_err_ctx {
        int (*recover)(struct mlx5e_txqsq *sq);
+       int (*dump)(struct mlx5e_txqsq *sq);
        struct mlx5e_txqsq *sq;
 };
 
@@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct 
devlink_health_reporter *reporter,
        return err;
 }
 
+static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
+                                             struct mlx5e_txqsq *sq,
+                                             struct devlink_fmsg *fmsg)
+{
+       u64 *ptr = (u64 *)sq;
+       int copy, err;
+       int i = 0;
+
+       if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+               return 0;
+
+       err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
+       if (err)
+               return err;
+
+       err = devlink_fmsg_obj_nest_start(fmsg);
+       if (err)
+               return err;
+
+       err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
+       if (err)
+               return err;
+
+       while (i < sizeof(struct mlx5e_txqsq)) {
+               copy = sizeof(u64);
+
+               if (i + copy > sizeof(struct mlx5e_txqsq))
+                       copy = sizeof(struct mlx5e_txqsq) - i;
+
+               err = devlink_fmsg_binary_put(fmsg, ptr, copy);
+               if (err)
+                       return err;
+               ptr++;
+               i += copy;
+       }
+
+       err = devlink_fmsg_arr_pair_nest_end(fmsg);
+       if (err)
+               return err;
+
+       err = devlink_fmsg_obj_nest_end(fmsg);
+       if (err)
+               return err;
+
+       err = devlink_fmsg_pair_nest_end(fmsg);
+
+       return err;
+}
+
+static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
+                                        struct devlink_fmsg *fmsg)
+{
+       int i, err = 0;
+
+       mutex_lock(&priv->state_lock);
+
+       if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
+               goto unlock;
+
+       err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
+       if (err)
+               goto unlock;
+
+       for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
+            i++) {
+               err = devlink_fmsg_obj_nest_start(fmsg);
+               if (err)
+                       goto unlock;
+
+               err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
+                                                        fmsg);
+               if (err)
+                       goto unlock;
+
+               err = devlink_fmsg_pair_nest_end(fmsg);
+               if (err)
+                       goto unlock;
+       }
+       err = devlink_fmsg_arr_pair_nest_end(fmsg);
+       if (err)
+               goto unlock;
+
+unlock:
+       mutex_unlock(&priv->state_lock);
+       return err;
+}
+
+static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter *reporter,
+                                    struct devlink_fmsg *fmsg, void *context)
+{
+       struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
+       struct mlx5e_tx_err_ctx *err_ctx = context;
+
+       return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
+                                                           fmsg) :
+                        mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
+}
+
 static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
                .name = "tx",
                .recover = mlx5e_tx_reporter_recover,
                .diagnose = mlx5e_tx_reporter_diagnose,
+               .dump = mlx5e_tx_reporter_sw_dump,
 };
 
 #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
-- 
2.14.1

Reply via email to