On 5/14/2019 3:07 PM, Jiri Pirko wrote:
> Sun, May 12, 2019 at 10:37:35AM CEST, a...@mellanox.com wrote:
>>
>>
>> On 5/9/2019 11:23 AM, Jiri Pirko wrote:
>>> Tue, May 07, 2019 at 02:58:32PM CEST, a...@mellanox.com wrote:
>>>>
>>>>
>>>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, a...@mellanox.com wrote:
>>>>>> TX reporter reports an error on two scenarios:
>>>>>> - TX timeout on a specific tx queue
>>>>>> - TX completion error on a specific send queue
>>>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>>>> reported, before any recovery procedure was launched. With this
>>>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>>>> according to the layout on which the SQ was compiled with.  By providing
>>>>>> a SW context, one can easily debug error on a given SQ.
>>>>>>
>>>>>> In order to offline translate the raw memory into a human readable
>>>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>>>> input the following:
>>>>>> - Object raw memory
>>>>>> - Driver object compiled with debug info (can be taken/generated at any 
>>>>>> time from the machine)
>>>>>> - Object name
>>>>>>
>>>>>> An example of such script output can be seen below.
>>>>>> Note: the script is not offered as part of this patch as it do not
>>>>>> belong to the kernel, I just described it in order to grasp the general
>>>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>>>
>>>>>> The output of the SW dump can be extracted by devlink health command:
>>>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>>>> mlx5e_txqsq: sqn: 6336
>>>>>> memory:
>>>>>>      00 00 00 00 00 00 00 00
>>>>>>      01 00 00 00 00 00 00 00
>>>>>>      00 00 00 00 00 00 00 00
>>>>>>      45 f4 88 cb 09 00 00 00
>>>>>>      00 00 00 00 00 00 00 00
>>>>>>      00 00 00 00 00 00 00 00
>>>>>>      c0 ff ff ff 1f 00 00 00
>>>>>>      f8 18 1e 89 81 88 ff ff
>>>>>>      ...
>>>>>>
>>>>>> script output below, with struct members names and actual values:
>>>>>>
>>>>>> struct  mlx5e_txqsq {
>>>>>>  short unsigned int         cc    0x5 ;
>>>>>>  unsigned int               dma_fifo_cc   0x5 ;
>>>>>>  struct  net_dim {
>>>>>>          unsigned char      state         0x1 ;
>>>>>>          struct  net_dim_stats {
>>>>>>                  int        ppms          0x0 ;
>>>>>>                  int        bpms          0x0 ;
>>>>>>                  int        epms          0x0 ;
>>>>>>          } prev_stats;
>>>>>>          struct  net_dim_sample {
>>>>>>                  long long int time       0x90766ef9d ;
>>>>>>                  unsigned int pkt_ctr     0x0 ;
>>>>>>                  unsigned int byte_ctr    0x0 ;
>>>>>>                  short unsigned int event_ctr     0x0 ;
>>>>>>          } start_sample;
>>>>>>          struct  work_struct {
>>>>>>                  struct   {
>>>>>>                          long int counter         0x1fffffffc0 ;
>>>>>>                  } data;
>>>>>>                  struct  list_head {
>>>>>>                          struct list_head * next          
>>>>>> 0xffff8881b08998f8 ;
>>>>>>                          struct list_head * prev          
>>>>>> 0xffff8881b08998f8 ;
>>>>>>                  } entry;
>>>>>>                  void       (*func)(struct work_struct *)         
>>>>>> 0xffffffffa02d0e30 ;
>>>>>>          } work;
>>>>>>          unsigned char      profile_ix    0x60 ;
>>>>>>          unsigned char      mode          0x72 ;
>>>>>>          unsigned char      tune_state    0x35 ;
>>>>>>          unsigned char      steps_right   0xa0 ;
>>>>>>          unsigned char      steps_left    0xff ;
>>>>>>          unsigned char      tired         0xff ;
>>>>>>  } dim;
>>>>>>  short unsigned int         pc    0x0 ;
>>>>>>  unsigned int               dma_fifo_pc   0x0 ;
>>>>>>  struct  mlx5e_cq {
>>>>>>          struct  mlx5_cqwq {
>>>>>>                  struct  mlx5_frag_buf_ctrl {
>>>>>>                          struct mlx5_buf_list * frags     0x500000005 ;
>>>>>>                          unsigned int sz_m1       0x0 ;
>>>>>>                          short unsigned int frag_sz_m1    0x0 ;
>>>>>>                          short unsigned int strides_offset        0x0 ;
>>>>>>                          unsigned char log_sz     0x0 ;
>>>>>>                          unsigned char log_stride         0x0 ;
>>>>>>                          unsigned char log_frag_strides   0x0 ;
>>>>>>                  } fbc;
>>>>>>                  __be32 *   db    0x0 ;
>>>>>>                  unsigned int cc          0x0 ;
>>>>>>          } wq;
>>>>>>          short unsigned int event_ctr     0x0 ;
>>>>>>          struct napi_struct * napi        0x0 ;
>>>>>>          struct  mlx5_core_cq {
>>>>>>                  unsigned int cqn         0x0 ;
>>>>>>                  int        cqe_sz        0x0 ;
>>>>>>                  __be32 *   set_ci_db     0xffff8881b1aa4988 ;
>>>>>>                  __be32 *   arm_db        0x3f000003ff ;
>>>>>>                  struct mlx5_uars_page * uar      0x6060a ;
>>>>>>                  struct  refcount_struct {
>>>>>>                          struct   {
>>>>>>                                  int    counter   0xa1814500 ;
>>>>>>                          } refs;
>>>>>>                  } refcount;
>>>>>>                  struct  completion {
>>>>>>                          unsigned int done        0x5 ;
>>>>>>                          struct  wait_queue_head {
>>>>>>                                  struct  spinlock {
>>>>>>                                          union   {
>>>>>>                                                  struct  raw_spinlock {
>>>>>>                                                          struct  
>>>>>> qspinlock {
>>>>>>                                                                  union   
>>>>>> {
>>>>>>                                                                          
>>>>>> struct   {
>>>>>>                                                                          
>>>>>>         int                                                    counter   
>>>>>> 0x5 ;
>>>>>>                                                                          
>>>>>> } val;
>>>>>>                                                                          
>>>>>> struct   {
>>>>>>                                                                          
>>>>>>         unsigned char                                          locked    
>>>>>> 0x5 ;
>>>>>>                                                                          
>>>>>>         unsigned char                                          pending   
>>>>>> 0x0 ;
>>>>>>                                                                          
>>>>>> } ;
>>>>>>                                                                          
>>>>>> struct   {
>>>>>>                                                                          
>>>>>>         short unsigned int                                     
>>>>>> locked_pending    0x5 ;
>>>>>>                                                                          
>>>>>>         short unsigned int                                     tail      
>>>>>> 0x0 ;
>>>>>>                                                                          
>>>>>> } ;
>>>>>>                                                                  } ;
>>>>>>                                                          } raw_lock;
>>>>>>                                                  } rlock;
>>>>>>                                          } ;
>>>>>>                                  } lock;
>>>>>>                                  struct  list_head {
>>>>>>                                          struct list_head * next         
>>>>>>  0xffff8881b089bb88 ;
>>>>>>                                          struct list_head * prev         
>>>>>>  0x4000000c0a ;
>>>>>>                                  } head;
>>>>>>                          } wait;
>>>>>>                  } free;
>>>>>>                  unsigned int vector      0xa1814500 ;
>>>>>>                  unsigned int irqn        0xffff8881 ;
>>>>>>                  void       (*comp)(struct mlx5_core_cq *)        
>>>>>> 0xffff8881a1814504 ;
>>>>>>                  void       (*event)(struct mlx5_core_cq *, enum 
>>>>>> mlx5_event)      0xffff8881a2cdea08 ;
>>>>>>                  unsigned int cons_index          0x1 ;
>>>>>>                  unsigned int arm_sn      0x0 ;
>>>>>>                  struct mlx5_rsc_debug * dbg      0x0 ;
>>>>>>                  int        pid   0x0 ;
>>>>>>                  struct   {
>>>>>>                          struct  list_head {
>>>>>>                                  struct list_head * next          
>>>>>> 0xffffffff ;
>>>>>>                                  struct list_head * prev          
>>>>>> 0xffffffffffffffff ;
>>>>>>                          } list;
>>>>>>                          void (*comp)(struct mlx5_core_cq *)      
>>>>>> 0xffffffffa0356940 ;
>>>>>>                          void * priv      0x0 ;
>>>>>>                  } tasklet_ctx;
>>>>>>                  int        reset_notify_added    0x0 ;
>>>>>>                  struct  list_head {
>>>>>>                          struct list_head * next          
>>>>>> 0xffffffffa0300700 ;
>>>>>>                          struct list_head * prev          0xd ;
>>>>>>                  } reset_notify;
>>>>>>                  struct mlx5_eq_comp * eq         0x0 ;
>>>>>>                  short unsigned int uid   0x9a70 ;
>>>>>>          } mcq;
>>>>>>          struct mlx5e_channel * channel   0xffff8881b0899a70 ;
>>>>>>          struct mlx5_core_dev * mdev      0x4800000001 ;
>>>>>>          struct  mlx5_wq_ctrl {
>>>>>>                  struct mlx5_core_dev * mdev      0xffffffffa02d5350 ;
>>>>>>                  struct  mlx5_frag_buf {
>>>>>>                          struct mlx5_buf_list * frags     
>>>>>> 0xffffffffa02d5460 ;
>>>>>>                          int npages       0x0 ;
>>>>>>                          int size         0x5 ;
>>>>>>                          unsigned char page_shift         0x8 ;
>>>>>>                  } buf;
>>>>>>                  struct  mlx5_db {
>>>>>>                          __be32 * db      0x1c6 ;
>>>>>>                          union   {
>>>>>>                                  struct mlx5_db_pgdir * pgdir     0x0 ;
>>>>>>                                  struct mlx5_ib_user_db_page * user_page 
>>>>>>          0x0 ;
>>>>>>                          } u;
>>>>>>                          long long unsigned int dma       
>>>>>> 0xffff8881b0899ab0 ;
>>>>>>                          int index        0x0 ;
>>>>>>                  } db;
>>>>>>          } wq_ctrl;
>>>>>>  } cq;
>>>>>>  struct  mlx5_wq_cyc {
>>>>>>          struct  mlx5_frag_buf_ctrl {
>>>>>>                  struct mlx5_buf_list * frags     0xffff8881a7600160 ;
>>>>>>                  unsigned int sz_m1       0xa7600160 ;
>>>>>>                  short unsigned int frag_sz_m1    0x8881 ;
>>>>>>                  short unsigned int strides_offset        0xffff ;
>>>>>>                  unsigned char log_sz     0x88 ;
>>>>>>                  unsigned char log_stride         0x49 ;
>>>>>>                  unsigned char log_frag_strides   0xaa ;
>>>>>>          } fbc;
>>>>>>          __be32 *           db    0x1000000000010 ;
>>>>>>          short unsigned int sz    0xc ;
>>>>>>          short unsigned int wqe_ctr       0x0 ;
>>>>>>          short unsigned int cur_sz        0x0 ;
>>>>>>  } wq;
>>>>>>  unsigned int               dma_fifo_mask         0xa1814500 ;
>>>>>>  struct mlx5e_sq_stats *    stats         0xffff8881a33a0348 ;
>>>>>>  struct   {
>>>>>>          struct mlx5e_sq_dma * dma_fifo   0x1a1814500 ;
>>>>>>          struct mlx5e_tx_wqe_info * wqe_info      0x14 ;
>>>>>>  } db;
>>>>>>  void *                     uar_map       0x0 ;
>>>>>>  struct netdev_queue *      txq   0x0 ;
>>>>>>  unsigned int               sqn   0x18c0 ;
>>>>>>  unsigned char              min_inline_mode       0x0 ;
>>>>>>  struct device *            pdev          0x0 ;
>>>>>>  unsigned int               mkey_be       0x0 ;
>>>>>>  long unsigned int          state         0x0 ;
>>>>>>  struct hwtstamp_config *   tstamp        0x0 ;
>>>>>>  struct mlx5_clock *        clock         0xffff8881b1aa6f88 ;
>>>>>>  struct  mlx5_wq_ctrl {
>>>>>>          struct mlx5_core_dev * mdev      0x3f000003ff ;
>>>>>>          struct  mlx5_frag_buf {
>>>>>>                  struct mlx5_buf_list * frags     0x6060a ;
>>>>>>                  int        npages        0xa1814604 ;
>>>>>>                  int        size          0xffff8881 ;
>>>>>>                  unsigned char page_shift         0x0 ;
>>>>>>          } buf;
>>>>>>          struct  mlx5_db {
>>>>>>                  __be32 *   db    0xfff ;
>>>>>>                  union   {
>>>>>>                          struct mlx5_db_pgdir * pgdir     0x0 ;
>>>>>>                          struct mlx5_ib_user_db_page * user_page         
>>>>>>  0x0 ;
>>>>>>                  } u;
>>>>>>                  long long unsigned int dma       0xffff888188440000 ;
>>>>>>                  int        index         0x8b074000 ;
>>>>>>          } db;
>>>>>>  } wq_ctrl;
>>>>>>  struct mlx5e_channel *     channel       0xffffc9000010d800 ;
>>>>>>  int                        txq_ix        0xa0020180 ;
>>>>>>  unsigned int               rate_limit    0xffff8881 ;
>>>>>>  struct  work_struct {
>>>>>>          struct   {
>>>>>>                  long int   counter       0x1000018c0 ;
>>>>>>          } data;
>>>>>>          struct  list_head {
>>>>>>                  struct list_head * next          0xffff8881c32b68e8 ;
>>>>>>                  struct list_head * prev          0x800 ;
>>>>>>          } entry;
>>>>>>          void               (*func)(struct work_struct *)         0x9 ;
>>>>>>  } recover_work;
>>>>>> } ;
>>>>>
>>>>> I don't get it. You are dumping live kernel memory? There are already
>>>>> facilities to do that in place. Why to replicate it?
>>>> I am dumping the driver's memory under a lock so I can ensure it's
>>>> consistency (as appose to /dev/mem)
>>>> vmcore cannot be taken from a live kernel (without crashing).
>>>> I need the memory's snapshot right after the error from the driver's
>>>> context.
>>>
>>> Got it. However, this sounds like a generic problem not specific to
>>> nic drivers. How other subsystems resolve this (if they do at all)?
>>>
>>>
>> Correct, this is a suggested debugging solution for a generic problem:
>> enabling the user of a run time memory snapshot for kernel modules (at a
>> given error event). My research shows that other subsystems deal with
>> errors either by panicking (too much) or by debug/log prints (too little).
>> This solution is (a) low in maintenance (b) consistent in memory (c) has
>> small performance impact (d) use an existing infra-structure between the
>> kernel module and the user space.
> 
> I'm still convinced that dumping kernel memory over devlink health dump
> is a good idea :/
> 
> 
>> It might be ported to other subsystems using their own user-space vs.
>> kernel tools. Regardless of how the memory output was generated to the
>> user, the parsing script can work on it.
> 
> Could you share the script? How is it going to be distributed?
I thought that the script should be in a available on Mellanox website. 
The script is still pending review but I will be happy to share it when 
its ready.
> 
> 
>>
>>>
>>>> Which other tools do you mean?
>>>>>
>>>>>
>>>>>>
>>>>>> Signed-off-by: Aya Levin <a...@mellanox.com>
>>>>>> ---
>>>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 
>>>>>> +++++++++++++++++++++
>>>>>> 1 file changed, 100 insertions(+)
>>>>>>
>>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c 
>>>>>> b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>>> @@ -9,6 +9,7 @@
>>>>>>
>>>>>> struct mlx5e_tx_err_ctx {
>>>>>>  int (*recover)(struct mlx5e_txqsq *sq);
>>>>>> +        int (*dump)(struct mlx5e_txqsq *sq);
>>>>>>  struct mlx5e_txqsq *sq;
>>>>>> };
>>>>>>
>>>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct 
>>>>>> devlink_health_reporter *reporter,
>>>>>>  return err;
>>>>>> }
>>>>>>
>>>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>>>> +                                              struct mlx5e_txqsq *sq,
>>>>>> +                                              struct devlink_fmsg *fmsg)
>>>>>> +{
>>>>>> +        u64 *ptr = (u64 *)sq;
>>>>>> +        int copy, err;
>>>>>> +        int i = 0;
>>>>>> +
>>>>>> +        if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>> +                return 0;
>>>>>> +
>>>>>> +        err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>>>> +        if (err)
>>>>>> +                return err;
>>>>>> +
>>>>>> +        err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>> +        if (err)
>>>>>> +                return err;
>>>>>> +
>>>>>> +        err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>>>> +        if (err)
>>>>>> +                return err;
>>>>>> +
>>>>>> +        while (i < sizeof(struct mlx5e_txqsq)) {
>>>>>> +                copy = sizeof(u64);
>>>>>> +
>>>>>> +                if (i + copy > sizeof(struct mlx5e_txqsq))
>>>>>> +                        copy = sizeof(struct mlx5e_txqsq) - i;
>>>>>> +
>>>>>> +                err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>>>> +                if (err)
>>>>>> +                        return err;
>>>>>> +                ptr++;
>>>>>> +                i += copy;
>>>>>> +        }
>>>>>> +
>>>>>> +        err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>> +        if (err)
>>>>>> +                return err;
>>>>>> +
>>>>>> +        err = devlink_fmsg_obj_nest_end(fmsg);
>>>>>> +        if (err)
>>>>>> +                return err;
>>>>>> +
>>>>>> +        err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>> +
>>>>>> +        return err;
>>>>>> +}
>>>>>> +
>>>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>>>> +                                         struct devlink_fmsg *fmsg)
>>>>>> +{
>>>>>> +        int i, err = 0;
>>>>>> +
>>>>>> +        mutex_lock(&priv->state_lock);
>>>>>> +
>>>>>> +        if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>>> +                goto unlock;
>>>>>> +
>>>>>> +        err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>>>> +        if (err)
>>>>>> +                goto unlock;
>>>>>> +
>>>>>> +        for (i = 0; i < priv->channels.num * 
>>>>>> priv->channels.params.num_tc;
>>>>>> +             i++) {
>>>>>> +                err = devlink_fmsg_obj_nest_start(fmsg);
>>>>>> +                if (err)
>>>>>> +                        goto unlock;
>>>>>> +
>>>>>> +                err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, 
>>>>>> priv->txq2sq[i],
>>>>>> +                                                         fmsg);
>>>>>> +                if (err)
>>>>>> +                        goto unlock;
>>>>>> +
>>>>>> +                err = devlink_fmsg_pair_nest_end(fmsg);
>>>>>> +                if (err)
>>>>>> +                        goto unlock;
>>>>>> +        }
>>>>>> +        err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>>> +        if (err)
>>>>>> +                goto unlock;
>>>>>> +
>>>>>> +unlock:
>>>>>> +        mutex_unlock(&priv->state_lock);
>>>>>> +        return err;
>>>>>> +}
>>>>>> +
>>>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter 
>>>>>> *reporter,
>>>>>> +                                     struct devlink_fmsg *fmsg, void 
>>>>>> *context)
>>>>>> +{
>>>>>> +        struct mlx5e_priv *priv = 
>>>>>> devlink_health_reporter_priv(reporter);
>>>>>> +        struct mlx5e_tx_err_ctx *err_ctx = context;
>>>>>> +
>>>>>> +        return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, 
>>>>>> err_ctx->sq,
>>>>>> +                                                            fmsg) :
>>>>>> +                         mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>>>> +}
>>>>>> +
>>>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>>>>          .name = "tx",
>>>>>>          .recover = mlx5e_tx_reporter_recover,
>>>>>>          .diagnose = mlx5e_tx_reporter_diagnose,
>>>>>> +                .dump = mlx5e_tx_reporter_sw_dump,
>>>>>> };
>>>>>>
>>>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>>>> -- 
>>>>>> 2.14.1
>>>>>>

Reply via email to