Sun, May 12, 2019 at 10:37:35AM CEST, a...@mellanox.com wrote:
>
>
>On 5/9/2019 11:23 AM, Jiri Pirko wrote:
>> Tue, May 07, 2019 at 02:58:32PM CEST, a...@mellanox.com wrote:
>>>
>>>
>>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, a...@mellanox.com wrote:
>>>>> TX reporter reports an error on two scenarios:
>>>>> - TX timeout on a specific tx queue
>>>>> - TX completion error on a specific send queue
>>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>>> reported, before any recovery procedure was launched. With this
>>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>>> according to the layout on which the SQ was compiled with.  By providing
>>>>> a SW context, one can easily debug error on a given SQ.
>>>>>
>>>>> In order to offline translate the raw memory into a human readable
>>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>>> input the following:
>>>>> - Object raw memory
>>>>> - Driver object compiled with debug info (can be taken/generated at any 
>>>>> time from the machine)
>>>>> - Object name
>>>>>
>>>>> An example of such script output can be seen below.
>>>>> Note: the script is not offered as part of this patch as it do not
>>>>> belong to the kernel, I just described it in order to grasp the general
>>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>>
>>>>> The output of the SW dump can be extracted by devlink health command:
>>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>>> mlx5e_txqsq: sqn: 6336
>>>>> memory:
>>>>>     00 00 00 00 00 00 00 00
>>>>>     01 00 00 00 00 00 00 00
>>>>>     00 00 00 00 00 00 00 00
>>>>>     45 f4 88 cb 09 00 00 00
>>>>>     00 00 00 00 00 00 00 00
>>>>>     00 00 00 00 00 00 00 00
>>>>>     c0 ff ff ff 1f 00 00 00
>>>>>     f8 18 1e 89 81 88 ff ff
>>>>>     ...
>>>>>
>>>>> script output below, with struct members names and actual values:
>>>>>
>>>>> struct  mlx5e_txqsq {
>>>>>   short unsigned int         cc    0x5 ;
>>>>>   unsigned int               dma_fifo_cc   0x5 ;
>>>>>   struct  net_dim {
>>>>>           unsigned char      state         0x1 ;
>>>>>           struct  net_dim_stats {
>>>>>                   int        ppms          0x0 ;
>>>>>                   int        bpms          0x0 ;
>>>>>                   int        epms          0x0 ;
>>>>>           } prev_stats;
>>>>>           struct  net_dim_sample {
>>>>>                   long long int time       0x90766ef9d ;
>>>>>                   unsigned int pkt_ctr     0x0 ;
>>>>>                   unsigned int byte_ctr    0x0 ;
>>>>>                   short unsigned int event_ctr     0x0 ;
>>>>>           } start_sample;
>>>>>           struct  work_struct {
>>>>>                   struct   {
>>>>>                           long int counter         0x1fffffffc0 ;
>>>>>                   } data;
>>>>>                   struct  list_head {
>>>>>                           struct list_head * next          
>>>>> 0xffff8881b08998f8 ;
>>>>>                           struct list_head * prev          
>>>>> 0xffff8881b08998f8 ;
>>>>>                   } entry;
>>>>>                   void       (*func)(struct work_struct *)         
>>>>> 0xffffffffa02d0e30 ;
>>>>>           } work;
>>>>>           unsigned char      profile_ix    0x60 ;
>>>>>           unsigned char      mode          0x72 ;
>>>>>           unsigned char      tune_state    0x35 ;
>>>>>           unsigned char      steps_right   0xa0 ;
>>>>>           unsigned char      steps_left    0xff ;
>>>>>           unsigned char      tired         0xff ;
>>>>>   } dim;
>>>>>   short unsigned int         pc    0x0 ;
>>>>>   unsigned int               dma_fifo_pc   0x0 ;
>>>>>   struct  mlx5e_cq {
>>>>>           struct  mlx5_cqwq {
>>>>>                   struct  mlx5_frag_buf_ctrl {
>>>>>                           struct mlx5_buf_list * frags     0x500000005 ;
>>>>>                           unsigned int sz_m1       0x0 ;
>>>>>                           short unsigned int frag_sz_m1    0x0 ;
>>>>>                           short unsigned int strides_offset        0x0 ;
>>>>>                           unsigned char log_sz     0x0 ;
>>>>>                           unsigned char log_stride         0x0 ;
>>>>>                           unsigned char log_frag_strides   0x0 ;
>>>>>                   } fbc;
>>>>>                   __be32 *   db    0x0 ;
>>>>>                   unsigned int cc          0x0 ;
>>>>>           } wq;
>>>>>           short unsigned int event_ctr     0x0 ;
>>>>>           struct napi_struct * napi        0x0 ;
>>>>>           struct  mlx5_core_cq {
>>>>>                   unsigned int cqn         0x0 ;
>>>>>                   int        cqe_sz        0x0 ;
>>>>>                   __be32 *   set_ci_db     0xffff8881b1aa4988 ;
>>>>>                   __be32 *   arm_db        0x3f000003ff ;
>>>>>                   struct mlx5_uars_page * uar      0x6060a ;
>>>>>                   struct  refcount_struct {
>>>>>                           struct   {
>>>>>                                   int    counter   0xa1814500 ;
>>>>>                           } refs;
>>>>>                   } refcount;
>>>>>                   struct  completion {
>>>>>                           unsigned int done        0x5 ;
>>>>>                           struct  wait_queue_head {
>>>>>                                   struct  spinlock {
>>>>>                                           union   {
>>>>>                                                   struct  raw_spinlock {
>>>>>                                                           struct  
>>>>> qspinlock {
>>>>>                                                                   union   
>>>>> {
>>>>>                                                                           
>>>>> struct   {
>>>>>                                                                           
>>>>>         int                                                    counter   
>>>>> 0x5 ;
>>>>>                                                                           
>>>>> } val;
>>>>>                                                                           
>>>>> struct   {
>>>>>                                                                           
>>>>>         unsigned char                                          locked    
>>>>> 0x5 ;
>>>>>                                                                           
>>>>>         unsigned char                                          pending   
>>>>> 0x0 ;
>>>>>                                                                           
>>>>> } ;
>>>>>                                                                           
>>>>> struct   {
>>>>>                                                                           
>>>>>         short unsigned int                                     
>>>>> locked_pending    0x5 ;
>>>>>                                                                           
>>>>>         short unsigned int                                     tail      
>>>>> 0x0 ;
>>>>>                                                                           
>>>>> } ;
>>>>>                                                                   } ;
>>>>>                                                           } raw_lock;
>>>>>                                                   } rlock;
>>>>>                                           } ;
>>>>>                                   } lock;
>>>>>                                   struct  list_head {
>>>>>                                           struct list_head * next         
>>>>>  0xffff8881b089bb88 ;
>>>>>                                           struct list_head * prev         
>>>>>  0x4000000c0a ;
>>>>>                                   } head;
>>>>>                           } wait;
>>>>>                   } free;
>>>>>                   unsigned int vector      0xa1814500 ;
>>>>>                   unsigned int irqn        0xffff8881 ;
>>>>>                   void       (*comp)(struct mlx5_core_cq *)        
>>>>> 0xffff8881a1814504 ;
>>>>>                   void       (*event)(struct mlx5_core_cq *, enum 
>>>>> mlx5_event)      0xffff8881a2cdea08 ;
>>>>>                   unsigned int cons_index          0x1 ;
>>>>>                   unsigned int arm_sn      0x0 ;
>>>>>                   struct mlx5_rsc_debug * dbg      0x0 ;
>>>>>                   int        pid   0x0 ;
>>>>>                   struct   {
>>>>>                           struct  list_head {
>>>>>                                   struct list_head * next          
>>>>> 0xffffffff ;
>>>>>                                   struct list_head * prev          
>>>>> 0xffffffffffffffff ;
>>>>>                           } list;
>>>>>                           void (*comp)(struct mlx5_core_cq *)      
>>>>> 0xffffffffa0356940 ;
>>>>>                           void * priv      0x0 ;
>>>>>                   } tasklet_ctx;
>>>>>                   int        reset_notify_added    0x0 ;
>>>>>                   struct  list_head {
>>>>>                           struct list_head * next          
>>>>> 0xffffffffa0300700 ;
>>>>>                           struct list_head * prev          0xd ;
>>>>>                   } reset_notify;
>>>>>                   struct mlx5_eq_comp * eq         0x0 ;
>>>>>                   short unsigned int uid   0x9a70 ;
>>>>>           } mcq;
>>>>>           struct mlx5e_channel * channel   0xffff8881b0899a70 ;
>>>>>           struct mlx5_core_dev * mdev      0x4800000001 ;
>>>>>           struct  mlx5_wq_ctrl {
>>>>>                   struct mlx5_core_dev * mdev      0xffffffffa02d5350 ;
>>>>>                   struct  mlx5_frag_buf {
>>>>>                           struct mlx5_buf_list * frags     
>>>>> 0xffffffffa02d5460 ;
>>>>>                           int npages       0x0 ;
>>>>>                           int size         0x5 ;
>>>>>                           unsigned char page_shift         0x8 ;
>>>>>                   } buf;
>>>>>                   struct  mlx5_db {
>>>>>                           __be32 * db      0x1c6 ;
>>>>>                           union   {
>>>>>                                   struct mlx5_db_pgdir * pgdir     0x0 ;
>>>>>                                   struct mlx5_ib_user_db_page * user_page 
>>>>>          0x0 ;
>>>>>                           } u;
>>>>>                           long long unsigned int dma       
>>>>> 0xffff8881b0899ab0 ;
>>>>>                           int index        0x0 ;
>>>>>                   } db;
>>>>>           } wq_ctrl;
>>>>>   } cq;
>>>>>   struct  mlx5_wq_cyc {
>>>>>           struct  mlx5_frag_buf_ctrl {
>>>>>                   struct mlx5_buf_list * frags     0xffff8881a7600160 ;
>>>>>                   unsigned int sz_m1       0xa7600160 ;
>>>>>                   short unsigned int frag_sz_m1    0x8881 ;
>>>>>                   short unsigned int strides_offset        0xffff ;
>>>>>                   unsigned char log_sz     0x88 ;
>>>>>                   unsigned char log_stride         0x49 ;
>>>>>                   unsigned char log_frag_strides   0xaa ;
>>>>>           } fbc;
>>>>>           __be32 *           db    0x1000000000010 ;
>>>>>           short unsigned int sz    0xc ;
>>>>>           short unsigned int wqe_ctr       0x0 ;
>>>>>           short unsigned int cur_sz        0x0 ;
>>>>>   } wq;
>>>>>   unsigned int               dma_fifo_mask         0xa1814500 ;
>>>>>   struct mlx5e_sq_stats *    stats         0xffff8881a33a0348 ;
>>>>>   struct   {
>>>>>           struct mlx5e_sq_dma * dma_fifo   0x1a1814500 ;
>>>>>           struct mlx5e_tx_wqe_info * wqe_info      0x14 ;
>>>>>   } db;
>>>>>   void *                     uar_map       0x0 ;
>>>>>   struct netdev_queue *      txq   0x0 ;
>>>>>   unsigned int               sqn   0x18c0 ;
>>>>>   unsigned char              min_inline_mode       0x0 ;
>>>>>   struct device *            pdev          0x0 ;
>>>>>   unsigned int               mkey_be       0x0 ;
>>>>>   long unsigned int          state         0x0 ;
>>>>>   struct hwtstamp_config *   tstamp        0x0 ;
>>>>>   struct mlx5_clock *        clock         0xffff8881b1aa6f88 ;
>>>>>   struct  mlx5_wq_ctrl {
>>>>>           struct mlx5_core_dev * mdev      0x3f000003ff ;
>>>>>           struct  mlx5_frag_buf {
>>>>>                   struct mlx5_buf_list * frags     0x6060a ;
>>>>>                   int        npages        0xa1814604 ;
>>>>>                   int        size          0xffff8881 ;
>>>>>                   unsigned char page_shift         0x0 ;
>>>>>           } buf;
>>>>>           struct  mlx5_db {
>>>>>                   __be32 *   db    0xfff ;
>>>>>                   union   {
>>>>>                           struct mlx5_db_pgdir * pgdir     0x0 ;
>>>>>                           struct mlx5_ib_user_db_page * user_page         
>>>>>  0x0 ;
>>>>>                   } u;
>>>>>                   long long unsigned int dma       0xffff888188440000 ;
>>>>>                   int        index         0x8b074000 ;
>>>>>           } db;
>>>>>   } wq_ctrl;
>>>>>   struct mlx5e_channel *     channel       0xffffc9000010d800 ;
>>>>>   int                        txq_ix        0xa0020180 ;
>>>>>   unsigned int               rate_limit    0xffff8881 ;
>>>>>   struct  work_struct {
>>>>>           struct   {
>>>>>                   long int   counter       0x1000018c0 ;
>>>>>           } data;
>>>>>           struct  list_head {
>>>>>                   struct list_head * next          0xffff8881c32b68e8 ;
>>>>>                   struct list_head * prev          0x800 ;
>>>>>           } entry;
>>>>>           void               (*func)(struct work_struct *)         0x9 ;
>>>>>   } recover_work;
>>>>> } ;
>>>>
>>>> I don't get it. You are dumping live kernel memory? There are already
>>>> facilities to do that in place. Why to replicate it?
>>> I am dumping the driver's memory under a lock so I can ensure it's
>>> consistency (as appose to /dev/mem)
>>> vmcore cannot be taken from a live kernel (without crashing).
>>> I need the memory's snapshot right after the error from the driver's
>>> context.
>> 
>> Got it. However, this sounds like a generic problem not specific to
>> nic drivers. How other subsystems resolve this (if they do at all)?
>> 
>> 
>Correct, this is a suggested debugging solution for a generic problem: 
>enabling the user of a run time memory snapshot for kernel modules (at a 
>given error event). My research shows that other subsystems deal with 
>errors either by panicking (too much) or by debug/log prints (too little).
>This solution is (a) low in maintenance (b) consistent in memory (c) has 
>small performance impact (d) use an existing infra-structure between the 
>kernel module and the user space.

I'm still convinced that dumping kernel memory over devlink health dump
is a good idea :/


>It might be ported to other subsystems using their own user-space vs. 
>kernel tools. Regardless of how the memory output was generated to the 
>user, the parsing script can work on it.

Could you share the script? How is it going to be distributed?


>
>> 
>>> Which other tools do you mean?
>>>>
>>>>
>>>>>
>>>>> Signed-off-by: Aya Levin <a...@mellanox.com>
>>>>> ---
>>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 
>>>>> +++++++++++++++++++++
>>>>> 1 file changed, 100 insertions(+)
>>>>>
>>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c 
>>>>> b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>>> @@ -9,6 +9,7 @@
>>>>>
>>>>> struct mlx5e_tx_err_ctx {
>>>>>   int (*recover)(struct mlx5e_txqsq *sq);
>>>>> + int (*dump)(struct mlx5e_txqsq *sq);
>>>>>   struct mlx5e_txqsq *sq;
>>>>> };
>>>>>
>>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct 
>>>>> devlink_health_reporter *reporter,
>>>>>   return err;
>>>>> }
>>>>>
>>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>>> +                                       struct mlx5e_txqsq *sq,
>>>>> +                                       struct devlink_fmsg *fmsg)
>>>>> +{
>>>>> + u64 *ptr = (u64 *)sq;
>>>>> + int copy, err;
>>>>> + int i = 0;
>>>>> +
>>>>> + if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>> +         return 0;
>>>>> +
>>>>> + err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>>> + if (err)
>>>>> +         return err;
>>>>> +
>>>>> + err = devlink_fmsg_obj_nest_start(fmsg);
>>>>> + if (err)
>>>>> +         return err;
>>>>> +
>>>>> + err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>>> + if (err)
>>>>> +         return err;
>>>>> +
>>>>> + while (i < sizeof(struct mlx5e_txqsq)) {
>>>>> +         copy = sizeof(u64);
>>>>> +
>>>>> +         if (i + copy > sizeof(struct mlx5e_txqsq))
>>>>> +                 copy = sizeof(struct mlx5e_txqsq) - i;
>>>>> +
>>>>> +         err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>>> +         if (err)
>>>>> +                 return err;
>>>>> +         ptr++;
>>>>> +         i += copy;
>>>>> + }
>>>>> +
>>>>> + err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>> + if (err)
>>>>> +         return err;
>>>>> +
>>>>> + err = devlink_fmsg_obj_nest_end(fmsg);
>>>>> + if (err)
>>>>> +         return err;
>>>>> +
>>>>> + err = devlink_fmsg_pair_nest_end(fmsg);
>>>>> +
>>>>> + return err;
>>>>> +}
>>>>> +
>>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>>> +                                  struct devlink_fmsg *fmsg)
>>>>> +{
>>>>> + int i, err = 0;
>>>>> +
>>>>> + mutex_lock(&priv->state_lock);
>>>>> +
>>>>> + if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>>> +         goto unlock;
>>>>> +
>>>>> + err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>>> + if (err)
>>>>> +         goto unlock;
>>>>> +
>>>>> + for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>>>> +      i++) {
>>>>> +         err = devlink_fmsg_obj_nest_start(fmsg);
>>>>> +         if (err)
>>>>> +                 goto unlock;
>>>>> +
>>>>> +         err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>>>> +                                                  fmsg);
>>>>> +         if (err)
>>>>> +                 goto unlock;
>>>>> +
>>>>> +         err = devlink_fmsg_pair_nest_end(fmsg);
>>>>> +         if (err)
>>>>> +                 goto unlock;
>>>>> + }
>>>>> + err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>>> + if (err)
>>>>> +         goto unlock;
>>>>> +
>>>>> +unlock:
>>>>> + mutex_unlock(&priv->state_lock);
>>>>> + return err;
>>>>> +}
>>>>> +
>>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter 
>>>>> *reporter,
>>>>> +                              struct devlink_fmsg *fmsg, void *context)
>>>>> +{
>>>>> + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>>>> + struct mlx5e_tx_err_ctx *err_ctx = context;
>>>>> +
>>>>> + return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>>>> +                                                     fmsg) :
>>>>> +                  mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>>> +}
>>>>> +
>>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>>>           .name = "tx",
>>>>>           .recover = mlx5e_tx_reporter_recover,
>>>>>           .diagnose = mlx5e_tx_reporter_diagnose,
>>>>> +         .dump = mlx5e_tx_reporter_sw_dump,
>>>>> };
>>>>>
>>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>>> -- 
>>>>> 2.14.1
>>>>>

Reply via email to