Sun, May 12, 2019 at 10:37:35AM CEST, a...@mellanox.com wrote: > > >On 5/9/2019 11:23 AM, Jiri Pirko wrote: >> Tue, May 07, 2019 at 02:58:32PM CEST, a...@mellanox.com wrote: >>> >>> >>> On 5/7/2019 3:41 PM, Jiri Pirko wrote: >>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, a...@mellanox.com wrote: >>>>> TX reporter reports an error on two scenarios: >>>>> - TX timeout on a specific tx queue >>>>> - TX completion error on a specific send queue >>>>> Prior to this patch, no dump data was supported by the tx reporter. This >>>>> patch adds support for SW data dump of the related SQ context. The dump >>>>> is simply the SQ's raw memory snapshot taken right after the error was >>>>> reported, before any recovery procedure was launched. With this >>>>> approach, no maintenance is needed as the driver fetch the actual data >>>>> according to the layout on which the SQ was compiled with. By providing >>>>> a SW context, one can easily debug error on a given SQ. >>>>> >>>>> In order to offline translate the raw memory into a human readable >>>>> format, the user can use some out-of-kernel scripts which receives as an >>>>> input the following: >>>>> - Object raw memory >>>>> - Driver object compiled with debug info (can be taken/generated at any >>>>> time from the machine) >>>>> - Object name >>>>> >>>>> An example of such script output can be seen below. >>>>> Note: the script is not offered as part of this patch as it do not >>>>> belong to the kernel, I just described it in order to grasp the general >>>>> idea of how/what can be fetched from SW dump via devlink health. >>>>> >>>>> The output of the SW dump can be extracted by devlink health command: >>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx. >>>>> mlx5e_txqsq: sqn: 6336 >>>>> memory: >>>>> 00 00 00 00 00 00 00 00 >>>>> 01 00 00 00 00 00 00 00 >>>>> 00 00 00 00 00 00 00 00 >>>>> 45 f4 88 cb 09 00 00 00 >>>>> 00 00 00 00 00 00 00 00 >>>>> 00 00 00 00 00 00 00 00 >>>>> c0 ff ff ff 1f 00 00 00 >>>>> f8 18 1e 89 81 88 ff ff >>>>> ... >>>>> >>>>> script output below, with struct members names and actual values: >>>>> >>>>> struct mlx5e_txqsq { >>>>> short unsigned int cc 0x5 ; >>>>> unsigned int dma_fifo_cc 0x5 ; >>>>> struct net_dim { >>>>> unsigned char state 0x1 ; >>>>> struct net_dim_stats { >>>>> int ppms 0x0 ; >>>>> int bpms 0x0 ; >>>>> int epms 0x0 ; >>>>> } prev_stats; >>>>> struct net_dim_sample { >>>>> long long int time 0x90766ef9d ; >>>>> unsigned int pkt_ctr 0x0 ; >>>>> unsigned int byte_ctr 0x0 ; >>>>> short unsigned int event_ctr 0x0 ; >>>>> } start_sample; >>>>> struct work_struct { >>>>> struct { >>>>> long int counter 0x1fffffffc0 ; >>>>> } data; >>>>> struct list_head { >>>>> struct list_head * next >>>>> 0xffff8881b08998f8 ; >>>>> struct list_head * prev >>>>> 0xffff8881b08998f8 ; >>>>> } entry; >>>>> void (*func)(struct work_struct *) >>>>> 0xffffffffa02d0e30 ; >>>>> } work; >>>>> unsigned char profile_ix 0x60 ; >>>>> unsigned char mode 0x72 ; >>>>> unsigned char tune_state 0x35 ; >>>>> unsigned char steps_right 0xa0 ; >>>>> unsigned char steps_left 0xff ; >>>>> unsigned char tired 0xff ; >>>>> } dim; >>>>> short unsigned int pc 0x0 ; >>>>> unsigned int dma_fifo_pc 0x0 ; >>>>> struct mlx5e_cq { >>>>> struct mlx5_cqwq { >>>>> struct mlx5_frag_buf_ctrl { >>>>> struct mlx5_buf_list * frags 0x500000005 ; >>>>> unsigned int sz_m1 0x0 ; >>>>> short unsigned int frag_sz_m1 0x0 ; >>>>> short unsigned int strides_offset 0x0 ; >>>>> unsigned char log_sz 0x0 ; >>>>> unsigned char log_stride 0x0 ; >>>>> unsigned char log_frag_strides 0x0 ; >>>>> } fbc; >>>>> __be32 * db 0x0 ; >>>>> unsigned int cc 0x0 ; >>>>> } wq; >>>>> short unsigned int event_ctr 0x0 ; >>>>> struct napi_struct * napi 0x0 ; >>>>> struct mlx5_core_cq { >>>>> unsigned int cqn 0x0 ; >>>>> int cqe_sz 0x0 ; >>>>> __be32 * set_ci_db 0xffff8881b1aa4988 ; >>>>> __be32 * arm_db 0x3f000003ff ; >>>>> struct mlx5_uars_page * uar 0x6060a ; >>>>> struct refcount_struct { >>>>> struct { >>>>> int counter 0xa1814500 ; >>>>> } refs; >>>>> } refcount; >>>>> struct completion { >>>>> unsigned int done 0x5 ; >>>>> struct wait_queue_head { >>>>> struct spinlock { >>>>> union { >>>>> struct raw_spinlock { >>>>> struct >>>>> qspinlock { >>>>> union >>>>> { >>>>> >>>>> struct { >>>>> >>>>> int counter >>>>> 0x5 ; >>>>> >>>>> } val; >>>>> >>>>> struct { >>>>> >>>>> unsigned char locked >>>>> 0x5 ; >>>>> >>>>> unsigned char pending >>>>> 0x0 ; >>>>> >>>>> } ; >>>>> >>>>> struct { >>>>> >>>>> short unsigned int >>>>> locked_pending 0x5 ; >>>>> >>>>> short unsigned int tail >>>>> 0x0 ; >>>>> >>>>> } ; >>>>> } ; >>>>> } raw_lock; >>>>> } rlock; >>>>> } ; >>>>> } lock; >>>>> struct list_head { >>>>> struct list_head * next >>>>> 0xffff8881b089bb88 ; >>>>> struct list_head * prev >>>>> 0x4000000c0a ; >>>>> } head; >>>>> } wait; >>>>> } free; >>>>> unsigned int vector 0xa1814500 ; >>>>> unsigned int irqn 0xffff8881 ; >>>>> void (*comp)(struct mlx5_core_cq *) >>>>> 0xffff8881a1814504 ; >>>>> void (*event)(struct mlx5_core_cq *, enum >>>>> mlx5_event) 0xffff8881a2cdea08 ; >>>>> unsigned int cons_index 0x1 ; >>>>> unsigned int arm_sn 0x0 ; >>>>> struct mlx5_rsc_debug * dbg 0x0 ; >>>>> int pid 0x0 ; >>>>> struct { >>>>> struct list_head { >>>>> struct list_head * next >>>>> 0xffffffff ; >>>>> struct list_head * prev >>>>> 0xffffffffffffffff ; >>>>> } list; >>>>> void (*comp)(struct mlx5_core_cq *) >>>>> 0xffffffffa0356940 ; >>>>> void * priv 0x0 ; >>>>> } tasklet_ctx; >>>>> int reset_notify_added 0x0 ; >>>>> struct list_head { >>>>> struct list_head * next >>>>> 0xffffffffa0300700 ; >>>>> struct list_head * prev 0xd ; >>>>> } reset_notify; >>>>> struct mlx5_eq_comp * eq 0x0 ; >>>>> short unsigned int uid 0x9a70 ; >>>>> } mcq; >>>>> struct mlx5e_channel * channel 0xffff8881b0899a70 ; >>>>> struct mlx5_core_dev * mdev 0x4800000001 ; >>>>> struct mlx5_wq_ctrl { >>>>> struct mlx5_core_dev * mdev 0xffffffffa02d5350 ; >>>>> struct mlx5_frag_buf { >>>>> struct mlx5_buf_list * frags >>>>> 0xffffffffa02d5460 ; >>>>> int npages 0x0 ; >>>>> int size 0x5 ; >>>>> unsigned char page_shift 0x8 ; >>>>> } buf; >>>>> struct mlx5_db { >>>>> __be32 * db 0x1c6 ; >>>>> union { >>>>> struct mlx5_db_pgdir * pgdir 0x0 ; >>>>> struct mlx5_ib_user_db_page * user_page >>>>> 0x0 ; >>>>> } u; >>>>> long long unsigned int dma >>>>> 0xffff8881b0899ab0 ; >>>>> int index 0x0 ; >>>>> } db; >>>>> } wq_ctrl; >>>>> } cq; >>>>> struct mlx5_wq_cyc { >>>>> struct mlx5_frag_buf_ctrl { >>>>> struct mlx5_buf_list * frags 0xffff8881a7600160 ; >>>>> unsigned int sz_m1 0xa7600160 ; >>>>> short unsigned int frag_sz_m1 0x8881 ; >>>>> short unsigned int strides_offset 0xffff ; >>>>> unsigned char log_sz 0x88 ; >>>>> unsigned char log_stride 0x49 ; >>>>> unsigned char log_frag_strides 0xaa ; >>>>> } fbc; >>>>> __be32 * db 0x1000000000010 ; >>>>> short unsigned int sz 0xc ; >>>>> short unsigned int wqe_ctr 0x0 ; >>>>> short unsigned int cur_sz 0x0 ; >>>>> } wq; >>>>> unsigned int dma_fifo_mask 0xa1814500 ; >>>>> struct mlx5e_sq_stats * stats 0xffff8881a33a0348 ; >>>>> struct { >>>>> struct mlx5e_sq_dma * dma_fifo 0x1a1814500 ; >>>>> struct mlx5e_tx_wqe_info * wqe_info 0x14 ; >>>>> } db; >>>>> void * uar_map 0x0 ; >>>>> struct netdev_queue * txq 0x0 ; >>>>> unsigned int sqn 0x18c0 ; >>>>> unsigned char min_inline_mode 0x0 ; >>>>> struct device * pdev 0x0 ; >>>>> unsigned int mkey_be 0x0 ; >>>>> long unsigned int state 0x0 ; >>>>> struct hwtstamp_config * tstamp 0x0 ; >>>>> struct mlx5_clock * clock 0xffff8881b1aa6f88 ; >>>>> struct mlx5_wq_ctrl { >>>>> struct mlx5_core_dev * mdev 0x3f000003ff ; >>>>> struct mlx5_frag_buf { >>>>> struct mlx5_buf_list * frags 0x6060a ; >>>>> int npages 0xa1814604 ; >>>>> int size 0xffff8881 ; >>>>> unsigned char page_shift 0x0 ; >>>>> } buf; >>>>> struct mlx5_db { >>>>> __be32 * db 0xfff ; >>>>> union { >>>>> struct mlx5_db_pgdir * pgdir 0x0 ; >>>>> struct mlx5_ib_user_db_page * user_page >>>>> 0x0 ; >>>>> } u; >>>>> long long unsigned int dma 0xffff888188440000 ; >>>>> int index 0x8b074000 ; >>>>> } db; >>>>> } wq_ctrl; >>>>> struct mlx5e_channel * channel 0xffffc9000010d800 ; >>>>> int txq_ix 0xa0020180 ; >>>>> unsigned int rate_limit 0xffff8881 ; >>>>> struct work_struct { >>>>> struct { >>>>> long int counter 0x1000018c0 ; >>>>> } data; >>>>> struct list_head { >>>>> struct list_head * next 0xffff8881c32b68e8 ; >>>>> struct list_head * prev 0x800 ; >>>>> } entry; >>>>> void (*func)(struct work_struct *) 0x9 ; >>>>> } recover_work; >>>>> } ; >>>> >>>> I don't get it. You are dumping live kernel memory? There are already >>>> facilities to do that in place. Why to replicate it? >>> I am dumping the driver's memory under a lock so I can ensure it's >>> consistency (as appose to /dev/mem) >>> vmcore cannot be taken from a live kernel (without crashing). >>> I need the memory's snapshot right after the error from the driver's >>> context. >> >> Got it. However, this sounds like a generic problem not specific to >> nic drivers. How other subsystems resolve this (if they do at all)? >> >> >Correct, this is a suggested debugging solution for a generic problem: >enabling the user of a run time memory snapshot for kernel modules (at a >given error event). My research shows that other subsystems deal with >errors either by panicking (too much) or by debug/log prints (too little). >This solution is (a) low in maintenance (b) consistent in memory (c) has >small performance impact (d) use an existing infra-structure between the >kernel module and the user space.
I'm still convinced that dumping kernel memory over devlink health dump is a good idea :/ >It might be ported to other subsystems using their own user-space vs. >kernel tools. Regardless of how the memory output was generated to the >user, the parsing script can work on it. Could you share the script? How is it going to be distributed? > >> >>> Which other tools do you mean? >>>> >>>> >>>>> >>>>> Signed-off-by: Aya Levin <a...@mellanox.com> >>>>> --- >>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 100 >>>>> +++++++++++++++++++++ >>>>> 1 file changed, 100 insertions(+) >>>>> >>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>>>> b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>>>> index 476dd97f7f2f..8a39f5525e57 100644 >>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>>>> @@ -9,6 +9,7 @@ >>>>> >>>>> struct mlx5e_tx_err_ctx { >>>>> int (*recover)(struct mlx5e_txqsq *sq); >>>>> + int (*dump)(struct mlx5e_txqsq *sq); >>>>> struct mlx5e_txqsq *sq; >>>>> }; >>>>> >>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct >>>>> devlink_health_reporter *reporter, >>>>> return err; >>>>> } >>>>> >>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv, >>>>> + struct mlx5e_txqsq *sq, >>>>> + struct devlink_fmsg *fmsg) >>>>> +{ >>>>> + u64 *ptr = (u64 *)sq; >>>>> + int copy, err; >>>>> + int i = 0; >>>>> + >>>>> + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) >>>>> + return 0; >>>>> + >>>>> + err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq"); >>>>> + if (err) >>>>> + return err; >>>>> + >>>>> + err = devlink_fmsg_obj_nest_start(fmsg); >>>>> + if (err) >>>>> + return err; >>>>> + >>>>> + err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory"); >>>>> + if (err) >>>>> + return err; >>>>> + >>>>> + while (i < sizeof(struct mlx5e_txqsq)) { >>>>> + copy = sizeof(u64); >>>>> + >>>>> + if (i + copy > sizeof(struct mlx5e_txqsq)) >>>>> + copy = sizeof(struct mlx5e_txqsq) - i; >>>>> + >>>>> + err = devlink_fmsg_binary_put(fmsg, ptr, copy); >>>>> + if (err) >>>>> + return err; >>>>> + ptr++; >>>>> + i += copy; >>>>> + } >>>>> + >>>>> + err = devlink_fmsg_arr_pair_nest_end(fmsg); >>>>> + if (err) >>>>> + return err; >>>>> + >>>>> + err = devlink_fmsg_obj_nest_end(fmsg); >>>>> + if (err) >>>>> + return err; >>>>> + >>>>> + err = devlink_fmsg_pair_nest_end(fmsg); >>>>> + >>>>> + return err; >>>>> +} >>>>> + >>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv, >>>>> + struct devlink_fmsg *fmsg) >>>>> +{ >>>>> + int i, err = 0; >>>>> + >>>>> + mutex_lock(&priv->state_lock); >>>>> + >>>>> + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) >>>>> + goto unlock; >>>>> + >>>>> + err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); >>>>> + if (err) >>>>> + goto unlock; >>>>> + >>>>> + for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; >>>>> + i++) { >>>>> + err = devlink_fmsg_obj_nest_start(fmsg); >>>>> + if (err) >>>>> + goto unlock; >>>>> + >>>>> + err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i], >>>>> + fmsg); >>>>> + if (err) >>>>> + goto unlock; >>>>> + >>>>> + err = devlink_fmsg_pair_nest_end(fmsg); >>>>> + if (err) >>>>> + goto unlock; >>>>> + } >>>>> + err = devlink_fmsg_arr_pair_nest_end(fmsg); >>>>> + if (err) >>>>> + goto unlock; >>>>> + >>>>> +unlock: >>>>> + mutex_unlock(&priv->state_lock); >>>>> + return err; >>>>> +} >>>>> + >>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter >>>>> *reporter, >>>>> + struct devlink_fmsg *fmsg, void *context) >>>>> +{ >>>>> + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); >>>>> + struct mlx5e_tx_err_ctx *err_ctx = context; >>>>> + >>>>> + return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq, >>>>> + fmsg) : >>>>> + mlx5e_tx_reporter_sw_dump_all(priv, fmsg); >>>>> +} >>>>> + >>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { >>>>> .name = "tx", >>>>> .recover = mlx5e_tx_reporter_recover, >>>>> .diagnose = mlx5e_tx_reporter_diagnose, >>>>> + .dump = mlx5e_tx_reporter_sw_dump, >>>>> }; >>>>> >>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 >>>>> -- >>>>> 2.14.1 >>>>>