On Fri, 17 Jul 2020 16:34:47 +0800 Luo bin wrote:
> add support to handle hw abnormal event such as hardware failure,
> cable unplugged,link error
> 
> Signed-off-by: Luo bin <luob...@huawei.com>
> Reported-by: kernel test robot <l...@intel.com>

> +static void hinic_comm_recv_mgmt_self_cmd_reg(struct hinic_pfhwdev *pfhwdev,
> +                                           u8 cmd,
> +                                           comm_mgmt_self_msg_proc proc)
> +{
> +     u8 cmd_idx;
> +
> +     cmd_idx = pfhwdev->proc.cmd_num;
> +     if (cmd_idx >= HINIC_COMM_SELF_CMD_MAX) {
> +             dev_err(&pfhwdev->hwdev.hwif->pdev->dev,
> +                     "Register recv mgmt process failed, cmd: 0x%x\n", cmd);
> +             return;
> +     }
> +
> +     pfhwdev->proc.info[cmd_idx].cmd = cmd;
> +     pfhwdev->proc.info[cmd_idx].proc = proc;
> +     pfhwdev->proc.cmd_num++;
> +}
> +
> +static void hinic_comm_recv_mgmt_self_cmd_unreg(struct hinic_pfhwdev 
> *pfhwdev,
> +                                             u8 cmd)
> +{
> +     u8 cmd_idx;
> +
> +     cmd_idx = pfhwdev->proc.cmd_num;
> +     if (cmd_idx >= HINIC_COMM_SELF_CMD_MAX) {
> +             dev_err(&pfhwdev->hwdev.hwif->pdev->dev, "Unregister recv mgmt 
> process failed, cmd: 0x%x\n",
> +                     cmd);
> +             return;
> +     }
> +
> +     for (cmd_idx = 0; cmd_idx < HINIC_COMM_SELF_CMD_MAX; cmd_idx++) {
> +             if (cmd == pfhwdev->proc.info[cmd_idx].cmd) {
> +                     pfhwdev->proc.info[cmd_idx].cmd = 0;
> +                     pfhwdev->proc.info[cmd_idx].proc = NULL;
> +                     pfhwdev->proc.cmd_num--;
> +             }
> +     }
> +}
> +
> +static void comm_mgmt_msg_handler(void *handle, u8 cmd, void *buf_in,
> +                               u16 in_size, void *buf_out, u16 *out_size)
> +{
> +     struct hinic_pfhwdev *pfhwdev = handle;
> +     u8 cmd_idx;
> +
> +     for (cmd_idx = 0; cmd_idx < pfhwdev->proc.cmd_num; cmd_idx++) {
> +             if (cmd == pfhwdev->proc.info[cmd_idx].cmd) {
> +                     if (!pfhwdev->proc.info[cmd_idx].proc) {
> +                             dev_warn(&pfhwdev->hwdev.hwif->pdev->dev,
> +                                      "PF recv mgmt comm msg handle null, 
> cmd: 0x%x\n",
> +                                      cmd);
> +                     } else {
> +                             pfhwdev->proc.info[cmd_idx].proc
> +                                     (&pfhwdev->hwdev, buf_in, in_size,
> +                                      buf_out, out_size);
> +                     }
> +
> +                     return;
> +             }
> +     }
> +
> +     dev_warn(&pfhwdev->hwdev.hwif->pdev->dev, "Received unknown mgmt cpu 
> event: 0x%x\n",
> +              cmd);
> +
> +     *out_size = 0;
> +}
> +
> +static void chip_fault_show(struct hinic_hwdev *hwdev,
> +                         struct hinic_fault_event *event)
> +{
> +     char fault_level[FAULT_TYPE_MAX][FAULT_SHOW_STR_LEN + 1] = {
> +             "fatal", "reset", "flr", "general", "suggestion"};
> +     char level_str[FAULT_SHOW_STR_LEN + 1] = {0};
> +     u8 level;
> +
> +     level = event->event.chip.err_level;
> +     if (level < FAULT_LEVEL_MAX)
> +             strncpy(level_str, fault_level[level],
> +                     FAULT_SHOW_STR_LEN);
> +     else
> +             strncpy(level_str, "Unknown", FAULT_SHOW_STR_LEN);
> +
> +     if (level == FAULT_LEVEL_SERIOUS_FLR)
> +             dev_err(&hwdev->hwif->pdev->dev, "err_level: %d [%s], flr 
> func_id: %d\n",
> +                     level, level_str, event->event.chip.func_id);
> +
> +     dev_err(&hwdev->hwif->pdev->dev, "Module_id: 0x%x, err_type: 0x%x, 
> err_level: %d[%s], err_csr_addr: 0x%08x, err_csr_value: 0x%08x\n",
> +             event->event.chip.node_id,
> +             event->event.chip.err_type, level, level_str,
> +             event->event.chip.err_csr_addr,
> +             event->event.chip.err_csr_value);
> +}
> +
> +static void fault_report_show(struct hinic_hwdev *hwdev,
> +                           struct hinic_fault_event *event)
> +{
> +     char fault_type[FAULT_TYPE_MAX][FAULT_SHOW_STR_LEN + 1] = {
> +             "chip", "ucode", "mem rd timeout", "mem wr timeout",
> +             "reg rd timeout", "reg wr timeout", "phy fault"};
> +     char type_str[FAULT_SHOW_STR_LEN + 1] = {0};
> +
> +     dev_err(&hwdev->hwif->pdev->dev, "Fault event report received, func_id: 
> %d\n",
> +             HINIC_HWIF_FUNC_IDX(hwdev->hwif));
> +
> +     if (event->type < FAULT_TYPE_MAX)
> +             strncpy(type_str, fault_type[event->type], FAULT_SHOW_STR_LEN);
> +     else
> +             strncpy(type_str, "Unknown", FAULT_SHOW_STR_LEN);
> +
> +     dev_err(&hwdev->hwif->pdev->dev, "Fault type: %d [%s]\n", event->type,
> +             type_str);
> +     dev_err(&hwdev->hwif->pdev->dev,  "Fault val[0]: 0x%08x, val[1]: 
> 0x%08x, val[2]: 0x%08x, val[3]: 0x%08x\n",
> +             event->event.val[0], event->event.val[1], event->event.val[2],
> +             event->event.val[3]);
> +
> +     switch (event->type) {
> +     case FAULT_TYPE_CHIP:
> +             chip_fault_show(hwdev, event);
> +             break;
> +     case FAULT_TYPE_UCODE:
> +             dev_err(&hwdev->hwif->pdev->dev, "Cause_id: %d, core_id: %d, 
> c_id: %d, epc: 0x%08x\n",
> +                     event->event.ucode.cause_id, event->event.ucode.core_id,
> +                     event->event.ucode.c_id, event->event.ucode.epc);
> +             break;
> +     case FAULT_TYPE_MEM_RD_TIMEOUT:
> +     case FAULT_TYPE_MEM_WR_TIMEOUT:
> +             dev_err(&hwdev->hwif->pdev->dev, "Err_csr_ctrl: 0x%08x, 
> err_csr_data: 0x%08x, ctrl_tab: 0x%08x, mem_index: 0x%08x\n",
> +                     event->event.mem_timeout.err_csr_ctrl,
> +                     event->event.mem_timeout.err_csr_data,
> +                     event->event.mem_timeout.ctrl_tab,
> +                     event->event.mem_timeout.mem_index);
> +             break;
> +     case FAULT_TYPE_REG_RD_TIMEOUT:
> +     case FAULT_TYPE_REG_WR_TIMEOUT:
> +             dev_err(&hwdev->hwif->pdev->dev, "Err_csr: 0x%08x\n",
> +                     event->event.reg_timeout.err_csr);
> +             break;
> +     case FAULT_TYPE_PHY_FAULT:
> +             dev_err(&hwdev->hwif->pdev->dev, "Op_type: %u, port_id: %u, 
> dev_ad: %u, csr_addr: 0x%08x, op_data: 0x%08x\n",
> +                     event->event.phy_fault.op_type,
> +                     event->event.phy_fault.port_id,
> +                     event->event.phy_fault.dev_ad,
> +                     event->event.phy_fault.csr_addr,
> +                     event->event.phy_fault.op_data);
> +             break;
> +     default:
> +             break;
> +     }
> +}
> +
> +/* pf fault report event */
> +static void pf_fault_event_handler(void *hwdev, void *buf_in, u16 in_size,
> +                                void *buf_out, u16 *out_size)
> +{
> +     struct hinic_cmd_fault_event *fault_event = buf_in;
> +
> +     fault_report_show(hwdev, &fault_event->event);
> +}
> +
> +static void mgmt_watchdog_timeout_event_handler(void *dev,
> +                                             void *buf_in, u16 in_size,
> +                                             void *buf_out, u16 *out_size)
> +{
> +     struct hinic_mgmt_watchdog_info *watchdog_info = NULL;
> +     struct hinic_hwdev *hwdev = dev;
> +     u32 *dump_addr = NULL;
> +     u32 stack_len, i, j;
> +     u32 *reg = NULL;
> +
> +     if (in_size != sizeof(*watchdog_info)) {
> +             dev_err(&hwdev->hwif->pdev->dev, "Invalid mgmt watchdog report, 
> length: %d, should be %zu\n",
> +                     in_size, sizeof(*watchdog_info));
> +             return;
> +     }
> +
> +     watchdog_info = buf_in;
> +
> +     dev_err(&hwdev->hwif->pdev->dev, "Mgmt deadloop time: 0x%x 0x%x, task 
> id: 0x%x, sp: 0x%x\n",
> +             watchdog_info->curr_time_h, watchdog_info->curr_time_l,
> +             watchdog_info->task_id, watchdog_info->sp);
> +     dev_err(&hwdev->hwif->pdev->dev, "Stack current used: 0x%x, peak used: 
> 0x%x, overflow flag: 0x%x, top: 0x%x, bottom: 0x%x\n",
> +             watchdog_info->curr_used, watchdog_info->peak_used,
> +             watchdog_info->is_overflow, watchdog_info->stack_top,
> +             watchdog_info->stack_bottom);
> +
> +     dev_err(&hwdev->hwif->pdev->dev, "Mgmt pc: 0x%08x, lr: 0x%08x, 
> cpsr:0x%08x\n",
> +             watchdog_info->pc, watchdog_info->lr, watchdog_info->cpsr);
> +
> +     dev_err(&hwdev->hwif->pdev->dev, "Mgmt register info\n");
> +
> +     for (i = 0; i < 3; i++) {
> +             reg = watchdog_info->reg + (u64)(u32)(4 * i);
> +             dev_err(&hwdev->hwif->pdev->dev, "0x%08x 0x%08x 0x%08x 
> 0x%08x\n",
> +                     *(reg), *(reg + 1), *(reg + 2), *(reg + 3));
> +     }
> +
> +     dev_err(&hwdev->hwif->pdev->dev, "0x%08x\n", watchdog_info->reg[12]);
> +
> +     if (watchdog_info->stack_actlen <= 1024) {
> +             stack_len = watchdog_info->stack_actlen;
> +     } else {
> +             dev_err(&hwdev->hwif->pdev->dev, "Oops stack length: 0x%x is 
> wrong\n",
> +                     watchdog_info->stack_actlen);
> +             stack_len = 1024;
> +     }
> +
> +     dev_err(&hwdev->hwif->pdev->dev, "Mgmt dump stack, 16Bytes per 
> line(start from sp)\n");
> +     for (i = 0; i < (stack_len / 16); i++) {
> +             dump_addr = (u32 *)(watchdog_info->data + ((u64)(u32)(i * 16)));
> +             dev_err(&hwdev->hwif->pdev->dev, "0x%08x 0x%08x 0x%08x 
> 0x%08x\n",
> +                     *dump_addr, *(dump_addr + 1), *(dump_addr + 2),
> +                     *(dump_addr + 3));
> +     }
> +
> +     for (j = 0; j < ((stack_len % 16) / 4); j++) {
> +             dump_addr = (u32 *)(watchdog_info->data +
> +                         ((u64)(u32)(i * 16 + j * 4)));
> +             dev_err(&hwdev->hwif->pdev->dev, "0x%08x ", *dump_addr);
> +     }
> +
> +     *out_size = sizeof(*watchdog_info);
> +     watchdog_info = buf_out;
> +     watchdog_info->status = 0;
> +}

These look like things which should be devlink health reporters.

> diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c 
> b/drivers/net/ethernet/huawei/hinic/hinic_main.c
> index c4c6f9c29f0e..98d2133a268b 100644
> --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
> +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
> @@ -75,6 +75,10 @@ MODULE_PARM_DESC(rx_weight, "Number Rx packets for NAPI 
> budget (default=64)");
>  #define HINIC_DEAULT_TXRX_MSIX_COALESC_TIMER_CFG     32
>  #define HINIC_DEAULT_TXRX_MSIX_RESEND_TIMER_CFG              7
>  
> +static const char *hinic_module_link_err[LINK_ERR_NUM] = {
> +     "Unrecognized module",
> +};
> +
>  static int change_mac_addr(struct net_device *netdev, const u8 *addr);
>  
>  static int set_features(struct hinic_dev *nic_dev,
> @@ -971,6 +975,44 @@ static void link_status_event_handler(void *handle, void 
> *buf_in, u16 in_size,
>       *out_size = sizeof(*ret_link_status);
>  }
>  
> +static void cable_plug_event(void *handle,
> +                          void *buf_in, u16 in_size,
> +                          void *buf_out, u16 *out_size)
> +{
> +     struct hinic_cable_plug_event *plug_event = buf_in;
> +     struct hinic_dev *nic_dev = handle;
> +
> +     netif_info(nic_dev, link, nic_dev->netdev,
> +                "Port module event: Cable %s\n",
> +                plug_event->plugged ? "plugged" : "unplugged");
> +
> +     *out_size = sizeof(*plug_event);
> +     plug_event = buf_out;
> +     plug_event->status = 0;
> +}
> +
> +static void link_err_event(void *handle,
> +                        void *buf_in, u16 in_size,
> +                        void *buf_out, u16 *out_size)
> +{
> +     struct hinic_link_err_event *link_err = buf_in;
> +     struct hinic_dev *nic_dev = handle;
> +
> +     if (link_err->err_type >= LINK_ERR_NUM)
> +             netif_info(nic_dev, link, nic_dev->netdev,
> +                        "Link failed, Unknown error type: 0x%x\n",
> +                        link_err->err_type);
> +     else
> +             netif_info(nic_dev, link, nic_dev->netdev,
> +                        "Link failed, error type: 0x%x: %s\n",
> +                        link_err->err_type,
> +                        hinic_module_link_err[link_err->err_type]);
> +
> +     *out_size = sizeof(*link_err);
> +     link_err = buf_out;
> +     link_err->status = 0;
> +}

Please expose link error information via the infrastructure added in
commit ecc31c60240b ("ethtool: Add link extended state") rather than
printing it to the logs.

Reply via email to