On Thu, 2020-12-03 at 12:01 +0800, Can Guo wrote:
> On 2020-12-03 10:21, Stanley Chu wrote:
> > On Wed, 2020-12-02 at 04:04 -0800, Can Guo wrote:
> >> In current task abort routine, if task abort happens to the device 
> >> W-LU,
> >> the code directly jumps to ufshcd_eh_host_reset_handler() to perform a
> >> full reset and restore then returns FAIL or SUCCESS. Commands sent to 
> >> the
> >> device W-LU are most likely the SSU cmds sent during UFS PM 
> >> operations. If
> >> such SSU cmd enters task abort routine, when 
> >> ufshcd_eh_host_reset_handler()
> >> flushes eh_work, it will get stuck there since err_handler is 
> >> serialized
> >> with PM operations.
> >> 
> >> In order to unblock above call path, we merely clean up the lrb taken 
> >> by
> >> this cmd, queue the eh_work and return SUCCESS. Once the cmd is 
> >> aborted,
> >> the PM operation which sends out the cmd just errors out, then 
> >> err_handler
> >> shall be able to proceed with the full reset and restore.
> >> 
> >> In this scenario, the cmd is aborted even before it is actually 
> >> cleared by
> >> HW, set the lrb->in_use flag to prevent subsequent cmds, including 
> >> SCSI
> >> cmds and dev cmds, from taking the lrb released from abort. The flag 
> >> shall
> >> evetually be cleared in __ufshcd_transfer_req_compl() invoked by the 
> >> full
> >> reset and restore from err_handler.
> >> 
> >> Reviewed-by: Asutosh Das <[email protected]>
> >> Signed-off-by: Can Guo <[email protected]>
> >> ---
> >>  drivers/scsi/ufs/ufshcd.c | 60 
> >> +++++++++++++++++++++++++++++++++++++----------
> >>  drivers/scsi/ufs/ufshcd.h |  2 ++
> >>  2 files changed, 49 insertions(+), 13 deletions(-)
> >> 
> >> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> >> index f0bb3fc..26c1fa0 100644
> >> --- a/drivers/scsi/ufs/ufshcd.c
> >> +++ b/drivers/scsi/ufs/ufshcd.c
> >> @@ -2539,6 +2539,14 @@ static int ufshcd_queuecommand(struct Scsi_Host 
> >> *host, struct scsi_cmnd *cmd)
> >>            (hba->clk_gating.state != CLKS_ON));
> >> 
> >>    lrbp = &hba->lrb[tag];
> >> +  if (unlikely(lrbp->in_use)) {
> >> +          if (hba->pm_op_in_progress)
> >> +                  set_host_byte(cmd, DID_BAD_TARGET);
> >> +          else
> >> +                  err = SCSI_MLQUEUE_HOST_BUSY;
> >> +          ufshcd_release(hba);
> >> +          goto out;
> >> +  }
> >> 
> >>    WARN_ON(lrbp->cmd);
> >>    lrbp->cmd = cmd;
> >> @@ -2781,6 +2789,11 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba 
> >> *hba,
> >> 
> >>    init_completion(&wait);
> >>    lrbp = &hba->lrb[tag];
> >> +  if (unlikely(lrbp->in_use)) {
> >> +          err = -EBUSY;
> >> +          goto out;
> >> +  }
> >> +
> >>    WARN_ON(lrbp->cmd);
> >>    err = ufshcd_compose_dev_cmd(hba, lrbp, cmd_type, tag);
> >>    if (unlikely(err))
> >> @@ -2797,6 +2810,7 @@ static int ufshcd_exec_dev_cmd(struct ufs_hba 
> >> *hba,
> >> 
> >>    err = ufshcd_wait_for_dev_cmd(hba, lrbp, timeout);
> >> 
> >> +out:
> >>    ufshcd_add_query_upiu_trace(hba, tag,
> >>                    err ? "query_complete_err" : "query_complete");
> >> 
> >> @@ -4929,9 +4943,11 @@ static void __ufshcd_transfer_req_compl(struct 
> >> ufs_hba *hba,
> >>    struct scsi_cmnd *cmd;
> >>    int result;
> >>    int index;
> >> +  bool update_scaling = false;
> >> 
> >>    for_each_set_bit(index, &completed_reqs, hba->nutrs) {
> >>            lrbp = &hba->lrb[index];
> >> +          lrbp->in_use = false;
> >>            lrbp->compl_time_stamp = ktime_get();
> >>            cmd = lrbp->cmd;
> >>            if (cmd) {
> >> @@ -4944,15 +4960,17 @@ static void __ufshcd_transfer_req_compl(struct 
> >> ufs_hba *hba,
> >>                    /* Do not touch lrbp after scsi done */
> >>                    cmd->scsi_done(cmd);
> >>                    __ufshcd_release(hba);
> >> +                  update_scaling = true;
> >>            } else if (lrbp->command_type == UTP_CMD_TYPE_DEV_MANAGE ||
> >>                    lrbp->command_type == UTP_CMD_TYPE_UFS_STORAGE) {
> >>                    if (hba->dev_cmd.complete) {
> >>                            ufshcd_add_command_trace(hba, index,
> >>                                            "dev_complete");
> >>                            complete(hba->dev_cmd.complete);
> >> +                          update_scaling = true;
> >>                    }
> >>            }
> >> -          if (ufshcd_is_clkscaling_supported(hba))
> >> +          if (ufshcd_is_clkscaling_supported(hba) && update_scaling)
> >>                    hba->clk_scaling.active_reqs--;
> >>    }
> >> 
> >> @@ -6374,8 +6392,12 @@ static int ufshcd_issue_devman_upiu_cmd(struct 
> >> ufs_hba *hba,
> >> 
> >>    init_completion(&wait);
> >>    lrbp = &hba->lrb[tag];
> >> -  WARN_ON(lrbp->cmd);
> >> +  if (unlikely(lrbp->in_use)) {
> >> +          err = -EBUSY;
> >> +          goto out;
> >> +  }
> >> 
> >> +  WARN_ON(lrbp->cmd);
> >>    lrbp->cmd = NULL;
> >>    lrbp->sense_bufflen = 0;
> >>    lrbp->sense_buffer = NULL;
> >> @@ -6447,6 +6469,7 @@ static int ufshcd_issue_devman_upiu_cmd(struct 
> >> ufs_hba *hba,
> >>            }
> >>    }
> >> 
> >> +out:
> >>    blk_put_request(req);
> >>  out_unlock:
> >>    up_read(&hba->clk_scaling_lock);
> >> @@ -6696,16 +6719,6 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >>            BUG();
> >>    }
> >> 
> >> -  /*
> >> -   * Task abort to the device W-LUN is illegal. When this command
> >> -   * will fail, due to spec violation, scsi err handling next step
> >> -   * will be to send LU reset which, again, is a spec violation.
> >> -   * To avoid these unnecessary/illegal step we skip to the last error
> >> -   * handling stage: reset and restore.
> >> -   */
> >> -  if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN)
> >> -          return ufshcd_eh_host_reset_handler(cmd);
> >> -
> >>    ufshcd_hold(hba, false);
> >>    reg = ufshcd_readl(hba, REG_UTP_TRANSFER_REQ_DOOR_BELL);
> >>    /* If command is already aborted/completed, return SUCCESS */
> >> @@ -6726,7 +6739,7 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >>     * to reduce repeated printouts. For other aborted requests only 
> >> print
> >>     * basic details.
> >>     */
> >> -  scsi_print_command(hba->lrb[tag].cmd);
> >> +  scsi_print_command(cmd);
> >>    if (!hba->req_abort_count) {
> >>            ufshcd_update_reg_hist(&hba->ufs_stats.task_abort, 0);
> >>            ufshcd_print_host_regs(hba);
> >> @@ -6745,6 +6758,27 @@ static int ufshcd_abort(struct scsi_cmnd *cmd)
> >>            goto cleanup;
> >>    }
> >> 
> >> +  /*
> >> +   * Task abort to the device W-LUN is illegal. When this command
> >> +   * will fail, due to spec violation, scsi err handling next step
> >> +   * will be to send LU reset which, again, is a spec violation.
> >> +   * To avoid these unnecessary/illegal steps, first we clean up
> >> +   * the lrb taken by this cmd and mark the lrb as in_use, then
> >> +   * queue the eh_work and bail.
> >> +   */
> >> +  if (lrbp->lun == UFS_UPIU_UFS_DEVICE_WLUN) {
> >> +          spin_lock_irqsave(host->host_lock, flags);
> >> +          if (lrbp->cmd) {
> >> +                  __ufshcd_transfer_req_compl(hba, (1UL << tag));
> >> +                  __set_bit(tag, &hba->outstanding_reqs);
> >> +                  lrbp->in_use = true;
> >> +                  hba->force_reset = true;
> >> +                  ufshcd_schedule_eh_work(hba);
> > 
> > ufshcd_schedule_eh_work() will set hba->ufshcd_state as
> > UFSHCD_STATE_EH_SCHEDULED_FATAL. While in this state,
> > ufshcd_queuecommand() will set_host_byte(DID_BAD_TARGET) which is
> > similar as what you would like to do in this patch.
> > 
> > Is this enough for avoiding reusing tag issue? Just wonder if
> > lrpb->in_use flag is really required to be added.
> 
> Hi Stanley,
> 
> Thanks for the discussion.
> 
> To be accurate, it is to prevent lrb from being re-used, not the
> tag.
> Block layer and/or scsi layer can re-use the tag right after
> we abort the cmd, but the lrb is empty since we cleared it from
> abort path and we need to make sure the lrb stays empty before the
> full reset and restore happens.

What is the definition of "empty" here?

If it means lrb->cmd shall be empty (to not invoking scsi_done again),
then the hba->ufshcd_state check in ufshcd_queuecommend() will also
clear the re-used lrb->cmd if ufshcd_state is in
UFSHCD_STATE_EH_SCHEDULED_FATAL case.

However ufshcd_state cannot protect other paths now, for example,
ufshcd_exec_dev_cmd(), so lrbp->in_use may be required for this usage,
or ufshcd_state check can be added to help.

BTW, would you also need to consider ufshcd_issue_devman_upiu_cmd() that
is another possible path to re-use lrb?

Thanks,
Stanley Chu

> So, in queuecommand path, we have
> below checks to prevernt the lrb being re-used. This is before
> hba->ufshcd_state checks.
> 
> +    if (unlikely(lrbp->in_use)) {
> +        if (hba->pm_op_in_progress)
> +            set_host_byte(cmd, DID_BAD_TARGET);
> +        else
> +            err = SCSI_MLQUEUE_HOST_BUSY;
> +        ufshcd_release(hba);
> +        goto out;
> +    }
> 
> In above checks, below exception is for the case that a SSU cmd
> sent from PM ops is trying to re-use the lrb. In this case, we
> should simply let it fail so that PM ops errors out to unblock
> error handling (since error handling is serialized with PM ops).
> 
> +        if (hba->pm_op_in_progress)
> +            set_host_byte(cmd, DID_BAD_TARGET);
> 
> Thanks,
> 
> Can Guo.
> 
> > 
> >> +          }
> >> +          spin_unlock_irqrestore(host->host_lock, flags);
> >> +          goto out;
> >> +  }
> >> +
> >>    /* Skip task abort in case previous aborts failed and report failure 
> >> */
> >>    if (lrbp->req_abort_skip)
> >>            err = -EIO;
> >> diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> >> index 1e680bf..66e5338 100644
> >> --- a/drivers/scsi/ufs/ufshcd.h
> >> +++ b/drivers/scsi/ufs/ufshcd.h
> >> @@ -163,6 +163,7 @@ struct ufs_pm_lvl_states {
> >>   * @crypto_key_slot: the key slot to use for inline crypto (-1 if 
> >> none)
> >>   * @data_unit_num: the data unit number for the first block for 
> >> inline crypto
> >>   * @req_abort_skip: skip request abort task flag
> >> + * @in_use: indicates that this lrb is still in use
> >>   */
> >>  struct ufshcd_lrb {
> >>    struct utp_transfer_req_desc *utr_descriptor_ptr;
> >> @@ -192,6 +193,7 @@ struct ufshcd_lrb {
> >>  #endif
> >> 
> >>    bool req_abort_skip;
> >> +  bool in_use;
> >>  };
> >> 
> >>  /**

Reply via email to