A prior patch inadvertently caused lpfc_sli_sum_iocb to exclude
counting of outstanding aborted I/Os and ABORT iocbs. Thus,
lpfc_reset_flush_io_context called from any TMF routine does not
properly wait to flush all outstanding FCP iocbs leading to a
block layer crash on an invalid scsi_cmnd->request pointer.
kernel BUG at ../block/blk-core.c:1489!
RIP: 0010:blk_requeue_request+0xaf/0xc0
...
Call Trace:
<IRQ>
__scsi_queue_insert+0x90/0xe0 [scsi_mod]
blk_done_softirq+0x7e/0x90
__do_softirq+0xd2/0x280
irq_exit+0xd5/0xe0
do_IRQ+0x4c/0xd0
common_interrupt+0x87/0x87
</IRQ>
Fix by separating out the LPFC_IO_FCP, LPFC_IO_ON_TXCMPLQ,
LPFC_DRIVER_ABORTED, and CMD_ABORT_XRI_CN || CMD_CLOSE_XRI_CN checks
into a new lpfc_sli_validate_fcp_iocb_for_abort routine when determining
to build an ABORT iocb.
Restore lpfc_reset_flush_io_context functionality by including
counting of outstanding aborted iocbs and ABORT iocbs in lpfc_sli_sum_iocb.
Fixes: e1364711359f ("scsi: lpfc: Fix illegal memory access on Abort IOCBs")
Cc: <stable(a)vger.kernel.org> # v5.12+
Co-developed-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: James Smart <jsmart2021(a)gmail.com>
---
drivers/scsi/lpfc/lpfc_sli.c | 101 +++++++++++++++++++++++++++--------
1 file changed, 78 insertions(+), 23 deletions(-)
diff --git a/drivers/scsi/lpfc/lpfc_sli.c b/drivers/scsi/lpfc/lpfc_sli.c
index 546c851938bc..e8f6ad484768 100644
--- a/drivers/scsi/lpfc/lpfc_sli.c
+++ b/drivers/scsi/lpfc/lpfc_sli.c
@@ -12485,15 +12485,54 @@ lpfc_sli_hba_iocb_abort(struct lpfc_hba *phba)
}
/**
- * lpfc_sli_validate_fcp_iocb - find commands associated with a vport or LUN
+ * lpfc_sli_validate_fcp_iocb_for_abort - filter iocbs appropriate for FCP aborts
+ * @iocbq: Pointer to iocb object.
+ * @vport: Pointer to driver virtual port object.
+ *
+ * This function acts as an iocb filter for functions which abort FCP iocbs.
+ *
+ * Return values
+ * -ENODEV, if a null iocb or vport ptr is encountered
+ * -EINVAL, if the iocb is not an FCP I/O, not on the TX cmpl queue, premarked as
+ * driver already started the abort process, or is an abort iocb itself
+ * 0, passes criteria for aborting the FCP I/O iocb
+ **/
+static int
+lpfc_sli_validate_fcp_iocb_for_abort(struct lpfc_iocbq *iocbq,
+ struct lpfc_vport *vport)
+{
+ IOCB_t *icmd = NULL;
+
+ /* No null ptr vports */
+ if (!iocbq || iocbq->vport != vport)
+ return -ENODEV;
+
+ /* iocb must be for FCP IO, already exists on the TX cmpl queue,
+ * can't be premarked as driver aborted, nor be an ABORT iocb itself
+ */
+ icmd = &iocbq->iocb;
+ if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
+ !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ) ||
+ (iocbq->iocb_flag & LPFC_DRIVER_ABORTED) ||
+ (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
+ icmd->ulpCommand == CMD_CLOSE_XRI_CN))
+ return -EINVAL;
+
+ return 0;
+}
+
+/**
+ * lpfc_sli_validate_fcp_iocb - validate commands associated with a SCSI target
* @iocbq: Pointer to driver iocb object.
* @vport: Pointer to driver virtual port object.
* @tgt_id: SCSI ID of the target.
* @lun_id: LUN ID of the scsi device.
* @ctx_cmd: LPFC_CTX_LUN/LPFC_CTX_TGT/LPFC_CTX_HOST
*
- * This function acts as an iocb filter for functions which abort or count
- * all FCP iocbs pending on a lun/SCSI target/SCSI host. It will return
+ * This function acts as an iocb filter for validating a lun/SCSI target/SCSI
+ * host.
+ *
+ * It will return
* 0 if the filtering criteria is met for the given iocb and will return
* 1 if the filtering criteria is not met.
* If ctx_cmd == LPFC_CTX_LUN, the function returns 0 only if the
@@ -12512,22 +12551,8 @@ lpfc_sli_validate_fcp_iocb(struct lpfc_iocbq *iocbq, struct lpfc_vport *vport,
lpfc_ctx_cmd ctx_cmd)
{
struct lpfc_io_buf *lpfc_cmd;
- IOCB_t *icmd = NULL;
int rc = 1;
- if (!iocbq || iocbq->vport != vport)
- return rc;
-
- if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
- !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ) ||
- iocbq->iocb_flag & LPFC_DRIVER_ABORTED)
- return rc;
-
- icmd = &iocbq->iocb;
- if (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
- icmd->ulpCommand == CMD_CLOSE_XRI_CN)
- return rc;
-
lpfc_cmd = container_of(iocbq, struct lpfc_io_buf, cur_iocbq);
if (lpfc_cmd->pCmd == NULL)
@@ -12582,17 +12607,33 @@ lpfc_sli_sum_iocb(struct lpfc_vport *vport, uint16_t tgt_id, uint64_t lun_id,
{
struct lpfc_hba *phba = vport->phba;
struct lpfc_iocbq *iocbq;
+ IOCB_t *icmd = NULL;
int sum, i;
+ unsigned long iflags;
- spin_lock_irq(&phba->hbalock);
+ spin_lock_irqsave(&phba->hbalock, iflags);
for (i = 1, sum = 0; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
- if (lpfc_sli_validate_fcp_iocb (iocbq, vport, tgt_id, lun_id,
- ctx_cmd) == 0)
+ if (!iocbq || iocbq->vport != vport)
+ continue;
+ if (!(iocbq->iocb_flag & LPFC_IO_FCP) ||
+ !(iocbq->iocb_flag & LPFC_IO_ON_TXCMPLQ))
+ continue;
+
+ /* Include counting outstanding aborts */
+ icmd = &iocbq->iocb;
+ if (icmd->ulpCommand == CMD_ABORT_XRI_CN ||
+ icmd->ulpCommand == CMD_CLOSE_XRI_CN) {
+ sum++;
+ continue;
+ }
+
+ if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
+ ctx_cmd) == 0)
sum++;
}
- spin_unlock_irq(&phba->hbalock);
+ spin_unlock_irqrestore(&phba->hbalock, iflags);
return sum;
}
@@ -12659,7 +12700,11 @@ lpfc_sli_abort_fcp_cmpl(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
*
* This function sends an abort command for every SCSI command
* associated with the given virtual port pending on the ring
- * filtered by lpfc_sli_validate_fcp_iocb function.
+ * filtered by lpfc_sli_validate_fcp_iocb_for_abort and then
+ * lpfc_sli_validate_fcp_iocb function. The ordering for validation before
+ * submitting abort iocbs must be lpfc_sli_validate_fcp_iocb_for_abort
+ * followed by lpfc_sli_validate_fcp_iocb.
+ *
* When abort_cmd == LPFC_CTX_LUN, the function sends abort only to the
* FCP iocbs associated with lun specified by tgt_id and lun_id
* parameters
@@ -12691,6 +12736,9 @@ lpfc_sli_abort_iocb(struct lpfc_vport *vport, u16 tgt_id, u64 lun_id,
for (i = 1; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
+ if (lpfc_sli_validate_fcp_iocb_for_abort(iocbq, vport))
+ continue;
+
if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
abort_cmd) != 0)
continue;
@@ -12723,7 +12771,11 @@ lpfc_sli_abort_iocb(struct lpfc_vport *vport, u16 tgt_id, u64 lun_id,
*
* This function sends an abort command for every SCSI command
* associated with the given virtual port pending on the ring
- * filtered by lpfc_sli_validate_fcp_iocb function.
+ * filtered by lpfc_sli_validate_fcp_iocb_for_abort and then
+ * lpfc_sli_validate_fcp_iocb function. The ordering for validation before
+ * submitting abort iocbs must be lpfc_sli_validate_fcp_iocb_for_abort
+ * followed by lpfc_sli_validate_fcp_iocb.
+ *
* When taskmgmt_cmd == LPFC_CTX_LUN, the function sends abort only to the
* FCP iocbs associated with lun specified by tgt_id and lun_id
* parameters
@@ -12761,6 +12813,9 @@ lpfc_sli_abort_taskmgmt(struct lpfc_vport *vport, struct lpfc_sli_ring *pring,
for (i = 1; i <= phba->sli.last_iotag; i++) {
iocbq = phba->sli.iocbq_lookup[i];
+ if (lpfc_sli_validate_fcp_iocb_for_abort(iocbq, vport))
+ continue;
+
if (lpfc_sli_validate_fcp_iocb(iocbq, vport, tgt_id, lun_id,
cmd) != 0)
continue;
--
2.26.2
In a rarely executed path, FLOGI failure, there is a refcounting error.
If FLOGI completed with an error, typically a timeout, the initial
completion handler would remove the job reference. However, the job
completion isn't the actual end of the job/exchange as the timeout
usually initiates an ABTS, and upon that ABTS completion, a final
completion is sent. The driver removes the reference again in the
final completion. Thus the imbalance.
In the buggy cases, if there was a link bounce while the delayed
response is outstanding, the fport node may be referenced again
but there was no additional reference as it is already present. The
delayed completion then occurs and removes the last reference freeing
the node and causing issues in the link up processed that is using the
node.
Fix this scenario by removing the snippet that removed the reference
in the initial flogi completion. The bad snippet was poorly trying to
identify the flogi as ok to do so by realizing the node was not
registered with either SCSI or NVME transport.
Fixes: 618e2ee146d4 ("scsi: lpfc: Fix FLOGI failure due to accessing a freed node")
Cc: <stable(a)vger.kernel.org> # v5.13+
Co-developed-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: Justin Tee <justin.tee(a)broadcom.com>
Signed-off-by: James Smart <jsmart2021(a)gmail.com>
---
drivers/scsi/lpfc/lpfc_els.c | 11 +++++------
drivers/scsi/lpfc/lpfc_hbadisc.c | 10 ++++++----
drivers/scsi/lpfc/lpfc_nvme.c | 5 +++--
3 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c
index 1254a575fd47..df5fc223ddb2 100644
--- a/drivers/scsi/lpfc/lpfc_els.c
+++ b/drivers/scsi/lpfc/lpfc_els.c
@@ -1059,9 +1059,10 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
lpfc_printf_vlog(vport, KERN_WARNING, LOG_TRACE_EVENT,
"0150 FLOGI failure Status:x%x/x%x "
- "xri x%x TMO:x%x\n",
+ "xri x%x TMO:x%x refcnt %d\n",
irsp->ulpStatus, irsp->un.ulpWord[4],
- cmdiocb->sli4_xritag, irsp->ulpTimeout);
+ cmdiocb->sli4_xritag, irsp->ulpTimeout,
+ kref_read(&ndlp->kref));
/* If this is not a loop open failure, bail out */
if (!(irsp->ulpStatus == IOSTAT_LOCAL_REJECT &&
@@ -1122,12 +1123,12 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
/* FLOGI completes successfully */
lpfc_printf_vlog(vport, KERN_INFO, LOG_ELS,
"0101 FLOGI completes successfully, I/O tag:x%x, "
- "xri x%x Data: x%x x%x x%x x%x x%x x%x x%x\n",
+ "xri x%x Data: x%x x%x x%x x%x x%x x%x x%x %d\n",
cmdiocb->iotag, cmdiocb->sli4_xritag,
irsp->un.ulpWord[4], sp->cmn.e_d_tov,
sp->cmn.w2.r_a_tov, sp->cmn.edtovResolution,
vport->port_state, vport->fc_flag,
- sp->cmn.priority_tagging);
+ sp->cmn.priority_tagging, kref_read(&ndlp->kref));
if (sp->cmn.priority_tagging)
vport->vmid_flag |= LPFC_VMID_ISSUE_QFPA;
@@ -1205,8 +1206,6 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb,
phba->fcf.fcf_flag &= ~FCF_DISCOVERY;
spin_unlock_irq(&phba->hbalock);
- if (!(ndlp->fc4_xpt_flags & (SCSI_XPT_REGD | NVME_XPT_REGD)))
- lpfc_nlp_put(ndlp);
if (!lpfc_error_lost_link(irsp)) {
/* FLOGI failed, so just use loop map to make discovery list */
lpfc_disc_list_loopmap(vport);
diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c
index 7195ca0275f9..6f2e07c30f98 100644
--- a/drivers/scsi/lpfc/lpfc_hbadisc.c
+++ b/drivers/scsi/lpfc/lpfc_hbadisc.c
@@ -4449,8 +4449,9 @@ lpfc_register_remote_port(struct lpfc_vport *vport, struct lpfc_nodelist *ndlp)
fc_remote_port_rolechg(rport, rport_ids.roles);
lpfc_printf_vlog(ndlp->vport, KERN_INFO, LOG_NODE,
- "3183 %s rport x%px DID x%x, role x%x\n",
- __func__, rport, rport->port_id, rport->roles);
+ "3183 %s rport x%px DID x%x, role x%x refcnt %d\n",
+ __func__, rport, rport->port_id, rport->roles,
+ kref_read(&ndlp->kref));
if ((rport->scsi_target_id != -1) &&
(rport->scsi_target_id < LPFC_MAX_TARGET)) {
@@ -4475,8 +4476,9 @@ lpfc_unregister_remote_port(struct lpfc_nodelist *ndlp)
lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE,
"3184 rport unregister x%06x, rport x%px "
- "xptflg x%x\n",
- ndlp->nlp_DID, rport, ndlp->fc4_xpt_flags);
+ "xptflg x%x refcnt %d\n",
+ ndlp->nlp_DID, rport, ndlp->fc4_xpt_flags,
+ kref_read(&ndlp->kref));
fc_remote_port_delete(rport);
lpfc_nlp_put(ndlp);
diff --git a/drivers/scsi/lpfc/lpfc_nvme.c b/drivers/scsi/lpfc/lpfc_nvme.c
index 73a3568ff17e..bd88477f9b82 100644
--- a/drivers/scsi/lpfc/lpfc_nvme.c
+++ b/drivers/scsi/lpfc/lpfc_nvme.c
@@ -209,8 +209,9 @@ lpfc_nvme_remoteport_delete(struct nvme_fc_remote_port *remoteport)
* calling state machine to remove the node.
*/
lpfc_printf_vlog(vport, KERN_INFO, LOG_NVME_DISC,
- "6146 remoteport delete of remoteport x%px\n",
- remoteport);
+ "6146 remoteport delete of remoteport x%px, ndlp x%px "
+ "DID x%x xflags x%x\n",
+ remoteport, ndlp, ndlp->nlp_DID, ndlp->fc4_xpt_flags);
spin_lock_irq(&ndlp->lock);
/* The register rebind might have occurred before the delete
--
2.26.2