Skip to content

Commit 219d27d

Browse files
bvanasschemartinkpetersen
authored andcommitted
scsi: qla2xxx: Fix race conditions in the code for aborting SCSI commands
In the *_done() functions, instead of returning early if sp->ref_count >= 2, only decrement sp->ref_count. In qla2xxx_eh_abort(), instead of deciding what to do based on the value of sp->ref_count, decide which action to take depending on the completion status of the firmware abort. Remove srb.cwaitq and use srb.comp instead. In qla2x00_abort_srb(), call isp_ops->abort_command() directly instead of calling qla2xxx_eh_abort(). Cc: Himanshu Madhani <[email protected]> Cc: Giridhar Malavali <[email protected]> Signed-off-by: Bart Van Assche <[email protected]> Acked-by: Himanshu Madhani <[email protected]> Signed-off-by: Martin K. Petersen <[email protected]>
1 parent 982cc4b commit 219d27d

File tree

4 files changed

+55
-131
lines changed

4 files changed

+55
-131
lines changed

drivers/scsi/qla2xxx/qla_def.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,6 @@ typedef struct srb {
546546
int rc;
547547
int retry_count;
548548
struct completion *comp;
549-
wait_queue_head_t *cwaitq;
550549
union {
551550
struct srb_iocb iocb_cmd;
552551
struct bsg_job *bsg_job;

drivers/scsi/qla2xxx/qla_nvme.c

Lines changed: 2 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,7 @@ static void qla_nvme_sp_ls_done(void *ptr, int res)
137137
return;
138138
}
139139

140-
if (!atomic_dec_and_test(&sp->ref_count))
141-
return;
140+
atomic_dec(&sp->ref_count);
142141

143142
if (res)
144143
res = -EINVAL;
@@ -161,8 +160,7 @@ static void qla_nvme_sp_done(void *ptr, int res)
161160
nvme = &sp->u.iocb_cmd;
162161
fd = nvme->u.nvme.desc;
163162

164-
if (!atomic_dec_and_test(&sp->ref_count))
165-
return;
163+
atomic_dec(&sp->ref_count);
166164

167165
if (res == QLA_SUCCESS) {
168166
fd->rcv_rsplen = nvme->u.nvme.rsp_pyld_len;
@@ -599,34 +597,6 @@ static struct nvme_fc_port_template qla_nvme_fc_transport = {
599597
.fcprqst_priv_sz = sizeof(struct nvme_private),
600598
};
601599

602-
#define NVME_ABORT_POLLING_PERIOD 2
603-
static int qla_nvme_wait_on_command(srb_t *sp)
604-
{
605-
int ret = QLA_SUCCESS;
606-
607-
wait_event_timeout(sp->nvme_ls_waitq, (atomic_read(&sp->ref_count) > 1),
608-
NVME_ABORT_POLLING_PERIOD*HZ);
609-
610-
if (atomic_read(&sp->ref_count) > 1)
611-
ret = QLA_FUNCTION_FAILED;
612-
613-
return ret;
614-
}
615-
616-
void qla_nvme_abort(struct qla_hw_data *ha, struct srb *sp, int res)
617-
{
618-
int rval;
619-
620-
if (ha->flags.fw_started) {
621-
rval = ha->isp_ops->abort_command(sp);
622-
if (!rval && !qla_nvme_wait_on_command(sp))
623-
ql_log(ql_log_warn, NULL, 0x2112,
624-
"timed out waiting on sp=%p\n", sp);
625-
} else {
626-
sp->done(sp, res);
627-
}
628-
}
629-
630600
static void qla_nvme_unregister_remote_port(struct work_struct *work)
631601
{
632602
struct fc_port *fcport = container_of(work, struct fc_port,

drivers/scsi/qla2xxx/qla_nvme.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,6 @@ struct pt_ls4_rx_unsol {
145145
int qla_nvme_register_hba(struct scsi_qla_host *);
146146
int qla_nvme_register_remote(struct scsi_qla_host *, struct fc_port *);
147147
void qla_nvme_delete(struct scsi_qla_host *);
148-
void qla_nvme_abort(struct qla_hw_data *, struct srb *sp, int res);
149148
void qla24xx_nvme_ls4_iocb(struct scsi_qla_host *, struct pt_ls4_request *,
150149
struct req_que *);
151150
void qla24xx_async_gffid_sp_done(void *, int);

drivers/scsi/qla2xxx/qla_os.c

Lines changed: 53 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -714,7 +714,7 @@ qla2x00_sp_compl(void *ptr, int res)
714714
{
715715
srb_t *sp = ptr;
716716
struct scsi_cmnd *cmd = GET_CMD_SP(sp);
717-
wait_queue_head_t *cwaitq = sp->cwaitq;
717+
struct completion *comp = sp->comp;
718718

719719
if (atomic_read(&sp->ref_count) == 0) {
720720
ql_dbg(ql_dbg_io, sp->vha, 0x3015,
@@ -724,15 +724,15 @@ qla2x00_sp_compl(void *ptr, int res)
724724
WARN_ON(atomic_read(&sp->ref_count) == 0);
725725
return;
726726
}
727-
if (!atomic_dec_and_test(&sp->ref_count))
728-
return;
727+
728+
atomic_dec(&sp->ref_count);
729729

730730
sp->free(sp);
731731
cmd->result = res;
732732
CMD_SP(cmd) = NULL;
733733
cmd->scsi_done(cmd);
734-
if (cwaitq)
735-
wake_up(cwaitq);
734+
if (comp)
735+
complete(comp);
736736
qla2x00_rel_sp(sp);
737737
}
738738

@@ -825,7 +825,7 @@ qla2xxx_qpair_sp_compl(void *ptr, int res)
825825
{
826826
srb_t *sp = ptr;
827827
struct scsi_cmnd *cmd = GET_CMD_SP(sp);
828-
wait_queue_head_t *cwaitq = sp->cwaitq;
828+
struct completion *comp = sp->comp;
829829

830830
if (atomic_read(&sp->ref_count) == 0) {
831831
ql_dbg(ql_dbg_io, sp->fcport->vha, 0x3079,
@@ -835,15 +835,15 @@ qla2xxx_qpair_sp_compl(void *ptr, int res)
835835
WARN_ON(atomic_read(&sp->ref_count) == 0);
836836
return;
837837
}
838-
if (!atomic_dec_and_test(&sp->ref_count))
839-
return;
838+
839+
atomic_dec(&sp->ref_count);
840840

841841
sp->free(sp);
842842
cmd->result = res;
843843
CMD_SP(cmd) = NULL;
844844
cmd->scsi_done(cmd);
845-
if (cwaitq)
846-
wake_up(cwaitq);
845+
if (comp)
846+
complete(comp);
847847
qla2xxx_rel_qpair_sp(sp->qpair, sp);
848848
}
849849

@@ -1286,7 +1286,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
12861286
unsigned int id;
12871287
uint64_t lun;
12881288
unsigned long flags;
1289-
int rval, wait = 0;
1289+
int rval;
12901290
struct qla_hw_data *ha = vha->hw;
12911291
struct qla_qpair *qpair;
12921292

@@ -1299,7 +1299,6 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
12991299
ret = fc_block_scsi_eh(cmd);
13001300
if (ret != 0)
13011301
return ret;
1302-
ret = SUCCESS;
13031302

13041303
sp = (srb_t *) CMD_SP(cmd);
13051304
if (!sp)
@@ -1310,7 +1309,7 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
13101309
return SUCCESS;
13111310

13121311
spin_lock_irqsave(qpair->qp_lock_ptr, flags);
1313-
if (!CMD_SP(cmd)) {
1312+
if (sp->type != SRB_SCSI_CMD || GET_CMD_SP(sp) != cmd) {
13141313
/* there's a chance an interrupt could clear
13151314
the ptr as part of done & free */
13161315
spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
@@ -1331,66 +1330,31 @@ qla2xxx_eh_abort(struct scsi_cmnd *cmd)
13311330
"Aborting from RISC nexus=%ld:%d:%llu sp=%p cmd=%p handle=%x\n",
13321331
vha->host_no, id, lun, sp, cmd, sp->handle);
13331332

1334-
/* Get a reference to the sp and drop the lock.*/
13351333
rval = ha->isp_ops->abort_command(sp);
1336-
if (rval) {
1337-
if (rval == QLA_FUNCTION_PARAMETER_ERROR)
1338-
ret = SUCCESS;
1339-
else
1340-
ret = FAILED;
1341-
1342-
ql_dbg(ql_dbg_taskm, vha, 0x8003,
1343-
"Abort command mbx failed cmd=%p, rval=%x.\n", cmd, rval);
1344-
} else {
1345-
ql_dbg(ql_dbg_taskm, vha, 0x8004,
1346-
"Abort command mbx success cmd=%p.\n", cmd);
1347-
wait = 1;
1348-
}
1334+
ql_dbg(ql_dbg_taskm, vha, 0x8003,
1335+
"Abort command mbx cmd=%p, rval=%x.\n", cmd, rval);
13491336

1350-
spin_lock_irqsave(qpair->qp_lock_ptr, flags);
1351-
1352-
/*
1353-
* Releasing of the SRB and associated command resources
1354-
* is managed through ref_count.
1355-
* Whether we need to wait for the abort completion or complete
1356-
* the abort handler should be based on the ref_count.
1357-
*/
1358-
if (atomic_read(&sp->ref_count) > 1) {
1337+
switch (rval) {
1338+
case QLA_SUCCESS:
13591339
/*
1360-
* The command is not yet completed. We need to wait for either
1361-
* command completion or abort completion.
1340+
* The command has been aborted. That means that the firmware
1341+
* won't report a completion.
13621342
*/
1363-
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(eh_waitq);
1364-
uint32_t ratov = ha->r_a_tov/10;
1365-
1366-
/* Go ahead and release the extra ref_count obtained earlier */
1367-
sp->done(sp, DID_RESET << 16);
1368-
sp->cwaitq = &eh_waitq;
1369-
1370-
if (!wait_event_lock_irq_timeout(eh_waitq,
1371-
CMD_SP(cmd) == NULL, *qpair->qp_lock_ptr,
1372-
msecs_to_jiffies(4 * ratov * 1000))) {
1373-
/*
1374-
* The abort got dropped, LOGO will be sent and the
1375-
* original command will be completed with CS_TIMEOUT
1376-
* completion
1377-
*/
1378-
ql_dbg(ql_dbg_taskm, vha, 0xffff,
1379-
"%s: Abort wait timer (4 * R_A_TOV[%d]) expired\n",
1380-
__func__, ha->r_a_tov);
1381-
sp->cwaitq = NULL;
1382-
ret = FAILED;
1383-
goto end;
1384-
}
1385-
} else {
1386-
/* Command completed while processing the abort */
1387-
sp->done(sp, DID_RESET << 16);
1343+
sp->done(sp, DID_ABORT << 16);
1344+
ret = SUCCESS;
1345+
break;
1346+
default:
1347+
/*
1348+
* Either abort failed or abort and completion raced. Let
1349+
* the SCSI core retry the abort in the former case.
1350+
*/
1351+
ret = FAILED;
1352+
break;
13881353
}
1389-
end:
1390-
spin_unlock_irqrestore(qpair->qp_lock_ptr, flags);
1354+
13911355
ql_log(ql_log_info, vha, 0x801c,
1392-
"Abort command issued nexus=%ld:%d:%llu -- %d %x.\n",
1393-
vha->host_no, id, lun, wait, ret);
1356+
"Abort command issued nexus=%ld:%d:%llu -- %x.\n",
1357+
vha->host_no, id, lun, ret);
13941358

13951359
return ret;
13961360
}
@@ -1766,42 +1730,34 @@ static void qla2x00_abort_srb(struct qla_qpair *qp, srb_t *sp, const int res,
17661730
__releases(qp->qp_lock_ptr)
17671731
__acquires(qp->qp_lock_ptr)
17681732
{
1733+
DECLARE_COMPLETION_ONSTACK(comp);
17691734
scsi_qla_host_t *vha = qp->vha;
17701735
struct qla_hw_data *ha = vha->hw;
1736+
int rval;
17711737

1772-
if (sp->type == SRB_NVME_CMD || sp->type == SRB_NVME_LS) {
1773-
if (!sp_get(sp)) {
1774-
/* got sp */
1775-
spin_unlock_irqrestore(qp->qp_lock_ptr, *flags);
1776-
qla_nvme_abort(ha, sp, res);
1777-
spin_lock_irqsave(qp->qp_lock_ptr, *flags);
1778-
}
1779-
} else if (GET_CMD_SP(sp) && !ha->flags.eeh_busy &&
1780-
!test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) &&
1781-
!qla2x00_isp_reg_stat(ha) && sp->type == SRB_SCSI_CMD) {
1782-
/*
1783-
* Don't abort commands in adapter during EEH recovery as it's
1784-
* not accessible/responding.
1785-
*
1786-
* Get a reference to the sp and drop the lock. The reference
1787-
* ensures this sp->done() call and not the call in
1788-
* qla2xxx_eh_abort() ends the SCSI cmd (with result 'res').
1789-
*/
1790-
if (!sp_get(sp)) {
1791-
int status;
1738+
if (sp_get(sp))
1739+
return;
17921740

1793-
spin_unlock_irqrestore(qp->qp_lock_ptr, *flags);
1794-
status = qla2xxx_eh_abort(GET_CMD_SP(sp));
1795-
spin_lock_irqsave(qp->qp_lock_ptr, *flags);
1796-
/*
1797-
* Get rid of extra reference caused
1798-
* by early exit from qla2xxx_eh_abort
1799-
*/
1800-
if (status == FAST_IO_FAIL)
1801-
atomic_dec(&sp->ref_count);
1741+
if (sp->type == SRB_NVME_CMD || sp->type == SRB_NVME_LS ||
1742+
(sp->type == SRB_SCSI_CMD && !ha->flags.eeh_busy &&
1743+
!test_bit(ABORT_ISP_ACTIVE, &vha->dpc_flags) &&
1744+
!qla2x00_isp_reg_stat(ha))) {
1745+
sp->comp = &comp;
1746+
rval = ha->isp_ops->abort_command(sp);
1747+
spin_unlock_irqrestore(qp->qp_lock_ptr, *flags);
1748+
1749+
switch (rval) {
1750+
case QLA_SUCCESS:
1751+
sp->done(sp, res);
1752+
break;
1753+
case QLA_FUNCTION_PARAMETER_ERROR:
1754+
wait_for_completion(&comp);
1755+
break;
18021756
}
1757+
1758+
spin_lock_irqsave(qp->qp_lock_ptr, *flags);
1759+
sp->comp = NULL;
18031760
}
1804-
sp->done(sp, res);
18051761
}
18061762

18071763
static void

0 commit comments

Comments
 (0)