Merge tag 'for-linus-20180616' of git://git.kernel.dk/linux-block

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'for-linus-20180616' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
"A collection of fixes that should go into -rc1. This contains:

- bsg_open vs bsg_unregister race fix (Anatoliy)

- NVMe pull request from Christoph, with fixes for regressions in
this window, FC connect/reconnect path code unification, and a
trace point addition.

- timeout fix (Christoph)

- remove a few unused functions (Christoph)

- blk-mq tag_set reinit fix (Roman)"

* tag 'for-linus-20180616' of git://git.kernel.dk/linux-block:
bsg: fix race of bsg_open and bsg_unregister
block: remov blk_queue_invalidate_tags
nvme-fabrics: fix and refine state checks in __nvmf_check_ready
nvme-fabrics: handle the admin-only case properly in nvmf_check_ready
nvme-fabrics: refactor queue ready check
blk-mq: remove blk_mq_tagset_iter
nvme: remove nvme_reinit_tagset
nvme-fc: fix nulling of queue data on reconnect
nvme-fc: remove reinit_request routine
blk-mq: don't time out requests again that are in the timeout handler
nvme-fc: change controllers first connect to use reconnect path
nvme: don't rely on the changed namespace list log
nvmet: free smart-log buffer after use
nvme-rdma: fix error flow during mapping request data
nvme: add bio remapping tracepoint
nvme: fix NULL pointer dereference in nvme_init_subsystem
blk-mq: reinit q->tag_set_list entry only after grace period

Linus Torvalds 8 years ago 265c5596 5e7b9212

+182 -283

16 changed files

expand all collapse all

Documentation

block

biodoc.txt

block

blk-mq-tag.c

blk-mq.c

blk-tag.c

bsg.c

drivers

nvme

host

core.c

fabrics.c

fabrics.h

fc.c

multipath.c

nvme.h

rdma.c

target

admin-cmd.c

loop.c

include

linux

blk-mq.h

blkdev.h

+1 -14

Documentation/block/biodoc.txt

reviewed

··· 752 752 operations before calling end_that_request_last()! For an example of a user 753 753 of these helpers, see the IDE tagged command queueing support. 754 754 755 755 - Certain hardware conditions may dictate a need to invalidate the block tag 756 756 - queue. For instance, on IDE any tagged request error needs to clear both 757 757 - the hardware and software block queue and enable the driver to sanely restart 758 758 - all the outstanding requests. There's a third helper to do that: 759 759 - 760 760 - blk_queue_invalidate_tags(struct request_queue *q) 761 761 - 762 762 - Clear the internal block tag queue and re-add all the pending requests 763 763 - to the request queue. The driver will receive them again on the 764 764 - next request_fn run, just like it did the first time it encountered 765 765 - them. 766 766 - 767 755 3.2.5.2 Tag info 768 756 769 757 Some block functions exist to query current tag status or to go from a ··· 793 805 Most of the above is simple and straight forward, however busy_list may need 794 806 a bit of explaining. Normally we don't care too much about request ordering, 795 807 but in the event of any barrier requests in the tag queue we need to ensure 796 796 - that requests are restarted in the order they were queue. This may happen 797 797 - if the driver needs to use blk_queue_invalidate_tags(). 808 808 + that requests are restarted in the order they were queue. 798 809 799 810 3.3 I/O Submission 800 811

-29

block/blk-mq-tag.c

reviewed

··· 311 311 } 312 312 EXPORT_SYMBOL(blk_mq_tagset_busy_iter); 313 313 314 314 - int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, 315 315 - int (fn)(void *, struct request *)) 316 316 - { 317 317 - int i, j, ret = 0; 318 318 - 319 319 - if (WARN_ON_ONCE(!fn)) 320 320 - goto out; 321 321 - 322 322 - for (i = 0; i < set->nr_hw_queues; i++) { 323 323 - struct blk_mq_tags *tags = set->tags[i]; 324 324 - 325 325 - if (!tags) 326 326 - continue; 327 327 - 328 328 - for (j = 0; j < tags->nr_tags; j++) { 329 329 - if (!tags->static_rqs[j]) 330 330 - continue; 331 331 - 332 332 - ret = fn(data, tags->static_rqs[j]); 333 333 - if (ret) 334 334 - goto out; 335 335 - } 336 336 - } 337 337 - 338 338 - out: 339 339 - return ret; 340 340 - } 341 341 - EXPORT_SYMBOL_GPL(blk_mq_tagset_iter); 342 342 - 343 314 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, 344 315 void *priv) 345 316 {

+6 -2

block/blk-mq.c

reviewed

··· 671 671 672 672 if (blk_mq_request_started(rq)) { 673 673 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 674 674 + rq->rq_flags &= ~RQF_TIMED_OUT; 674 675 if (q->dma_drain_size && blk_rq_bytes(rq)) 675 676 rq->nr_phys_segments--; 676 677 } ··· 771 770 772 771 static void blk_mq_rq_timed_out(struct request *req, bool reserved) 773 772 { 773 773 + req->rq_flags |= RQF_TIMED_OUT; 774 774 if (req->q->mq_ops->timeout) { 775 775 enum blk_eh_timer_return ret; 776 776 ··· 781 779 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); 782 780 } 783 781 782 782 + req->rq_flags &= ~RQF_TIMED_OUT; 784 783 blk_add_timer(req); 785 784 } 786 785 ··· 790 787 unsigned long deadline; 791 788 792 789 if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) 790 790 + return false; 791 791 + if (rq->rq_flags & RQF_TIMED_OUT) 793 792 return false; 794 793 795 794 deadline = blk_rq_deadline(rq); ··· 2354 2349 2355 2350 mutex_lock(&set->tag_list_lock); 2356 2351 list_del_rcu(&q->tag_set_list); 2357 2357 - INIT_LIST_HEAD(&q->tag_set_list); 2358 2352 if (list_is_singular(&set->tag_list)) { 2359 2353 /* just transitioned to unshared */ 2360 2354 set->flags &= ~BLK_MQ_F_TAG_SHARED; ··· 2361 2357 blk_mq_update_tag_set_depth(set, false); 2362 2358 } 2363 2359 mutex_unlock(&set->tag_list_lock); 2364 2364 - 2365 2360 synchronize_rcu(); 2361 2361 + INIT_LIST_HEAD(&q->tag_set_list); 2366 2362 } 2367 2363 2368 2364 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,

-22

block/blk-tag.c

reviewed

··· 188 188 */ 189 189 q->queue_tags = tags; 190 190 queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); 191 191 - INIT_LIST_HEAD(&q->tag_busy_list); 192 191 return 0; 193 192 } 194 193 EXPORT_SYMBOL(blk_queue_init_tags); ··· 373 374 rq->tag = tag; 374 375 bqt->tag_index[tag] = rq; 375 376 blk_start_request(rq); 376 376 - list_add(&rq->queuelist, &q->tag_busy_list); 377 377 return 0; 378 378 } 379 379 EXPORT_SYMBOL(blk_queue_start_tag); 380 380 - 381 381 - /** 382 382 - * blk_queue_invalidate_tags - invalidate all pending tags 383 383 - * @q: the request queue for the device 384 384 - * 385 385 - * Description: 386 386 - * Hardware conditions may dictate a need to stop all pending requests. 387 387 - * In this case, we will safely clear the block side of the tag queue and 388 388 - * readd all requests to the request queue in the right order. 389 389 - **/ 390 390 - void blk_queue_invalidate_tags(struct request_queue *q) 391 391 - { 392 392 - struct list_head *tmp, *n; 393 393 - 394 394 - lockdep_assert_held(q->queue_lock); 395 395 - 396 396 - list_for_each_safe(tmp, n, &q->tag_busy_list) 397 397 - blk_requeue_request(q, list_entry_rq(tmp)); 398 398 - } 399 399 - EXPORT_SYMBOL(blk_queue_invalidate_tags);

+11 -11

block/bsg.c

reviewed

··· 693 693 struct bsg_device *bd; 694 694 unsigned char buf[32]; 695 695 696 696 + lockdep_assert_held(&bsg_mutex); 697 697 + 696 698 if (!blk_get_queue(rq)) 697 699 return ERR_PTR(-ENXIO); 698 700 ··· 709 707 bsg_set_block(bd, file); 710 708 711 709 atomic_set(&bd->ref_count, 1); 712 712 - mutex_lock(&bsg_mutex); 713 710 hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); 714 711 715 712 strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); 716 713 bsg_dbg(bd, "bound to <%s>, max queue %d\n", 717 714 format_dev_t(buf, inode->i_rdev), bd->max_queue); 718 715 719 719 - mutex_unlock(&bsg_mutex); 720 716 return bd; 721 717 } 722 718 ··· 722 722 { 723 723 struct bsg_device *bd; 724 724 725 725 - mutex_lock(&bsg_mutex); 725 725 + lockdep_assert_held(&bsg_mutex); 726 726 727 727 hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) { 728 728 if (bd->queue == q) { ··· 732 732 } 733 733 bd = NULL; 734 734 found: 735 735 - mutex_unlock(&bsg_mutex); 736 735 return bd; 737 736 } 738 737 ··· 745 746 */ 746 747 mutex_lock(&bsg_mutex); 747 748 bcd = idr_find(&bsg_minor_idr, iminor(inode)); 748 748 - mutex_unlock(&bsg_mutex); 749 749 750 750 - if (!bcd) 751 751 - return ERR_PTR(-ENODEV); 750 750 + if (!bcd) { 751 751 + bd = ERR_PTR(-ENODEV); 752 752 + goto out_unlock; 753 753 + } 752 754 753 755 bd = __bsg_get_device(iminor(inode), bcd->queue); 754 754 - if (bd) 755 755 - return bd; 756 756 + if (!bd) 757 757 + bd = bsg_add_device(inode, bcd->queue, file); 756 758 757 757 - bd = bsg_add_device(inode, bcd->queue, file); 758 758 - 759 759 + out_unlock: 760 760 + mutex_unlock(&bsg_mutex); 759 761 return bd; 760 762 } 761 763

+12 -36

drivers/nvme/host/core.c

reviewed

··· 2208 2208 * Verify that the subsystem actually supports multiple 2209 2209 * controllers, else bail out. 2210 2210 */ 2211 2211 - if (!ctrl->opts->discovery_nqn && 2211 2211 + if (!(ctrl->opts && ctrl->opts->discovery_nqn) && 2212 2212 nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) { 2213 2213 dev_err(ctrl->device, 2214 2214 "ignoring ctrl due to duplicate subnqn (%s).\n", ··· 3197 3197 nvme_remove_invalid_namespaces(ctrl, nn); 3198 3198 } 3199 3199 3200 3200 - static bool nvme_scan_changed_ns_log(struct nvme_ctrl *ctrl) 3200 3200 + static void nvme_clear_changed_ns_log(struct nvme_ctrl *ctrl) 3201 3201 { 3202 3202 size_t log_size = NVME_MAX_CHANGED_NAMESPACES * sizeof(__le32); 3203 3203 __le32 *log; 3204 3204 - int error, i; 3205 3205 - bool ret = false; 3204 3204 + int error; 3206 3205 3207 3206 log = kzalloc(log_size, GFP_KERNEL); 3208 3207 if (!log) 3209 3209 - return false; 3208 3208 + return; 3210 3209 3210 3210 + /* 3211 3211 + * We need to read the log to clear the AEN, but we don't want to rely 3212 3212 + * on it for the changed namespace information as userspace could have 3213 3213 + * raced with us in reading the log page, which could cause us to miss 3214 3214 + * updates. 3215 3215 + */ 3211 3216 error = nvme_get_log(ctrl, NVME_LOG_CHANGED_NS, log, log_size); 3212 3212 - if (error) { 3217 3217 + if (error) 3213 3218 dev_warn(ctrl->device, 3214 3219 "reading changed ns log failed: %d\n", error); 3215 3215 - goto out_free_log; 3216 3216 - } 3217 3220 3218 3218 - if (log[0] == cpu_to_le32(0xffffffff)) 3219 3219 - goto out_free_log; 3220 3220 - 3221 3221 - for (i = 0; i < NVME_MAX_CHANGED_NAMESPACES; i++) { 3222 3222 - u32 nsid = le32_to_cpu(log[i]); 3223 3223 - 3224 3224 - if (nsid == 0) 3225 3225 - break; 3226 3226 - dev_info(ctrl->device, "rescanning namespace %d.\n", nsid); 3227 3227 - nvme_validate_ns(ctrl, nsid); 3228 3228 - } 3229 3229 - ret = true; 3230 3230 - 3231 3231 - out_free_log: 3232 3221 kfree(log); 3233 3233 - return ret; 3234 3222 } 3235 3223 3236 3224 static void nvme_scan_work(struct work_struct *work) ··· 3234 3246 WARN_ON_ONCE(!ctrl->tagset); 3235 3247 3236 3248 if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { 3237 3237 - if (nvme_scan_changed_ns_log(ctrl)) 3238 3238 - goto out_sort_namespaces; 3239 3249 dev_info(ctrl->device, "rescanning namespaces.\n"); 3250 3250 + nvme_clear_changed_ns_log(ctrl); 3240 3251 } 3241 3252 3242 3253 if (nvme_identify_ctrl(ctrl, &id)) ··· 3250 3263 nvme_scan_ns_sequential(ctrl, nn); 3251 3264 out_free_id: 3252 3265 kfree(id); 3253 3253 - out_sort_namespaces: 3254 3266 down_write(&ctrl->namespaces_rwsem); 3255 3267 list_sort(NULL, &ctrl->namespaces, ns_cmp); 3256 3268 up_write(&ctrl->namespaces_rwsem); ··· 3626 3640 up_read(&ctrl->namespaces_rwsem); 3627 3641 } 3628 3642 EXPORT_SYMBOL_GPL(nvme_start_queues); 3629 3629 - 3630 3630 - int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set) 3631 3631 - { 3632 3632 - if (!ctrl->ops->reinit_request) 3633 3633 - return 0; 3634 3634 - 3635 3635 - return blk_mq_tagset_iter(set, set->driver_data, 3636 3636 - ctrl->ops->reinit_request); 3637 3637 - } 3638 3638 - EXPORT_SYMBOL_GPL(nvme_reinit_tagset); 3639 3643 3640 3644 int __init nvme_core_init(void) 3641 3645 {

+43 -55

drivers/nvme/host/fabrics.c

reviewed

··· 536 536 return NULL; 537 537 } 538 538 539 539 - blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, struct request *rq, 540 540 - bool queue_live, bool is_connected) 539 539 + /* 540 540 + * For something we're not in a state to send to the device the default action 541 541 + * is to busy it and retry it after the controller state is recovered. However, 542 542 + * anything marked for failfast or nvme multipath is immediately failed. 543 543 + * 544 544 + * Note: commands used to initialize the controller will be marked for failfast. 545 545 + * Note: nvme cli/ioctl commands are marked for failfast. 546 546 + */ 547 547 + blk_status_t nvmf_fail_nonready_command(struct request *rq) 541 548 { 542 542 - struct nvme_command *cmd = nvme_req(rq)->cmd; 543 543 - 544 544 - if (likely(ctrl->state == NVME_CTRL_LIVE && is_connected)) 545 545 - return BLK_STS_OK; 546 546 - 547 547 - switch (ctrl->state) { 548 548 - case NVME_CTRL_NEW: 549 549 - case NVME_CTRL_CONNECTING: 550 550 - case NVME_CTRL_DELETING: 551 551 - /* 552 552 - * This is the case of starting a new or deleting an association 553 553 - * but connectivity was lost before it was fully created or torn 554 554 - * down. We need to error the commands used to initialize the 555 555 - * controller so the reconnect can go into a retry attempt. The 556 556 - * commands should all be marked REQ_FAILFAST_DRIVER, which will 557 557 - * hit the reject path below. Anything else will be queued while 558 558 - * the state settles. 559 559 - */ 560 560 - if (!is_connected) 561 561 - break; 562 562 - 563 563 - /* 564 564 - * If queue is live, allow only commands that are internally 565 565 - * generated pass through. These are commands on the admin 566 566 - * queue to initialize the controller. This will reject any 567 567 - * ioctl admin cmds received while initializing. 568 568 - */ 569 569 - if (queue_live && !(nvme_req(rq)->flags & NVME_REQ_USERCMD)) 570 570 - return BLK_STS_OK; 571 571 - 572 572 - /* 573 573 - * If the queue is not live, allow only a connect command. This 574 574 - * will reject any ioctl admin cmd as well as initialization 575 575 - * commands if the controller reverted the queue to non-live. 576 576 - */ 577 577 - if (!queue_live && blk_rq_is_passthrough(rq) && 578 578 - cmd->common.opcode == nvme_fabrics_command && 579 579 - cmd->fabrics.fctype == nvme_fabrics_type_connect) 580 580 - return BLK_STS_OK; 581 581 - break; 582 582 - default: 583 583 - break; 584 584 - } 585 585 - 586 586 - /* 587 587 - * Any other new io is something we're not in a state to send to the 588 588 - * device. Default action is to busy it and retry it after the 589 589 - * controller state is recovered. However, anything marked for failfast 590 590 - * or nvme multipath is immediately failed. Note: commands used to 591 591 - * initialize the controller will be marked for failfast. 592 592 - * Note: nvme cli/ioctl commands are marked for failfast. 593 593 - */ 594 549 if (!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) 595 550 return BLK_STS_RESOURCE; 596 551 nvme_req(rq)->status = NVME_SC_ABORT_REQ; 597 552 return BLK_STS_IOERR; 598 553 } 599 599 - EXPORT_SYMBOL_GPL(nvmf_check_if_ready); 554 554 + EXPORT_SYMBOL_GPL(nvmf_fail_nonready_command); 555 555 + 556 556 + bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, 557 557 + bool queue_live) 558 558 + { 559 559 + struct nvme_request *req = nvme_req(rq); 560 560 + 561 561 + /* 562 562 + * If we are in some state of setup or teardown only allow 563 563 + * internally generated commands. 564 564 + */ 565 565 + if (!blk_rq_is_passthrough(rq) || (req->flags & NVME_REQ_USERCMD)) 566 566 + return false; 567 567 + 568 568 + /* 569 569 + * Only allow commands on a live queue, except for the connect command, 570 570 + * which is require to set the queue live in the appropinquate states. 571 571 + */ 572 572 + switch (ctrl->state) { 573 573 + case NVME_CTRL_NEW: 574 574 + case NVME_CTRL_CONNECTING: 575 575 + if (req->cmd->common.opcode == nvme_fabrics_command && 576 576 + req->cmd->fabrics.fctype == nvme_fabrics_type_connect) 577 577 + return true; 578 578 + break; 579 579 + default: 580 580 + break; 581 581 + case NVME_CTRL_DEAD: 582 582 + return false; 583 583 + } 584 584 + 585 585 + return queue_live; 586 586 + } 587 587 + EXPORT_SYMBOL_GPL(__nvmf_check_ready); 600 588 601 589 static const match_table_t opt_tokens = { 602 590 { NVMF_OPT_TRANSPORT, "transport=%s" },

+12 -2

drivers/nvme/host/fabrics.h

reviewed

··· 162 162 void nvmf_free_options(struct nvmf_ctrl_options *opts); 163 163 int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size); 164 164 bool nvmf_should_reconnect(struct nvme_ctrl *ctrl); 165 165 - blk_status_t nvmf_check_if_ready(struct nvme_ctrl *ctrl, 166 166 - struct request *rq, bool queue_live, bool is_connected); 165 165 + blk_status_t nvmf_fail_nonready_command(struct request *rq); 166 166 + bool __nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, 167 167 + bool queue_live); 168 168 + 169 169 + static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, 170 170 + bool queue_live) 171 171 + { 172 172 + if (likely(ctrl->state == NVME_CTRL_LIVE || 173 173 + ctrl->state == NVME_CTRL_ADMIN_ONLY)) 174 174 + return true; 175 175 + return __nvmf_check_ready(ctrl, rq, queue_live); 176 176 + } 167 177 168 178 #endif /* _NVME_FABRICS_H */

+58 -90

drivers/nvme/host/fc.c

reviewed

··· 142 142 struct nvme_fc_rport *rport; 143 143 u32 cnum; 144 144 145 145 + bool ioq_live; 145 146 bool assoc_active; 146 147 u64 association_id; 147 148 ··· 1471 1470 1472 1471 static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); 1473 1472 1474 1474 - static int 1475 1475 - nvme_fc_reinit_request(void *data, struct request *rq) 1476 1476 - { 1477 1477 - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); 1478 1478 - struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; 1479 1479 - 1480 1480 - memset(cmdiu, 0, sizeof(*cmdiu)); 1481 1481 - cmdiu->scsi_id = NVME_CMD_SCSI_ID; 1482 1482 - cmdiu->fc_id = NVME_CMD_FC_ID; 1483 1483 - cmdiu->iu_len = cpu_to_be16(sizeof(*cmdiu) / sizeof(u32)); 1484 1484 - memset(&op->rsp_iu, 0, sizeof(op->rsp_iu)); 1485 1485 - 1486 1486 - return 0; 1487 1487 - } 1488 1488 - 1489 1473 static void 1490 1474 __nvme_fc_exit_request(struct nvme_fc_ctrl *ctrl, 1491 1475 struct nvme_fc_fcp_op *op) ··· 1879 1893 */ 1880 1894 1881 1895 queue->connection_id = 0; 1896 1896 + atomic_set(&queue->csn, 1); 1882 1897 } 1883 1898 1884 1899 static void ··· 2266 2279 struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu; 2267 2280 struct nvme_command *sqe = &cmdiu->sqe; 2268 2281 enum nvmefc_fcp_datadir io_dir; 2282 2282 + bool queue_ready = test_bit(NVME_FC_Q_LIVE, &queue->flags); 2269 2283 u32 data_len; 2270 2284 blk_status_t ret; 2271 2285 2272 2272 - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, 2273 2273 - test_bit(NVME_FC_Q_LIVE, &queue->flags), 2274 2274 - ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE); 2275 2275 - if (unlikely(ret)) 2276 2276 - return ret; 2286 2286 + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || 2287 2287 + !nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) 2288 2288 + return nvmf_fail_nonready_command(rq); 2277 2289 2278 2290 ret = nvme_setup_cmd(ns, rq, sqe); 2279 2291 if (ret) ··· 2449 2463 if (ret) 2450 2464 goto out_delete_hw_queues; 2451 2465 2466 2466 + ctrl->ioq_live = true; 2467 2467 + 2452 2468 return 0; 2453 2469 2454 2470 out_delete_hw_queues: ··· 2468 2480 } 2469 2481 2470 2482 static int 2471 2471 - nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) 2483 2483 + nvme_fc_recreate_io_queues(struct nvme_fc_ctrl *ctrl) 2472 2484 { 2473 2485 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; 2474 2486 unsigned int nr_io_queues; ··· 2487 2499 /* check for io queues existing */ 2488 2500 if (ctrl->ctrl.queue_count == 1) 2489 2501 return 0; 2490 2490 - 2491 2491 - nvme_fc_init_io_queues(ctrl); 2492 2492 - 2493 2493 - ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); 2494 2494 - if (ret) 2495 2495 - goto out_free_io_queues; 2496 2502 2497 2503 ret = nvme_fc_create_hw_io_queues(ctrl, ctrl->ctrl.sqsize + 1); 2498 2504 if (ret) ··· 2585 2603 * Create the admin queue 2586 2604 */ 2587 2605 2588 2588 - nvme_fc_init_queue(ctrl, 0); 2589 2589 - 2590 2606 ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, 2591 2607 NVME_AQ_DEPTH); 2592 2608 if (ret) ··· 2595 2615 if (ret) 2596 2616 goto out_delete_hw_queue; 2597 2617 2598 2598 - if (ctrl->ctrl.state != NVME_CTRL_NEW) 2599 2599 - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 2618 2618 + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 2600 2619 2601 2620 ret = nvmf_connect_admin_queue(&ctrl->ctrl); 2602 2621 if (ret) ··· 2668 2689 */ 2669 2690 2670 2691 if (ctrl->ctrl.queue_count > 1) { 2671 2671 - if (ctrl->ctrl.state == NVME_CTRL_NEW) 2692 2692 + if (!ctrl->ioq_live) 2672 2693 ret = nvme_fc_create_io_queues(ctrl); 2673 2694 else 2674 2674 - ret = nvme_fc_reinit_io_queues(ctrl); 2695 2695 + ret = nvme_fc_recreate_io_queues(ctrl); 2675 2696 if (ret) 2676 2697 goto out_term_aen_ops; 2677 2698 } ··· 2755 2776 * use blk_mq_tagset_busy_itr() and the transport routine to 2756 2777 * terminate the exchanges. 2757 2778 */ 2758 2758 - if (ctrl->ctrl.state != NVME_CTRL_NEW) 2759 2759 - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 2779 2779 + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 2760 2780 blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, 2761 2781 nvme_fc_terminate_exchange, &ctrl->ctrl); 2762 2782 ··· 2895 2917 .submit_async_event = nvme_fc_submit_async_event, 2896 2918 .delete_ctrl = nvme_fc_delete_ctrl, 2897 2919 .get_address = nvmf_get_address, 2898 2898 - .reinit_request = nvme_fc_reinit_request, 2899 2920 }; 2900 2921 2901 2922 static void ··· 2911 2934 nvme_fc_reconnect_or_delete(ctrl, ret); 2912 2935 else 2913 2936 dev_info(ctrl->ctrl.device, 2914 2914 - "NVME-FC{%d}: controller reconnect complete\n", 2937 2937 + "NVME-FC{%d}: controller connect complete\n", 2915 2938 ctrl->cnum); 2916 2939 } 2917 2940 ··· 2959 2982 { 2960 2983 struct nvme_fc_ctrl *ctrl; 2961 2984 unsigned long flags; 2962 2962 - int ret, idx, retry; 2985 2985 + int ret, idx; 2963 2986 2964 2987 if (!(rport->remoteport.port_role & 2965 2988 (FC_PORT_ROLE_NVME_DISCOVERY | FC_PORT_ROLE_NVME_TARGET))) { ··· 2986 3009 } 2987 3010 2988 3011 ctrl->ctrl.opts = opts; 3012 3012 + ctrl->ctrl.nr_reconnects = 0; 2989 3013 INIT_LIST_HEAD(&ctrl->ctrl_list); 2990 3014 ctrl->lport = lport; 2991 3015 ctrl->rport = rport; 2992 3016 ctrl->dev = lport->dev; 2993 3017 ctrl->cnum = idx; 3018 3018 + ctrl->ioq_live = false; 2994 3019 ctrl->assoc_active = false; 2995 3020 init_waitqueue_head(&ctrl->ioabort_wait); 2996 3021 ··· 3011 3032 3012 3033 ctrl->ctrl.sqsize = opts->queue_size - 1; 3013 3034 ctrl->ctrl.kato = opts->kato; 3035 3035 + ctrl->ctrl.cntlid = 0xffff; 3014 3036 3015 3037 ret = -ENOMEM; 3016 3038 ctrl->queues = kcalloc(ctrl->ctrl.queue_count, 3017 3039 sizeof(struct nvme_fc_queue), GFP_KERNEL); 3018 3040 if (!ctrl->queues) 3019 3041 goto out_free_ida; 3042 3042 + 3043 3043 + nvme_fc_init_queue(ctrl, 0); 3020 3044 3021 3045 memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); 3022 3046 ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; ··· 3063 3081 list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list); 3064 3082 spin_unlock_irqrestore(&rport->lock, flags); 3065 3083 3066 3066 - /* 3067 3067 - * It's possible that transactions used to create the association 3068 3068 - * may fail. Examples: CreateAssociation LS or CreateIOConnection 3069 3069 - * LS gets dropped/corrupted/fails; or a frame gets dropped or a 3070 3070 - * command times out for one of the actions to init the controller 3071 3071 - * (Connect, Get/Set_Property, Set_Features, etc). Many of these 3072 3072 - * transport errors (frame drop, LS failure) inherently must kill 3073 3073 - * the association. The transport is coded so that any command used 3074 3074 - * to create the association (prior to a LIVE state transition 3075 3075 - * while NEW or CONNECTING) will fail if it completes in error or 3076 3076 - * times out. 3077 3077 - * 3078 3078 - * As such: as the connect request was mostly likely due to a 3079 3079 - * udev event that discovered the remote port, meaning there is 3080 3080 - * not an admin or script there to restart if the connect 3081 3081 - * request fails, retry the initial connection creation up to 3082 3082 - * three times before giving up and declaring failure. 3083 3083 - */ 3084 3084 - for (retry = 0; retry < 3; retry++) { 3085 3085 - ret = nvme_fc_create_association(ctrl); 3086 3086 - if (!ret) 3087 3087 - break; 3088 3088 - } 3089 3089 - 3090 3090 - if (ret) { 3091 3091 - nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); 3092 3092 - cancel_work_sync(&ctrl->ctrl.reset_work); 3093 3093 - cancel_delayed_work_sync(&ctrl->connect_work); 3094 3094 - 3095 3095 - /* couldn't schedule retry - fail out */ 3084 3084 + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) || 3085 3085 + !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { 3096 3086 dev_err(ctrl->ctrl.device, 3097 3097 - "NVME-FC{%d}: Connect retry failed\n", ctrl->cnum); 3098 3098 - 3099 3099 - ctrl->ctrl.opts = NULL; 3100 3100 - 3101 3101 - /* initiate nvme ctrl ref counting teardown */ 3102 3102 - nvme_uninit_ctrl(&ctrl->ctrl); 3103 3103 - 3104 3104 - /* Remove core ctrl ref. */ 3105 3105 - nvme_put_ctrl(&ctrl->ctrl); 3106 3106 - 3107 3107 - /* as we're past the point where we transition to the ref 3108 3108 - * counting teardown path, if we return a bad pointer here, 3109 3109 - * the calling routine, thinking it's prior to the 3110 3110 - * transition, will do an rport put. Since the teardown 3111 3111 - * path also does a rport put, we do an extra get here to 3112 3112 - * so proper order/teardown happens. 3113 3113 - */ 3114 3114 - nvme_fc_rport_get(rport); 3115 3115 - 3116 3116 - if (ret > 0) 3117 3117 - ret = -EIO; 3118 3118 - return ERR_PTR(ret); 3087 3087 + "NVME-FC{%d}: failed to init ctrl state\n", ctrl->cnum); 3088 3088 + goto fail_ctrl; 3119 3089 } 3120 3090 3121 3091 nvme_get_ctrl(&ctrl->ctrl); 3092 3092 + 3093 3093 + if (!queue_delayed_work(nvme_wq, &ctrl->connect_work, 0)) { 3094 3094 + nvme_put_ctrl(&ctrl->ctrl); 3095 3095 + dev_err(ctrl->ctrl.device, 3096 3096 + "NVME-FC{%d}: failed to schedule initial connect\n", 3097 3097 + ctrl->cnum); 3098 3098 + goto fail_ctrl; 3099 3099 + } 3100 3100 + 3101 3101 + flush_delayed_work(&ctrl->connect_work); 3122 3102 3123 3103 dev_info(ctrl->ctrl.device, 3124 3104 "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", 3125 3105 ctrl->cnum, ctrl->ctrl.opts->subsysnqn); 3126 3106 3127 3107 return &ctrl->ctrl; 3108 3108 + 3109 3109 + fail_ctrl: 3110 3110 + nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING); 3111 3111 + cancel_work_sync(&ctrl->ctrl.reset_work); 3112 3112 + cancel_delayed_work_sync(&ctrl->connect_work); 3113 3113 + 3114 3114 + ctrl->ctrl.opts = NULL; 3115 3115 + 3116 3116 + /* initiate nvme ctrl ref counting teardown */ 3117 3117 + nvme_uninit_ctrl(&ctrl->ctrl); 3118 3118 + 3119 3119 + /* Remove core ctrl ref. */ 3120 3120 + nvme_put_ctrl(&ctrl->ctrl); 3121 3121 + 3122 3122 + /* as we're past the point where we transition to the ref 3123 3123 + * counting teardown path, if we return a bad pointer here, 3124 3124 + * the calling routine, thinking it's prior to the 3125 3125 + * transition, will do an rport put. Since the teardown 3126 3126 + * path also does a rport put, we do an extra get here to 3127 3127 + * so proper order/teardown happens. 3128 3128 + */ 3129 3129 + nvme_fc_rport_get(rport); 3130 3130 + 3131 3131 + return ERR_PTR(-EIO); 3128 3132 3129 3133 out_cleanup_admin_q: 3130 3134 blk_cleanup_queue(ctrl->ctrl.admin_q);

drivers/nvme/host/multipath.c

reviewed

··· 12 12 */ 13 13 14 14 #include <linux/moduleparam.h> 15 15 + #include <trace/events/block.h> 15 16 #include "nvme.h" 16 17 17 18 static bool multipath = true; ··· 112 111 if (likely(ns)) { 113 112 bio->bi_disk = ns->disk; 114 113 bio->bi_opf |= REQ_NVME_MPATH; 114 114 + trace_block_bio_remap(bio->bi_disk->queue, bio, 115 115 + disk_devt(ns->head->disk), 116 116 + bio->bi_iter.bi_sector); 115 117 ret = direct_make_request(bio); 116 118 } else if (!list_empty_careful(&head->list)) { 117 119 dev_warn_ratelimited(dev, "no path available - requeuing I/O\n");

-2

drivers/nvme/host/nvme.h

reviewed

··· 321 321 void (*submit_async_event)(struct nvme_ctrl *ctrl); 322 322 void (*delete_ctrl)(struct nvme_ctrl *ctrl); 323 323 int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); 324 324 - int (*reinit_request)(void *data, struct request *rq); 325 324 void (*stop_ctrl)(struct nvme_ctrl *ctrl); 326 325 }; 327 326 ··· 415 416 void nvme_wait_freeze(struct nvme_ctrl *ctrl); 416 417 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); 417 418 void nvme_start_freeze(struct nvme_ctrl *ctrl); 418 418 - int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); 419 419 420 420 #define NVME_QID_ANY -1 421 421 struct request *nvme_alloc_request(struct request_queue *q,

+27 -11

drivers/nvme/host/rdma.c

reviewed

··· 1189 1189 count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, 1190 1190 rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1191 1191 if (unlikely(count <= 0)) { 1192 1192 - sg_free_table_chained(&req->sg_table, true); 1193 1193 - return -EIO; 1192 1192 + ret = -EIO; 1193 1193 + goto out_free_table; 1194 1194 } 1195 1195 1196 1196 if (count == 1) { 1197 1197 if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && 1198 1198 blk_rq_payload_bytes(rq) <= 1199 1199 - nvme_rdma_inline_data_size(queue)) 1200 1200 - return nvme_rdma_map_sg_inline(queue, req, c); 1199 1199 + nvme_rdma_inline_data_size(queue)) { 1200 1200 + ret = nvme_rdma_map_sg_inline(queue, req, c); 1201 1201 + goto out; 1202 1202 + } 1201 1203 1202 1202 - if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) 1203 1203 - return nvme_rdma_map_sg_single(queue, req, c); 1204 1204 + if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { 1205 1205 + ret = nvme_rdma_map_sg_single(queue, req, c); 1206 1206 + goto out; 1207 1207 + } 1204 1208 } 1205 1209 1206 1206 - return nvme_rdma_map_sg_fr(queue, req, c, count); 1210 1210 + ret = nvme_rdma_map_sg_fr(queue, req, c, count); 1211 1211 + out: 1212 1212 + if (unlikely(ret)) 1213 1213 + goto out_unmap_sg; 1214 1214 + 1215 1215 + return 0; 1216 1216 + 1217 1217 + out_unmap_sg: 1218 1218 + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, 1219 1219 + req->nents, rq_data_dir(rq) == 1220 1220 + WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); 1221 1221 + out_free_table: 1222 1222 + sg_free_table_chained(&req->sg_table, true); 1223 1223 + return ret; 1207 1224 } 1208 1225 1209 1226 static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) ··· 1630 1613 struct nvme_rdma_qe *sqe = &req->sqe; 1631 1614 struct nvme_command *c = sqe->data; 1632 1615 struct ib_device *dev; 1616 1616 + bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags); 1633 1617 blk_status_t ret; 1634 1618 int err; 1635 1619 1636 1620 WARN_ON_ONCE(rq->tag < 0); 1637 1621 1638 1638 - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, rq, 1639 1639 - test_bit(NVME_RDMA_Q_LIVE, &queue->flags), true); 1640 1640 - if (unlikely(ret)) 1641 1641 - return ret; 1622 1622 + if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) 1623 1623 + return nvmf_fail_nonready_command(rq); 1642 1624 1643 1625 dev = queue->device->dev; 1644 1626 ib_dma_sync_single_for_cpu(dev, sqe->dma,

+3 -1

drivers/nvme/target/admin-cmd.c

reviewed

··· 119 119 else 120 120 status = nvmet_get_smart_log_nsid(req, log); 121 121 if (status) 122 122 - goto out; 122 122 + goto out_free_log; 123 123 124 124 status = nvmet_copy_to_sgl(req, 0, log, sizeof(*log)); 125 125 + out_free_log: 126 126 + kfree(log); 125 127 out: 126 128 nvmet_req_complete(req, status); 127 129 }

+3 -4

drivers/nvme/target/loop.c

reviewed

··· 158 158 struct nvme_loop_queue *queue = hctx->driver_data; 159 159 struct request *req = bd->rq; 160 160 struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); 161 161 + bool queue_ready = test_bit(NVME_LOOP_Q_LIVE, &queue->flags); 161 162 blk_status_t ret; 162 163 163 163 - ret = nvmf_check_if_ready(&queue->ctrl->ctrl, req, 164 164 - test_bit(NVME_LOOP_Q_LIVE, &queue->flags), true); 165 165 - if (unlikely(ret)) 166 166 - return ret; 164 164 + if (!nvmf_check_ready(&queue->ctrl->ctrl, req, queue_ready)) 165 165 + return nvmf_fail_nonready_command(req); 167 166 168 167 ret = nvme_setup_cmd(ns, req, &iod->cmd); 169 168 if (ret)

-2

include/linux/blk-mq.h

reviewed

··· 281 281 void blk_mq_freeze_queue_wait(struct request_queue *q); 282 282 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 283 283 unsigned long timeout); 284 284 - int blk_mq_tagset_iter(struct blk_mq_tag_set *set, void *data, 285 285 - int (reinit_request)(void *, struct request *)); 286 284 287 285 int blk_mq_map_queues(struct blk_mq_tag_set *set); 288 286 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);

+2 -2

include/linux/blkdev.h

reviewed

··· 127 127 #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) 128 128 /* already slept for hybrid poll */ 129 129 #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) 130 130 + /* ->timeout has been called, don't expire again */ 131 131 + #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) 130 132 131 133 /* flags that prevent us from merging requests: */ 132 134 #define RQF_NOMERGE_FLAGS \ ··· 562 560 unsigned int dma_alignment; 563 561 564 562 struct blk_queue_tag *queue_tags; 565 565 - struct list_head tag_busy_list; 566 563 567 564 unsigned int nr_sorted; 568 565 unsigned int in_flight[2]; ··· 1374 1373 extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *, int); 1375 1374 extern void blk_queue_free_tags(struct request_queue *); 1376 1375 extern int blk_queue_resize_tags(struct request_queue *, int); 1377 1377 - extern void blk_queue_invalidate_tags(struct request_queue *); 1378 1376 extern struct blk_queue_tag *blk_init_tags(int, int); 1379 1377 extern void blk_free_tags(struct blk_queue_tag *); 1380 1378