Merge tag 'block-5.9-2020-09-04' of git://git.kernel.dk/linux-block

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'block-5.9-2020-09-04' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
"A bit larger than usual this week, mostly due to the NVMe fixes
arriving late for -rc3 and hence didn't make last weeks pull request.

- NVMe:
- instance leak and io boundary fixes from Keith
- fc locking fix from Christophe
- various tcp/rdma reset during traffic fixes from Sagi
- pci use-after-free fix from Tong
- tcp target null deref fix from Ziye

- Locking fix for partition removal (Christoph)

- Ensure bdi->io_pages is always set (me)

- Fixup for hd struct reference (Ming)

- Fix for zero length bvecs (Ming)

- Two small blk-iocost fixes (Tejun)"

* tag 'block-5.9-2020-09-04' of git://git.kernel.dk/linux-block:
block: allow for_each_bvec to support zero len bvec
blk-stat: make q->stats->lock irqsafe
blk-iocost: ioc_pd_free() shouldn't assume irq disabled
block: fix locking in bdev_del_partition
block: release disk reference in hd_struct_free_work
block: ensure bdi->io_pages is always initialized
nvme-pci: cancel nvme device request before disabling
nvme: only use power of two io boundaries
nvme: fix controller instance leak
nvmet-fc: Fix a missed _irqsave version of spin_lock in 'nvmet_fc_fod_op_done()'
nvme: Fix NULL dereference for pci nvme controllers
nvme-rdma: fix reset hang if controller died in the middle of a reset
nvme-rdma: fix timeout handler
nvme-rdma: serialize controller teardown sequences
nvme-tcp: fix reset hang if controller died in the middle of a reset
nvme-tcp: fix timeout handler
nvme-tcp: serialize controller teardown sequences
nvme: have nvme_wait_freeze_timeout return if it timed out
nvme-fabrics: don't check state NVME_CTRL_NEW for request acceptance
nvmet-tcp: Fix NULL dereference when a connect data comes in h2cdata pdu

Linus Torvalds 5 years ago 8075fc3b d849ca48

+212 -82

13 changed files

expand all collapse all

block

blk-core.c

blk-iocost.c

blk-stat.c

partitions

core.c

drivers

nvme

host

core.c

fabrics.c

nvme.h

pci.c

rdma.c

tcp.c

target

fc.c

tcp.c

include

linux

bvec.h

block/blk-core.c

reviewed

··· 539 539 goto fail_stats; 540 540 541 541 q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES; 542 542 + q->backing_dev_info->io_pages = VM_READAHEAD_PAGES; 542 543 q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; 543 544 q->node = node_id; 544 545

+3 -2

block/blk-iocost.c

reviewed

··· 2092 2092 { 2093 2093 struct ioc_gq *iocg = pd_to_iocg(pd); 2094 2094 struct ioc *ioc = iocg->ioc; 2095 2095 + unsigned long flags; 2095 2096 2096 2097 if (ioc) { 2097 2097 - spin_lock(&ioc->lock); 2098 2098 + spin_lock_irqsave(&ioc->lock, flags); 2098 2099 if (!list_empty(&iocg->active_list)) { 2099 2100 propagate_active_weight(iocg, 0, 0); 2100 2101 list_del_init(&iocg->active_list); 2101 2102 } 2102 2102 - spin_unlock(&ioc->lock); 2103 2103 + spin_unlock_irqrestore(&ioc->lock, flags); 2103 2104 2104 2105 hrtimer_cancel(&iocg->waitq_timer); 2105 2106 hrtimer_cancel(&iocg->delay_timer);

+11 -6

block/blk-stat.c

reviewed

··· 137 137 struct blk_stat_callback *cb) 138 138 { 139 139 unsigned int bucket; 140 140 + unsigned long flags; 140 141 int cpu; 141 142 142 143 for_each_possible_cpu(cpu) { ··· 148 147 blk_rq_stat_init(&cpu_stat[bucket]); 149 148 } 150 149 151 151 - spin_lock(&q->stats->lock); 150 150 + spin_lock_irqsave(&q->stats->lock, flags); 152 151 list_add_tail_rcu(&cb->list, &q->stats->callbacks); 153 152 blk_queue_flag_set(QUEUE_FLAG_STATS, q); 154 154 - spin_unlock(&q->stats->lock); 153 153 + spin_unlock_irqrestore(&q->stats->lock, flags); 155 154 } 156 155 157 156 void blk_stat_remove_callback(struct request_queue *q, 158 157 struct blk_stat_callback *cb) 159 158 { 160 160 - spin_lock(&q->stats->lock); 159 159 + unsigned long flags; 160 160 + 161 161 + spin_lock_irqsave(&q->stats->lock, flags); 161 162 list_del_rcu(&cb->list); 162 163 if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) 163 164 blk_queue_flag_clear(QUEUE_FLAG_STATS, q); 164 164 - spin_unlock(&q->stats->lock); 165 165 + spin_unlock_irqrestore(&q->stats->lock, flags); 165 166 166 167 del_timer_sync(&cb->timer); 167 168 } ··· 186 183 187 184 void blk_stat_enable_accounting(struct request_queue *q) 188 185 { 189 189 - spin_lock(&q->stats->lock); 186 186 + unsigned long flags; 187 187 + 188 188 + spin_lock_irqsave(&q->stats->lock, flags); 190 189 q->stats->enable_accounting = true; 191 190 blk_queue_flag_set(QUEUE_FLAG_STATS, q); 192 192 - spin_unlock(&q->stats->lock); 191 191 + spin_unlock_irqrestore(&q->stats->lock, flags); 193 192 } 194 193 EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); 195 194

+22 -15

block/partitions/core.c

reviewed

··· 278 278 { 279 279 struct hd_struct *part = 280 280 container_of(to_rcu_work(work), struct hd_struct, rcu_work); 281 281 + struct gendisk *disk = part_to_disk(part); 282 282 + 283 283 + /* 284 284 + * Release the disk reference acquired in delete_partition here. 285 285 + * We can't release it in hd_struct_free because the final put_device 286 286 + * needs process context and thus can't be run directly from a 287 287 + * percpu_ref ->release handler. 288 288 + */ 289 289 + put_device(disk_to_dev(disk)); 281 290 282 291 part->start_sect = 0; 283 292 part->nr_sects = 0; ··· 302 293 rcu_dereference_protected(disk->part_tbl, 1); 303 294 304 295 rcu_assign_pointer(ptbl->last_lookup, NULL); 305 305 - put_device(disk_to_dev(disk)); 306 296 307 297 INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); 308 298 queue_rcu_work(system_wq, &part->rcu_work); ··· 532 524 int bdev_del_partition(struct block_device *bdev, int partno) 533 525 { 534 526 struct block_device *bdevp; 535 535 - struct hd_struct *part; 536 536 - int ret = 0; 527 527 + struct hd_struct *part = NULL; 528 528 + int ret; 537 529 538 538 - part = disk_get_part(bdev->bd_disk, partno); 539 539 - if (!part) 540 540 - return -ENXIO; 541 541 - 542 542 - ret = -ENOMEM; 543 543 - bdevp = bdget(part_devt(part)); 530 530 + bdevp = bdget_disk(bdev->bd_disk, partno); 544 531 if (!bdevp) 545 545 - goto out_put_part; 532 532 + return -ENOMEM; 546 533 547 534 mutex_lock(&bdevp->bd_mutex); 535 535 + mutex_lock_nested(&bdev->bd_mutex, 1); 536 536 + 537 537 + ret = -ENXIO; 538 538 + part = disk_get_part(bdev->bd_disk, partno); 539 539 + if (!part) 540 540 + goto out_unlock; 548 541 549 542 ret = -EBUSY; 550 543 if (bdevp->bd_openers) ··· 554 545 sync_blockdev(bdevp); 555 546 invalidate_bdev(bdevp); 556 547 557 557 - mutex_lock_nested(&bdev->bd_mutex, 1); 558 548 delete_partition(bdev->bd_disk, part); 559 559 - mutex_unlock(&bdev->bd_mutex); 560 560 - 561 549 ret = 0; 562 550 out_unlock: 551 551 + mutex_unlock(&bdev->bd_mutex); 563 552 mutex_unlock(&bdevp->bd_mutex); 564 553 bdput(bdevp); 565 565 - out_put_part: 566 566 - disk_put_part(part); 554 554 + if (part) 555 555 + disk_put_part(part); 567 556 return ret; 568 557 } 569 558

+45 -11

drivers/nvme/host/core.c

reviewed

··· 2026 2026 blk_mq_unfreeze_queue(disk->queue); 2027 2027 } 2028 2028 2029 2029 + static inline bool nvme_first_scan(struct gendisk *disk) 2030 2030 + { 2031 2031 + /* nvme_alloc_ns() scans the disk prior to adding it */ 2032 2032 + return !(disk->flags & GENHD_FL_UP); 2033 2033 + } 2034 2034 + 2035 2035 + static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) 2036 2036 + { 2037 2037 + struct nvme_ctrl *ctrl = ns->ctrl; 2038 2038 + u32 iob; 2039 2039 + 2040 2040 + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && 2041 2041 + is_power_of_2(ctrl->max_hw_sectors)) 2042 2042 + iob = ctrl->max_hw_sectors; 2043 2043 + else 2044 2044 + iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); 2045 2045 + 2046 2046 + if (!iob) 2047 2047 + return; 2048 2048 + 2049 2049 + if (!is_power_of_2(iob)) { 2050 2050 + if (nvme_first_scan(ns->disk)) 2051 2051 + pr_warn("%s: ignoring unaligned IO boundary:%u\n", 2052 2052 + ns->disk->disk_name, iob); 2053 2053 + return; 2054 2054 + } 2055 2055 + 2056 2056 + if (blk_queue_is_zoned(ns->disk->queue)) { 2057 2057 + if (nvme_first_scan(ns->disk)) 2058 2058 + pr_warn("%s: ignoring zoned namespace IO boundary\n", 2059 2059 + ns->disk->disk_name); 2060 2060 + return; 2061 2061 + } 2062 2062 + 2063 2063 + blk_queue_chunk_sectors(ns->queue, iob); 2064 2064 + } 2065 2065 + 2029 2066 static int __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) 2030 2067 { 2031 2068 unsigned lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; 2032 2069 struct nvme_ns *ns = disk->private_data; 2033 2070 struct nvme_ctrl *ctrl = ns->ctrl; 2034 2071 int ret; 2035 2035 - u32 iob; 2036 2072 2037 2073 /* 2038 2074 * If identify namespace failed, use default 512 byte block size so ··· 2095 2059 ns->head->ids.csi, ns->head->ns_id); 2096 2060 return -ENODEV; 2097 2061 } 2098 2098 - 2099 2099 - if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && 2100 2100 - is_power_of_2(ctrl->max_hw_sectors)) 2101 2101 - iob = ctrl->max_hw_sectors; 2102 2102 - else 2103 2103 - iob = nvme_lba_to_sect(ns, le16_to_cpu(id->noiob)); 2104 2062 2105 2063 ns->features = 0; 2106 2064 ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); ··· 2127 2097 } 2128 2098 } 2129 2099 2130 2130 - if (iob && !blk_queue_is_zoned(ns->queue)) 2131 2131 - blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(iob)); 2100 2100 + nvme_set_chunk_sectors(ns, id); 2132 2101 nvme_update_disk_info(disk, ns, id); 2133 2102 #ifdef CONFIG_NVME_MULTIPATH 2134 2103 if (ns->head->disk) { ··· 3705 3676 return 0; 3706 3677 if (a == &dev_attr_hostid.attr && !ctrl->opts) 3707 3678 return 0; 3679 3679 + if (a == &dev_attr_ctrl_loss_tmo.attr && !ctrl->opts) 3680 3680 + return 0; 3681 3681 + if (a == &dev_attr_reconnect_delay.attr && !ctrl->opts) 3682 3682 + return 0; 3708 3683 3709 3684 return a->mode; 3710 3685 } ··· 4423 4390 struct nvme_subsystem *subsys = ctrl->subsys; 4424 4391 struct nvme_cel *cel, *next; 4425 4392 4426 4426 - if (subsys && ctrl->instance != subsys->instance) 4393 4393 + if (!subsys || ctrl->instance != subsys->instance) 4427 4394 ida_simple_remove(&nvme_instance_ida, ctrl->instance); 4428 4395 4429 4396 list_for_each_entry_safe(cel, next, &ctrl->cels, entry) { ··· 4567 4534 } 4568 4535 EXPORT_SYMBOL_GPL(nvme_unfreeze); 4569 4536 4570 4570 - void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) 4537 4537 + int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout) 4571 4538 { 4572 4539 struct nvme_ns *ns; 4573 4540 ··· 4578 4545 break; 4579 4546 } 4580 4547 up_read(&ctrl->namespaces_rwsem); 4548 4548 + return timeout; 4581 4549 } 4582 4550 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout); 4583 4551

-1

drivers/nvme/host/fabrics.c

reviewed

··· 576 576 * which is require to set the queue live in the appropinquate states. 577 577 */ 578 578 switch (ctrl->state) { 579 579 - case NVME_CTRL_NEW: 580 579 case NVME_CTRL_CONNECTING: 581 580 if (nvme_is_fabrics(req->cmd) && 582 581 req->cmd->fabrics.fctype == nvme_fabrics_type_connect)

+1 -1

drivers/nvme/host/nvme.h

reviewed

··· 605 605 void nvme_sync_queues(struct nvme_ctrl *ctrl); 606 606 void nvme_unfreeze(struct nvme_ctrl *ctrl); 607 607 void nvme_wait_freeze(struct nvme_ctrl *ctrl); 608 608 - void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); 608 608 + int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); 609 609 void nvme_start_freeze(struct nvme_ctrl *ctrl); 610 610 611 611 #define NVME_QID_ANY -1

+2 -2

drivers/nvme/host/pci.c

reviewed

··· 1249 1249 dev_warn_ratelimited(dev->ctrl.device, 1250 1250 "I/O %d QID %d timeout, disable controller\n", 1251 1251 req->tag, nvmeq->qid); 1252 1252 - nvme_dev_disable(dev, true); 1253 1252 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1253 1253 + nvme_dev_disable(dev, true); 1254 1254 return BLK_EH_DONE; 1255 1255 case NVME_CTRL_RESETTING: 1256 1256 return BLK_EH_RESET_TIMER; ··· 1267 1267 dev_warn(dev->ctrl.device, 1268 1268 "I/O %d QID %d timeout, reset controller\n", 1269 1269 req->tag, nvmeq->qid); 1270 1270 + nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1270 1271 nvme_dev_disable(dev, false); 1271 1272 nvme_reset_ctrl(&dev->ctrl); 1272 1273 1273 1273 - nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1274 1274 return BLK_EH_DONE; 1275 1275 } 1276 1276

+51 -17

drivers/nvme/host/rdma.c

reviewed

··· 122 122 struct sockaddr_storage src_addr; 123 123 124 124 struct nvme_ctrl ctrl; 125 125 + struct mutex teardown_lock; 125 126 bool use_inline_data; 126 127 u32 io_queues[HCTX_MAX_TYPES]; 127 128 }; ··· 976 975 977 976 if (!new) { 978 977 nvme_start_queues(&ctrl->ctrl); 979 979 - nvme_wait_freeze(&ctrl->ctrl); 978 978 + if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) { 979 979 + /* 980 980 + * If we timed out waiting for freeze we are likely to 981 981 + * be stuck. Fail the controller initialization just 982 982 + * to be safe. 983 983 + */ 984 984 + ret = -ENODEV; 985 985 + goto out_wait_freeze_timed_out; 986 986 + } 980 987 blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset, 981 988 ctrl->ctrl.queue_count - 1); 982 989 nvme_unfreeze(&ctrl->ctrl); ··· 992 983 993 984 return 0; 994 985 986 986 + out_wait_freeze_timed_out: 987 987 + nvme_stop_queues(&ctrl->ctrl); 988 988 + nvme_rdma_stop_io_queues(ctrl); 995 989 out_cleanup_connect_q: 996 990 if (new) 997 991 blk_cleanup_queue(ctrl->ctrl.connect_q); ··· 1009 997 static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, 1010 998 bool remove) 1011 999 { 1000 1000 + mutex_lock(&ctrl->teardown_lock); 1012 1001 blk_mq_quiesce_queue(ctrl->ctrl.admin_q); 1013 1002 nvme_rdma_stop_queue(&ctrl->queues[0]); 1014 1003 if (ctrl->ctrl.admin_tagset) { ··· 1020 1007 if (remove) 1021 1008 blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); 1022 1009 nvme_rdma_destroy_admin_queue(ctrl, remove); 1010 1010 + mutex_unlock(&ctrl->teardown_lock); 1023 1011 } 1024 1012 1025 1013 static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, 1026 1014 bool remove) 1027 1015 { 1016 1016 + mutex_lock(&ctrl->teardown_lock); 1028 1017 if (ctrl->ctrl.queue_count > 1) { 1029 1018 nvme_start_freeze(&ctrl->ctrl); 1030 1019 nvme_stop_queues(&ctrl->ctrl); ··· 1040 1025 nvme_start_queues(&ctrl->ctrl); 1041 1026 nvme_rdma_destroy_io_queues(ctrl, remove); 1042 1027 } 1028 1028 + mutex_unlock(&ctrl->teardown_lock); 1043 1029 } 1044 1030 1045 1031 static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl) ··· 1196 1180 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) 1197 1181 return; 1198 1182 1183 1183 + dev_warn(ctrl->ctrl.device, "starting error recovery\n"); 1199 1184 queue_work(nvme_reset_wq, &ctrl->err_work); 1200 1185 } 1201 1186 ··· 1963 1946 return 0; 1964 1947 } 1965 1948 1949 1949 + static void nvme_rdma_complete_timed_out(struct request *rq) 1950 1950 + { 1951 1951 + struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); 1952 1952 + struct nvme_rdma_queue *queue = req->queue; 1953 1953 + struct nvme_rdma_ctrl *ctrl = queue->ctrl; 1954 1954 + 1955 1955 + /* fence other contexts that may complete the command */ 1956 1956 + mutex_lock(&ctrl->teardown_lock); 1957 1957 + nvme_rdma_stop_queue(queue); 1958 1958 + if (!blk_mq_request_completed(rq)) { 1959 1959 + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; 1960 1960 + blk_mq_complete_request(rq); 1961 1961 + } 1962 1962 + mutex_unlock(&ctrl->teardown_lock); 1963 1963 + } 1964 1964 + 1966 1965 static enum blk_eh_timer_return 1967 1966 nvme_rdma_timeout(struct request *rq, bool reserved) 1968 1967 { ··· 1989 1956 dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", 1990 1957 rq->tag, nvme_rdma_queue_idx(queue)); 1991 1958 1992 1992 - /* 1993 1993 - * Restart the timer if a controller reset is already scheduled. Any 1994 1994 - * timed out commands would be handled before entering the connecting 1995 1995 - * state. 1996 1996 - */ 1997 1997 - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) 1998 1998 - return BLK_EH_RESET_TIMER; 1999 1999 - 2000 1959 if (ctrl->ctrl.state != NVME_CTRL_LIVE) { 2001 1960 /* 2002 2002 - * Teardown immediately if controller times out while starting 2003 2003 - * or we are already started error recovery. all outstanding 2004 2004 - * requests are completed on shutdown, so we return BLK_EH_DONE. 1961 1961 + * If we are resetting, connecting or deleting we should 1962 1962 + * complete immediately because we may block controller 1963 1963 + * teardown or setup sequence 1964 1964 + * - ctrl disable/shutdown fabrics requests 1965 1965 + * - connect requests 1966 1966 + * - initialization admin requests 1967 1967 + * - I/O requests that entered after unquiescing and 1968 1968 + * the controller stopped responding 1969 1969 + * 1970 1970 + * All other requests should be cancelled by the error 1971 1971 + * recovery work, so it's fine that we fail it here. 2005 1972 */ 2006 2006 - flush_work(&ctrl->err_work); 2007 2007 - nvme_rdma_teardown_io_queues(ctrl, false); 2008 2008 - nvme_rdma_teardown_admin_queue(ctrl, false); 1973 1973 + nvme_rdma_complete_timed_out(rq); 2009 1974 return BLK_EH_DONE; 2010 1975 } 2011 1976 2012 2012 - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); 1977 1977 + /* 1978 1978 + * LIVE state should trigger the normal error recovery which will 1979 1979 + * handle completing this request. 1980 1980 + */ 2013 1981 nvme_rdma_error_recovery(ctrl); 2014 2014 - 2015 1982 return BLK_EH_RESET_TIMER; 2016 1983 } 2017 1984 ··· 2311 2278 return ERR_PTR(-ENOMEM); 2312 2279 ctrl->ctrl.opts = opts; 2313 2280 INIT_LIST_HEAD(&ctrl->list); 2281 2281 + mutex_init(&ctrl->teardown_lock); 2314 2282 2315 2283 if (!(opts->mask & NVMF_OPT_TRSVCID)) { 2316 2284 opts->trsvcid =

+57 -23

drivers/nvme/host/tcp.c

reviewed

··· 124 124 struct sockaddr_storage src_addr; 125 125 struct nvme_ctrl ctrl; 126 126 127 127 + struct mutex teardown_lock; 127 128 struct work_struct err_work; 128 129 struct delayed_work connect_work; 129 130 struct nvme_tcp_request async_req; ··· 465 464 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 466 465 return; 467 466 467 467 + dev_warn(ctrl->device, "starting error recovery\n"); 468 468 queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); 469 469 } 470 470 ··· 1528 1526 1529 1527 if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) 1530 1528 return; 1531 1531 - 1532 1529 __nvme_tcp_stop_queue(queue); 1533 1530 } 1534 1531 ··· 1782 1781 1783 1782 if (!new) { 1784 1783 nvme_start_queues(ctrl); 1785 1785 - nvme_wait_freeze(ctrl); 1784 1784 + if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { 1785 1785 + /* 1786 1786 + * If we timed out waiting for freeze we are likely to 1787 1787 + * be stuck. Fail the controller initialization just 1788 1788 + * to be safe. 1789 1789 + */ 1790 1790 + ret = -ENODEV; 1791 1791 + goto out_wait_freeze_timed_out; 1792 1792 + } 1786 1793 blk_mq_update_nr_hw_queues(ctrl->tagset, 1787 1794 ctrl->queue_count - 1); 1788 1795 nvme_unfreeze(ctrl); ··· 1798 1789 1799 1790 return 0; 1800 1791 1792 1792 + out_wait_freeze_timed_out: 1793 1793 + nvme_stop_queues(ctrl); 1794 1794 + nvme_tcp_stop_io_queues(ctrl); 1801 1795 out_cleanup_connect_q: 1802 1796 if (new) 1803 1797 blk_cleanup_queue(ctrl->connect_q); ··· 1886 1874 static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, 1887 1875 bool remove) 1888 1876 { 1877 1877 + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); 1889 1878 blk_mq_quiesce_queue(ctrl->admin_q); 1890 1879 nvme_tcp_stop_queue(ctrl, 0); 1891 1880 if (ctrl->admin_tagset) { ··· 1897 1884 if (remove) 1898 1885 blk_mq_unquiesce_queue(ctrl->admin_q); 1899 1886 nvme_tcp_destroy_admin_queue(ctrl, remove); 1887 1887 + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); 1900 1888 } 1901 1889 1902 1890 static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, 1903 1891 bool remove) 1904 1892 { 1893 1893 + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); 1905 1894 if (ctrl->queue_count <= 1) 1906 1906 - return; 1895 1895 + goto out; 1896 1896 + blk_mq_quiesce_queue(ctrl->admin_q); 1907 1897 nvme_start_freeze(ctrl); 1908 1898 nvme_stop_queues(ctrl); 1909 1899 nvme_tcp_stop_io_queues(ctrl); ··· 1918 1902 if (remove) 1919 1903 nvme_start_queues(ctrl); 1920 1904 nvme_tcp_destroy_io_queues(ctrl, remove); 1905 1905 + out: 1906 1906 + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); 1921 1907 } 1922 1908 1923 1909 static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) ··· 2166 2148 nvme_tcp_queue_request(&ctrl->async_req, true, true); 2167 2149 } 2168 2150 2151 2151 + static void nvme_tcp_complete_timed_out(struct request *rq) 2152 2152 + { 2153 2153 + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2154 2154 + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; 2155 2155 + 2156 2156 + /* fence other contexts that may complete the command */ 2157 2157 + mutex_lock(&to_tcp_ctrl(ctrl)->teardown_lock); 2158 2158 + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); 2159 2159 + if (!blk_mq_request_completed(rq)) { 2160 2160 + nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD; 2161 2161 + blk_mq_complete_request(rq); 2162 2162 + } 2163 2163 + mutex_unlock(&to_tcp_ctrl(ctrl)->teardown_lock); 2164 2164 + } 2165 2165 + 2169 2166 static enum blk_eh_timer_return 2170 2167 nvme_tcp_timeout(struct request *rq, bool reserved) 2171 2168 { 2172 2169 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); 2173 2173 - struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; 2170 2170 + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; 2174 2171 struct nvme_tcp_cmd_pdu *pdu = req->pdu; 2175 2172 2176 2176 - /* 2177 2177 - * Restart the timer if a controller reset is already scheduled. Any 2178 2178 - * timed out commands would be handled before entering the connecting 2179 2179 - * state. 2180 2180 - */ 2181 2181 - if (ctrl->ctrl.state == NVME_CTRL_RESETTING) 2182 2182 - return BLK_EH_RESET_TIMER; 2183 2183 - 2184 2184 - dev_warn(ctrl->ctrl.device, 2173 2173 + dev_warn(ctrl->device, 2185 2174 "queue %d: timeout request %#x type %d\n", 2186 2175 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); 2187 2176 2188 2188 - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { 2177 2177 + if (ctrl->state != NVME_CTRL_LIVE) { 2189 2178 /* 2190 2190 - * Teardown immediately if controller times out while starting 2191 2191 - * or we are already started error recovery. all outstanding 2192 2192 - * requests are completed on shutdown, so we return BLK_EH_DONE. 2179 2179 + * If we are resetting, connecting or deleting we should 2180 2180 + * complete immediately because we may block controller 2181 2181 + * teardown or setup sequence 2182 2182 + * - ctrl disable/shutdown fabrics requests 2183 2183 + * - connect requests 2184 2184 + * - initialization admin requests 2185 2185 + * - I/O requests that entered after unquiescing and 2186 2186 + * the controller stopped responding 2187 2187 + * 2188 2188 + * All other requests should be cancelled by the error 2189 2189 + * recovery work, so it's fine that we fail it here. 2193 2190 */ 2194 2194 - flush_work(&ctrl->err_work); 2195 2195 - nvme_tcp_teardown_io_queues(&ctrl->ctrl, false); 2196 2196 - nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false); 2191 2191 + nvme_tcp_complete_timed_out(rq); 2197 2192 return BLK_EH_DONE; 2198 2193 } 2199 2194 2200 2200 - dev_warn(ctrl->ctrl.device, "starting error recovery\n"); 2201 2201 - nvme_tcp_error_recovery(&ctrl->ctrl); 2202 2202 - 2195 2195 + /* 2196 2196 + * LIVE state should trigger the normal error recovery which will 2197 2197 + * handle completing this request. 2198 2198 + */ 2199 2199 + nvme_tcp_error_recovery(ctrl); 2203 2200 return BLK_EH_RESET_TIMER; 2204 2201 } 2205 2202 ··· 2455 2422 nvme_tcp_reconnect_ctrl_work); 2456 2423 INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work); 2457 2424 INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work); 2425 2425 + mutex_init(&ctrl->teardown_lock); 2458 2426 2459 2427 if (!(opts->mask & NVMF_OPT_TRSVCID)) { 2460 2428 opts->trsvcid =

+2 -2

drivers/nvme/target/fc.c

reviewed

··· 2342 2342 return; 2343 2343 if (fcpreq->fcp_error || 2344 2344 fcpreq->transferred_length != fcpreq->transfer_length) { 2345 2345 - spin_lock(&fod->flock); 2345 2345 + spin_lock_irqsave(&fod->flock, flags); 2346 2346 fod->abort = true; 2347 2347 - spin_unlock(&fod->flock); 2347 2347 + spin_unlock_irqrestore(&fod->flock, flags); 2348 2348 2349 2349 nvmet_req_complete(&fod->req, NVME_SC_INTERNAL); 2350 2350 return;

+9 -1

drivers/nvme/target/tcp.c

reviewed

··· 160 160 static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, 161 161 struct nvmet_tcp_cmd *cmd) 162 162 { 163 163 + if (unlikely(!queue->nr_cmds)) { 164 164 + /* We didn't allocate cmds yet, send 0xffff */ 165 165 + return USHRT_MAX; 166 166 + } 167 167 + 163 168 return cmd - queue->cmds; 164 169 } 165 170 ··· 871 866 struct nvme_tcp_data_pdu *data = &queue->pdu.data; 872 867 struct nvmet_tcp_cmd *cmd; 873 868 874 874 - cmd = &queue->cmds[data->ttag]; 869 869 + if (likely(queue->nr_cmds)) 870 870 + cmd = &queue->cmds[data->ttag]; 871 871 + else 872 872 + cmd = &queue->connect; 875 873 876 874 if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) { 877 875 pr_err("ttag %u unexpected data offset %u (expected %u)\n",

+8 -1

include/linux/bvec.h

reviewed

··· 117 117 return true; 118 118 } 119 119 120 120 + static inline void bvec_iter_skip_zero_bvec(struct bvec_iter *iter) 121 121 + { 122 122 + iter->bi_bvec_done = 0; 123 123 + iter->bi_idx++; 124 124 + } 125 125 + 120 126 #define for_each_bvec(bvl, bio_vec, iter, start) \ 121 127 for (iter = (start); \ 122 128 (iter).bi_size && \ 123 129 ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ 124 124 - bvec_iter_advance((bio_vec), &(iter), (bvl).bv_len)) 130 130 + (bvl).bv_len ? (void)bvec_iter_advance((bio_vec), &(iter), \ 131 131 + (bvl).bv_len) : bvec_iter_skip_zero_bvec(&(iter))) 125 132 126 133 /* for iterating one bio from start to end */ 127 134 #define BVEC_ITER_ALL_INIT (struct bvec_iter) \