Merge tag 'for-linus-2019-10-18' of git://git.kernel.dk/linux-block

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'for-linus-2019-10-18' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:

- NVMe pull request from Keith that address deadlocks, double resets,
memory leaks, and other regression.

- Fixup elv_support_iosched() for bio based devices (Damien)

- Fixup for the ahci PCS quirk (Dan)

- Socket O_NONBLOCK handling fix for io_uring (me)

- Timeout sequence io_uring fixes (yangerkun)

- MD warning fix for parameter default_layout (Song)

- blkcg activation fixes (Tejun)

- blk-rq-qos node deletion fix (Tejun)

* tag 'for-linus-2019-10-18' of git://git.kernel.dk/linux-block:
nvme-pci: Set the prp2 correctly when using more than 4k page
io_uring: fix logic error in io_timeout
io_uring: fix up O_NONBLOCK handling for sockets
md/raid0: fix warning message for parameter default_layout
libata/ahci: Fix PCS quirk application
blk-rq-qos: fix first node deletion of rq_qos_del()
blkcg: Fix multiple bugs in blkcg_activate_policy()
io_uring: consider the overflow of sequence for timeout req
nvme-tcp: fix possible leakage during error flow
nvmet-loop: fix possible leakage during error flow
block: Fix elv_support_iosched()
nvme-tcp: Initialize sk->sk_ll_usec only with NET_RX_BUSY_POLL
nvme: Wait for reset state when required
nvme: Prevent resets during paused controller state
nvme: Restart request timers in resetting state
nvme: Remove ADMIN_ONLY state
nvme-pci: Free tagset if no IO queues
nvme: retain split access workaround for capability reads
nvme: fix possible deadlock when nvme_update_formats fails

Linus Torvalds 6 years ago d418d070 dfdcff32

+266 -117

13 changed files

expand all collapse all

block

blk-cgroup.c

blk-rq-qos.h

elevator.c

drivers

ata

ahci.c

raid0.c

nvme

host

core.c

fabrics.h

nvme.h

pci.c

rdma.c

tcp.c

target

loop.c

io_uring.c

+51 -18

block/blk-cgroup.c

reviewed

··· 1362 1362 const struct blkcg_policy *pol) 1363 1363 { 1364 1364 struct blkg_policy_data *pd_prealloc = NULL; 1365 1365 - struct blkcg_gq *blkg; 1365 1365 + struct blkcg_gq *blkg, *pinned_blkg = NULL; 1366 1366 int ret; 1367 1367 1368 1368 if (blkcg_policy_enabled(q, pol)) ··· 1370 1370 1371 1371 if (queue_is_mq(q)) 1372 1372 blk_mq_freeze_queue(q); 1373 1373 - pd_prealloc: 1374 1374 - if (!pd_prealloc) { 1375 1375 - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, &blkcg_root); 1376 1376 - if (!pd_prealloc) { 1377 1377 - ret = -ENOMEM; 1378 1378 - goto out_bypass_end; 1379 1379 - } 1380 1380 - } 1381 1381 - 1373 1373 + retry: 1382 1374 spin_lock_irq(&q->queue_lock); 1383 1375 1384 1384 - /* blkg_list is pushed at the head, reverse walk to init parents first */ 1376 1376 + /* blkg_list is pushed at the head, reverse walk to allocate parents first */ 1385 1377 list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { 1386 1378 struct blkg_policy_data *pd; 1387 1379 1388 1380 if (blkg->pd[pol->plid]) 1389 1381 continue; 1390 1382 1391 1391 - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, &blkcg_root); 1392 1392 - if (!pd) 1393 1393 - swap(pd, pd_prealloc); 1383 1383 + /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ 1384 1384 + if (blkg == pinned_blkg) { 1385 1385 + pd = pd_prealloc; 1386 1386 + pd_prealloc = NULL; 1387 1387 + } else { 1388 1388 + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, 1389 1389 + blkg->blkcg); 1390 1390 + } 1391 1391 + 1394 1392 if (!pd) { 1393 1393 + /* 1394 1394 + * GFP_NOWAIT failed. Free the existing one and 1395 1395 + * prealloc for @blkg w/ GFP_KERNEL. 1396 1396 + */ 1397 1397 + if (pinned_blkg) 1398 1398 + blkg_put(pinned_blkg); 1399 1399 + blkg_get(blkg); 1400 1400 + pinned_blkg = blkg; 1401 1401 + 1395 1402 spin_unlock_irq(&q->queue_lock); 1396 1396 - goto pd_prealloc; 1403 1403 + 1404 1404 + if (pd_prealloc) 1405 1405 + pol->pd_free_fn(pd_prealloc); 1406 1406 + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, 1407 1407 + blkg->blkcg); 1408 1408 + if (pd_prealloc) 1409 1409 + goto retry; 1410 1410 + else 1411 1411 + goto enomem; 1397 1412 } 1398 1413 1399 1414 blkg->pd[pol->plid] = pd; 1400 1415 pd->blkg = blkg; 1401 1416 pd->plid = pol->plid; 1402 1402 - if (pol->pd_init_fn) 1403 1403 - pol->pd_init_fn(pd); 1404 1417 } 1418 1418 + 1419 1419 + /* all allocated, init in the same order */ 1420 1420 + if (pol->pd_init_fn) 1421 1421 + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) 1422 1422 + pol->pd_init_fn(blkg->pd[pol->plid]); 1405 1423 1406 1424 __set_bit(pol->plid, q->blkcg_pols); 1407 1425 ret = 0; 1408 1426 1409 1427 spin_unlock_irq(&q->queue_lock); 1410 1410 - out_bypass_end: 1428 1428 + out: 1411 1429 if (queue_is_mq(q)) 1412 1430 blk_mq_unfreeze_queue(q); 1431 1431 + if (pinned_blkg) 1432 1432 + blkg_put(pinned_blkg); 1413 1433 if (pd_prealloc) 1414 1434 pol->pd_free_fn(pd_prealloc); 1415 1435 return ret; 1436 1436 + 1437 1437 + enomem: 1438 1438 + /* alloc failed, nothing's initialized yet, free everything */ 1439 1439 + spin_lock_irq(&q->queue_lock); 1440 1440 + list_for_each_entry(blkg, &q->blkg_list, q_node) { 1441 1441 + if (blkg->pd[pol->plid]) { 1442 1442 + pol->pd_free_fn(blkg->pd[pol->plid]); 1443 1443 + blkg->pd[pol->plid] = NULL; 1444 1444 + } 1445 1445 + } 1446 1446 + spin_unlock_irq(&q->queue_lock); 1447 1447 + ret = -ENOMEM; 1448 1448 + goto out; 1416 1449 } 1417 1450 EXPORT_SYMBOL_GPL(blkcg_activate_policy); 1418 1451

+5 -8

block/blk-rq-qos.h

reviewed

··· 108 108 109 109 static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) 110 110 { 111 111 - struct rq_qos *cur, *prev = NULL; 112 112 - for (cur = q->rq_qos; cur; cur = cur->next) { 113 113 - if (cur == rqos) { 114 114 - if (prev) 115 115 - prev->next = rqos->next; 116 116 - else 117 117 - q->rq_qos = cur; 111 111 + struct rq_qos **cur; 112 112 + 113 113 + for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { 114 114 + if (*cur == rqos) { 115 115 + *cur = rqos->next; 118 116 break; 119 117 } 120 120 - prev = cur; 121 118 } 122 119 123 120 blk_mq_debugfs_unregister_rqos(rqos);

+2 -1

block/elevator.c

reviewed

··· 616 616 617 617 static inline bool elv_support_iosched(struct request_queue *q) 618 618 { 619 619 - if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) 619 619 + if (!q->mq_ops || 620 620 + (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) 620 621 return false; 621 622 return true; 622 623 }

+3 -1

drivers/ata/ahci.c

reviewed

··· 1600 1600 */ 1601 1601 if (!id || id->vendor != PCI_VENDOR_ID_INTEL) 1602 1602 return; 1603 1603 - if (((enum board_ids) id->driver_data) < board_ahci_pcs7) 1603 1603 + 1604 1604 + /* Skip applying the quirk on Denverton and beyond */ 1605 1605 + if (((enum board_ids) id->driver_data) >= board_ahci_pcs7) 1604 1606 return; 1605 1607 1606 1608 /*

+1 -1

drivers/md/raid0.c

reviewed

··· 154 154 } else { 155 155 pr_err("md/raid0:%s: cannot assemble multi-zone RAID0 with default_layout setting\n", 156 156 mdname(mddev)); 157 157 - pr_err("md/raid0: please set raid.default_layout to 1 or 2\n"); 157 157 + pr_err("md/raid0: please set raid0.default_layout to 1 or 2\n"); 158 158 err = -ENOTSUPP; 159 159 goto abort; 160 160 }

+68 -26

drivers/nvme/host/core.c

reviewed

··· 116 116 /* 117 117 * Only new queue scan work when admin and IO queues are both alive 118 118 */ 119 119 - if (ctrl->state == NVME_CTRL_LIVE) 119 119 + if (ctrl->state == NVME_CTRL_LIVE && ctrl->tagset) 120 120 queue_work(nvme_wq, &ctrl->scan_work); 121 121 } 122 122 + 123 123 + /* 124 124 + * Use this function to proceed with scheduling reset_work for a controller 125 125 + * that had previously been set to the resetting state. This is intended for 126 126 + * code paths that can't be interrupted by other reset attempts. A hot removal 127 127 + * may prevent this from succeeding. 128 128 + */ 129 129 + int nvme_try_sched_reset(struct nvme_ctrl *ctrl) 130 130 + { 131 131 + if (ctrl->state != NVME_CTRL_RESETTING) 132 132 + return -EBUSY; 133 133 + if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) 134 134 + return -EBUSY; 135 135 + return 0; 136 136 + } 137 137 + EXPORT_SYMBOL_GPL(nvme_try_sched_reset); 122 138 123 139 int nvme_reset_ctrl(struct nvme_ctrl *ctrl) 124 140 { ··· 153 137 ret = nvme_reset_ctrl(ctrl); 154 138 if (!ret) { 155 139 flush_work(&ctrl->reset_work); 156 156 - if (ctrl->state != NVME_CTRL_LIVE && 157 157 - ctrl->state != NVME_CTRL_ADMIN_ONLY) 140 140 + if (ctrl->state != NVME_CTRL_LIVE) 158 141 ret = -ENETRESET; 159 142 } 160 143 ··· 330 315 331 316 old_state = ctrl->state; 332 317 switch (new_state) { 333 333 - case NVME_CTRL_ADMIN_ONLY: 334 334 - switch (old_state) { 335 335 - case NVME_CTRL_CONNECTING: 336 336 - changed = true; 337 337 - /* FALLTHRU */ 338 338 - default: 339 339 - break; 340 340 - } 341 341 - break; 342 318 case NVME_CTRL_LIVE: 343 319 switch (old_state) { 344 320 case NVME_CTRL_NEW: ··· 345 339 switch (old_state) { 346 340 case NVME_CTRL_NEW: 347 341 case NVME_CTRL_LIVE: 348 348 - case NVME_CTRL_ADMIN_ONLY: 349 342 changed = true; 350 343 /* FALLTHRU */ 351 344 default: ··· 364 359 case NVME_CTRL_DELETING: 365 360 switch (old_state) { 366 361 case NVME_CTRL_LIVE: 367 367 - case NVME_CTRL_ADMIN_ONLY: 368 362 case NVME_CTRL_RESETTING: 369 363 case NVME_CTRL_CONNECTING: 370 364 changed = true; ··· 385 381 break; 386 382 } 387 383 388 388 - if (changed) 384 384 + if (changed) { 389 385 ctrl->state = new_state; 386 386 + wake_up_all(&ctrl->state_wq); 387 387 + } 390 388 391 389 spin_unlock_irqrestore(&ctrl->lock, flags); 392 390 if (changed && ctrl->state == NVME_CTRL_LIVE) ··· 396 390 return changed; 397 391 } 398 392 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); 393 393 + 394 394 + /* 395 395 + * Returns true for sink states that can't ever transition back to live. 396 396 + */ 397 397 + static bool nvme_state_terminal(struct nvme_ctrl *ctrl) 398 398 + { 399 399 + switch (ctrl->state) { 400 400 + case NVME_CTRL_NEW: 401 401 + case NVME_CTRL_LIVE: 402 402 + case NVME_CTRL_RESETTING: 403 403 + case NVME_CTRL_CONNECTING: 404 404 + return false; 405 405 + case NVME_CTRL_DELETING: 406 406 + case NVME_CTRL_DEAD: 407 407 + return true; 408 408 + default: 409 409 + WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state); 410 410 + return true; 411 411 + } 412 412 + } 413 413 + 414 414 + /* 415 415 + * Waits for the controller state to be resetting, or returns false if it is 416 416 + * not possible to ever transition to that state. 417 417 + */ 418 418 + bool nvme_wait_reset(struct nvme_ctrl *ctrl) 419 419 + { 420 420 + wait_event(ctrl->state_wq, 421 421 + nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING) || 422 422 + nvme_state_terminal(ctrl)); 423 423 + return ctrl->state == NVME_CTRL_RESETTING; 424 424 + } 425 425 + EXPORT_SYMBOL_GPL(nvme_wait_reset); 399 426 400 427 static void nvme_free_ns_head(struct kref *ref) 401 428 { ··· 1345 1306 if (ns->disk && nvme_revalidate_disk(ns->disk)) 1346 1307 nvme_set_queue_dying(ns); 1347 1308 up_read(&ctrl->namespaces_rwsem); 1348 1348 - 1349 1349 - nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); 1350 1309 } 1351 1310 1352 1311 static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) ··· 1360 1323 nvme_unfreeze(ctrl); 1361 1324 nvme_mpath_unfreeze(ctrl->subsys); 1362 1325 mutex_unlock(&ctrl->subsys->lock); 1326 1326 + nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL); 1363 1327 mutex_unlock(&ctrl->scan_lock); 1364 1328 } 1365 1329 if (effects & NVME_CMD_EFFECTS_CCC) ··· 2912 2874 2913 2875 switch (ctrl->state) { 2914 2876 case NVME_CTRL_LIVE: 2915 2915 - case NVME_CTRL_ADMIN_ONLY: 2916 2877 break; 2917 2878 default: 2918 2879 return -EWOULDBLOCK; ··· 3205 3168 static const char *const state_name[] = { 3206 3169 [NVME_CTRL_NEW] = "new", 3207 3170 [NVME_CTRL_LIVE] = "live", 3208 3208 - [NVME_CTRL_ADMIN_ONLY] = "only-admin", 3209 3171 [NVME_CTRL_RESETTING] = "resetting", 3210 3172 [NVME_CTRL_CONNECTING] = "connecting", 3211 3173 [NVME_CTRL_DELETING] = "deleting", ··· 3715 3679 struct nvme_id_ctrl *id; 3716 3680 unsigned nn; 3717 3681 3718 3718 - if (ctrl->state != NVME_CTRL_LIVE) 3682 3682 + /* No tagset on a live ctrl means IO queues could not created */ 3683 3683 + if (ctrl->state != NVME_CTRL_LIVE || !ctrl->tagset) 3719 3684 return; 3720 3720 - 3721 3721 - WARN_ON_ONCE(!ctrl->tagset); 3722 3685 3723 3686 if (test_and_clear_bit(NVME_AER_NOTICE_NS_CHANGED, &ctrl->events)) { 3724 3687 dev_info(ctrl->device, "rescanning namespaces.\n"); ··· 3879 3844 if (time_after(jiffies, fw_act_timeout)) { 3880 3845 dev_warn(ctrl->device, 3881 3846 "Fw activation timeout, reset controller\n"); 3882 3882 - nvme_reset_ctrl(ctrl); 3883 3883 - break; 3847 3847 + nvme_try_sched_reset(ctrl); 3848 3848 + return; 3884 3849 } 3885 3850 msleep(100); 3886 3851 } 3887 3852 3888 3888 - if (ctrl->state != NVME_CTRL_LIVE) 3853 3853 + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) 3889 3854 return; 3890 3855 3891 3856 nvme_start_queues(ctrl); ··· 3905 3870 nvme_queue_scan(ctrl); 3906 3871 break; 3907 3872 case NVME_AER_NOTICE_FW_ACT_STARTING: 3908 3908 - queue_work(nvme_wq, &ctrl->fw_act_work); 3873 3873 + /* 3874 3874 + * We are (ab)using the RESETTING state to prevent subsequent 3875 3875 + * recovery actions from interfering with the controller's 3876 3876 + * firmware activation. 3877 3877 + */ 3878 3878 + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) 3879 3879 + queue_work(nvme_wq, &ctrl->fw_act_work); 3909 3880 break; 3910 3881 #ifdef CONFIG_NVME_MULTIPATH 3911 3882 case NVME_AER_NOTICE_ANA: ··· 4034 3993 INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); 4035 3994 INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); 4036 3995 INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); 3996 3996 + init_waitqueue_head(&ctrl->state_wq); 4037 3997 4038 3998 INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); 4039 3999 memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));

+1 -2

drivers/nvme/host/fabrics.h

reviewed

··· 182 182 static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq, 183 183 bool queue_live) 184 184 { 185 185 - if (likely(ctrl->state == NVME_CTRL_LIVE || 186 186 - ctrl->state == NVME_CTRL_ADMIN_ONLY)) 185 185 + if (likely(ctrl->state == NVME_CTRL_LIVE)) 187 186 return true; 188 187 return __nvmf_check_ready(ctrl, rq, queue_live); 189 188 }

+4 -1

drivers/nvme/host/nvme.h

reviewed

··· 15 15 #include <linux/sed-opal.h> 16 16 #include <linux/fault-inject.h> 17 17 #include <linux/rcupdate.h> 18 18 + #include <linux/wait.h> 18 19 19 20 #include <trace/events/block.h> 20 21 ··· 162 161 enum nvme_ctrl_state { 163 162 NVME_CTRL_NEW, 164 163 NVME_CTRL_LIVE, 165 165 - NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */ 166 164 NVME_CTRL_RESETTING, 167 165 NVME_CTRL_CONNECTING, 168 166 NVME_CTRL_DELETING, ··· 199 199 struct cdev cdev; 200 200 struct work_struct reset_work; 201 201 struct work_struct delete_work; 202 202 + wait_queue_head_t state_wq; 202 203 203 204 struct nvme_subsystem *subsys; 204 205 struct list_head subsys_entry; ··· 450 449 bool nvme_cancel_request(struct request *req, void *data, bool reserved); 451 450 bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, 452 451 enum nvme_ctrl_state new_state); 452 452 + bool nvme_wait_reset(struct nvme_ctrl *ctrl); 453 453 int nvme_disable_ctrl(struct nvme_ctrl *ctrl); 454 454 int nvme_enable_ctrl(struct nvme_ctrl *ctrl); 455 455 int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); ··· 501 499 void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); 502 500 int nvme_reset_ctrl(struct nvme_ctrl *ctrl); 503 501 int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); 502 502 + int nvme_try_sched_reset(struct nvme_ctrl *ctrl); 504 503 int nvme_delete_ctrl(struct nvme_ctrl *ctrl); 505 504 506 505 int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,

+49 -34

drivers/nvme/host/pci.c

reviewed

··· 773 773 struct bio_vec *bv) 774 774 { 775 775 struct nvme_iod *iod = blk_mq_rq_to_pdu(req); 776 776 - unsigned int first_prp_len = dev->ctrl.page_size - bv->bv_offset; 776 776 + unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1); 777 777 + unsigned int first_prp_len = dev->ctrl.page_size - offset; 777 778 778 779 iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0); 779 780 if (dma_mapping_error(dev->dev, iod->first_dma)) ··· 2264 2263 return true; 2265 2264 } 2266 2265 2267 2267 - /* 2268 2268 - * return error value only when tagset allocation failed 2269 2269 - */ 2270 2270 - static int nvme_dev_add(struct nvme_dev *dev) 2266 2266 + static void nvme_dev_add(struct nvme_dev *dev) 2271 2267 { 2272 2268 int ret; 2273 2269 ··· 2294 2296 if (ret) { 2295 2297 dev_warn(dev->ctrl.device, 2296 2298 "IO queues tagset allocation failed %d\n", ret); 2297 2297 - return ret; 2299 2299 + return; 2298 2300 } 2299 2301 dev->ctrl.tagset = &dev->tagset; 2300 2302 } else { ··· 2305 2307 } 2306 2308 2307 2309 nvme_dbbuf_set(dev); 2308 2308 - return 0; 2309 2310 } 2310 2311 2311 2312 static int nvme_pci_enable(struct nvme_dev *dev) ··· 2464 2467 mutex_unlock(&dev->shutdown_lock); 2465 2468 } 2466 2469 2470 2470 + static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown) 2471 2471 + { 2472 2472 + if (!nvme_wait_reset(&dev->ctrl)) 2473 2473 + return -EBUSY; 2474 2474 + nvme_dev_disable(dev, shutdown); 2475 2475 + return 0; 2476 2476 + } 2477 2477 + 2467 2478 static int nvme_setup_prp_pools(struct nvme_dev *dev) 2468 2479 { 2469 2480 dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, ··· 2495 2490 dma_pool_destroy(dev->prp_small_pool); 2496 2491 } 2497 2492 2493 2493 + static void nvme_free_tagset(struct nvme_dev *dev) 2494 2494 + { 2495 2495 + if (dev->tagset.tags) 2496 2496 + blk_mq_free_tag_set(&dev->tagset); 2497 2497 + dev->ctrl.tagset = NULL; 2498 2498 + } 2499 2499 + 2498 2500 static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) 2499 2501 { 2500 2502 struct nvme_dev *dev = to_nvme_dev(ctrl); 2501 2503 2502 2504 nvme_dbbuf_dma_free(dev); 2503 2505 put_device(dev->dev); 2504 2504 - if (dev->tagset.tags) 2505 2505 - blk_mq_free_tag_set(&dev->tagset); 2506 2506 + nvme_free_tagset(dev); 2506 2507 if (dev->ctrl.admin_q) 2507 2508 blk_put_queue(dev->ctrl.admin_q); 2508 2509 kfree(dev->queues); ··· 2519 2508 2520 2509 static void nvme_remove_dead_ctrl(struct nvme_dev *dev) 2521 2510 { 2511 2511 + /* 2512 2512 + * Set state to deleting now to avoid blocking nvme_wait_reset(), which 2513 2513 + * may be holding this pci_dev's device lock. 2514 2514 + */ 2515 2515 + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 2522 2516 nvme_get_ctrl(&dev->ctrl); 2523 2517 nvme_dev_disable(dev, false); 2524 2518 nvme_kill_queues(&dev->ctrl); ··· 2537 2521 container_of(work, struct nvme_dev, ctrl.reset_work); 2538 2522 bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); 2539 2523 int result; 2540 2540 - enum nvme_ctrl_state new_state = NVME_CTRL_LIVE; 2541 2524 2542 2525 if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING)) { 2543 2526 result = -ENODEV; ··· 2630 2615 dev_warn(dev->ctrl.device, "IO queues not created\n"); 2631 2616 nvme_kill_queues(&dev->ctrl); 2632 2617 nvme_remove_namespaces(&dev->ctrl); 2633 2633 - new_state = NVME_CTRL_ADMIN_ONLY; 2618 2618 + nvme_free_tagset(dev); 2634 2619 } else { 2635 2620 nvme_start_queues(&dev->ctrl); 2636 2621 nvme_wait_freeze(&dev->ctrl); 2637 2637 - /* hit this only when allocate tagset fails */ 2638 2638 - if (nvme_dev_add(dev)) 2639 2639 - new_state = NVME_CTRL_ADMIN_ONLY; 2622 2622 + nvme_dev_add(dev); 2640 2623 nvme_unfreeze(&dev->ctrl); 2641 2624 } 2642 2625 ··· 2642 2629 * If only admin queue live, keep it to do further investigation or 2643 2630 * recovery. 2644 2631 */ 2645 2645 - if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) { 2632 2632 + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { 2646 2633 dev_warn(dev->ctrl.device, 2647 2647 - "failed to mark controller state %d\n", new_state); 2634 2634 + "failed to mark controller live state\n"); 2648 2635 result = -ENODEV; 2649 2636 goto out; 2650 2637 } ··· 2685 2672 2686 2673 static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) 2687 2674 { 2688 2688 - *val = readq(to_nvme_dev(ctrl)->bar + off); 2675 2675 + *val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off); 2689 2676 return 0; 2690 2677 } 2691 2678 ··· 2849 2836 static void nvme_reset_prepare(struct pci_dev *pdev) 2850 2837 { 2851 2838 struct nvme_dev *dev = pci_get_drvdata(pdev); 2852 2852 - nvme_dev_disable(dev, false); 2839 2839 + 2840 2840 + /* 2841 2841 + * We don't need to check the return value from waiting for the reset 2842 2842 + * state as pci_dev device lock is held, making it impossible to race 2843 2843 + * with ->remove(). 2844 2844 + */ 2845 2845 + nvme_disable_prepare_reset(dev, false); 2846 2846 + nvme_sync_queues(&dev->ctrl); 2853 2847 } 2854 2848 2855 2849 static void nvme_reset_done(struct pci_dev *pdev) 2856 2850 { 2857 2851 struct nvme_dev *dev = pci_get_drvdata(pdev); 2858 2858 - nvme_reset_ctrl_sync(&dev->ctrl); 2852 2852 + 2853 2853 + if (!nvme_try_sched_reset(&dev->ctrl)) 2854 2854 + flush_work(&dev->ctrl.reset_work); 2859 2855 } 2860 2856 2861 2857 static void nvme_shutdown(struct pci_dev *pdev) 2862 2858 { 2863 2859 struct nvme_dev *dev = pci_get_drvdata(pdev); 2864 2864 - nvme_dev_disable(dev, true); 2860 2860 + nvme_disable_prepare_reset(dev, true); 2865 2861 } 2866 2862 2867 2863 /* ··· 2923 2901 2924 2902 if (ndev->last_ps == U32_MAX || 2925 2903 nvme_set_power_state(ctrl, ndev->last_ps) != 0) 2926 2926 - nvme_reset_ctrl(ctrl); 2904 2904 + return nvme_try_sched_reset(&ndev->ctrl); 2927 2905 return 0; 2928 2906 } 2929 2907 ··· 2951 2929 */ 2952 2930 if (pm_suspend_via_firmware() || !ctrl->npss || 2953 2931 !pcie_aspm_enabled(pdev) || 2954 2954 - (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) { 2955 2955 - nvme_dev_disable(ndev, true); 2956 2956 - return 0; 2957 2957 - } 2932 2932 + (ndev->ctrl.quirks & NVME_QUIRK_SIMPLE_SUSPEND)) 2933 2933 + return nvme_disable_prepare_reset(ndev, true); 2958 2934 2959 2935 nvme_start_freeze(ctrl); 2960 2936 nvme_wait_freeze(ctrl); 2961 2937 nvme_sync_queues(ctrl); 2962 2938 2963 2963 - if (ctrl->state != NVME_CTRL_LIVE && 2964 2964 - ctrl->state != NVME_CTRL_ADMIN_ONLY) 2939 2939 + if (ctrl->state != NVME_CTRL_LIVE) 2965 2940 goto unfreeze; 2966 2941 2967 2942 ret = nvme_get_power_state(ctrl, &ndev->last_ps); ··· 2984 2965 * Clearing npss forces a controller reset on resume. The 2985 2966 * correct value will be resdicovered then. 2986 2967 */ 2987 2987 - nvme_dev_disable(ndev, true); 2968 2968 + ret = nvme_disable_prepare_reset(ndev, true); 2988 2969 ctrl->npss = 0; 2989 2989 - ret = 0; 2990 2970 } 2991 2971 unfreeze: 2992 2972 nvme_unfreeze(ctrl); ··· 2995 2977 static int nvme_simple_suspend(struct device *dev) 2996 2978 { 2997 2979 struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev)); 2998 2998 - 2999 2999 - nvme_dev_disable(ndev, true); 3000 3000 - return 0; 2980 2980 + return nvme_disable_prepare_reset(ndev, true); 3001 2981 } 3002 2982 3003 2983 static int nvme_simple_resume(struct device *dev) ··· 3003 2987 struct pci_dev *pdev = to_pci_dev(dev); 3004 2988 struct nvme_dev *ndev = pci_get_drvdata(pdev); 3005 2989 3006 3006 - nvme_reset_ctrl(&ndev->ctrl); 3007 3007 - return 0; 2990 2990 + return nvme_try_sched_reset(&ndev->ctrl); 3008 2991 } 3009 2992 3010 2993 static const struct dev_pm_ops nvme_dev_pm_ops = {

drivers/nvme/host/rdma.c

reviewed

··· 1701 1701 dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", 1702 1702 rq->tag, nvme_rdma_queue_idx(queue)); 1703 1703 1704 1704 + /* 1705 1705 + * Restart the timer if a controller reset is already scheduled. Any 1706 1706 + * timed out commands would be handled before entering the connecting 1707 1707 + * state. 1708 1708 + */ 1709 1709 + if (ctrl->ctrl.state == NVME_CTRL_RESETTING) 1710 1710 + return BLK_EH_RESET_TIMER; 1711 1711 + 1704 1712 if (ctrl->ctrl.state != NVME_CTRL_LIVE) { 1705 1713 /* 1706 1714 * Teardown immediately if controller times out while starting

+11

drivers/nvme/host/tcp.c

reviewed

··· 1386 1386 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; 1387 1387 queue->sock->sk->sk_state_change = nvme_tcp_state_change; 1388 1388 queue->sock->sk->sk_write_space = nvme_tcp_write_space; 1389 1389 + #ifdef CONFIG_NET_RX_BUSY_POLL 1389 1390 queue->sock->sk->sk_ll_usec = 1; 1391 1391 + #endif 1390 1392 write_unlock_bh(&queue->sock->sk->sk_callback_lock); 1391 1393 1392 1394 return 0; ··· 2046 2044 struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; 2047 2045 struct nvme_tcp_cmd_pdu *pdu = req->pdu; 2048 2046 2047 2047 + /* 2048 2048 + * Restart the timer if a controller reset is already scheduled. Any 2049 2049 + * timed out commands would be handled before entering the connecting 2050 2050 + * state. 2051 2051 + */ 2052 2052 + if (ctrl->ctrl.state == NVME_CTRL_RESETTING) 2053 2053 + return BLK_EH_RESET_TIMER; 2054 2054 + 2049 2055 dev_warn(ctrl->ctrl.device, 2050 2056 "queue %d: timeout request %#x type %d\n", 2051 2057 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); ··· 2136 2126 2137 2127 ret = nvme_tcp_map_data(queue, rq); 2138 2128 if (unlikely(ret)) { 2129 2129 + nvme_cleanup_cmd(rq); 2139 2130 dev_err(queue->ctrl->ctrl.device, 2140 2131 "Failed to map data (%d)\n", ret); 2141 2132 return ret;

+3 -1

drivers/nvme/target/loop.c

reviewed

··· 157 157 iod->sg_table.sgl = iod->first_sgl; 158 158 if (sg_alloc_table_chained(&iod->sg_table, 159 159 blk_rq_nr_phys_segments(req), 160 160 - iod->sg_table.sgl, SG_CHUNK_SIZE)) 160 160 + iod->sg_table.sgl, SG_CHUNK_SIZE)) { 161 161 + nvme_cleanup_cmd(req); 161 162 return BLK_STS_RESOURCE; 163 163 + } 162 164 163 165 iod->req.sg = iod->sg_table.sgl; 164 166 iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl);

+60 -24

fs/io_uring.c

reviewed

··· 322 322 #define REQ_F_FAIL_LINK 256 /* fail rest of links */ 323 323 #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ 324 324 #define REQ_F_TIMEOUT 1024 /* timeout request */ 325 325 + #define REQ_F_ISREG 2048 /* regular file */ 326 326 + #define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ 325 327 u64 user_data; 326 328 u32 result; 327 329 u32 sequence; ··· 916 914 return ret; 917 915 } 918 916 919 919 - static void kiocb_end_write(struct kiocb *kiocb) 917 917 + static void kiocb_end_write(struct io_kiocb *req) 920 918 { 921 921 - if (kiocb->ki_flags & IOCB_WRITE) { 922 922 - struct inode *inode = file_inode(kiocb->ki_filp); 919 919 + /* 920 920 + * Tell lockdep we inherited freeze protection from submission 921 921 + * thread. 922 922 + */ 923 923 + if (req->flags & REQ_F_ISREG) { 924 924 + struct inode *inode = file_inode(req->file); 923 925 924 924 - /* 925 925 - * Tell lockdep we inherited freeze protection from submission 926 926 - * thread. 927 927 - */ 928 928 - if (S_ISREG(inode->i_mode)) 929 929 - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); 930 930 - file_end_write(kiocb->ki_filp); 926 926 + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); 931 927 } 928 928 + file_end_write(req->file); 932 929 } 933 930 934 931 static void io_complete_rw(struct kiocb *kiocb, long res, long res2) 935 932 { 936 933 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); 937 934 938 938 - kiocb_end_write(kiocb); 935 935 + if (kiocb->ki_flags & IOCB_WRITE) 936 936 + kiocb_end_write(req); 939 937 940 938 if ((req->flags & REQ_F_LINK) && res != req->result) 941 939 req->flags |= REQ_F_FAIL_LINK; ··· 947 945 { 948 946 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); 949 947 950 950 - kiocb_end_write(kiocb); 948 948 + if (kiocb->ki_flags & IOCB_WRITE) 949 949 + kiocb_end_write(req); 951 950 952 951 if ((req->flags & REQ_F_LINK) && res != req->result) 953 952 req->flags |= REQ_F_FAIL_LINK; ··· 1062 1059 if (!req->file) 1063 1060 return -EBADF; 1064 1061 1065 1065 - if (force_nonblock && !io_file_supports_async(req->file)) 1066 1066 - force_nonblock = false; 1062 1062 + if (S_ISREG(file_inode(req->file)->i_mode)) 1063 1063 + req->flags |= REQ_F_ISREG; 1064 1064 + 1065 1065 + /* 1066 1066 + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so 1067 1067 + * we know to async punt it even if it was opened O_NONBLOCK 1068 1068 + */ 1069 1069 + if (force_nonblock && !io_file_supports_async(req->file)) { 1070 1070 + req->flags |= REQ_F_MUST_PUNT; 1071 1071 + return -EAGAIN; 1072 1072 + } 1067 1073 1068 1074 kiocb->ki_pos = READ_ONCE(sqe->off); 1069 1075 kiocb->ki_flags = iocb_flags(kiocb->ki_filp); ··· 1093 1081 return ret; 1094 1082 1095 1083 /* don't allow async punt if RWF_NOWAIT was requested */ 1096 1096 - if (kiocb->ki_flags & IOCB_NOWAIT) 1084 1084 + if ((kiocb->ki_flags & IOCB_NOWAIT) || 1085 1085 + (req->file->f_flags & O_NONBLOCK)) 1097 1086 req->flags |= REQ_F_NOWAIT; 1098 1087 1099 1088 if (force_nonblock) ··· 1395 1382 * need async punt anyway, so it's more efficient to do it 1396 1383 * here. 1397 1384 */ 1398 1398 - if (force_nonblock && ret2 > 0 && ret2 < read_size) 1385 1385 + if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && 1386 1386 + (req->flags & REQ_F_ISREG) && 1387 1387 + ret2 > 0 && ret2 < read_size) 1399 1388 ret2 = -EAGAIN; 1400 1389 /* Catch -EAGAIN return for forced non-blocking submission */ 1401 1390 if (!force_nonblock || ret2 != -EAGAIN) { ··· 1462 1447 * released so that it doesn't complain about the held lock when 1463 1448 * we return to userspace. 1464 1449 */ 1465 1465 - if (S_ISREG(file_inode(file)->i_mode)) { 1450 1450 + if (req->flags & REQ_F_ISREG) { 1466 1451 __sb_start_write(file_inode(file)->i_sb, 1467 1452 SB_FREEZE_WRITE, true); 1468 1453 __sb_writers_release(file_inode(file)->i_sb, ··· 1899 1884 1900 1885 static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1901 1886 { 1902 1902 - unsigned count, req_dist, tail_index; 1887 1887 + unsigned count; 1903 1888 struct io_ring_ctx *ctx = req->ctx; 1904 1889 struct list_head *entry; 1905 1890 struct timespec64 ts; ··· 1922 1907 count = 1; 1923 1908 1924 1909 req->sequence = ctx->cached_sq_head + count - 1; 1910 1910 + /* reuse it to store the count */ 1911 1911 + req->submit.sequence = count; 1925 1912 req->flags |= REQ_F_TIMEOUT; 1926 1913 1927 1914 /* 1928 1915 * Insertion sort, ensuring the first entry in the list is always 1929 1916 * the one we need first. 1930 1917 */ 1931 1931 - tail_index = ctx->cached_cq_tail - ctx->rings->sq_dropped; 1932 1932 - req_dist = req->sequence - tail_index; 1933 1918 spin_lock_irq(&ctx->completion_lock); 1934 1919 list_for_each_prev(entry, &ctx->timeout_list) { 1935 1920 struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); 1936 1936 - unsigned dist; 1921 1921 + unsigned nxt_sq_head; 1922 1922 + long long tmp, tmp_nxt; 1937 1923 1938 1938 - dist = nxt->sequence - tail_index; 1939 1939 - if (req_dist >= dist) 1924 1924 + /* 1925 1925 + * Since cached_sq_head + count - 1 can overflow, use type long 1926 1926 + * long to store it. 1927 1927 + */ 1928 1928 + tmp = (long long)ctx->cached_sq_head + count - 1; 1929 1929 + nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; 1930 1930 + tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; 1931 1931 + 1932 1932 + /* 1933 1933 + * cached_sq_head may overflow, and it will never overflow twice 1934 1934 + * once there is some timeout req still be valid. 1935 1935 + */ 1936 1936 + if (ctx->cached_sq_head < nxt_sq_head) 1937 1937 + tmp += UINT_MAX; 1938 1938 + 1939 1939 + if (tmp >= tmp_nxt) 1940 1940 break; 1941 1941 } 1942 1942 list_add(&req->list, entry); ··· 2297 2267 int ret; 2298 2268 2299 2269 ret = __io_submit_sqe(ctx, req, s, force_nonblock); 2300 2300 - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { 2270 2270 + 2271 2271 + /* 2272 2272 + * We async punt it if the file wasn't marked NOWAIT, or if the file 2273 2273 + * doesn't support non-blocking read/write attempts 2274 2274 + */ 2275 2275 + if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || 2276 2276 + (req->flags & REQ_F_MUST_PUNT))) { 2301 2277 struct io_uring_sqe *sqe_copy; 2302 2278 2303 2279 sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL);