Merge tag 'for-linus-20190524' of git://git.kernel.dk/linux-block

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'for-linus-20190524' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:

- NVMe pull request from Keith, with fixes from a few folks.

- bio and sbitmap before atomic barrier fixes (Andrea)

- Hang fix for blk-mq freeze and unfreeze (Bob)

- Single segment count regression fix (Christoph)

- AoE now has a new maintainer

- tools/io_uring/ Makefile fix, and sync with liburing (me)

* tag 'for-linus-20190524' of git://git.kernel.dk/linux-block: (23 commits)
tools/io_uring: sync with liburing
tools/io_uring: fix Makefile for pthread library link
blk-mq: fix hang caused by freeze/unfreeze sequence
block: remove the bi_seg_{front,back}_size fields in struct bio
block: remove the segment size check in bio_will_gap
block: force an unlimited segment size on queues with a virt boundary
block: don't decrement nr_phys_segments for physically contigous segments
sbitmap: fix improper use of smp_mb__before_atomic()
bio: fix improper use of smp_mb__before_atomic()
aoe: list new maintainer for aoe driver
nvme-pci: use blk-mq mapping for unmanaged irqs
nvme: update MAINTAINERS
nvme: copy MTFA field from identify controller
nvme: fix memory leak for power latency tolerance
nvme: release namespace SRCU protection before performing controller ioctls
nvme: merge nvme_ns_ioctl into nvme_ioctl
nvme: remove the ifdef around nvme_nvm_ioctl
nvme: fix srcu locking on error return in nvme_get_ns_from_disk
nvme: Fix known effects
nvme-pci: Sync queues on reset
...

Linus Torvalds 7 years ago 7fbc78e3 7f8b40e3

+243 -248

18 changed files

expand all collapse all

MAINTAINERS

block

blk-core.c

blk-merge.c

blk-mq.c

blk-settings.c

drivers

nvme

host

core.c

nvme.h

pci.c

include

linux

bio.h

blk_types.h

blkdev.h

lib

sbitmap.c

tools

io_uring

Makefile

io_uring-cp.c

liburing.h

queue.c

setup.c

syscall.c

+2 -2

MAINTAINERS

reviewed

··· 2627 2627 F: drivers/misc/eeprom/at24.c 2628 2628 2629 2629 ATA OVER ETHERNET (AOE) DRIVER 2630 2630 - M: "Ed L. Cashin" <ed.cashin@acm.org> 2630 2630 + M: "Justin Sanders" <justin@coraid.com> 2631 2631 W: http://www.openaoe.org/ 2632 2632 S: Supported 2633 2633 F: Documentation/aoe/ ··· 11226 11226 F: drivers/video/fbdev/nvidia/ 11227 11227 11228 11228 NVM EXPRESS DRIVER 11229 11229 - M: Keith Busch <keith.busch@intel.com> 11229 11229 + M: Keith Busch <kbusch@kernel.org> 11230 11230 M: Jens Axboe <axboe@fb.com> 11231 11231 M: Christoph Hellwig <hch@lst.de> 11232 11232 M: Sagi Grimberg <sagi@grimberg.me>

+2 -1

block/blk-core.c

reviewed

··· 413 413 smp_rmb(); 414 414 415 415 wait_event(q->mq_freeze_wq, 416 416 - (atomic_read(&q->mq_freeze_depth) == 0 && 416 416 + (!q->mq_freeze_depth && 417 417 (pm || (blk_pm_request_resume(q), 418 418 !blk_queue_pm_only(q)))) || 419 419 blk_queue_dying(q)); ··· 503 503 spin_lock_init(&q->queue_lock); 504 504 505 505 init_waitqueue_head(&q->mq_freeze_wq); 506 506 + mutex_init(&q->mq_freeze_lock); 506 507 507 508 /* 508 509 * Init percpu_ref in atomic mode so that it's faster to shutdown.

+13 -121

block/blk-merge.c

reviewed

··· 12 12 13 13 #include "blk.h" 14 14 15 15 - /* 16 16 - * Check if the two bvecs from two bios can be merged to one segment. If yes, 17 17 - * no need to check gap between the two bios since the 1st bio and the 1st bvec 18 18 - * in the 2nd bio can be handled in one segment. 19 19 - */ 20 20 - static inline bool bios_segs_mergeable(struct request_queue *q, 21 21 - struct bio *prev, struct bio_vec *prev_last_bv, 22 22 - struct bio_vec *next_first_bv) 23 23 - { 24 24 - if (!biovec_phys_mergeable(q, prev_last_bv, next_first_bv)) 25 25 - return false; 26 26 - if (prev->bi_seg_back_size + next_first_bv->bv_len > 27 27 - queue_max_segment_size(q)) 28 28 - return false; 29 29 - return true; 30 30 - } 31 31 - 32 15 static inline bool bio_will_gap(struct request_queue *q, 33 16 struct request *prev_rq, struct bio *prev, struct bio *next) 34 17 { ··· 43 60 */ 44 61 bio_get_last_bvec(prev, &pb); 45 62 bio_get_first_bvec(next, &nb); 46 46 - if (bios_segs_mergeable(q, prev, &pb, &nb)) 63 63 + if (biovec_phys_mergeable(q, &pb, &nb)) 47 64 return false; 48 65 return __bvec_gap_to_prev(q, &pb, nb.bv_offset); 49 66 } ··· 162 179 * variables. 163 180 */ 164 181 static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv, 165 165 - unsigned *nsegs, unsigned *last_seg_size, 166 166 - unsigned *front_seg_size, unsigned *sectors, unsigned max_segs) 182 182 + unsigned *nsegs, unsigned *sectors, unsigned max_segs) 167 183 { 168 184 unsigned len = bv->bv_len; 169 185 unsigned total_len = 0; ··· 184 202 break; 185 203 } 186 204 187 187 - if (!new_nsegs) 188 188 - return !!len; 189 189 - 190 190 - /* update front segment size */ 191 191 - if (!*nsegs) { 192 192 - unsigned first_seg_size; 193 193 - 194 194 - if (new_nsegs == 1) 195 195 - first_seg_size = get_max_segment_size(q, bv->bv_offset); 196 196 - else 197 197 - first_seg_size = queue_max_segment_size(q); 198 198 - 199 199 - if (*front_seg_size < first_seg_size) 200 200 - *front_seg_size = first_seg_size; 205 205 + if (new_nsegs) { 206 206 + *nsegs += new_nsegs; 207 207 + if (sectors) 208 208 + *sectors += total_len >> 9; 201 209 } 202 202 - 203 203 - /* update other varibles */ 204 204 - *last_seg_size = seg_size; 205 205 - *nsegs += new_nsegs; 206 206 - if (sectors) 207 207 - *sectors += total_len >> 9; 208 210 209 211 /* split in the middle of the bvec if len != 0 */ 210 212 return !!len; ··· 201 235 { 202 236 struct bio_vec bv, bvprv, *bvprvp = NULL; 203 237 struct bvec_iter iter; 204 204 - unsigned seg_size = 0, nsegs = 0, sectors = 0; 205 205 - unsigned front_seg_size = bio->bi_seg_front_size; 238 238 + unsigned nsegs = 0, sectors = 0; 206 239 bool do_split = true; 207 240 struct bio *new = NULL; 208 241 const unsigned max_sectors = get_max_io_size(q, bio); ··· 225 260 /* split in the middle of bvec */ 226 261 bv.bv_len = (max_sectors - sectors) << 9; 227 262 bvec_split_segs(q, &bv, &nsegs, 228 228 - &seg_size, 229 229 - &front_seg_size, 230 263 &sectors, max_segs); 231 264 } 232 265 goto split; ··· 238 275 239 276 if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) { 240 277 nsegs++; 241 241 - seg_size = bv.bv_len; 242 278 sectors += bv.bv_len >> 9; 243 243 - if (nsegs == 1 && seg_size > front_seg_size) 244 244 - front_seg_size = seg_size; 245 245 - } else if (bvec_split_segs(q, &bv, &nsegs, &seg_size, 246 246 - &front_seg_size, &sectors, max_segs)) { 279 279 + } else if (bvec_split_segs(q, &bv, &nsegs, &sectors, 280 280 + max_segs)) { 247 281 goto split; 248 282 } 249 283 } ··· 254 294 if (new) 255 295 bio = new; 256 296 } 257 257 - 258 258 - bio->bi_seg_front_size = front_seg_size; 259 259 - if (seg_size > bio->bi_seg_back_size) 260 260 - bio->bi_seg_back_size = seg_size; 261 297 262 298 return do_split ? new : NULL; 263 299 } ··· 309 353 static unsigned int __blk_recalc_rq_segments(struct request_queue *q, 310 354 struct bio *bio) 311 355 { 312 312 - struct bio_vec uninitialized_var(bv), bvprv = { NULL }; 313 313 - unsigned int seg_size, nr_phys_segs; 314 314 - unsigned front_seg_size; 315 315 - struct bio *fbio, *bbio; 356 356 + unsigned int nr_phys_segs = 0; 316 357 struct bvec_iter iter; 317 317 - bool new_bio = false; 358 358 + struct bio_vec bv; 318 359 319 360 if (!bio) 320 361 return 0; 321 321 - 322 322 - front_seg_size = bio->bi_seg_front_size; 323 362 324 363 switch (bio_op(bio)) { 325 364 case REQ_OP_DISCARD: ··· 325 374 return 1; 326 375 } 327 376 328 328 - fbio = bio; 329 329 - seg_size = 0; 330 330 - nr_phys_segs = 0; 331 377 for_each_bio(bio) { 332 332 - bio_for_each_bvec(bv, bio, iter) { 333 333 - if (new_bio) { 334 334 - if (seg_size + bv.bv_len 335 335 - > queue_max_segment_size(q)) 336 336 - goto new_segment; 337 337 - if (!biovec_phys_mergeable(q, &bvprv, &bv)) 338 338 - goto new_segment; 339 339 - 340 340 - seg_size += bv.bv_len; 341 341 - 342 342 - if (nr_phys_segs == 1 && seg_size > 343 343 - front_seg_size) 344 344 - front_seg_size = seg_size; 345 345 - 346 346 - continue; 347 347 - } 348 348 - new_segment: 349 349 - bvec_split_segs(q, &bv, &nr_phys_segs, &seg_size, 350 350 - &front_seg_size, NULL, UINT_MAX); 351 351 - new_bio = false; 352 352 - } 353 353 - bbio = bio; 354 354 - if (likely(bio->bi_iter.bi_size)) { 355 355 - bvprv = bv; 356 356 - new_bio = true; 357 357 - } 378 378 + bio_for_each_bvec(bv, bio, iter) 379 379 + bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX); 358 380 } 359 359 - 360 360 - fbio->bi_seg_front_size = front_seg_size; 361 361 - if (seg_size > bbio->bi_seg_back_size) 362 362 - bbio->bi_seg_back_size = seg_size; 363 381 364 382 return nr_phys_segs; 365 383 } ··· 347 427 bio->bi_next = nxt; 348 428 349 429 bio_set_flag(bio, BIO_SEG_VALID); 350 350 - } 351 351 - 352 352 - static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, 353 353 - struct bio *nxt) 354 354 - { 355 355 - struct bio_vec end_bv = { NULL }, nxt_bv; 356 356 - 357 357 - if (bio->bi_seg_back_size + nxt->bi_seg_front_size > 358 358 - queue_max_segment_size(q)) 359 359 - return 0; 360 360 - 361 361 - if (!bio_has_data(bio)) 362 362 - return 1; 363 363 - 364 364 - bio_get_last_bvec(bio, &end_bv); 365 365 - bio_get_first_bvec(nxt, &nxt_bv); 366 366 - 367 367 - return biovec_phys_mergeable(q, &end_bv, &nxt_bv); 368 430 } 369 431 370 432 static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, ··· 608 706 struct request *next) 609 707 { 610 708 int total_phys_segments; 611 611 - unsigned int seg_size = 612 612 - req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; 613 709 614 710 if (req_gap_back_merge(req, next->bio)) 615 711 return 0; ··· 620 720 return 0; 621 721 622 722 total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; 623 623 - if (blk_phys_contig_segment(q, req->biotail, next->bio)) { 624 624 - if (req->nr_phys_segments == 1) 625 625 - req->bio->bi_seg_front_size = seg_size; 626 626 - if (next->nr_phys_segments == 1) 627 627 - next->biotail->bi_seg_back_size = seg_size; 628 628 - total_phys_segments--; 629 629 - } 630 630 - 631 723 if (total_phys_segments > queue_max_segments(q)) 632 724 return 0; 633 725

+10 -9

block/blk-mq.c

reviewed

··· 144 144 145 145 void blk_freeze_queue_start(struct request_queue *q) 146 146 { 147 147 - int freeze_depth; 148 148 - 149 149 - freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 150 150 - if (freeze_depth == 1) { 147 147 + mutex_lock(&q->mq_freeze_lock); 148 148 + if (++q->mq_freeze_depth == 1) { 151 149 percpu_ref_kill(&q->q_usage_counter); 150 150 + mutex_unlock(&q->mq_freeze_lock); 152 151 if (queue_is_mq(q)) 153 152 blk_mq_run_hw_queues(q, false); 153 153 + } else { 154 154 + mutex_unlock(&q->mq_freeze_lock); 154 155 } 155 156 } 156 157 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); ··· 200 199 201 200 void blk_mq_unfreeze_queue(struct request_queue *q) 202 201 { 203 203 - int freeze_depth; 204 204 - 205 205 - freeze_depth = atomic_dec_return(&q->mq_freeze_depth); 206 206 - WARN_ON_ONCE(freeze_depth < 0); 207 207 - if (!freeze_depth) { 202 202 + mutex_lock(&q->mq_freeze_lock); 203 203 + q->mq_freeze_depth--; 204 204 + WARN_ON_ONCE(q->mq_freeze_depth < 0); 205 205 + if (!q->mq_freeze_depth) { 208 206 percpu_ref_resurrect(&q->q_usage_counter); 209 207 wake_up_all(&q->mq_freeze_wq); 210 208 } 209 209 + mutex_unlock(&q->mq_freeze_lock); 211 210 } 212 211 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 213 212

+11

block/blk-settings.c

reviewed

··· 310 310 __func__, max_size); 311 311 } 312 312 313 313 + /* see blk_queue_virt_boundary() for the explanation */ 314 314 + WARN_ON_ONCE(q->limits.virt_boundary_mask); 315 315 + 313 316 q->limits.max_segment_size = max_size; 314 317 } 315 318 EXPORT_SYMBOL(blk_queue_max_segment_size); ··· 745 742 void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask) 746 743 { 747 744 q->limits.virt_boundary_mask = mask; 745 745 + 746 746 + /* 747 747 + * Devices that require a virtual boundary do not support scatter/gather 748 748 + * I/O natively, but instead require a descriptor list entry for each 749 749 + * page (which might not be idential to the Linux PAGE_SIZE). Because 750 750 + * of that they are not limited by our notion of "segment size". 751 751 + */ 752 752 + q->limits.max_segment_size = UINT_MAX; 748 753 } 749 754 EXPORT_SYMBOL(blk_queue_virt_boundary); 750 755

+62 -29

drivers/nvme/host/core.c

reviewed

··· 1257 1257 return 0; 1258 1258 } 1259 1259 1260 1260 - effects |= nvme_known_admin_effects(opcode); 1261 1260 if (ctrl->effects) 1262 1261 effects = le32_to_cpu(ctrl->effects->acs[opcode]); 1262 1262 + effects |= nvme_known_admin_effects(opcode); 1263 1263 1264 1264 /* 1265 1265 * For simplicity, IO to all namespaces is quiesced even if the command ··· 1361 1361 { 1362 1362 #ifdef CONFIG_NVME_MULTIPATH 1363 1363 if (disk->fops == &nvme_ns_head_ops) { 1364 1364 + struct nvme_ns *ns; 1365 1365 + 1364 1366 *head = disk->private_data; 1365 1367 *srcu_idx = srcu_read_lock(&(*head)->srcu); 1366 1366 - return nvme_find_path(*head); 1368 1368 + ns = nvme_find_path(*head); 1369 1369 + if (!ns) 1370 1370 + srcu_read_unlock(&(*head)->srcu, *srcu_idx); 1371 1371 + return ns; 1367 1372 } 1368 1373 #endif 1369 1374 *head = NULL; ··· 1382 1377 srcu_read_unlock(&head->srcu, idx); 1383 1378 } 1384 1379 1385 1385 - static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg) 1386 1386 - { 1387 1387 - switch (cmd) { 1388 1388 - case NVME_IOCTL_ID: 1389 1389 - force_successful_syscall_return(); 1390 1390 - return ns->head->ns_id; 1391 1391 - case NVME_IOCTL_ADMIN_CMD: 1392 1392 - return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); 1393 1393 - case NVME_IOCTL_IO_CMD: 1394 1394 - return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); 1395 1395 - case NVME_IOCTL_SUBMIT_IO: 1396 1396 - return nvme_submit_io(ns, (void __user *)arg); 1397 1397 - default: 1398 1398 - #ifdef CONFIG_NVM 1399 1399 - if (ns->ndev) 1400 1400 - return nvme_nvm_ioctl(ns, cmd, arg); 1401 1401 - #endif 1402 1402 - if (is_sed_ioctl(cmd)) 1403 1403 - return sed_ioctl(ns->ctrl->opal_dev, cmd, 1404 1404 - (void __user *) arg); 1405 1405 - return -ENOTTY; 1406 1406 - } 1407 1407 - } 1408 1408 - 1409 1380 static int nvme_ioctl(struct block_device *bdev, fmode_t mode, 1410 1381 unsigned int cmd, unsigned long arg) 1411 1382 { 1412 1383 struct nvme_ns_head *head = NULL; 1384 1384 + void __user *argp = (void __user *)arg; 1413 1385 struct nvme_ns *ns; 1414 1386 int srcu_idx, ret; 1415 1387 1416 1388 ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); 1417 1389 if (unlikely(!ns)) 1418 1418 - ret = -EWOULDBLOCK; 1419 1419 - else 1420 1420 - ret = nvme_ns_ioctl(ns, cmd, arg); 1390 1390 + return -EWOULDBLOCK; 1391 1391 + 1392 1392 + /* 1393 1393 + * Handle ioctls that apply to the controller instead of the namespace 1394 1394 + * seperately and drop the ns SRCU reference early. This avoids a 1395 1395 + * deadlock when deleting namespaces using the passthrough interface. 1396 1396 + */ 1397 1397 + if (cmd == NVME_IOCTL_ADMIN_CMD || is_sed_ioctl(cmd)) { 1398 1398 + struct nvme_ctrl *ctrl = ns->ctrl; 1399 1399 + 1400 1400 + nvme_get_ctrl(ns->ctrl); 1401 1401 + nvme_put_ns_from_disk(head, srcu_idx); 1402 1402 + 1403 1403 + if (cmd == NVME_IOCTL_ADMIN_CMD) 1404 1404 + ret = nvme_user_cmd(ctrl, NULL, argp); 1405 1405 + else 1406 1406 + ret = sed_ioctl(ctrl->opal_dev, cmd, argp); 1407 1407 + 1408 1408 + nvme_put_ctrl(ctrl); 1409 1409 + return ret; 1410 1410 + } 1411 1411 + 1412 1412 + switch (cmd) { 1413 1413 + case NVME_IOCTL_ID: 1414 1414 + force_successful_syscall_return(); 1415 1415 + ret = ns->head->ns_id; 1416 1416 + break; 1417 1417 + case NVME_IOCTL_IO_CMD: 1418 1418 + ret = nvme_user_cmd(ns->ctrl, ns, argp); 1419 1419 + break; 1420 1420 + case NVME_IOCTL_SUBMIT_IO: 1421 1421 + ret = nvme_submit_io(ns, argp); 1422 1422 + break; 1423 1423 + default: 1424 1424 + if (ns->ndev) 1425 1425 + ret = nvme_nvm_ioctl(ns, cmd, arg); 1426 1426 + else 1427 1427 + ret = -ENOTTY; 1428 1428 + } 1429 1429 + 1421 1430 nvme_put_ns_from_disk(head, srcu_idx); 1422 1431 return ret; 1423 1432 } ··· 2576 2557 2577 2558 ctrl->oacs = le16_to_cpu(id->oacs); 2578 2559 ctrl->oncs = le16_to_cpu(id->oncs); 2560 2560 + ctrl->mtfa = le16_to_cpu(id->mtfa); 2579 2561 ctrl->oaes = le32_to_cpu(id->oaes); 2580 2562 atomic_set(&ctrl->abort_limit, id->acl + 1); 2581 2563 ctrl->vwc = id->vwc; ··· 3701 3681 3702 3682 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) 3703 3683 { 3684 3684 + dev_pm_qos_hide_latency_tolerance(ctrl->device); 3704 3685 cdev_device_del(&ctrl->cdev, ctrl->device); 3705 3686 } 3706 3687 EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); ··· 3900 3879 up_read(&ctrl->namespaces_rwsem); 3901 3880 } 3902 3881 EXPORT_SYMBOL_GPL(nvme_start_queues); 3882 3882 + 3883 3883 + 3884 3884 + void nvme_sync_queues(struct nvme_ctrl *ctrl) 3885 3885 + { 3886 3886 + struct nvme_ns *ns; 3887 3887 + 3888 3888 + down_read(&ctrl->namespaces_rwsem); 3889 3889 + list_for_each_entry(ns, &ctrl->namespaces, list) 3890 3890 + blk_sync_queue(ns->queue); 3891 3891 + up_read(&ctrl->namespaces_rwsem); 3892 3892 + } 3893 3893 + EXPORT_SYMBOL_GPL(nvme_sync_queues); 3903 3894 3904 3895 /* 3905 3896 * Check we didn't inadvertently grow the command structure sizes:

drivers/nvme/host/nvme.h

reviewed

··· 441 441 void nvme_stop_queues(struct nvme_ctrl *ctrl); 442 442 void nvme_start_queues(struct nvme_ctrl *ctrl); 443 443 void nvme_kill_queues(struct nvme_ctrl *ctrl); 444 444 + void nvme_sync_queues(struct nvme_ctrl *ctrl); 444 445 void nvme_unfreeze(struct nvme_ctrl *ctrl); 445 446 void nvme_wait_freeze(struct nvme_ctrl *ctrl); 446 447 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);

+14 -13

drivers/nvme/host/pci.c

reviewed

··· 464 464 * affinity), so use the regular blk-mq cpu mapping 465 465 */ 466 466 map->queue_offset = qoff; 467 467 - if (i != HCTX_TYPE_POLL) 467 467 + if (i != HCTX_TYPE_POLL && offset) 468 468 blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset); 469 469 else 470 470 blk_mq_map_queues(map); ··· 1257 1257 struct nvme_dev *dev = nvmeq->dev; 1258 1258 struct request *abort_req; 1259 1259 struct nvme_command cmd; 1260 1260 - bool shutdown = false; 1261 1260 u32 csts = readl(dev->bar + NVME_REG_CSTS); 1262 1261 1263 1262 /* If PCI error recovery process is happening, we cannot reset or ··· 1293 1294 * shutdown, so we return BLK_EH_DONE. 1294 1295 */ 1295 1296 switch (dev->ctrl.state) { 1296 1296 - case NVME_CTRL_DELETING: 1297 1297 - shutdown = true; 1298 1298 - /* fall through */ 1299 1297 case NVME_CTRL_CONNECTING: 1300 1300 - case NVME_CTRL_RESETTING: 1298 1298 + nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING); 1299 1299 + /* fall through */ 1300 1300 + case NVME_CTRL_DELETING: 1301 1301 dev_warn_ratelimited(dev->ctrl.device, 1302 1302 "I/O %d QID %d timeout, disable controller\n", 1303 1303 req->tag, nvmeq->qid); 1304 1304 - nvme_dev_disable(dev, shutdown); 1304 1304 + nvme_dev_disable(dev, true); 1305 1305 nvme_req(req)->flags |= NVME_REQ_CANCELLED; 1306 1306 return BLK_EH_DONE; 1307 1307 + case NVME_CTRL_RESETTING: 1308 1308 + return BLK_EH_RESET_TIMER; 1307 1309 default: 1308 1310 break; 1309 1311 } ··· 2376 2376 2377 2377 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) 2378 2378 { 2379 2379 - bool dead = true; 2379 2379 + bool dead = true, freeze = false; 2380 2380 struct pci_dev *pdev = to_pci_dev(dev->dev); 2381 2381 2382 2382 mutex_lock(&dev->shutdown_lock); ··· 2384 2384 u32 csts = readl(dev->bar + NVME_REG_CSTS); 2385 2385 2386 2386 if (dev->ctrl.state == NVME_CTRL_LIVE || 2387 2387 - dev->ctrl.state == NVME_CTRL_RESETTING) 2387 2387 + dev->ctrl.state == NVME_CTRL_RESETTING) { 2388 2388 + freeze = true; 2388 2389 nvme_start_freeze(&dev->ctrl); 2390 2390 + } 2389 2391 dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || 2390 2392 pdev->error_state != pci_channel_io_normal); 2391 2393 } ··· 2396 2394 * Give the controller a chance to complete all entered requests if 2397 2395 * doing a safe shutdown. 2398 2396 */ 2399 2399 - if (!dead) { 2400 2400 - if (shutdown) 2401 2401 - nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); 2402 2402 - } 2397 2397 + if (!dead && shutdown && freeze) 2398 2398 + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); 2403 2399 2404 2400 nvme_stop_queues(&dev->ctrl); 2405 2401 ··· 2492 2492 */ 2493 2493 if (dev->ctrl.ctrl_config & NVME_CC_ENABLE) 2494 2494 nvme_dev_disable(dev, false); 2495 2495 + nvme_sync_queues(&dev->ctrl); 2495 2496 2496 2497 mutex_lock(&dev->shutdown_lock); 2497 2498 result = nvme_pci_enable(dev);

+1 -1

include/linux/bio.h

reviewed

··· 210 210 { 211 211 if (count != 1) { 212 212 bio->bi_flags |= (1 << BIO_REFFED); 213 213 - smp_mb__before_atomic(); 213 213 + smp_mb(); 214 214 } 215 215 atomic_set(&bio->__bi_cnt, count); 216 216 }

-7

include/linux/blk_types.h

reviewed

··· 159 159 */ 160 160 unsigned int bi_phys_segments; 161 161 162 162 - /* 163 163 - * To keep track of the max segment size, we account for the 164 164 - * sizes of the first and last mergeable segments in this bio. 165 165 - */ 166 166 - unsigned int bi_seg_front_size; 167 167 - unsigned int bi_seg_back_size; 168 168 - 169 162 struct bvec_iter bi_iter; 170 163 171 164 atomic_t __bi_remaining;

+6 -1

include/linux/blkdev.h

reviewed

··· 542 542 struct list_head unused_hctx_list; 543 543 spinlock_t unused_hctx_lock; 544 544 545 545 - atomic_t mq_freeze_depth; 545 545 + int mq_freeze_depth; 546 546 547 547 #if defined(CONFIG_BLK_DEV_BSG) 548 548 struct bsg_class_device bsg_dev; ··· 554 554 #endif 555 555 struct rcu_head rcu_head; 556 556 wait_queue_head_t mq_freeze_wq; 557 557 + /* 558 558 + * Protect concurrent access to q_usage_counter by 559 559 + * percpu_ref_kill() and percpu_ref_reinit(). 560 560 + */ 561 561 + struct mutex mq_freeze_lock; 557 562 struct percpu_ref q_usage_counter; 558 563 559 564 struct blk_mq_tag_set *tag_set;

+1 -1

lib/sbitmap.c

reviewed

··· 435 435 * to ensure that the batch size is updated before the wait 436 436 * counts. 437 437 */ 438 438 - smp_mb__before_atomic(); 438 438 + smp_mb(); 439 439 for (i = 0; i < SBQ_WAIT_QUEUES; i++) 440 440 atomic_set(&sbq->ws[i].wait_cnt, 1); 441 441 }

+1 -1

tools/io_uring/Makefile

reviewed

··· 8 8 $(CC) $(CFLAGS) -o $@ $^ 9 9 10 10 io_uring-bench: syscall.o io_uring-bench.o 11 11 - $(CC) $(CFLAGS) $(LDLIBS) -o $@ $^ 11 11 + $(CC) $(CFLAGS) -o $@ $^ $(LDLIBS) 12 12 13 13 io_uring-cp: setup.o syscall.o queue.o 14 14

+16 -7

tools/io_uring/io_uring-cp.c

reviewed

··· 13 13 #include <assert.h> 14 14 #include <errno.h> 15 15 #include <inttypes.h> 16 16 + #include <sys/types.h> 16 17 #include <sys/stat.h> 17 18 #include <sys/ioctl.h> 18 19 ··· 86 85 struct io_uring_sqe *sqe; 87 86 struct io_data *data; 88 87 89 89 - sqe = io_uring_get_sqe(ring); 90 90 - if (!sqe) 88 88 + data = malloc(size + sizeof(*data)); 89 89 + if (!data) 91 90 return 1; 92 91 93 93 - data = malloc(size + sizeof(*data)); 92 92 + sqe = io_uring_get_sqe(ring); 93 93 + if (!sqe) { 94 94 + free(data); 95 95 + return 1; 96 96 + } 97 97 + 94 98 data->read = 1; 95 99 data->offset = data->first_offset = offset; 96 100 ··· 172 166 struct io_data *data; 173 167 174 168 if (!got_comp) { 175 175 - ret = io_uring_wait_completion(ring, &cqe); 169 169 + ret = io_uring_wait_cqe(ring, &cqe); 176 170 got_comp = 1; 177 171 } else 178 178 - ret = io_uring_get_completion(ring, &cqe); 172 172 + ret = io_uring_peek_cqe(ring, &cqe); 179 173 if (ret < 0) { 180 180 - fprintf(stderr, "io_uring_get_completion: %s\n", 174 174 + fprintf(stderr, "io_uring_peek_cqe: %s\n", 181 175 strerror(-ret)); 182 176 return 1; 183 177 } 184 178 if (!cqe) 185 179 break; 186 180 187 187 - data = (struct io_data *) (uintptr_t) cqe->user_data; 181 181 + data = io_uring_cqe_get_data(cqe); 188 182 if (cqe->res < 0) { 189 183 if (cqe->res == -EAGAIN) { 190 184 queue_prepped(ring, data); 185 185 + io_uring_cqe_seen(ring, cqe); 191 186 continue; 192 187 } 193 188 fprintf(stderr, "cqe failed: %s\n", ··· 200 193 data->iov.iov_len -= cqe->res; 201 194 data->offset += cqe->res; 202 195 queue_prepped(ring, data); 196 196 + io_uring_cqe_seen(ring, cqe); 203 197 continue; 204 198 } 205 199 ··· 217 209 free(data); 218 210 writes--; 219 211 } 212 212 + io_uring_cqe_seen(ring, cqe); 220 213 } 221 214 } 222 215

+52 -12

tools/io_uring/liburing.h

reviewed

··· 1 1 #ifndef LIB_URING_H 2 2 #define LIB_URING_H 3 3 4 4 + #ifdef __cplusplus 5 5 + extern "C" { 6 6 + #endif 7 7 + 4 8 #include <sys/uio.h> 5 9 #include <signal.h> 6 10 #include <string.h> 7 11 #include "../../include/uapi/linux/io_uring.h" 12 12 + #include <inttypes.h> 13 13 + #include "barrier.h" 8 14 9 15 /* 10 16 * Library interface to io_uring ··· 52 46 * System calls 53 47 */ 54 48 extern int io_uring_setup(unsigned entries, struct io_uring_params *p); 55 55 - extern int io_uring_enter(unsigned fd, unsigned to_submit, 49 49 + extern int io_uring_enter(int fd, unsigned to_submit, 56 50 unsigned min_complete, unsigned flags, sigset_t *sig); 57 51 extern int io_uring_register(int fd, unsigned int opcode, void *arg, 58 52 unsigned int nr_args); ··· 65 59 extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, 66 60 struct io_uring *ring); 67 61 extern void io_uring_queue_exit(struct io_uring *ring); 68 68 - extern int io_uring_get_completion(struct io_uring *ring, 62 62 + extern int io_uring_peek_cqe(struct io_uring *ring, 69 63 struct io_uring_cqe **cqe_ptr); 70 70 - extern int io_uring_wait_completion(struct io_uring *ring, 64 64 + extern int io_uring_wait_cqe(struct io_uring *ring, 71 65 struct io_uring_cqe **cqe_ptr); 72 66 extern int io_uring_submit(struct io_uring *ring); 73 67 extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring); 68 68 + 69 69 + /* 70 70 + * Must be called after io_uring_{peek,wait}_cqe() after the cqe has 71 71 + * been processed by the application. 72 72 + */ 73 73 + static inline void io_uring_cqe_seen(struct io_uring *ring, 74 74 + struct io_uring_cqe *cqe) 75 75 + { 76 76 + if (cqe) { 77 77 + struct io_uring_cq *cq = &ring->cq; 78 78 + 79 79 + (*cq->khead)++; 80 80 + /* 81 81 + * Ensure that the kernel sees our new head, the kernel has 82 82 + * the matching read barrier. 83 83 + */ 84 84 + write_barrier(); 85 85 + } 86 86 + } 74 87 75 88 /* 76 89 * Command prep helpers ··· 99 74 sqe->user_data = (unsigned long) data; 100 75 } 101 76 77 77 + static inline void *io_uring_cqe_get_data(struct io_uring_cqe *cqe) 78 78 + { 79 79 + return (void *) (uintptr_t) cqe->user_data; 80 80 + } 81 81 + 102 82 static inline void io_uring_prep_rw(int op, struct io_uring_sqe *sqe, int fd, 103 103 - void *addr, unsigned len, off_t offset) 83 83 + const void *addr, unsigned len, 84 84 + off_t offset) 104 85 { 105 86 memset(sqe, 0, sizeof(*sqe)); 106 87 sqe->opcode = op; ··· 117 86 } 118 87 119 88 static inline void io_uring_prep_readv(struct io_uring_sqe *sqe, int fd, 120 120 - struct iovec *iovecs, unsigned nr_vecs, 121 121 - off_t offset) 89 89 + const struct iovec *iovecs, 90 90 + unsigned nr_vecs, off_t offset) 122 91 { 123 92 io_uring_prep_rw(IORING_OP_READV, sqe, fd, iovecs, nr_vecs, offset); 124 93 } ··· 131 100 } 132 101 133 102 static inline void io_uring_prep_writev(struct io_uring_sqe *sqe, int fd, 134 134 - struct iovec *iovecs, unsigned nr_vecs, 135 135 - off_t offset) 103 103 + const struct iovec *iovecs, 104 104 + unsigned nr_vecs, off_t offset) 136 105 { 137 106 io_uring_prep_rw(IORING_OP_WRITEV, sqe, fd, iovecs, nr_vecs, offset); 138 107 } 139 108 140 109 static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd, 141 141 - void *buf, unsigned nbytes, 110 110 + const void *buf, unsigned nbytes, 142 111 off_t offset) 143 112 { 144 113 io_uring_prep_rw(IORING_OP_WRITE_FIXED, sqe, fd, buf, nbytes, offset); ··· 162 131 } 163 132 164 133 static inline void io_uring_prep_fsync(struct io_uring_sqe *sqe, int fd, 165 165 - int datasync) 134 134 + unsigned fsync_flags) 166 135 { 167 136 memset(sqe, 0, sizeof(*sqe)); 168 137 sqe->opcode = IORING_OP_FSYNC; 169 138 sqe->fd = fd; 170 170 - if (datasync) 171 171 - sqe->fsync_flags = IORING_FSYNC_DATASYNC; 139 139 + sqe->fsync_flags = fsync_flags; 172 140 } 141 141 + 142 142 + static inline void io_uring_prep_nop(struct io_uring_sqe *sqe) 143 143 + { 144 144 + memset(sqe, 0, sizeof(*sqe)); 145 145 + sqe->opcode = IORING_OP_NOP; 146 146 + } 147 147 + 148 148 + #ifdef __cplusplus 149 149 + } 150 150 + #endif 173 151 174 152 #endif

+14 -22

tools/io_uring/queue.c

reviewed

··· 8 8 #include "liburing.h" 9 9 #include "barrier.h" 10 10 11 11 - static int __io_uring_get_completion(struct io_uring *ring, 12 12 - struct io_uring_cqe **cqe_ptr, int wait) 11 11 + static int __io_uring_get_cqe(struct io_uring *ring, 12 12 + struct io_uring_cqe **cqe_ptr, int wait) 13 13 { 14 14 struct io_uring_cq *cq = &ring->cq; 15 15 const unsigned mask = *cq->kring_mask; ··· 39 39 return -errno; 40 40 } while (1); 41 41 42 42 - if (*cqe_ptr) { 43 43 - *cq->khead = head + 1; 44 44 - /* 45 45 - * Ensure that the kernel sees our new head, the kernel has 46 46 - * the matching read barrier. 47 47 - */ 48 48 - write_barrier(); 49 49 - } 50 50 - 51 42 return 0; 52 43 } 53 44 54 45 /* 55 55 - * Return an IO completion, if one is readily available 46 46 + * Return an IO completion, if one is readily available. Returns 0 with 47 47 + * cqe_ptr filled in on success, -errno on failure. 56 48 */ 57 57 - int io_uring_get_completion(struct io_uring *ring, 58 58 - struct io_uring_cqe **cqe_ptr) 49 49 + int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 59 50 { 60 60 - return __io_uring_get_completion(ring, cqe_ptr, 0); 51 51 + return __io_uring_get_cqe(ring, cqe_ptr, 0); 61 52 } 62 53 63 54 /* 64 64 - * Return an IO completion, waiting for it if necessary 55 55 + * Return an IO completion, waiting for it if necessary. Returns 0 with 56 56 + * cqe_ptr filled in on success, -errno on failure. 65 57 */ 66 66 - int io_uring_wait_completion(struct io_uring *ring, 67 67 - struct io_uring_cqe **cqe_ptr) 58 58 + int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr) 68 59 { 69 69 - return __io_uring_get_completion(ring, cqe_ptr, 1); 60 60 + return __io_uring_get_cqe(ring, cqe_ptr, 1); 70 61 } 71 62 72 63 /* ··· 69 78 { 70 79 struct io_uring_sq *sq = &ring->sq; 71 80 const unsigned mask = *sq->kring_mask; 72 72 - unsigned ktail, ktail_next, submitted; 81 81 + unsigned ktail, ktail_next, submitted, to_submit; 73 82 int ret; 74 83 75 84 /* ··· 91 100 */ 92 101 submitted = 0; 93 102 ktail = ktail_next = *sq->ktail; 94 94 - while (sq->sqe_head < sq->sqe_tail) { 103 103 + to_submit = sq->sqe_tail - sq->sqe_head; 104 104 + while (to_submit--) { 95 105 ktail_next++; 96 106 read_barrier(); 97 107 ··· 128 136 if (ret < 0) 129 137 return -errno; 130 138 131 131 - return 0; 139 139 + return ret; 132 140 } 133 141 134 142 /*

+7 -3

tools/io_uring/setup.c

reviewed

··· 27 27 sq->kdropped = ptr + p->sq_off.dropped; 28 28 sq->array = ptr + p->sq_off.array; 29 29 30 30 - size = p->sq_entries * sizeof(struct io_uring_sqe), 30 30 + size = p->sq_entries * sizeof(struct io_uring_sqe); 31 31 sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, 32 32 MAP_SHARED | MAP_POPULATE, fd, 33 33 IORING_OFF_SQES); ··· 79 79 int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags) 80 80 { 81 81 struct io_uring_params p; 82 82 - int fd; 82 82 + int fd, ret; 83 83 84 84 memset(&p, 0, sizeof(p)); 85 85 p.flags = flags; ··· 88 88 if (fd < 0) 89 89 return fd; 90 90 91 91 - return io_uring_queue_mmap(fd, &p, ring); 91 91 + ret = io_uring_queue_mmap(fd, &p, ring); 92 92 + if (ret) 93 93 + close(fd); 94 94 + 95 95 + return ret; 92 96 } 93 97 94 98 void io_uring_queue_exit(struct io_uring *ring)

+30 -18

tools/io_uring/syscall.c

reviewed

··· 7 7 #include <signal.h> 8 8 #include "liburing.h" 9 9 10 10 - #if defined(__x86_64) || defined(__i386__) 11 11 - #ifndef __NR_sys_io_uring_setup 12 12 - #define __NR_sys_io_uring_setup 425 13 13 - #endif 14 14 - #ifndef __NR_sys_io_uring_enter 15 15 - #define __NR_sys_io_uring_enter 426 16 16 - #endif 17 17 - #ifndef __NR_sys_io_uring_register 18 18 - #define __NR_sys_io_uring_register 427 19 19 - #endif 20 20 - #else 21 21 - #error "Arch not supported yet" 10 10 + #ifdef __alpha__ 11 11 + /* 12 12 + * alpha is the only exception, all other architectures 13 13 + * have common numbers for new system calls. 14 14 + */ 15 15 + # ifndef __NR_io_uring_setup 16 16 + # define __NR_io_uring_setup 535 17 17 + # endif 18 18 + # ifndef __NR_io_uring_enter 19 19 + # define __NR_io_uring_enter 536 20 20 + # endif 21 21 + # ifndef __NR_io_uring_register 22 22 + # define __NR_io_uring_register 537 23 23 + # endif 24 24 + #else /* !__alpha__ */ 25 25 + # ifndef __NR_io_uring_setup 26 26 + # define __NR_io_uring_setup 425 27 27 + # endif 28 28 + # ifndef __NR_io_uring_enter 29 29 + # define __NR_io_uring_enter 426 30 30 + # endif 31 31 + # ifndef __NR_io_uring_register 32 32 + # define __NR_io_uring_register 427 33 33 + # endif 22 34 #endif 23 35 24 36 int io_uring_register(int fd, unsigned int opcode, void *arg, 25 37 unsigned int nr_args) 26 38 { 27 27 - return syscall(__NR_sys_io_uring_register, fd, opcode, arg, nr_args); 39 39 + return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args); 28 40 } 29 41 30 30 - int io_uring_setup(unsigned entries, struct io_uring_params *p) 42 42 + int io_uring_setup(unsigned int entries, struct io_uring_params *p) 31 43 { 32 32 - return syscall(__NR_sys_io_uring_setup, entries, p); 44 44 + return syscall(__NR_io_uring_setup, entries, p); 33 45 } 34 46 35 35 - int io_uring_enter(unsigned fd, unsigned to_submit, unsigned min_complete, 36 36 - unsigned flags, sigset_t *sig) 47 47 + int io_uring_enter(int fd, unsigned int to_submit, unsigned int min_complete, 48 48 + unsigned int flags, sigset_t *sig) 37 49 { 38 38 - return syscall(__NR_sys_io_uring_enter, fd, to_submit, min_complete, 50 50 + return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, 39 51 flags, sig, _NSIG / 8); 40 52 }