Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block fixes from Jens Axboe:
"Here's a pull request for 4.11-rc, fixing a set of issues mostly
centered around the new scheduling framework. These have been brewing
for a while, but split up into what we absolutely need in 4.11, and
what we can defer until 4.12. These are well tested, on both single
queue and multiqueue setups, and with and without shared tags. They
fix several hangs that have happened in testing.

This is obviously larger than I would have preferred at this point in
time, but I don't think we can shave much off this and still get the
desired results.

In detail, this pull request contains:

- a set of five fixes for NVMe, mostly from Christoph and one from
Roland.

- a series from Bart, fixing issues with dm-mq and SCSI shared tags
and scheduling. Note that one of those patches commit messages may
read like an optimization, but it is in fact an important fix for
queue restarts in particular.

- a series from Omar, most importantly fixing a hang with multiple
hardware queues when we fail to get a driver tag. Another important
fix in there is for resizing hardware queues, which nbd does when
handling multiple sockets for one connection.

- fixing an imbalance in putting the ctx for hctx request allocations
from Minchan"

* 'for-linus' of git://git.kernel.dk/linux-block:
blk-mq: Restart a single queue if tag sets are shared
dm rq: Avoid that request processing stalls sporadically
scsi: Avoid that SCSI queues get stuck
blk-mq: Introduce blk_mq_delay_run_hw_queue()
blk-mq: remap queues when adding/removing hardware queues
blk-mq-sched: fix crash in switch error path
blk-mq-sched: set up scheduler tags when bringing up new queues
blk-mq-sched: refactor scheduler initialization
blk-mq: use the right hctx when getting a driver tag fails
nvmet: fix byte swap in nvmet_parse_io_cmd
nvmet: fix byte swap in nvmet_execute_write_zeroes
nvmet: add missing byte swap in nvmet_get_smart_log
nvme: add missing byte swap in nvme_setup_discard
nvme: Correct NVMF enum values to match NVMe-oF rev 1.0
block: do not put mq context in blk_mq_alloc_request_hctx

+286 -169
+136 -55
block/blk-mq-sched.c
··· 171 171 172 172 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 173 173 { 174 - struct elevator_queue *e = hctx->queue->elevator; 174 + struct request_queue *q = hctx->queue; 175 + struct elevator_queue *e = q->elevator; 175 176 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 176 177 bool did_work = false; 177 178 LIST_HEAD(rq_list); ··· 204 203 */ 205 204 if (!list_empty(&rq_list)) { 206 205 blk_mq_sched_mark_restart_hctx(hctx); 207 - did_work = blk_mq_dispatch_rq_list(hctx, &rq_list); 206 + did_work = blk_mq_dispatch_rq_list(q, &rq_list); 208 207 } else if (!has_sched_dispatch) { 209 208 blk_mq_flush_busy_ctxs(hctx, &rq_list); 210 - blk_mq_dispatch_rq_list(hctx, &rq_list); 209 + blk_mq_dispatch_rq_list(q, &rq_list); 211 210 } 212 211 213 212 /* ··· 223 222 if (!rq) 224 223 break; 225 224 list_add(&rq->queuelist, &rq_list); 226 - } while (blk_mq_dispatch_rq_list(hctx, &rq_list)); 225 + } while (blk_mq_dispatch_rq_list(q, &rq_list)); 227 226 } 228 227 } 229 228 ··· 318 317 return true; 319 318 } 320 319 321 - static void blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) 320 + static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) 322 321 { 323 322 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) { 324 323 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 325 - if (blk_mq_hctx_has_pending(hctx)) 324 + if (blk_mq_hctx_has_pending(hctx)) { 326 325 blk_mq_run_hw_queue(hctx, true); 326 + return true; 327 + } 327 328 } 329 + return false; 328 330 } 329 331 330 - void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx) 331 - { 332 - struct request_queue *q = hctx->queue; 333 - unsigned int i; 332 + /** 333 + * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list 334 + * @pos: loop cursor. 335 + * @skip: the list element that will not be examined. Iteration starts at 336 + * @skip->next. 337 + * @head: head of the list to examine. This list must have at least one 338 + * element, namely @skip. 339 + * @member: name of the list_head structure within typeof(*pos). 340 + */ 341 + #define list_for_each_entry_rcu_rr(pos, skip, head, member) \ 342 + for ((pos) = (skip); \ 343 + (pos = (pos)->member.next != (head) ? list_entry_rcu( \ 344 + (pos)->member.next, typeof(*pos), member) : \ 345 + list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ 346 + (pos) != (skip); ) 334 347 335 - if (test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { 336 - if (test_and_clear_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) { 337 - queue_for_each_hw_ctx(q, hctx, i) 338 - blk_mq_sched_restart_hctx(hctx); 348 + /* 349 + * Called after a driver tag has been freed to check whether a hctx needs to 350 + * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware 351 + * queues in a round-robin fashion if the tag set of @hctx is shared with other 352 + * hardware queues. 353 + */ 354 + void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) 355 + { 356 + struct blk_mq_tags *const tags = hctx->tags; 357 + struct blk_mq_tag_set *const set = hctx->queue->tag_set; 358 + struct request_queue *const queue = hctx->queue, *q; 359 + struct blk_mq_hw_ctx *hctx2; 360 + unsigned int i, j; 361 + 362 + if (set->flags & BLK_MQ_F_TAG_SHARED) { 363 + rcu_read_lock(); 364 + list_for_each_entry_rcu_rr(q, queue, &set->tag_list, 365 + tag_set_list) { 366 + queue_for_each_hw_ctx(q, hctx2, i) 367 + if (hctx2->tags == tags && 368 + blk_mq_sched_restart_hctx(hctx2)) 369 + goto done; 339 370 } 371 + j = hctx->queue_num + 1; 372 + for (i = 0; i < queue->nr_hw_queues; i++, j++) { 373 + if (j == queue->nr_hw_queues) 374 + j = 0; 375 + hctx2 = queue->queue_hw_ctx[j]; 376 + if (hctx2->tags == tags && 377 + blk_mq_sched_restart_hctx(hctx2)) 378 + break; 379 + } 380 + done: 381 + rcu_read_unlock(); 340 382 } else { 341 383 blk_mq_sched_restart_hctx(hctx); 342 384 } ··· 475 431 } 476 432 } 477 433 478 - int blk_mq_sched_setup(struct request_queue *q) 434 + static int blk_mq_sched_alloc_tags(struct request_queue *q, 435 + struct blk_mq_hw_ctx *hctx, 436 + unsigned int hctx_idx) 479 437 { 480 438 struct blk_mq_tag_set *set = q->tag_set; 481 - struct blk_mq_hw_ctx *hctx; 482 - int ret, i; 439 + int ret; 483 440 484 - /* 485 - * Default to 256, since we don't split into sync/async like the 486 - * old code did. Additionally, this is a per-hw queue depth. 487 - */ 488 - q->nr_requests = 2 * BLKDEV_MAX_RQ; 441 + hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, 442 + set->reserved_tags); 443 + if (!hctx->sched_tags) 444 + return -ENOMEM; 489 445 490 - /* 491 - * We're switching to using an IO scheduler, so setup the hctx 492 - * scheduler tags and switch the request map from the regular 493 - * tags to scheduler tags. First allocate what we need, so we 494 - * can safely fail and fallback, if needed. 495 - */ 496 - ret = 0; 497 - queue_for_each_hw_ctx(q, hctx, i) { 498 - hctx->sched_tags = blk_mq_alloc_rq_map(set, i, 499 - q->nr_requests, set->reserved_tags); 500 - if (!hctx->sched_tags) { 501 - ret = -ENOMEM; 502 - break; 503 - } 504 - ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests); 505 - if (ret) 506 - break; 507 - } 446 + ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); 447 + if (ret) 448 + blk_mq_sched_free_tags(set, hctx, hctx_idx); 508 449 509 - /* 510 - * If we failed, free what we did allocate 511 - */ 512 - if (ret) { 513 - queue_for_each_hw_ctx(q, hctx, i) { 514 - if (!hctx->sched_tags) 515 - continue; 516 - blk_mq_sched_free_tags(set, hctx, i); 517 - } 518 - 519 - return ret; 520 - } 521 - 522 - return 0; 450 + return ret; 523 451 } 524 452 525 - void blk_mq_sched_teardown(struct request_queue *q) 453 + static void blk_mq_sched_tags_teardown(struct request_queue *q) 526 454 { 527 455 struct blk_mq_tag_set *set = q->tag_set; 528 456 struct blk_mq_hw_ctx *hctx; ··· 502 486 503 487 queue_for_each_hw_ctx(q, hctx, i) 504 488 blk_mq_sched_free_tags(set, hctx, i); 489 + } 490 + 491 + int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 492 + unsigned int hctx_idx) 493 + { 494 + struct elevator_queue *e = q->elevator; 495 + 496 + if (!e) 497 + return 0; 498 + 499 + return blk_mq_sched_alloc_tags(q, hctx, hctx_idx); 500 + } 501 + 502 + void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 503 + unsigned int hctx_idx) 504 + { 505 + struct elevator_queue *e = q->elevator; 506 + 507 + if (!e) 508 + return; 509 + 510 + blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 511 + } 512 + 513 + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) 514 + { 515 + struct blk_mq_hw_ctx *hctx; 516 + unsigned int i; 517 + int ret; 518 + 519 + if (!e) { 520 + q->elevator = NULL; 521 + return 0; 522 + } 523 + 524 + /* 525 + * Default to 256, since we don't split into sync/async like the 526 + * old code did. Additionally, this is a per-hw queue depth. 527 + */ 528 + q->nr_requests = 2 * BLKDEV_MAX_RQ; 529 + 530 + queue_for_each_hw_ctx(q, hctx, i) { 531 + ret = blk_mq_sched_alloc_tags(q, hctx, i); 532 + if (ret) 533 + goto err; 534 + } 535 + 536 + ret = e->ops.mq.init_sched(q, e); 537 + if (ret) 538 + goto err; 539 + 540 + return 0; 541 + 542 + err: 543 + blk_mq_sched_tags_teardown(q); 544 + q->elevator = NULL; 545 + return ret; 546 + } 547 + 548 + void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) 549 + { 550 + if (e->type->ops.mq.exit_sched) 551 + e->type->ops.mq.exit_sched(e); 552 + blk_mq_sched_tags_teardown(q); 553 + q->elevator = NULL; 505 554 } 506 555 507 556 int blk_mq_sched_init(struct request_queue *q)
+8 -17
block/blk-mq-sched.h
··· 19 19 struct request **merged_request); 20 20 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); 21 21 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); 22 - void blk_mq_sched_restart_queues(struct blk_mq_hw_ctx *hctx); 22 + void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); 23 23 24 24 void blk_mq_sched_insert_request(struct request *rq, bool at_head, 25 25 bool run_queue, bool async, bool can_block); ··· 32 32 struct list_head *rq_list, 33 33 struct request *(*get_rq)(struct blk_mq_hw_ctx *)); 34 34 35 - int blk_mq_sched_setup(struct request_queue *q); 36 - void blk_mq_sched_teardown(struct request_queue *q); 35 + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); 36 + void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); 37 + 38 + int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 39 + unsigned int hctx_idx); 40 + void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 41 + unsigned int hctx_idx); 37 42 38 43 int blk_mq_sched_init(struct request_queue *q); 39 44 ··· 134 129 { 135 130 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 136 131 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 137 - } 138 - 139 - /* 140 - * Mark a hardware queue and the request queue it belongs to as needing a 141 - * restart. 142 - */ 143 - static inline void blk_mq_sched_mark_restart_queue(struct blk_mq_hw_ctx *hctx) 144 - { 145 - struct request_queue *q = hctx->queue; 146 - 147 - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 148 - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 149 - if (!test_bit(QUEUE_FLAG_RESTART, &q->queue_flags)) 150 - set_bit(QUEUE_FLAG_RESTART, &q->queue_flags); 151 132 } 152 133 153 134 static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
+62 -23
block/blk-mq.c
··· 321 321 322 322 rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); 323 323 324 - blk_mq_put_ctx(alloc_data.ctx); 325 324 blk_queue_exit(q); 326 325 327 326 if (!rq) ··· 348 349 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 349 350 if (sched_tag != -1) 350 351 blk_mq_sched_completed_request(hctx, rq); 351 - blk_mq_sched_restart_queues(hctx); 352 + blk_mq_sched_restart(hctx); 352 353 blk_queue_exit(q); 353 354 } 354 355 ··· 845 846 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, 846 847 }; 847 848 848 - if (rq->tag != -1) { 849 - done: 850 - if (hctx) 851 - *hctx = data.hctx; 852 - return true; 853 - } 849 + if (rq->tag != -1) 850 + goto done; 854 851 855 852 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) 856 853 data.flags |= BLK_MQ_REQ_RESERVED; ··· 858 863 atomic_inc(&data.hctx->nr_active); 859 864 } 860 865 data.hctx->tags->rqs[rq->tag] = rq; 861 - goto done; 862 866 } 863 867 864 - return false; 868 + done: 869 + if (hctx) 870 + *hctx = data.hctx; 871 + return rq->tag != -1; 865 872 } 866 873 867 874 static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, ··· 960 963 return true; 961 964 } 962 965 963 - bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) 966 + bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) 964 967 { 965 - struct request_queue *q = hctx->queue; 968 + struct blk_mq_hw_ctx *hctx; 966 969 struct request *rq; 967 970 LIST_HEAD(driver_list); 968 971 struct list_head *dptr; 969 972 int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; 973 + 974 + if (list_empty(list)) 975 + return false; 970 976 971 977 /* 972 978 * Start off with dptr being NULL, so we start the first request ··· 981 981 * Now process all the entries, sending them to the driver. 982 982 */ 983 983 errors = queued = 0; 984 - while (!list_empty(list)) { 984 + do { 985 985 struct blk_mq_queue_data bd; 986 986 987 987 rq = list_first_entry(list, struct request, queuelist); ··· 1052 1052 */ 1053 1053 if (!dptr && list->next != list->prev) 1054 1054 dptr = &driver_list; 1055 - } 1055 + } while (!list_empty(list)); 1056 1056 1057 1057 hctx->dispatched[queued_to_index(queued)]++; 1058 1058 ··· 1135 1135 return hctx->next_cpu; 1136 1136 } 1137 1137 1138 - void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1138 + static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, 1139 + unsigned long msecs) 1139 1140 { 1140 1141 if (unlikely(blk_mq_hctx_stopped(hctx) || 1141 1142 !blk_mq_hw_queue_mapped(hctx))) ··· 1153 1152 put_cpu(); 1154 1153 } 1155 1154 1156 - kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); 1155 + if (msecs == 0) 1156 + kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), 1157 + &hctx->run_work); 1158 + else 1159 + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1160 + &hctx->delayed_run_work, 1161 + msecs_to_jiffies(msecs)); 1162 + } 1163 + 1164 + void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1165 + { 1166 + __blk_mq_delay_run_hw_queue(hctx, true, msecs); 1167 + } 1168 + EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 1169 + 1170 + void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1171 + { 1172 + __blk_mq_delay_run_hw_queue(hctx, async, 0); 1157 1173 } 1158 1174 1159 1175 void blk_mq_run_hw_queues(struct request_queue *q, bool async) ··· 1269 1251 struct blk_mq_hw_ctx *hctx; 1270 1252 1271 1253 hctx = container_of(work, struct blk_mq_hw_ctx, run_work); 1254 + 1255 + __blk_mq_run_hw_queue(hctx); 1256 + } 1257 + 1258 + static void blk_mq_delayed_run_work_fn(struct work_struct *work) 1259 + { 1260 + struct blk_mq_hw_ctx *hctx; 1261 + 1262 + hctx = container_of(work, struct blk_mq_hw_ctx, delayed_run_work.work); 1272 1263 1273 1264 __blk_mq_run_hw_queue(hctx); 1274 1265 } ··· 1951 1924 hctx->fq->flush_rq, hctx_idx, 1952 1925 flush_start_tag + hctx_idx); 1953 1926 1927 + blk_mq_sched_exit_hctx(q, hctx, hctx_idx); 1928 + 1954 1929 if (set->ops->exit_hctx) 1955 1930 set->ops->exit_hctx(hctx, hctx_idx); 1956 1931 ··· 1989 1960 node = hctx->numa_node = set->numa_node; 1990 1961 1991 1962 INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); 1963 + INIT_DELAYED_WORK(&hctx->delayed_run_work, blk_mq_delayed_run_work_fn); 1992 1964 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1993 1965 spin_lock_init(&hctx->lock); 1994 1966 INIT_LIST_HEAD(&hctx->dispatch); ··· 2020 1990 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 2021 1991 goto free_bitmap; 2022 1992 1993 + if (blk_mq_sched_init_hctx(q, hctx, hctx_idx)) 1994 + goto exit_hctx; 1995 + 2023 1996 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 2024 1997 if (!hctx->fq) 2025 - goto exit_hctx; 1998 + goto sched_exit_hctx; 2026 1999 2027 2000 if (set->ops->init_request && 2028 2001 set->ops->init_request(set->driver_data, ··· 2040 2007 2041 2008 free_fq: 2042 2009 kfree(hctx->fq); 2010 + sched_exit_hctx: 2011 + blk_mq_sched_exit_hctx(q, hctx, hctx_idx); 2043 2012 exit_hctx: 2044 2013 if (set->ops->exit_hctx) 2045 2014 set->ops->exit_hctx(hctx, hctx_idx); ··· 2267 2232 { 2268 2233 struct blk_mq_hw_ctx *hctx; 2269 2234 unsigned int i; 2270 - 2271 - blk_mq_sched_teardown(q); 2272 2235 2273 2236 /* hctx kobj stays in hctx */ 2274 2237 queue_for_each_hw_ctx(q, hctx, i) { ··· 2598 2565 return 0; 2599 2566 } 2600 2567 2568 + static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2569 + { 2570 + if (set->ops->map_queues) 2571 + return set->ops->map_queues(set); 2572 + else 2573 + return blk_mq_map_queues(set); 2574 + } 2575 + 2601 2576 /* 2602 2577 * Alloc a tag set to be associated with one or more request queues. 2603 2578 * May fail with EINVAL for various error conditions. May adjust the ··· 2660 2619 if (!set->mq_map) 2661 2620 goto out_free_tags; 2662 2621 2663 - if (set->ops->map_queues) 2664 - ret = set->ops->map_queues(set); 2665 - else 2666 - ret = blk_mq_map_queues(set); 2622 + ret = blk_mq_update_queue_map(set); 2667 2623 if (ret) 2668 2624 goto out_free_mq_map; 2669 2625 ··· 2752 2714 blk_mq_freeze_queue(q); 2753 2715 2754 2716 set->nr_hw_queues = nr_hw_queues; 2717 + blk_mq_update_queue_map(set); 2755 2718 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2756 2719 blk_mq_realloc_hw_ctxs(set, q); 2757 2720
+1 -1
block/blk-mq.h
··· 31 31 void blk_mq_free_queue(struct request_queue *q); 32 32 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); 33 33 void blk_mq_wake_waiters(struct request_queue *q); 34 - bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *); 34 + bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *); 35 35 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); 36 36 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx); 37 37 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
+1 -1
block/blk-sysfs.c
··· 816 816 817 817 if (q->elevator) { 818 818 ioc_clear_queue(q); 819 - elevator_exit(q->elevator); 819 + elevator_exit(q, q->elevator); 820 820 } 821 821 822 822 blk_exit_rl(&q->root_rl);
+59 -55
block/elevator.c
··· 242 242 } 243 243 } 244 244 245 - if (e->uses_mq) { 246 - err = blk_mq_sched_setup(q); 247 - if (!err) 248 - err = e->ops.mq.init_sched(q, e); 249 - } else 245 + if (e->uses_mq) 246 + err = blk_mq_init_sched(q, e); 247 + else 250 248 err = e->ops.sq.elevator_init_fn(q, e); 251 - if (err) { 252 - if (e->uses_mq) 253 - blk_mq_sched_teardown(q); 249 + if (err) 254 250 elevator_put(e); 255 - } 256 251 return err; 257 252 } 258 253 EXPORT_SYMBOL(elevator_init); 259 254 260 - void elevator_exit(struct elevator_queue *e) 255 + void elevator_exit(struct request_queue *q, struct elevator_queue *e) 261 256 { 262 257 mutex_lock(&e->sysfs_lock); 263 258 if (e->uses_mq && e->type->ops.mq.exit_sched) 264 - e->type->ops.mq.exit_sched(e); 259 + blk_mq_exit_sched(q, e); 265 260 else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) 266 261 e->type->ops.sq.elevator_exit_fn(e); 267 262 mutex_unlock(&e->sysfs_lock); ··· 941 946 } 942 947 EXPORT_SYMBOL_GPL(elv_unregister); 943 948 949 + static int elevator_switch_mq(struct request_queue *q, 950 + struct elevator_type *new_e) 951 + { 952 + int ret; 953 + 954 + blk_mq_freeze_queue(q); 955 + blk_mq_quiesce_queue(q); 956 + 957 + if (q->elevator) { 958 + if (q->elevator->registered) 959 + elv_unregister_queue(q); 960 + ioc_clear_queue(q); 961 + elevator_exit(q, q->elevator); 962 + } 963 + 964 + ret = blk_mq_init_sched(q, new_e); 965 + if (ret) 966 + goto out; 967 + 968 + if (new_e) { 969 + ret = elv_register_queue(q); 970 + if (ret) { 971 + elevator_exit(q, q->elevator); 972 + goto out; 973 + } 974 + } 975 + 976 + if (new_e) 977 + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 978 + else 979 + blk_add_trace_msg(q, "elv switch: none"); 980 + 981 + out: 982 + blk_mq_unfreeze_queue(q); 983 + blk_mq_start_stopped_hw_queues(q, true); 984 + return ret; 985 + 986 + } 987 + 944 988 /* 945 989 * switch to new_e io scheduler. be careful not to introduce deadlocks - 946 990 * we don't free the old io scheduler, before we have allocated what we ··· 992 958 bool old_registered = false; 993 959 int err; 994 960 995 - if (q->mq_ops) { 996 - blk_mq_freeze_queue(q); 997 - blk_mq_quiesce_queue(q); 998 - } 961 + if (q->mq_ops) 962 + return elevator_switch_mq(q, new_e); 999 963 1000 964 /* 1001 965 * Turn on BYPASS and drain all requests w/ elevator private data. ··· 1005 973 if (old) { 1006 974 old_registered = old->registered; 1007 975 1008 - if (old->uses_mq) 1009 - blk_mq_sched_teardown(q); 1010 - 1011 - if (!q->mq_ops) 1012 - blk_queue_bypass_start(q); 976 + blk_queue_bypass_start(q); 1013 977 1014 978 /* unregister and clear all auxiliary data of the old elevator */ 1015 979 if (old_registered) ··· 1015 987 } 1016 988 1017 989 /* allocate, init and register new elevator */ 1018 - if (new_e) { 1019 - if (new_e->uses_mq) { 1020 - err = blk_mq_sched_setup(q); 1021 - if (!err) 1022 - err = new_e->ops.mq.init_sched(q, new_e); 1023 - } else 1024 - err = new_e->ops.sq.elevator_init_fn(q, new_e); 1025 - if (err) 1026 - goto fail_init; 990 + err = new_e->ops.sq.elevator_init_fn(q, new_e); 991 + if (err) 992 + goto fail_init; 1027 993 1028 - err = elv_register_queue(q); 1029 - if (err) 1030 - goto fail_register; 1031 - } else 1032 - q->elevator = NULL; 994 + err = elv_register_queue(q); 995 + if (err) 996 + goto fail_register; 1033 997 1034 998 /* done, kill the old one and finish */ 1035 999 if (old) { 1036 - elevator_exit(old); 1037 - if (!q->mq_ops) 1038 - blk_queue_bypass_end(q); 1000 + elevator_exit(q, old); 1001 + blk_queue_bypass_end(q); 1039 1002 } 1040 1003 1041 - if (q->mq_ops) { 1042 - blk_mq_unfreeze_queue(q); 1043 - blk_mq_start_stopped_hw_queues(q, true); 1044 - } 1045 - 1046 - if (new_e) 1047 - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 1048 - else 1049 - blk_add_trace_msg(q, "elv switch: none"); 1004 + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 1050 1005 1051 1006 return 0; 1052 1007 1053 1008 fail_register: 1054 - if (q->mq_ops) 1055 - blk_mq_sched_teardown(q); 1056 - elevator_exit(q->elevator); 1009 + elevator_exit(q, q->elevator); 1057 1010 fail_init: 1058 1011 /* switch failed, restore and re-register old elevator */ 1059 1012 if (old) { 1060 1013 q->elevator = old; 1061 1014 elv_register_queue(q); 1062 - if (!q->mq_ops) 1063 - blk_queue_bypass_end(q); 1064 - } 1065 - if (q->mq_ops) { 1066 - blk_mq_unfreeze_queue(q); 1067 - blk_mq_start_stopped_hw_queues(q, true); 1015 + blk_queue_bypass_end(q); 1068 1016 } 1069 1017 1070 1018 return err;
+1
drivers/md/dm-rq.c
··· 755 755 /* Undo dm_start_request() before requeuing */ 756 756 rq_end_stats(md, rq); 757 757 rq_completed(md, rq_data_dir(rq), false); 758 + blk_mq_delay_run_hw_queue(hctx, 100/*ms*/); 758 759 return BLK_MQ_RQ_QUEUE_BUSY; 759 760 } 760 761
+1 -1
drivers/nvme/host/core.c
··· 270 270 memset(cmnd, 0, sizeof(*cmnd)); 271 271 cmnd->dsm.opcode = nvme_cmd_dsm; 272 272 cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); 273 - cmnd->dsm.nr = segments - 1; 273 + cmnd->dsm.nr = cpu_to_le32(segments - 1); 274 274 cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); 275 275 276 276 req->special_vec.bv_page = virt_to_page(range);
+1 -1
drivers/nvme/target/admin-cmd.c
··· 100 100 u16 status; 101 101 102 102 WARN_ON(req == NULL || slog == NULL); 103 - if (req->cmd->get_log_page.nsid == 0xFFFFFFFF) 103 + if (req->cmd->get_log_page.nsid == cpu_to_le32(0xFFFFFFFF)) 104 104 status = nvmet_get_smart_log_all(req, slog); 105 105 else 106 106 status = nvmet_get_smart_log_nsid(req, slog);
+2 -2
drivers/nvme/target/io-cmd.c
··· 180 180 181 181 sector = le64_to_cpu(write_zeroes->slba) << 182 182 (req->ns->blksize_shift - 9); 183 - nr_sector = (((sector_t)le32_to_cpu(write_zeroes->length)) << 183 + nr_sector = (((sector_t)le16_to_cpu(write_zeroes->length)) << 184 184 (req->ns->blksize_shift - 9)) + 1; 185 185 186 186 if (__blkdev_issue_zeroout(req->ns->bdev, sector, nr_sector, ··· 230 230 return 0; 231 231 case nvme_cmd_dsm: 232 232 req->execute = nvmet_execute_dsm; 233 - req->data_len = le32_to_cpu(cmd->dsm.nr + 1) * 233 + req->data_len = (le32_to_cpu(cmd->dsm.nr) + 1) * 234 234 sizeof(struct nvme_dsm_range); 235 235 return 0; 236 236 case nvme_cmd_write_zeroes:
+3 -3
drivers/scsi/scsi_lib.c
··· 496 496 scsi_starved_list_run(sdev->host); 497 497 498 498 if (q->mq_ops) 499 - blk_mq_start_stopped_hw_queues(q, false); 499 + blk_mq_run_hw_queues(q, false); 500 500 else 501 501 blk_run_queue(q); 502 502 } ··· 667 667 !list_empty(&sdev->host->starved_list)) 668 668 kblockd_schedule_work(&sdev->requeue_work); 669 669 else 670 - blk_mq_start_stopped_hw_queues(q, true); 670 + blk_mq_run_hw_queues(q, true); 671 671 } else { 672 672 unsigned long flags; 673 673 ··· 1974 1974 case BLK_MQ_RQ_QUEUE_BUSY: 1975 1975 if (atomic_read(&sdev->device_busy) == 0 && 1976 1976 !scsi_device_blocked(sdev)) 1977 - blk_mq_delay_queue(hctx, SCSI_QUEUE_DELAY); 1977 + blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); 1978 1978 break; 1979 1979 case BLK_MQ_RQ_QUEUE_ERROR: 1980 1980 /*
+2
include/linux/blk-mq.h
··· 51 51 52 52 atomic_t nr_active; 53 53 54 + struct delayed_work delayed_run_work; 54 55 struct delayed_work delay_work; 55 56 56 57 struct hlist_node cpuhp_dead; ··· 239 238 void blk_mq_start_hw_queues(struct request_queue *q); 240 239 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); 241 240 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); 241 + void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 242 242 void blk_mq_run_hw_queues(struct request_queue *q, bool async); 243 243 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); 244 244 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
-1
include/linux/blkdev.h
··· 610 610 #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ 611 611 #define QUEUE_FLAG_DAX 26 /* device supports DAX */ 612 612 #define QUEUE_FLAG_STATS 27 /* track rq completion times */ 613 - #define QUEUE_FLAG_RESTART 28 /* queue needs restart at completion */ 614 613 615 614 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ 616 615 (1 << QUEUE_FLAG_STACKABLE) | \
+1 -1
include/linux/elevator.h
··· 211 211 extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); 212 212 213 213 extern int elevator_init(struct request_queue *, char *); 214 - extern void elevator_exit(struct elevator_queue *); 214 + extern void elevator_exit(struct request_queue *, struct elevator_queue *); 215 215 extern int elevator_change(struct request_queue *, const char *); 216 216 extern bool elv_bio_merge_ok(struct request *, struct bio *); 217 217 extern struct elevator_queue *elevator_alloc(struct request_queue *,
+8 -8
include/linux/nvme.h
··· 64 64 * RDMA_QPTYPE field 65 65 */ 66 66 enum { 67 - NVMF_RDMA_QPTYPE_CONNECTED = 0, /* Reliable Connected */ 68 - NVMF_RDMA_QPTYPE_DATAGRAM = 1, /* Reliable Datagram */ 67 + NVMF_RDMA_QPTYPE_CONNECTED = 1, /* Reliable Connected */ 68 + NVMF_RDMA_QPTYPE_DATAGRAM = 2, /* Reliable Datagram */ 69 69 }; 70 70 71 71 /* RDMA QP Service Type codes for Discovery Log Page entry TSAS 72 72 * RDMA_QPTYPE field 73 73 */ 74 74 enum { 75 - NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 0, /* No Provider Specified */ 76 - NVMF_RDMA_PRTYPE_IB = 1, /* InfiniBand */ 77 - NVMF_RDMA_PRTYPE_ROCE = 2, /* InfiniBand RoCE */ 78 - NVMF_RDMA_PRTYPE_ROCEV2 = 3, /* InfiniBand RoCEV2 */ 79 - NVMF_RDMA_PRTYPE_IWARP = 4, /* IWARP */ 75 + NVMF_RDMA_PRTYPE_NOT_SPECIFIED = 1, /* No Provider Specified */ 76 + NVMF_RDMA_PRTYPE_IB = 2, /* InfiniBand */ 77 + NVMF_RDMA_PRTYPE_ROCE = 3, /* InfiniBand RoCE */ 78 + NVMF_RDMA_PRTYPE_ROCEV2 = 4, /* InfiniBand RoCEV2 */ 79 + NVMF_RDMA_PRTYPE_IWARP = 5, /* IWARP */ 80 80 }; 81 81 82 82 /* RDMA Connection Management Service Type codes for Discovery Log Page 83 83 * entry TSAS RDMA_CMS field 84 84 */ 85 85 enum { 86 - NVMF_RDMA_CMS_RDMA_CM = 0, /* Sockets based enpoint addressing */ 86 + NVMF_RDMA_CMS_RDMA_CM = 1, /* Sockets based endpoint addressing */ 87 87 }; 88 88 89 89 #define NVMF_AQ_DEPTH 32