Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

block: mq-deadline: Remove support for zone write locking

With the block layer generic plugging of write operations for zoned
block devices, mq-deadline, or any other scheduler, can only ever
see at most one write operation per zone at any time. There is thus no
sequentiality requirements for these writes and thus no need to tightly
control the dispatching of write requests using zone write locking.

Remove all the code that implement this control in the mq-deadline
scheduler and remove advertizing support for the
ELEVATOR_F_ZBD_SEQ_WRITE elevator feature.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Hannes Reinecke <hare@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Tested-by: Hans Holmberg <hans.holmberg@wdc.com>
Tested-by: Dennis Maisenbacher <dennis.maisenbacher@wdc.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Link: https://lore.kernel.org/r/20240408014128.205141-22-dlemoal@kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Damien Le Moal and committed by
Jens Axboe
fde02699 9b3c08b9

+6 -170
+6 -170
block/mq-deadline.c
··· 102 102 int prio_aging_expire; 103 103 104 104 spinlock_t lock; 105 - spinlock_t zone_lock; 106 105 }; 107 106 108 107 /* Maps an I/O priority class to a deadline scheduler priority. */ ··· 156 157 } 157 158 158 159 /* 159 - * Return the first request for which blk_rq_pos() >= @pos. For zoned devices, 160 - * return the first request after the start of the zone containing @pos. 160 + * Return the first request for which blk_rq_pos() >= @pos. 161 161 */ 162 162 static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, 163 163 enum dd_data_dir data_dir, sector_t pos) ··· 168 170 return NULL; 169 171 170 172 rq = rb_entry_rq(node); 171 - /* 172 - * A zoned write may have been requeued with a starting position that 173 - * is below that of the most recently dispatched request. Hence, for 174 - * zoned writes, start searching from the start of a zone. 175 - */ 176 - if (blk_rq_is_seq_zoned_write(rq)) 177 - pos = round_down(pos, rq->q->limits.chunk_sectors); 178 - 179 173 while (node) { 180 174 rq = rb_entry_rq(node); 181 175 if (blk_rq_pos(rq) >= pos) { ··· 299 309 } 300 310 301 311 /* 302 - * Check if rq has a sequential request preceding it. 303 - */ 304 - static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) 305 - { 306 - struct request *prev = deadline_earlier_request(rq); 307 - 308 - if (!prev) 309 - return false; 310 - 311 - return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); 312 - } 313 - 314 - /* 315 - * Skip all write requests that are sequential from @rq, even if we cross 316 - * a zone boundary. 317 - */ 318 - static struct request *deadline_skip_seq_writes(struct deadline_data *dd, 319 - struct request *rq) 320 - { 321 - sector_t pos = blk_rq_pos(rq); 322 - 323 - do { 324 - pos += blk_rq_sectors(rq); 325 - rq = deadline_latter_request(rq); 326 - } while (rq && blk_rq_pos(rq) == pos); 327 - 328 - return rq; 329 - } 330 - 331 - /* 332 312 * For the specified data direction, return the next request to 333 313 * dispatch using arrival ordered lists. 334 314 */ ··· 306 346 deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, 307 347 enum dd_data_dir data_dir) 308 348 { 309 - struct request *rq, *rb_rq, *next; 310 - unsigned long flags; 311 - 312 349 if (list_empty(&per_prio->fifo_list[data_dir])) 313 350 return NULL; 314 351 315 - rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); 316 - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) 317 - return rq; 318 - 319 - /* 320 - * Look for a write request that can be dispatched, that is one with 321 - * an unlocked target zone. For some HDDs, breaking a sequential 322 - * write stream can lead to lower throughput, so make sure to preserve 323 - * sequential write streams, even if that stream crosses into the next 324 - * zones and these zones are unlocked. 325 - */ 326 - spin_lock_irqsave(&dd->zone_lock, flags); 327 - list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE], 328 - queuelist) { 329 - /* Check whether a prior request exists for the same zone. */ 330 - rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq)); 331 - if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq)) 332 - rq = rb_rq; 333 - if (blk_req_can_dispatch_to_zone(rq) && 334 - (blk_queue_nonrot(rq->q) || 335 - !deadline_is_seq_write(dd, rq))) 336 - goto out; 337 - } 338 - rq = NULL; 339 - out: 340 - spin_unlock_irqrestore(&dd->zone_lock, flags); 341 - 342 - return rq; 352 + return rq_entry_fifo(per_prio->fifo_list[data_dir].next); 343 353 } 344 354 345 355 /* ··· 320 390 deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, 321 391 enum dd_data_dir data_dir) 322 392 { 323 - struct request *rq; 324 - unsigned long flags; 325 - 326 - rq = deadline_from_pos(per_prio, data_dir, 327 - per_prio->latest_pos[data_dir]); 328 - if (!rq) 329 - return NULL; 330 - 331 - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) 332 - return rq; 333 - 334 - /* 335 - * Look for a write request that can be dispatched, that is one with 336 - * an unlocked target zone. For some HDDs, breaking a sequential 337 - * write stream can lead to lower throughput, so make sure to preserve 338 - * sequential write streams, even if that stream crosses into the next 339 - * zones and these zones are unlocked. 340 - */ 341 - spin_lock_irqsave(&dd->zone_lock, flags); 342 - while (rq) { 343 - if (blk_req_can_dispatch_to_zone(rq)) 344 - break; 345 - if (blk_queue_nonrot(rq->q)) 346 - rq = deadline_latter_request(rq); 347 - else 348 - rq = deadline_skip_seq_writes(dd, rq); 349 - } 350 - spin_unlock_irqrestore(&dd->zone_lock, flags); 351 - 352 - return rq; 393 + return deadline_from_pos(per_prio, data_dir, 394 + per_prio->latest_pos[data_dir]); 353 395 } 354 396 355 397 /* ··· 427 525 rq = next_rq; 428 526 } 429 527 430 - /* 431 - * For a zoned block device, if we only have writes queued and none of 432 - * them can be dispatched, rq will be NULL. 433 - */ 434 528 if (!rq) 435 529 return NULL; 436 530 ··· 447 549 prio = ioprio_class_to_prio[ioprio_class]; 448 550 dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); 449 551 dd->per_prio[prio].stats.dispatched++; 450 - /* 451 - * If the request needs its target zone locked, do it. 452 - */ 453 - blk_req_zone_write_lock(rq); 454 552 rq->rq_flags |= RQF_STARTED; 455 553 return rq; 456 554 } ··· 616 722 dd->fifo_batch = fifo_batch; 617 723 dd->prio_aging_expire = prio_aging_expire; 618 724 spin_lock_init(&dd->lock); 619 - spin_lock_init(&dd->zone_lock); 620 725 621 726 /* We dispatch from request queue wide instead of hw queue */ 622 727 blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); ··· 697 804 698 805 lockdep_assert_held(&dd->lock); 699 806 700 - /* 701 - * This may be a requeue of a write request that has locked its 702 - * target zone. If it is the case, this releases the zone lock. 703 - */ 704 - blk_req_zone_write_unlock(rq); 705 - 706 807 prio = ioprio_class_to_prio[ioprio_class]; 707 808 per_prio = &dd->per_prio[prio]; 708 809 if (!rq->elv.priv[0]) { ··· 728 841 */ 729 842 rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; 730 843 insert_before = &per_prio->fifo_list[data_dir]; 731 - #ifdef CONFIG_BLK_DEV_ZONED 732 - /* 733 - * Insert zoned writes such that requests are sorted by 734 - * position per zone. 735 - */ 736 - if (blk_rq_is_seq_zoned_write(rq)) { 737 - struct request *rq2 = deadline_latter_request(rq); 738 - 739 - if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq)) 740 - insert_before = &rq2->queuelist; 741 - } 742 - #endif 743 844 list_add_tail(&rq->queuelist, insert_before); 744 845 } 745 846 } ··· 762 887 rq->elv.priv[0] = NULL; 763 888 } 764 889 765 - static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) 766 - { 767 - struct deadline_data *dd = hctx->queue->elevator->elevator_data; 768 - enum dd_prio p; 769 - 770 - for (p = 0; p <= DD_PRIO_MAX; p++) 771 - if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) 772 - return true; 773 - 774 - return false; 775 - } 776 - 777 890 /* 778 891 * Callback from inside blk_mq_free_request(). 779 - * 780 - * For zoned block devices, write unlock the target zone of 781 - * completed write requests. Do this while holding the zone lock 782 - * spinlock so that the zone is never unlocked while deadline_fifo_request() 783 - * or deadline_next_request() are executing. This function is called for 784 - * all requests, whether or not these requests complete successfully. 785 - * 786 - * For a zoned block device, __dd_dispatch_request() may have stopped 787 - * dispatching requests if all the queued requests are write requests directed 788 - * at zones that are already locked due to on-going write requests. To ensure 789 - * write request dispatch progress in this case, mark the queue as needing a 790 - * restart to ensure that the queue is run again after completion of the 791 - * request and zones being unlocked. 792 892 */ 793 893 static void dd_finish_request(struct request *rq) 794 894 { ··· 778 928 * called dd_insert_requests(). Skip requests that bypassed I/O 779 929 * scheduling. See also blk_mq_request_bypass_insert(). 780 930 */ 781 - if (!rq->elv.priv[0]) 782 - return; 783 - 784 - atomic_inc(&per_prio->stats.completed); 785 - 786 - if (blk_queue_is_zoned(q)) { 787 - unsigned long flags; 788 - 789 - spin_lock_irqsave(&dd->zone_lock, flags); 790 - blk_req_zone_write_unlock(rq); 791 - spin_unlock_irqrestore(&dd->zone_lock, flags); 792 - 793 - if (dd_has_write_work(rq->mq_hctx)) 794 - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); 795 - } 931 + if (rq->elv.priv[0]) 932 + atomic_inc(&per_prio->stats.completed); 796 933 } 797 934 798 935 static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) ··· 1103 1266 .elevator_attrs = deadline_attrs, 1104 1267 .elevator_name = "mq-deadline", 1105 1268 .elevator_alias = "deadline", 1106 - .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, 1107 1269 .elevator_owner = THIS_MODULE, 1108 1270 }; 1109 1271 MODULE_ALIAS("mq-deadline-iosched");