Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux

+10 -12

Documentation/ABI/stable/sysfs-block

··· 101 101 devices that support receiving integrity metadata. 102 102 103 103 104 + What: /sys/block/<disk>/partscan 105 + Date: May 2024 106 + Contact: Christoph Hellwig <hch@lst.de> 107 + Description: 108 + The /sys/block/<disk>/partscan files reports if partition 109 + scanning is enabled for the disk. It returns "1" if partition 110 + scanning is enabled, or "0" if not. The value type is a 32-bit 111 + unsigned integer, but only "0" and "1" are valid values. 112 + 113 + 104 114 What: /sys/block/<disk>/<partition>/alignment_offset 105 115 Date: April 2009 106 116 Contact: Martin K. Petersen <martin.petersen@oracle.com> ··· 593 583 example, once to calculate a checksum and once to actually write 594 584 the data. If no such restriction exists, this file will contain 595 585 '0'. This file is writable for testing purposes. 596 - 597 - 598 - What: /sys/block/<disk>/queue/throttle_sample_time 599 - Date: March 2017 600 - Contact: linux-block@vger.kernel.org 601 - Description: 602 - [RW] This is the time window that blk-throttle samples data, in 603 - millisecond. blk-throttle makes decision based on the 604 - samplings. Lower time means cgroups have more smooth throughput, 605 - but higher CPU overhead. This exists only when 606 - CONFIG_BLK_DEV_THROTTLING_LOW is enabled. 607 - 608 586 609 587 What: /sys/block/<disk>/queue/virt_boundary_mask 610 588 Date: April 2021

-1

arch/loongarch/configs/loongson3_defconfig

··· 76 76 CONFIG_MODVERSIONS=y 77 77 CONFIG_BLK_DEV_ZONED=y 78 78 CONFIG_BLK_DEV_THROTTLING=y 79 - CONFIG_BLK_DEV_THROTTLING_LOW=y 80 79 CONFIG_BLK_WBT=y 81 80 CONFIG_BLK_CGROUP_IOLATENCY=y 82 81 CONFIG_BLK_CGROUP_FC_APPID=y

-16

block/Kconfig

··· 100 100 101 101 config BLK_DEV_ZONED 102 102 bool "Zoned block device support" 103 - select MQ_IOSCHED_DEADLINE 104 103 help 105 104 Block layer zoned block device support. This option enables 106 105 support for ZAC/ZBC/ZNS host-managed and host-aware zoned block ··· 118 119 cgroups and specifying per device IO rate policies. 119 120 120 121 See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. 121 - 122 - config BLK_DEV_THROTTLING_LOW 123 - bool "Block throttling .low limit interface support (EXPERIMENTAL)" 124 - depends on BLK_DEV_THROTTLING 125 - help 126 - Add .low limit interface for block throttling. The low limit is a best 127 - effort limit to prioritize cgroups. Depending on the setting, the limit 128 - can be used to protect cgroups in terms of bandwidth/iops and better 129 - utilize disk resource. 130 - 131 - Note, this is an experimental interface and could be changed someday. 132 122 133 123 config BLK_WBT 134 124 bool "Enable support for block device writeback throttling" ··· 185 197 186 198 Unless you are building a kernel for a tiny system, you should 187 199 say Y here. 188 - 189 - config BLK_DEBUG_FS_ZONED 190 - bool 191 - default BLK_DEBUG_FS && BLK_DEV_ZONED 192 200 193 201 config BLK_SED_OPAL 194 202 bool "Logic for interfacing with Opal enabled SEDs"

-1

block/Makefile

··· 33 33 obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o 34 34 obj-$(CONFIG_BLK_WBT) += blk-wbt.o 35 35 obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o 36 - obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o 37 36 obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o 38 37 obj-$(CONFIG_BLK_PM) += blk-pm.o 39 38 obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \

+41 -9

block/bio.c

··· 345 345 } 346 346 EXPORT_SYMBOL(bio_chain); 347 347 348 + /** 349 + * bio_chain_and_submit - submit a bio after chaining it to another one 350 + * @prev: bio to chain and submit 351 + * @new: bio to chain to 352 + * 353 + * If @prev is non-NULL, chain it to @new and submit it. 354 + * 355 + * Return: @new. 356 + */ 357 + struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) 358 + { 359 + if (prev) { 360 + bio_chain(prev, new); 361 + submit_bio(prev); 362 + } 363 + return new; 364 + } 365 + 348 366 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, 349 367 unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) 350 368 { 351 - struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp); 352 - 353 - if (bio) { 354 - bio_chain(bio, new); 355 - submit_bio(bio); 356 - } 357 - 358 - return new; 369 + return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); 359 370 } 360 371 EXPORT_SYMBOL_GPL(blk_next_bio); 361 372 ··· 1395 1384 } 1396 1385 EXPORT_SYMBOL(submit_bio_wait); 1397 1386 1387 + static void bio_wait_end_io(struct bio *bio) 1388 + { 1389 + complete(bio->bi_private); 1390 + bio_put(bio); 1391 + } 1392 + 1393 + /* 1394 + * bio_await_chain - ends @bio and waits for every chained bio to complete 1395 + */ 1396 + void bio_await_chain(struct bio *bio) 1397 + { 1398 + DECLARE_COMPLETION_ONSTACK_MAP(done, 1399 + bio->bi_bdev->bd_disk->lockdep_map); 1400 + 1401 + bio->bi_private = &done; 1402 + bio->bi_end_io = bio_wait_end_io; 1403 + bio_endio(bio); 1404 + blk_wait_io(&done); 1405 + } 1406 + 1398 1407 void __bio_advance(struct bio *bio, unsigned bytes) 1399 1408 { 1400 1409 if (bio_integrity(bio)) ··· 1607 1576 if (!bio_integrity_endio(bio)) 1608 1577 return; 1609 1578 1579 + blk_zone_bio_endio(bio); 1580 + 1610 1581 rq_qos_done_bio(bio); 1611 1582 1612 1583 if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { ··· 1629 1596 goto again; 1630 1597 } 1631 1598 1632 - blk_throtl_bio_endio(bio); 1633 1599 /* release cgroup info */ 1634 1600 bio_uninit(bio); 1635 1601 if (bio->bi_end_io)

+6 -12

block/blk-cgroup-rwstat.c

··· 9 9 { 10 10 int i, ret; 11 11 12 - for (i = 0; i < BLKG_RWSTAT_NR; i++) { 13 - ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); 14 - if (ret) { 15 - while (--i >= 0) 16 - percpu_counter_destroy(&rwstat->cpu_cnt[i]); 17 - return ret; 18 - } 12 + ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR); 13 + if (ret) 14 + return ret; 15 + 16 + for (i = 0; i < BLKG_RWSTAT_NR; i++) 19 17 atomic64_set(&rwstat->aux_cnt[i], 0); 20 - } 21 18 return 0; 22 19 } 23 20 EXPORT_SYMBOL_GPL(blkg_rwstat_init); 24 21 25 22 void blkg_rwstat_exit(struct blkg_rwstat *rwstat) 26 23 { 27 - int i; 28 - 29 - for (i = 0; i < BLKG_RWSTAT_NR; i++) 30 - percpu_counter_destroy(&rwstat->cpu_cnt[i]); 24 + percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR); 31 25 } 32 26 EXPORT_SYMBOL_GPL(blkg_rwstat_exit); 33 27

+1 -8

block/blk-cgroup.c

··· 218 218 219 219 /* as long as there are pending bios, @blkg can't go away */ 220 220 spin_lock(&blkg->async_bio_lock); 221 - bio_list_merge(&bios, &blkg->async_bios); 222 - bio_list_init(&blkg->async_bios); 221 + bio_list_merge_init(&bios, &blkg->async_bios); 223 222 spin_unlock(&blkg->async_bio_lock); 224 223 225 224 /* start plug only when bio_list contains at least 2 bios */ ··· 1443 1444 if (ret) 1444 1445 goto err_destroy_all; 1445 1446 1446 - ret = blk_throtl_init(disk); 1447 - if (ret) 1448 - goto err_ioprio_exit; 1449 - 1450 1447 return 0; 1451 1448 1452 - err_ioprio_exit: 1453 - blk_ioprio_exit(disk); 1454 1449 err_destroy_all: 1455 1450 blkg_destroy_all(disk); 1456 1451 return ret;

+13 -13

block/blk-core.c

··· 591 591 return BLK_STS_NOTSUPP; 592 592 593 593 /* The bio sector must point to the start of a sequential zone */ 594 - if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) || 595 - !bio_zone_is_seq(bio)) 594 + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) 596 595 return BLK_STS_IOERR; 597 596 598 597 /* ··· 603 604 return BLK_STS_IOERR; 604 605 605 606 /* Make sure the BIO is small enough and will not get split */ 606 - if (nr_sectors > q->limits.max_zone_append_sectors) 607 + if (nr_sectors > queue_max_zone_append_sectors(q)) 607 608 return BLK_STS_IOERR; 608 609 609 610 bio->bi_opf |= REQ_NOMERGE; ··· 648 649 static void __submit_bio_noacct(struct bio *bio) 649 650 { 650 651 struct bio_list bio_list_on_stack[2]; 652 + struct blk_plug plug; 651 653 652 654 BUG_ON(bio->bi_next); 653 655 654 656 bio_list_init(&bio_list_on_stack[0]); 655 657 current->bio_list = bio_list_on_stack; 658 + blk_start_plug(&plug); 656 659 657 660 do { 658 661 struct request_queue *q = bdev_get_queue(bio->bi_bdev); ··· 688 687 bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); 689 688 } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); 690 689 690 + blk_finish_plug(&plug); 691 691 current->bio_list = NULL; 692 692 } 693 693 694 694 static void __submit_bio_noacct_mq(struct bio *bio) 695 695 { 696 696 struct bio_list bio_list[2] = { }; 697 + struct blk_plug plug; 697 698 698 699 current->bio_list = bio_list; 700 + blk_start_plug(&plug); 699 701 700 702 do { 701 703 __submit_bio(bio); 702 704 } while ((bio = bio_list_pop(&bio_list[0]))); 703 705 706 + blk_finish_plug(&plug); 704 707 current->bio_list = NULL; 705 708 } 706 709 ··· 915 910 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) 916 911 return 0; 917 912 918 - /* 919 - * As the requests that require a zone lock are not plugged in the 920 - * first place, directly accessing the plug instead of using 921 - * blk_mq_plug() should not have any consequences during flushing for 922 - * zoned devices. 923 - */ 924 913 blk_flush_plug(current->plug, false); 925 914 926 915 /* ··· 986 987 unsigned long stamp; 987 988 again: 988 989 stamp = READ_ONCE(part->bd_stamp); 989 - if (unlikely(time_after(now, stamp))) { 990 - if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) 991 - __part_stat_add(part, io_ticks, end ? now - stamp : 1); 992 - } 990 + if (unlikely(time_after(now, stamp)) && 991 + likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && 992 + (end || part_in_flight(part))) 993 + __part_stat_add(part, io_ticks, now - stamp); 994 + 993 995 if (part->bd_partno) { 994 996 part = bdev_whole(part); 995 997 goto again;

+2

block/blk-flush.c

··· 130 130 * original @rq->bio. Restore it. 131 131 */ 132 132 rq->bio = rq->biotail; 133 + if (rq->bio) 134 + rq->__sector = rq->bio->bi_iter.bi_sector; 133 135 134 136 /* make @rq a normal request */ 135 137 rq->rq_flags &= ~RQF_FLUSH_SEQ;

+29 -41

block/blk-lib.c

··· 35 35 return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; 36 36 } 37 37 38 + struct bio *blk_alloc_discard_bio(struct block_device *bdev, 39 + sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask) 40 + { 41 + sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector)); 42 + struct bio *bio; 43 + 44 + if (!bio_sects) 45 + return NULL; 46 + 47 + bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask); 48 + if (!bio) 49 + return NULL; 50 + bio->bi_iter.bi_sector = *sector; 51 + bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT; 52 + *sector += bio_sects; 53 + *nr_sects -= bio_sects; 54 + /* 55 + * We can loop for a long time in here if someone does full device 56 + * discards (like mkfs). Be nice and allow us to schedule out to avoid 57 + * softlocking if preempt is disabled. 58 + */ 59 + cond_resched(); 60 + return bio; 61 + } 62 + 38 63 int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, 39 64 sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) 40 65 { 41 - struct bio *bio = *biop; 42 - sector_t bs_mask; 66 + struct bio *bio; 43 67 44 - if (bdev_read_only(bdev)) 45 - return -EPERM; 46 - if (!bdev_max_discard_sectors(bdev)) 47 - return -EOPNOTSUPP; 48 - 49 - /* In case the discard granularity isn't set by buggy device driver */ 50 - if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { 51 - pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n", 52 - bdev); 53 - return -EOPNOTSUPP; 54 - } 55 - 56 - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; 57 - if ((sector | nr_sects) & bs_mask) 58 - return -EINVAL; 59 - 60 - if (!nr_sects) 61 - return -EINVAL; 62 - 63 - while (nr_sects) { 64 - sector_t req_sects = 65 - min(nr_sects, bio_discard_limit(bdev, sector)); 66 - 67 - bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask); 68 - bio->bi_iter.bi_sector = sector; 69 - bio->bi_iter.bi_size = req_sects << 9; 70 - sector += req_sects; 71 - nr_sects -= req_sects; 72 - 73 - /* 74 - * We can loop for a long time in here, if someone does 75 - * full device discards (like mkfs). Be nice and allow 76 - * us to schedule out to avoid softlocking if preempt 77 - * is disabled. 78 - */ 79 - cond_resched(); 80 - } 81 - 82 - *biop = bio; 68 + while ((bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, 69 + gfp_mask))) 70 + *biop = bio_chain_and_submit(*biop, bio); 83 71 return 0; 84 72 } 85 73 EXPORT_SYMBOL(__blkdev_issue_discard);

+16 -9

block/blk-merge.c

··· 377 377 blkcg_bio_issue_init(split); 378 378 bio_chain(split, bio); 379 379 trace_block_split(split, bio->bi_iter.bi_sector); 380 + WARN_ON_ONCE(bio_zone_write_plugging(bio)); 380 381 submit_bio_noacct(bio); 381 382 return split; 382 383 } ··· 780 779 if (blk_do_io_stat(req)) { 781 780 part_stat_lock(); 782 781 part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); 782 + part_stat_local_dec(req->part, 783 + in_flight[op_is_write(req_op(req))]); 783 784 part_stat_unlock(); 784 785 } 785 786 } ··· 975 972 part_stat_unlock(); 976 973 } 977 974 978 - enum bio_merge_status { 979 - BIO_MERGE_OK, 980 - BIO_MERGE_NONE, 981 - BIO_MERGE_FAILED, 982 - }; 983 - 984 - static enum bio_merge_status bio_attempt_back_merge(struct request *req, 975 + enum bio_merge_status bio_attempt_back_merge(struct request *req, 985 976 struct bio *bio, unsigned int nr_segs) 986 977 { 987 978 const blk_opf_t ff = bio_failfast(bio); ··· 991 994 992 995 blk_update_mixed_merge(req, bio, false); 993 996 997 + if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) 998 + blk_zone_write_plug_bio_merged(bio); 999 + 994 1000 req->biotail->bi_next = bio; 995 1001 req->biotail = bio; 996 1002 req->__data_len += bio->bi_iter.bi_size; ··· 1008 1008 struct bio *bio, unsigned int nr_segs) 1009 1009 { 1010 1010 const blk_opf_t ff = bio_failfast(bio); 1011 + 1012 + /* 1013 + * A front merge for writes to sequential zones of a zoned block device 1014 + * can happen only if the user submitted writes out of order. Do not 1015 + * merge such write to let it fail. 1016 + */ 1017 + if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) 1018 + return BIO_MERGE_FAILED; 1011 1019 1012 1020 if (!ll_front_merge_fn(req, bio, nr_segs)) 1013 1021 return BIO_MERGE_FAILED; ··· 1115 1107 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 1116 1108 unsigned int nr_segs) 1117 1109 { 1118 - struct blk_plug *plug; 1110 + struct blk_plug *plug = current->plug; 1119 1111 struct request *rq; 1120 1112 1121 - plug = blk_mq_plug(bio); 1122 1113 if (!plug || rq_list_empty(plug->mq_list)) 1123 1114 return false; 1124 1115

-22

block/blk-mq-debugfs-zoned.c

··· 1 - // SPDX-License-Identifier: GPL-2.0 2 - /* 3 - * Copyright (C) 2017 Western Digital Corporation or its affiliates. 4 - */ 5 - 6 - #include <linux/blkdev.h> 7 - #include "blk-mq-debugfs.h" 8 - 9 - int queue_zone_wlock_show(void *data, struct seq_file *m) 10 - { 11 - struct request_queue *q = data; 12 - unsigned int i; 13 - 14 - if (!q->disk->seq_zones_wlock) 15 - return 0; 16 - 17 - for (i = 0; i < q->disk->nr_zones; i++) 18 - if (test_bit(i, q->disk->seq_zones_wlock)) 19 - seq_printf(m, "%u\n", i); 20 - 21 - return 0; 22 - }

+1 -2

block/blk-mq-debugfs.c

··· 160 160 { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, 161 161 { "pm_only", 0600, queue_pm_only_show, NULL }, 162 162 { "state", 0600, queue_state_show, queue_state_write }, 163 - { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, 163 + { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL }, 164 164 { }, 165 165 }; 166 166 ··· 256 256 RQF_NAME(HASHED), 257 257 RQF_NAME(STATS), 258 258 RQF_NAME(SPECIAL_PAYLOAD), 259 - RQF_NAME(ZONE_WRITE_LOCKED), 260 259 RQF_NAME(TIMED_OUT), 261 260 RQF_NAME(RESV), 262 261 };

+3 -3

block/blk-mq-debugfs.h

··· 83 83 } 84 84 #endif 85 85 86 - #ifdef CONFIG_BLK_DEBUG_FS_ZONED 87 - int queue_zone_wlock_show(void *data, struct seq_file *m); 86 + #if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) 87 + int queue_zone_wplugs_show(void *data, struct seq_file *m); 88 88 #else 89 - static inline int queue_zone_wlock_show(void *data, struct seq_file *m) 89 + static inline int queue_zone_wplugs_show(void *data, struct seq_file *m) 90 90 { 91 91 return 0; 92 92 }

+107 -77

block/blk-mq.c

··· 28 28 #include <linux/prefetch.h> 29 29 #include <linux/blk-crypto.h> 30 30 #include <linux/part_stat.h> 31 + #include <linux/sched/isolation.h> 31 32 32 33 #include <trace/events/block.h> 33 34 ··· 691 690 { 692 691 struct request_queue *q = rq->q; 693 692 693 + blk_zone_finish_request(rq); 694 + 694 695 if (rq->rq_flags & RQF_USE_SCHED) { 695 696 q->elevator->type->ops.finish_request(rq); 696 697 /* ··· 764 761 } 765 762 EXPORT_SYMBOL(blk_dump_rq_flags); 766 763 767 - static void req_bio_endio(struct request *rq, struct bio *bio, 768 - unsigned int nbytes, blk_status_t error) 769 - { 770 - if (unlikely(error)) { 771 - bio->bi_status = error; 772 - } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { 773 - /* 774 - * Partial zone append completions cannot be supported as the 775 - * BIO fragments may end up not being written sequentially. 776 - */ 777 - if (bio->bi_iter.bi_size != nbytes) 778 - bio->bi_status = BLK_STS_IOERR; 779 - else 780 - bio->bi_iter.bi_sector = rq->__sector; 781 - } 782 - 783 - bio_advance(bio, nbytes); 784 - 785 - if (unlikely(rq->rq_flags & RQF_QUIET)) 786 - bio_set_flag(bio, BIO_QUIET); 787 - /* don't actually finish bio if it's part of flush sequence */ 788 - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) 789 - bio_endio(bio); 790 - } 791 - 792 764 static void blk_account_io_completion(struct request *req, unsigned int bytes) 793 765 { 794 766 if (req->part && blk_do_io_stat(req)) { ··· 823 845 /* Completion has already been traced */ 824 846 bio_clear_flag(bio, BIO_TRACE_COMPLETION); 825 847 826 - if (req_op(req) == REQ_OP_ZONE_APPEND) 827 - bio->bi_iter.bi_sector = req->__sector; 848 + blk_zone_update_request_bio(req, bio); 828 849 829 850 if (!is_flush) 830 851 bio_endio(bio); ··· 866 889 bool blk_update_request(struct request *req, blk_status_t error, 867 890 unsigned int nr_bytes) 868 891 { 892 + bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; 893 + bool quiet = req->rq_flags & RQF_QUIET; 869 894 int total_bytes; 870 895 871 896 trace_block_rq_complete(req, error, nr_bytes); ··· 888 909 if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) 889 910 __blk_crypto_rq_put_keyslot(req); 890 911 891 - if (unlikely(error && !blk_rq_is_passthrough(req) && 892 - !(req->rq_flags & RQF_QUIET)) && 893 - !test_bit(GD_DEAD, &req->q->disk->state)) { 912 + if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && 913 + !test_bit(GD_DEAD, &req->q->disk->state)) { 894 914 blk_print_req_error(req, error); 895 915 trace_block_rq_error(req, error, nr_bytes); 896 916 } ··· 901 923 struct bio *bio = req->bio; 902 924 unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); 903 925 904 - if (bio_bytes == bio->bi_iter.bi_size) 926 + if (unlikely(error)) 927 + bio->bi_status = error; 928 + 929 + if (bio_bytes == bio->bi_iter.bi_size) { 905 930 req->bio = bio->bi_next; 931 + } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { 932 + /* 933 + * Partial zone append completions cannot be supported 934 + * as the BIO fragments may end up not being written 935 + * sequentially. 936 + */ 937 + bio->bi_status = BLK_STS_IOERR; 938 + } 906 939 907 940 /* Completion has already been traced */ 908 941 bio_clear_flag(bio, BIO_TRACE_COMPLETION); 909 - req_bio_endio(req, bio, bio_bytes, error); 942 + if (unlikely(quiet)) 943 + bio_set_flag(bio, BIO_QUIET); 944 + 945 + bio_advance(bio, bio_bytes); 946 + 947 + /* Don't actually finish bio if it's part of flush sequence */ 948 + if (!bio->bi_iter.bi_size) { 949 + blk_zone_update_request_bio(req, bio); 950 + if (!is_flush) 951 + bio_endio(bio); 952 + } 910 953 911 954 total_bytes += bio_bytes; 912 955 nr_bytes -= bio_bytes; ··· 996 997 update_io_ticks(req->part, jiffies, true); 997 998 part_stat_inc(req->part, ios[sgrp]); 998 999 part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); 1000 + part_stat_local_dec(req->part, 1001 + in_flight[op_is_write(req_op(req))]); 999 1002 part_stat_unlock(); 1000 1003 } 1001 1004 } ··· 1020 1019 1021 1020 part_stat_lock(); 1022 1021 update_io_ticks(req->part, jiffies, false); 1022 + part_stat_local_inc(req->part, 1023 + in_flight[op_is_write(req_op(req))]); 1023 1024 part_stat_unlock(); 1024 1025 } 1025 1026 } ··· 1333 1330 1334 1331 blk_account_io_start(rq); 1335 1332 1336 - /* 1337 - * As plugging can be enabled for passthrough requests on a zoned 1338 - * device, directly accessing the plug instead of using blk_mq_plug() 1339 - * should not have any consequences. 1340 - */ 1341 1333 if (current->plug && !at_head) { 1342 1334 blk_add_rq_to_plug(current->plug, rq); 1343 1335 return; ··· 1919 1921 __blk_mq_requeue_request(rq); 1920 1922 } 1921 1923 1922 - static void blk_mq_handle_zone_resource(struct request *rq, 1923 - struct list_head *zone_list) 1924 - { 1925 - /* 1926 - * If we end up here it is because we cannot dispatch a request to a 1927 - * specific zone due to LLD level zone-write locking or other zone 1928 - * related resource not being available. In this case, set the request 1929 - * aside in zone_list for retrying it later. 1930 - */ 1931 - list_add(&rq->queuelist, zone_list); 1932 - __blk_mq_requeue_request(rq); 1933 - } 1934 - 1935 1924 enum prep_dispatch { 1936 1925 PREP_DISPATCH_OK, 1937 1926 PREP_DISPATCH_NO_TAG, ··· 2004 2019 struct request *rq; 2005 2020 int queued; 2006 2021 blk_status_t ret = BLK_STS_OK; 2007 - LIST_HEAD(zone_list); 2008 2022 bool needs_resource = false; 2009 2023 2010 2024 if (list_empty(list)) ··· 2045 2061 case BLK_STS_DEV_RESOURCE: 2046 2062 blk_mq_handle_dev_resource(rq, list); 2047 2063 goto out; 2048 - case BLK_STS_ZONE_RESOURCE: 2049 - /* 2050 - * Move the request to zone_list and keep going through 2051 - * the dispatch list to find more requests the drive can 2052 - * accept. 2053 - */ 2054 - blk_mq_handle_zone_resource(rq, &zone_list); 2055 - needs_resource = true; 2056 - break; 2057 2064 default: 2058 2065 blk_mq_end_request(rq, ret); 2059 2066 } 2060 2067 } while (!list_empty(list)); 2061 2068 out: 2062 - if (!list_empty(&zone_list)) 2063 - list_splice_tail_init(&zone_list, list); 2064 - 2065 2069 /* If we didn't flush the entire list, we could have told the driver 2066 2070 * there was more coming, but that turned out to be a lie. 2067 2071 */ ··· 2136 2164 } 2137 2165 2138 2166 /* 2167 + * ->next_cpu is always calculated from hctx->cpumask, so simply use 2168 + * it for speeding up the check 2169 + */ 2170 + static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) 2171 + { 2172 + return hctx->next_cpu >= nr_cpu_ids; 2173 + } 2174 + 2175 + /* 2139 2176 * It'd be great if the workqueue API had a way to pass 2140 2177 * in a mask and had some smarts for more clever placement. 2141 2178 * For now we just round-robin here, switching for every ··· 2155 2174 bool tried = false; 2156 2175 int next_cpu = hctx->next_cpu; 2157 2176 2158 - if (hctx->queue->nr_hw_queues == 1) 2177 + /* Switch to unbound if no allowable CPUs in this hctx */ 2178 + if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) 2159 2179 return WORK_CPU_UNBOUND; 2160 2180 2161 2181 if (--hctx->next_cpu_batch <= 0) { ··· 2930 2948 void blk_mq_submit_bio(struct bio *bio) 2931 2949 { 2932 2950 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 2933 - struct blk_plug *plug = blk_mq_plug(bio); 2951 + struct blk_plug *plug = current->plug; 2934 2952 const int is_sync = op_is_sync(bio->bi_opf); 2935 2953 struct blk_mq_hw_ctx *hctx; 2936 2954 unsigned int nr_segs = 1; 2937 2955 struct request *rq; 2938 2956 blk_status_t ret; 2939 2957 2958 + /* 2959 + * If the plug has a cached request for this queue, try to use it. 2960 + */ 2961 + rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); 2962 + 2963 + /* 2964 + * A BIO that was released from a zone write plug has already been 2965 + * through the preparation in this function, already holds a reference 2966 + * on the queue usage counter, and is the only write BIO in-flight for 2967 + * the target zone. Go straight to preparing a request for it. 2968 + */ 2969 + if (bio_zone_write_plugging(bio)) { 2970 + nr_segs = bio->__bi_nr_segments; 2971 + if (rq) 2972 + blk_queue_exit(q); 2973 + goto new_request; 2974 + } 2975 + 2940 2976 bio = blk_queue_bounce(bio, q); 2941 2977 2942 2978 /* 2943 - * If the plug has a cached request for this queue, try use it. 2944 - * 2945 2979 * The cached request already holds a q_usage_counter reference and we 2946 2980 * don't have to acquire a new one if we use it. 2947 2981 */ 2948 - rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); 2949 2982 if (!rq) { 2950 2983 if (unlikely(bio_queue_enter(bio))) 2951 2984 return; ··· 2977 2980 if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) 2978 2981 goto queue_exit; 2979 2982 2983 + if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) 2984 + goto queue_exit; 2985 + 2986 + new_request: 2980 2987 if (!rq) { 2981 2988 rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); 2982 2989 if (unlikely(!rq)) ··· 3002 3001 blk_mq_free_request(rq); 3003 3002 return; 3004 3003 } 3004 + 3005 + if (bio_zone_write_plugging(bio)) 3006 + blk_zone_write_plug_init_request(rq); 3005 3007 3006 3008 if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) 3007 3009 return; ··· 3487 3483 return data.has_rq; 3488 3484 } 3489 3485 3490 - static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, 3491 - struct blk_mq_hw_ctx *hctx) 3486 + static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, 3487 + unsigned int this_cpu) 3492 3488 { 3493 - if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) 3494 - return false; 3495 - if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) 3496 - return false; 3497 - return true; 3489 + enum hctx_type type = hctx->type; 3490 + int cpu; 3491 + 3492 + /* 3493 + * hctx->cpumask has to rule out isolated CPUs, but userspace still 3494 + * might submit IOs on these isolated CPUs, so use the queue map to 3495 + * check if all CPUs mapped to this hctx are offline 3496 + */ 3497 + for_each_online_cpu(cpu) { 3498 + struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, 3499 + type, cpu); 3500 + 3501 + if (h != hctx) 3502 + continue; 3503 + 3504 + /* this hctx has at least one online CPU */ 3505 + if (this_cpu != cpu) 3506 + return true; 3507 + } 3508 + 3509 + return false; 3498 3510 } 3499 3511 3500 3512 static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) ··· 3518 3498 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 3519 3499 struct blk_mq_hw_ctx, cpuhp_online); 3520 3500 3521 - if (!cpumask_test_cpu(cpu, hctx->cpumask) || 3522 - !blk_mq_last_cpu_in_hctx(cpu, hctx)) 3501 + if (blk_mq_hctx_has_online_cpu(hctx, cpu)) 3523 3502 return 0; 3524 3503 3525 3504 /* ··· 3926 3907 } 3927 3908 3928 3909 queue_for_each_hw_ctx(q, hctx, i) { 3910 + int cpu; 3911 + 3929 3912 /* 3930 3913 * If no software queues are mapped to this hardware queue, 3931 3914 * disable it and free the request entries. ··· 3953 3932 * over all possibly mapped software queues. 3954 3933 */ 3955 3934 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 3935 + 3936 + /* 3937 + * Rule out isolated CPUs from hctx->cpumask to avoid 3938 + * running block kworker on isolated CPUs 3939 + */ 3940 + for_each_cpu(cpu, hctx->cpumask) { 3941 + if (cpu_is_isolated(cpu)) 3942 + cpumask_clear_cpu(cpu, hctx->cpumask); 3943 + } 3956 3944 3957 3945 /* 3958 3946 * Initialize batch roundrobin counts

-31

block/blk-mq.h

··· 365 365 qmap->mq_map[cpu] = 0; 366 366 } 367 367 368 - /* 369 - * blk_mq_plug() - Get caller context plug 370 - * @bio : the bio being submitted by the caller context 371 - * 372 - * Plugging, by design, may delay the insertion of BIOs into the elevator in 373 - * order to increase BIO merging opportunities. This however can cause BIO 374 - * insertion order to change from the order in which submit_bio() is being 375 - * executed in the case of multiple contexts concurrently issuing BIOs to a 376 - * device, even if these context are synchronized to tightly control BIO issuing 377 - * order. While this is not a problem with regular block devices, this ordering 378 - * change can cause write BIO failures with zoned block devices as these 379 - * require sequential write patterns to zones. Prevent this from happening by 380 - * ignoring the plug state of a BIO issuing context if it is for a zoned block 381 - * device and the BIO to plug is a write operation. 382 - * 383 - * Return current->plug if the bio can be plugged and NULL otherwise 384 - */ 385 - static inline struct blk_plug *blk_mq_plug( struct bio *bio) 386 - { 387 - /* Zoned block device write operation case: do not plug the BIO */ 388 - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && 389 - bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) 390 - return NULL; 391 - 392 - /* 393 - * For regular block devices or read operations, use the context plug 394 - * which may be NULL if blk_start_plug() was not executed. 395 - */ 396 - return current->plug; 397 - } 398 - 399 368 /* Free all requests on the list */ 400 369 static inline void blk_mq_free_requests(struct list_head *list) 401 370 {

+19 -27

block/blk-settings.c

··· 411 411 * blk_queue_max_zone_append_sectors - set max sectors for a single zone append 412 412 * @q: the request queue for the device 413 413 * @max_zone_append_sectors: maximum number of sectors to write per command 414 + * 415 + * Sets the maximum number of sectors allowed for zone append commands. If 416 + * Specifying 0 for @max_zone_append_sectors indicates that the queue does 417 + * not natively support zone append operations and that the block layer must 418 + * emulate these operations using regular writes. 414 419 **/ 415 420 void blk_queue_max_zone_append_sectors(struct request_queue *q, 416 421 unsigned int max_zone_append_sectors) 417 422 { 418 - unsigned int max_sectors; 423 + unsigned int max_sectors = 0; 419 424 420 425 if (WARN_ON(!blk_queue_is_zoned(q))) 421 426 return; 422 427 423 - max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); 424 - max_sectors = min(q->limits.chunk_sectors, max_sectors); 428 + if (max_zone_append_sectors) { 429 + max_sectors = min(q->limits.max_hw_sectors, 430 + max_zone_append_sectors); 431 + max_sectors = min(q->limits.chunk_sectors, max_sectors); 425 432 426 - /* 427 - * Signal eventual driver bugs resulting in the max_zone_append sectors limit 428 - * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, 429 - * or the max_hw_sectors limit not set. 430 - */ 431 - WARN_ON(!max_sectors); 433 + /* 434 + * Signal eventual driver bugs resulting in the max_zone_append 435 + * sectors limit being 0 due to the chunk_sectors limit (zone 436 + * size) not set or the max_hw_sectors limit not set. 437 + */ 438 + WARN_ON_ONCE(!max_sectors); 439 + } 432 440 433 441 q->limits.max_zone_append_sectors = max_sectors; 434 442 } ··· 763 755 t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); 764 756 t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, 765 757 b->max_write_zeroes_sectors); 766 - t->max_zone_append_sectors = min(t->max_zone_append_sectors, 767 - b->max_zone_append_sectors); 758 + t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), 759 + queue_limits_max_zone_append_sectors(b)); 768 760 t->bounce = max(t->bounce, b->bounce); 769 761 770 762 t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, ··· 1050 1042 blk_queue_flag_clear(QUEUE_FLAG_FUA, q); 1051 1043 } 1052 1044 EXPORT_SYMBOL_GPL(blk_queue_write_cache); 1053 - 1054 - /** 1055 - * blk_queue_required_elevator_features - Set a queue required elevator features 1056 - * @q: the request queue for the target device 1057 - * @features: Required elevator features OR'ed together 1058 - * 1059 - * Tell the block layer that for the device controlled through @q, only the 1060 - * only elevators that can be used are those that implement at least the set of 1061 - * features specified by @features. 1062 - */ 1063 - void blk_queue_required_elevator_features(struct request_queue *q, 1064 - unsigned int features) 1065 - { 1066 - q->required_elevator_features = features; 1067 - } 1068 - EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); 1069 1045 1070 1046 /** 1071 1047 * blk_queue_can_use_dma_map_merging - configure queue for merging segments.

-3

block/blk-stat.c

··· 57 57 58 58 value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; 59 59 60 - if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE) 61 - blk_throtl_stat_add(rq, value); 62 - 63 60 rcu_read_lock(); 64 61 cpu = get_cpu(); 65 62 list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {

+1 -9

block/blk-sysfs.c

··· 224 224 225 225 static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) 226 226 { 227 - unsigned long long max_sectors = q->limits.max_zone_append_sectors; 227 + unsigned long long max_sectors = queue_max_zone_append_sectors(q); 228 228 229 229 return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); 230 230 } ··· 516 516 QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); 517 517 QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); 518 518 519 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 520 - QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); 521 - #endif 522 - 523 519 /* legacy alias for logical_block_size: */ 524 520 static struct queue_sysfs_entry queue_hw_sector_size_entry = { 525 521 .attr = {.name = "hw_sector_size", .mode = 0444 }, ··· 636 640 &queue_fua_entry.attr, 637 641 &queue_dax_entry.attr, 638 642 &queue_poll_delay_entry.attr, 639 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 640 - &blk_throtl_sample_time_entry.attr, 641 - #endif 642 643 &queue_virt_boundary_mask_entry.attr, 643 644 &queue_dma_alignment_entry.attr, 644 645 NULL, ··· 807 814 808 815 blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); 809 816 wbt_enable_default(disk); 810 - blk_throtl_register(disk); 811 817 812 818 /* Now everything is ready and send out KOBJ_ADD uevent */ 813 819 kobject_uevent(&disk->queue_kobj, KOBJ_ADD);

+120 -899

block/blk-throttle.c

··· 25 25 #define DFL_THROTL_SLICE_HD (HZ / 10) 26 26 #define DFL_THROTL_SLICE_SSD (HZ / 50) 27 27 #define MAX_THROTL_SLICE (HZ) 28 - #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ 29 - #define MIN_THROTL_BPS (320 * 1024) 30 - #define MIN_THROTL_IOPS (10) 31 - #define DFL_LATENCY_TARGET (-1L) 32 - #define DFL_IDLE_THRESHOLD (0) 33 - #define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */ 34 - #define LATENCY_FILTERED_SSD (0) 35 - /* 36 - * For HD, very small latency comes from sequential IO. Such IO is helpless to 37 - * help determine if its IO is impacted by others, hence we ignore the IO 38 - */ 39 - #define LATENCY_FILTERED_HD (1000L) /* 1ms */ 40 28 41 29 /* A workqueue to queue throttle related work */ 42 30 static struct workqueue_struct *kthrotld_workqueue; ··· 58 70 59 71 /* Work for dispatching throttled bios */ 60 72 struct work_struct dispatch_work; 61 - unsigned int limit_index; 62 - bool limit_valid[LIMIT_CNT]; 63 - 64 - unsigned long low_upgrade_time; 65 - unsigned long low_downgrade_time; 66 - 67 - unsigned int scale; 68 - 69 - struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; 70 - struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; 71 - struct latency_bucket __percpu *latency_buckets[2]; 72 - unsigned long last_calculate_time; 73 - unsigned long filtered_latency; 74 73 75 74 bool track_bio_latency; 76 75 }; ··· 101 126 return container_of(sq, struct throtl_data, service_queue); 102 127 } 103 128 104 - /* 105 - * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to 106 - * make the IO dispatch more smooth. 107 - * Scale up: linearly scale up according to elapsed time since upgrade. For 108 - * every throtl_slice, the limit scales up 1/2 .low limit till the 109 - * limit hits .max limit 110 - * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit 111 - */ 112 - static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) 113 - { 114 - /* arbitrary value to avoid too big scale */ 115 - if (td->scale < 4096 && time_after_eq(jiffies, 116 - td->low_upgrade_time + td->scale * td->throtl_slice)) 117 - td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; 118 - 119 - return low + (low >> 1) * td->scale; 120 - } 121 - 122 129 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) 123 130 { 124 131 struct blkcg_gq *blkg = tg_to_blkg(tg); 125 - struct throtl_data *td; 126 - uint64_t ret; 127 132 128 133 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 129 134 return U64_MAX; 130 135 131 - td = tg->td; 132 - ret = tg->bps[rw][td->limit_index]; 133 - if (ret == 0 && td->limit_index == LIMIT_LOW) { 134 - /* intermediate node or iops isn't 0 */ 135 - if (!list_empty(&blkg->blkcg->css.children) || 136 - tg->iops[rw][td->limit_index]) 137 - return U64_MAX; 138 - else 139 - return MIN_THROTL_BPS; 140 - } 141 - 142 - if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && 143 - tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { 144 - uint64_t adjusted; 145 - 146 - adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); 147 - ret = min(tg->bps[rw][LIMIT_MAX], adjusted); 148 - } 149 - return ret; 136 + return tg->bps[rw]; 150 137 } 151 138 152 139 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) 153 140 { 154 141 struct blkcg_gq *blkg = tg_to_blkg(tg); 155 - struct throtl_data *td; 156 - unsigned int ret; 157 142 158 143 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 159 144 return UINT_MAX; 160 145 161 - td = tg->td; 162 - ret = tg->iops[rw][td->limit_index]; 163 - if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { 164 - /* intermediate node or bps isn't 0 */ 165 - if (!list_empty(&blkg->blkcg->css.children) || 166 - tg->bps[rw][td->limit_index]) 167 - return UINT_MAX; 168 - else 169 - return MIN_THROTL_IOPS; 170 - } 171 - 172 - if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && 173 - tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { 174 - uint64_t adjusted; 175 - 176 - adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); 177 - if (adjusted > UINT_MAX) 178 - adjusted = UINT_MAX; 179 - ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); 180 - } 181 - return ret; 146 + return tg->iops[rw]; 182 147 } 183 148 184 149 #define request_bucket_index(sectors) \ ··· 274 359 } 275 360 276 361 RB_CLEAR_NODE(&tg->rb_node); 277 - tg->bps[READ][LIMIT_MAX] = U64_MAX; 278 - tg->bps[WRITE][LIMIT_MAX] = U64_MAX; 279 - tg->iops[READ][LIMIT_MAX] = UINT_MAX; 280 - tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; 281 - tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; 282 - tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; 283 - tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; 284 - tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; 285 - /* LIMIT_LOW will have default value 0 */ 286 - 287 - tg->latency_target = DFL_LATENCY_TARGET; 288 - tg->latency_target_conf = DFL_LATENCY_TARGET; 289 - tg->idletime_threshold = DFL_IDLE_THRESHOLD; 290 - tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; 362 + tg->bps[READ] = U64_MAX; 363 + tg->bps[WRITE] = U64_MAX; 364 + tg->iops[READ] = UINT_MAX; 365 + tg->iops[WRITE] = UINT_MAX; 291 366 292 367 return &tg->pd; 293 368 ··· 323 418 static void tg_update_has_rules(struct throtl_grp *tg) 324 419 { 325 420 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 326 - struct throtl_data *td = tg->td; 327 421 int rw; 328 422 329 423 for (rw = READ; rw <= WRITE; rw++) { 330 424 tg->has_rules_iops[rw] = 331 425 (parent_tg && parent_tg->has_rules_iops[rw]) || 332 - (td->limit_valid[td->limit_index] && 333 - tg_iops_limit(tg, rw) != UINT_MAX); 426 + tg_iops_limit(tg, rw) != UINT_MAX; 334 427 tg->has_rules_bps[rw] = 335 428 (parent_tg && parent_tg->has_rules_bps[rw]) || 336 - (td->limit_valid[td->limit_index] && 337 - (tg_bps_limit(tg, rw) != U64_MAX)); 429 + tg_bps_limit(tg, rw) != U64_MAX; 338 430 } 339 431 } 340 432 ··· 343 441 * Update has_rules[] after a new group is brought online. 344 442 */ 345 443 tg_update_has_rules(tg); 346 - } 347 - 348 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 349 - static void blk_throtl_update_limit_valid(struct throtl_data *td) 350 - { 351 - struct cgroup_subsys_state *pos_css; 352 - struct blkcg_gq *blkg; 353 - bool low_valid = false; 354 - 355 - rcu_read_lock(); 356 - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 357 - struct throtl_grp *tg = blkg_to_tg(blkg); 358 - 359 - if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || 360 - tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { 361 - low_valid = true; 362 - break; 363 - } 364 - } 365 - rcu_read_unlock(); 366 - 367 - td->limit_valid[LIMIT_LOW] = low_valid; 368 - } 369 - #else 370 - static inline void blk_throtl_update_limit_valid(struct throtl_data *td) 371 - { 372 - } 373 - #endif 374 - 375 - static void throtl_upgrade_state(struct throtl_data *td); 376 - static void throtl_pd_offline(struct blkg_policy_data *pd) 377 - { 378 - struct throtl_grp *tg = pd_to_tg(pd); 379 - 380 - tg->bps[READ][LIMIT_LOW] = 0; 381 - tg->bps[WRITE][LIMIT_LOW] = 0; 382 - tg->iops[READ][LIMIT_LOW] = 0; 383 - tg->iops[WRITE][LIMIT_LOW] = 0; 384 - 385 - blk_throtl_update_limit_valid(tg->td); 386 - 387 - if (!tg->td->limit_valid[tg->td->limit_index]) 388 - throtl_upgrade_state(tg->td); 389 444 } 390 445 391 446 static void throtl_pd_free(struct blkg_policy_data *pd) ··· 1010 1151 return nr_disp; 1011 1152 } 1012 1153 1013 - static bool throtl_can_upgrade(struct throtl_data *td, 1014 - struct throtl_grp *this_tg); 1015 1154 /** 1016 1155 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1017 1156 * @t: the pending_timer member of the throtl_service_queue being serviced ··· 1045 1188 1046 1189 if (!q->root_blkg) 1047 1190 goto out_unlock; 1048 - 1049 - if (throtl_can_upgrade(td, NULL)) 1050 - throtl_upgrade_state(td); 1051 1191 1052 1192 again: 1053 1193 parent_sq = sq->parent_sq; ··· 1185 1331 blkg_for_each_descendant_pre(blkg, pos_css, 1186 1332 global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { 1187 1333 struct throtl_grp *this_tg = blkg_to_tg(blkg); 1188 - struct throtl_grp *parent_tg; 1189 1334 1190 1335 tg_update_has_rules(this_tg); 1191 1336 /* ignore root/second level */ 1192 1337 if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || 1193 1338 !blkg->parent->parent) 1194 1339 continue; 1195 - parent_tg = blkg_to_tg(blkg->parent); 1196 - /* 1197 - * make sure all children has lower idle time threshold and 1198 - * higher latency target 1199 - */ 1200 - this_tg->idletime_threshold = min(this_tg->idletime_threshold, 1201 - parent_tg->idletime_threshold); 1202 - this_tg->latency_target = max(this_tg->latency_target, 1203 - parent_tg->latency_target); 1204 1340 } 1205 1341 rcu_read_unlock(); 1206 1342 ··· 1211 1367 } 1212 1368 } 1213 1369 1370 + static int blk_throtl_init(struct gendisk *disk) 1371 + { 1372 + struct request_queue *q = disk->queue; 1373 + struct throtl_data *td; 1374 + int ret; 1375 + 1376 + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1377 + if (!td) 1378 + return -ENOMEM; 1379 + 1380 + INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1381 + throtl_service_queue_init(&td->service_queue); 1382 + 1383 + /* 1384 + * Freeze queue before activating policy, to synchronize with IO path, 1385 + * which is protected by 'q_usage_counter'. 1386 + */ 1387 + blk_mq_freeze_queue(disk->queue); 1388 + blk_mq_quiesce_queue(disk->queue); 1389 + 1390 + q->td = td; 1391 + td->queue = q; 1392 + 1393 + /* activate policy */ 1394 + ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); 1395 + if (ret) { 1396 + q->td = NULL; 1397 + kfree(td); 1398 + goto out; 1399 + } 1400 + 1401 + if (blk_queue_nonrot(q)) 1402 + td->throtl_slice = DFL_THROTL_SLICE_SSD; 1403 + else 1404 + td->throtl_slice = DFL_THROTL_SLICE_HD; 1405 + td->track_bio_latency = !queue_is_mq(q); 1406 + if (!td->track_bio_latency) 1407 + blk_stat_enable_accounting(q); 1408 + 1409 + out: 1410 + blk_mq_unquiesce_queue(disk->queue); 1411 + blk_mq_unfreeze_queue(disk->queue); 1412 + 1413 + return ret; 1414 + } 1415 + 1416 + 1214 1417 static ssize_t tg_set_conf(struct kernfs_open_file *of, 1215 1418 char *buf, size_t nbytes, loff_t off, bool is_u64) 1216 1419 { ··· 1268 1377 u64 v; 1269 1378 1270 1379 blkg_conf_init(&ctx, buf); 1380 + 1381 + ret = blkg_conf_open_bdev(&ctx); 1382 + if (ret) 1383 + goto out_finish; 1384 + 1385 + if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1386 + ret = blk_throtl_init(ctx.bdev->bd_disk); 1387 + if (ret) 1388 + goto out_finish; 1389 + } 1271 1390 1272 1391 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1273 1392 if (ret) ··· 1345 1444 static struct cftype throtl_legacy_files[] = { 1346 1445 { 1347 1446 .name = "throttle.read_bps_device", 1348 - .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), 1447 + .private = offsetof(struct throtl_grp, bps[READ]), 1349 1448 .seq_show = tg_print_conf_u64, 1350 1449 .write = tg_set_conf_u64, 1351 1450 }, 1352 1451 { 1353 1452 .name = "throttle.write_bps_device", 1354 - .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), 1453 + .private = offsetof(struct throtl_grp, bps[WRITE]), 1355 1454 .seq_show = tg_print_conf_u64, 1356 1455 .write = tg_set_conf_u64, 1357 1456 }, 1358 1457 { 1359 1458 .name = "throttle.read_iops_device", 1360 - .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), 1459 + .private = offsetof(struct throtl_grp, iops[READ]), 1361 1460 .seq_show = tg_print_conf_uint, 1362 1461 .write = tg_set_conf_uint, 1363 1462 }, 1364 1463 { 1365 1464 .name = "throttle.write_iops_device", 1366 - .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), 1465 + .private = offsetof(struct throtl_grp, iops[WRITE]), 1367 1466 .seq_show = tg_print_conf_uint, 1368 1467 .write = tg_set_conf_uint, 1369 1468 }, ··· 1395 1494 { 1396 1495 struct throtl_grp *tg = pd_to_tg(pd); 1397 1496 const char *dname = blkg_dev_name(pd->blkg); 1398 - char bufs[4][21] = { "max", "max", "max", "max" }; 1399 1497 u64 bps_dft; 1400 1498 unsigned int iops_dft; 1401 - char idle_time[26] = ""; 1402 - char latency_time[26] = ""; 1403 1499 1404 1500 if (!dname) 1405 1501 return 0; 1406 1502 1407 - if (off == LIMIT_LOW) { 1408 - bps_dft = 0; 1409 - iops_dft = 0; 1410 - } else { 1411 - bps_dft = U64_MAX; 1412 - iops_dft = UINT_MAX; 1413 - } 1503 + bps_dft = U64_MAX; 1504 + iops_dft = UINT_MAX; 1414 1505 1415 - if (tg->bps_conf[READ][off] == bps_dft && 1416 - tg->bps_conf[WRITE][off] == bps_dft && 1417 - tg->iops_conf[READ][off] == iops_dft && 1418 - tg->iops_conf[WRITE][off] == iops_dft && 1419 - (off != LIMIT_LOW || 1420 - (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && 1421 - tg->latency_target_conf == DFL_LATENCY_TARGET))) 1506 + if (tg->bps_conf[READ] == bps_dft && 1507 + tg->bps_conf[WRITE] == bps_dft && 1508 + tg->iops_conf[READ] == iops_dft && 1509 + tg->iops_conf[WRITE] == iops_dft) 1422 1510 return 0; 1423 1511 1424 - if (tg->bps_conf[READ][off] != U64_MAX) 1425 - snprintf(bufs[0], sizeof(bufs[0]), "%llu", 1426 - tg->bps_conf[READ][off]); 1427 - if (tg->bps_conf[WRITE][off] != U64_MAX) 1428 - snprintf(bufs[1], sizeof(bufs[1]), "%llu", 1429 - tg->bps_conf[WRITE][off]); 1430 - if (tg->iops_conf[READ][off] != UINT_MAX) 1431 - snprintf(bufs[2], sizeof(bufs[2]), "%u", 1432 - tg->iops_conf[READ][off]); 1433 - if (tg->iops_conf[WRITE][off] != UINT_MAX) 1434 - snprintf(bufs[3], sizeof(bufs[3]), "%u", 1435 - tg->iops_conf[WRITE][off]); 1436 - if (off == LIMIT_LOW) { 1437 - if (tg->idletime_threshold_conf == ULONG_MAX) 1438 - strcpy(idle_time, " idle=max"); 1439 - else 1440 - snprintf(idle_time, sizeof(idle_time), " idle=%lu", 1441 - tg->idletime_threshold_conf); 1512 + seq_printf(sf, "%s", dname); 1513 + if (tg->bps_conf[READ] == U64_MAX) 1514 + seq_printf(sf, " rbps=max"); 1515 + else 1516 + seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]); 1442 1517 1443 - if (tg->latency_target_conf == ULONG_MAX) 1444 - strcpy(latency_time, " latency=max"); 1445 - else 1446 - snprintf(latency_time, sizeof(latency_time), 1447 - " latency=%lu", tg->latency_target_conf); 1448 - } 1518 + if (tg->bps_conf[WRITE] == U64_MAX) 1519 + seq_printf(sf, " wbps=max"); 1520 + else 1521 + seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]); 1449 1522 1450 - seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", 1451 - dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, 1452 - latency_time); 1523 + if (tg->iops_conf[READ] == UINT_MAX) 1524 + seq_printf(sf, " riops=max"); 1525 + else 1526 + seq_printf(sf, " riops=%u", tg->iops_conf[READ]); 1527 + 1528 + if (tg->iops_conf[WRITE] == UINT_MAX) 1529 + seq_printf(sf, " wiops=max"); 1530 + else 1531 + seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]); 1532 + 1533 + seq_printf(sf, "\n"); 1453 1534 return 0; 1454 1535 } 1455 1536 ··· 1449 1566 struct blkg_conf_ctx ctx; 1450 1567 struct throtl_grp *tg; 1451 1568 u64 v[4]; 1452 - unsigned long idle_time; 1453 - unsigned long latency_time; 1454 1569 int ret; 1455 - int index = of_cft(of)->private; 1456 1570 1457 1571 blkg_conf_init(&ctx, buf); 1572 + 1573 + ret = blkg_conf_open_bdev(&ctx); 1574 + if (ret) 1575 + goto out_finish; 1576 + 1577 + if (!blk_throtl_activated(ctx.bdev->bd_queue)) { 1578 + ret = blk_throtl_init(ctx.bdev->bd_disk); 1579 + if (ret) 1580 + goto out_finish; 1581 + } 1458 1582 1459 1583 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); 1460 1584 if (ret) ··· 1470 1580 tg = blkg_to_tg(ctx.blkg); 1471 1581 tg_update_carryover(tg); 1472 1582 1473 - v[0] = tg->bps_conf[READ][index]; 1474 - v[1] = tg->bps_conf[WRITE][index]; 1475 - v[2] = tg->iops_conf[READ][index]; 1476 - v[3] = tg->iops_conf[WRITE][index]; 1583 + v[0] = tg->bps[READ]; 1584 + v[1] = tg->bps[WRITE]; 1585 + v[2] = tg->iops[READ]; 1586 + v[3] = tg->iops[WRITE]; 1477 1587 1478 - idle_time = tg->idletime_threshold_conf; 1479 - latency_time = tg->latency_target_conf; 1480 1588 while (true) { 1481 1589 char tok[27]; /* wiops=18446744073709551616 */ 1482 1590 char *p; ··· 1506 1618 v[2] = min_t(u64, val, UINT_MAX); 1507 1619 else if (!strcmp(tok, "wiops") && val > 1) 1508 1620 v[3] = min_t(u64, val, UINT_MAX); 1509 - else if (off == LIMIT_LOW && !strcmp(tok, "idle")) 1510 - idle_time = val; 1511 - else if (off == LIMIT_LOW && !strcmp(tok, "latency")) 1512 - latency_time = val; 1513 1621 else 1514 1622 goto out_finish; 1515 1623 } 1516 1624 1517 - tg->bps_conf[READ][index] = v[0]; 1518 - tg->bps_conf[WRITE][index] = v[1]; 1519 - tg->iops_conf[READ][index] = v[2]; 1520 - tg->iops_conf[WRITE][index] = v[3]; 1625 + tg->bps[READ] = v[0]; 1626 + tg->bps[WRITE] = v[1]; 1627 + tg->iops[READ] = v[2]; 1628 + tg->iops[WRITE] = v[3]; 1521 1629 1522 - if (index == LIMIT_MAX) { 1523 - tg->bps[READ][index] = v[0]; 1524 - tg->bps[WRITE][index] = v[1]; 1525 - tg->iops[READ][index] = v[2]; 1526 - tg->iops[WRITE][index] = v[3]; 1527 - } 1528 - tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], 1529 - tg->bps_conf[READ][LIMIT_MAX]); 1530 - tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], 1531 - tg->bps_conf[WRITE][LIMIT_MAX]); 1532 - tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], 1533 - tg->iops_conf[READ][LIMIT_MAX]); 1534 - tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], 1535 - tg->iops_conf[WRITE][LIMIT_MAX]); 1536 - tg->idletime_threshold_conf = idle_time; 1537 - tg->latency_target_conf = latency_time; 1538 - 1539 - /* force user to configure all settings for low limit */ 1540 - if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || 1541 - tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || 1542 - tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || 1543 - tg->latency_target_conf == DFL_LATENCY_TARGET) { 1544 - tg->bps[READ][LIMIT_LOW] = 0; 1545 - tg->bps[WRITE][LIMIT_LOW] = 0; 1546 - tg->iops[READ][LIMIT_LOW] = 0; 1547 - tg->iops[WRITE][LIMIT_LOW] = 0; 1548 - tg->idletime_threshold = DFL_IDLE_THRESHOLD; 1549 - tg->latency_target = DFL_LATENCY_TARGET; 1550 - } else if (index == LIMIT_LOW) { 1551 - tg->idletime_threshold = tg->idletime_threshold_conf; 1552 - tg->latency_target = tg->latency_target_conf; 1553 - } 1554 - 1555 - blk_throtl_update_limit_valid(tg->td); 1556 - if (tg->td->limit_valid[LIMIT_LOW]) { 1557 - if (index == LIMIT_LOW) 1558 - tg->td->limit_index = LIMIT_LOW; 1559 - } else 1560 - tg->td->limit_index = LIMIT_MAX; 1561 - tg_conf_updated(tg, index == LIMIT_LOW && 1562 - tg->td->limit_valid[LIMIT_LOW]); 1630 + tg_conf_updated(tg, false); 1563 1631 ret = 0; 1564 1632 out_finish: 1565 1633 blkg_conf_exit(&ctx); ··· 1523 1679 } 1524 1680 1525 1681 static struct cftype throtl_files[] = { 1526 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1527 - { 1528 - .name = "low", 1529 - .flags = CFTYPE_NOT_ON_ROOT, 1530 - .seq_show = tg_print_limit, 1531 - .write = tg_set_limit, 1532 - .private = LIMIT_LOW, 1533 - }, 1534 - #endif 1535 1682 { 1536 1683 .name = "max", 1537 1684 .flags = CFTYPE_NOT_ON_ROOT, 1538 1685 .seq_show = tg_print_limit, 1539 1686 .write = tg_set_limit, 1540 - .private = LIMIT_MAX, 1541 1687 }, 1542 1688 { } /* terminate */ 1543 1689 }; ··· 1546 1712 .pd_alloc_fn = throtl_pd_alloc, 1547 1713 .pd_init_fn = throtl_pd_init, 1548 1714 .pd_online_fn = throtl_pd_online, 1549 - .pd_offline_fn = throtl_pd_offline, 1550 1715 .pd_free_fn = throtl_pd_free, 1551 1716 }; 1552 1717 ··· 1554 1721 struct request_queue *q = disk->queue; 1555 1722 struct cgroup_subsys_state *pos_css; 1556 1723 struct blkcg_gq *blkg; 1724 + 1725 + if (!blk_throtl_activated(q)) 1726 + return; 1557 1727 1558 1728 spin_lock_irq(&q->queue_lock); 1559 1729 /* ··· 1597 1761 spin_unlock_irq(&q->queue_lock); 1598 1762 } 1599 1763 1600 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1601 - static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) 1602 - { 1603 - unsigned long rtime = jiffies, wtime = jiffies; 1604 - 1605 - if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) 1606 - rtime = tg->last_low_overflow_time[READ]; 1607 - if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) 1608 - wtime = tg->last_low_overflow_time[WRITE]; 1609 - return min(rtime, wtime); 1610 - } 1611 - 1612 - static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) 1613 - { 1614 - struct throtl_service_queue *parent_sq; 1615 - struct throtl_grp *parent = tg; 1616 - unsigned long ret = __tg_last_low_overflow_time(tg); 1617 - 1618 - while (true) { 1619 - parent_sq = parent->service_queue.parent_sq; 1620 - parent = sq_to_tg(parent_sq); 1621 - if (!parent) 1622 - break; 1623 - 1624 - /* 1625 - * The parent doesn't have low limit, it always reaches low 1626 - * limit. Its overflow time is useless for children 1627 - */ 1628 - if (!parent->bps[READ][LIMIT_LOW] && 1629 - !parent->iops[READ][LIMIT_LOW] && 1630 - !parent->bps[WRITE][LIMIT_LOW] && 1631 - !parent->iops[WRITE][LIMIT_LOW]) 1632 - continue; 1633 - if (time_after(__tg_last_low_overflow_time(parent), ret)) 1634 - ret = __tg_last_low_overflow_time(parent); 1635 - } 1636 - return ret; 1637 - } 1638 - 1639 - static bool throtl_tg_is_idle(struct throtl_grp *tg) 1640 - { 1641 - /* 1642 - * cgroup is idle if: 1643 - * - single idle is too long, longer than a fixed value (in case user 1644 - * configure a too big threshold) or 4 times of idletime threshold 1645 - * - average think time is more than threshold 1646 - * - IO latency is largely below threshold 1647 - */ 1648 - unsigned long time; 1649 - bool ret; 1650 - 1651 - time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); 1652 - ret = tg->latency_target == DFL_LATENCY_TARGET || 1653 - tg->idletime_threshold == DFL_IDLE_THRESHOLD || 1654 - (blk_time_get_ns() >> 10) - tg->last_finish_time > time || 1655 - tg->avg_idletime > tg->idletime_threshold || 1656 - (tg->latency_target && tg->bio_cnt && 1657 - tg->bad_bio_cnt * 5 < tg->bio_cnt); 1658 - throtl_log(&tg->service_queue, 1659 - "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", 1660 - tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, 1661 - tg->bio_cnt, ret, tg->td->scale); 1662 - return ret; 1663 - } 1664 - 1665 - static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw) 1666 - { 1667 - struct throtl_service_queue *sq = &tg->service_queue; 1668 - bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW]; 1669 - 1670 - /* 1671 - * if low limit is zero, low limit is always reached. 1672 - * if low limit is non-zero, we can check if there is any request 1673 - * is queued to determine if low limit is reached as we throttle 1674 - * request according to limit. 1675 - */ 1676 - return !limit || sq->nr_queued[rw]; 1677 - } 1678 - 1679 - static bool throtl_tg_can_upgrade(struct throtl_grp *tg) 1680 - { 1681 - /* 1682 - * cgroup reaches low limit when low limit of READ and WRITE are 1683 - * both reached, it's ok to upgrade to next limit if cgroup reaches 1684 - * low limit 1685 - */ 1686 - if (throtl_low_limit_reached(tg, READ) && 1687 - throtl_low_limit_reached(tg, WRITE)) 1688 - return true; 1689 - 1690 - if (time_after_eq(jiffies, 1691 - tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && 1692 - throtl_tg_is_idle(tg)) 1693 - return true; 1694 - return false; 1695 - } 1696 - 1697 - static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) 1698 - { 1699 - while (true) { 1700 - if (throtl_tg_can_upgrade(tg)) 1701 - return true; 1702 - tg = sq_to_tg(tg->service_queue.parent_sq); 1703 - if (!tg || !tg_to_blkg(tg)->parent) 1704 - return false; 1705 - } 1706 - return false; 1707 - } 1708 - 1709 - static bool throtl_can_upgrade(struct throtl_data *td, 1710 - struct throtl_grp *this_tg) 1711 - { 1712 - struct cgroup_subsys_state *pos_css; 1713 - struct blkcg_gq *blkg; 1714 - 1715 - if (td->limit_index != LIMIT_LOW) 1716 - return false; 1717 - 1718 - if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) 1719 - return false; 1720 - 1721 - rcu_read_lock(); 1722 - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 1723 - struct throtl_grp *tg = blkg_to_tg(blkg); 1724 - 1725 - if (tg == this_tg) 1726 - continue; 1727 - if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) 1728 - continue; 1729 - if (!throtl_hierarchy_can_upgrade(tg)) { 1730 - rcu_read_unlock(); 1731 - return false; 1732 - } 1733 - } 1734 - rcu_read_unlock(); 1735 - return true; 1736 - } 1737 - 1738 - static void throtl_upgrade_check(struct throtl_grp *tg) 1739 - { 1740 - unsigned long now = jiffies; 1741 - 1742 - if (tg->td->limit_index != LIMIT_LOW) 1743 - return; 1744 - 1745 - if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) 1746 - return; 1747 - 1748 - tg->last_check_time = now; 1749 - 1750 - if (!time_after_eq(now, 1751 - __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) 1752 - return; 1753 - 1754 - if (throtl_can_upgrade(tg->td, NULL)) 1755 - throtl_upgrade_state(tg->td); 1756 - } 1757 - 1758 - static void throtl_upgrade_state(struct throtl_data *td) 1759 - { 1760 - struct cgroup_subsys_state *pos_css; 1761 - struct blkcg_gq *blkg; 1762 - 1763 - throtl_log(&td->service_queue, "upgrade to max"); 1764 - td->limit_index = LIMIT_MAX; 1765 - td->low_upgrade_time = jiffies; 1766 - td->scale = 0; 1767 - rcu_read_lock(); 1768 - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 1769 - struct throtl_grp *tg = blkg_to_tg(blkg); 1770 - struct throtl_service_queue *sq = &tg->service_queue; 1771 - 1772 - tg->disptime = jiffies - 1; 1773 - throtl_select_dispatch(sq); 1774 - throtl_schedule_next_dispatch(sq, true); 1775 - } 1776 - rcu_read_unlock(); 1777 - throtl_select_dispatch(&td->service_queue); 1778 - throtl_schedule_next_dispatch(&td->service_queue, true); 1779 - queue_work(kthrotld_workqueue, &td->dispatch_work); 1780 - } 1781 - 1782 - static void throtl_downgrade_state(struct throtl_data *td) 1783 - { 1784 - td->scale /= 2; 1785 - 1786 - throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); 1787 - if (td->scale) { 1788 - td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; 1789 - return; 1790 - } 1791 - 1792 - td->limit_index = LIMIT_LOW; 1793 - td->low_downgrade_time = jiffies; 1794 - } 1795 - 1796 - static bool throtl_tg_can_downgrade(struct throtl_grp *tg) 1797 - { 1798 - struct throtl_data *td = tg->td; 1799 - unsigned long now = jiffies; 1800 - 1801 - /* 1802 - * If cgroup is below low limit, consider downgrade and throttle other 1803 - * cgroups 1804 - */ 1805 - if (time_after_eq(now, tg_last_low_overflow_time(tg) + 1806 - td->throtl_slice) && 1807 - (!throtl_tg_is_idle(tg) || 1808 - !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) 1809 - return true; 1810 - return false; 1811 - } 1812 - 1813 - static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) 1814 - { 1815 - struct throtl_data *td = tg->td; 1816 - 1817 - if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice)) 1818 - return false; 1819 - 1820 - while (true) { 1821 - if (!throtl_tg_can_downgrade(tg)) 1822 - return false; 1823 - tg = sq_to_tg(tg->service_queue.parent_sq); 1824 - if (!tg || !tg_to_blkg(tg)->parent) 1825 - break; 1826 - } 1827 - return true; 1828 - } 1829 - 1830 - static void throtl_downgrade_check(struct throtl_grp *tg) 1831 - { 1832 - uint64_t bps; 1833 - unsigned int iops; 1834 - unsigned long elapsed_time; 1835 - unsigned long now = jiffies; 1836 - 1837 - if (tg->td->limit_index != LIMIT_MAX || 1838 - !tg->td->limit_valid[LIMIT_LOW]) 1839 - return; 1840 - if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) 1841 - return; 1842 - if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) 1843 - return; 1844 - 1845 - elapsed_time = now - tg->last_check_time; 1846 - tg->last_check_time = now; 1847 - 1848 - if (time_before(now, tg_last_low_overflow_time(tg) + 1849 - tg->td->throtl_slice)) 1850 - return; 1851 - 1852 - if (tg->bps[READ][LIMIT_LOW]) { 1853 - bps = tg->last_bytes_disp[READ] * HZ; 1854 - do_div(bps, elapsed_time); 1855 - if (bps >= tg->bps[READ][LIMIT_LOW]) 1856 - tg->last_low_overflow_time[READ] = now; 1857 - } 1858 - 1859 - if (tg->bps[WRITE][LIMIT_LOW]) { 1860 - bps = tg->last_bytes_disp[WRITE] * HZ; 1861 - do_div(bps, elapsed_time); 1862 - if (bps >= tg->bps[WRITE][LIMIT_LOW]) 1863 - tg->last_low_overflow_time[WRITE] = now; 1864 - } 1865 - 1866 - if (tg->iops[READ][LIMIT_LOW]) { 1867 - iops = tg->last_io_disp[READ] * HZ / elapsed_time; 1868 - if (iops >= tg->iops[READ][LIMIT_LOW]) 1869 - tg->last_low_overflow_time[READ] = now; 1870 - } 1871 - 1872 - if (tg->iops[WRITE][LIMIT_LOW]) { 1873 - iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; 1874 - if (iops >= tg->iops[WRITE][LIMIT_LOW]) 1875 - tg->last_low_overflow_time[WRITE] = now; 1876 - } 1877 - 1878 - /* 1879 - * If cgroup is below low limit, consider downgrade and throttle other 1880 - * cgroups 1881 - */ 1882 - if (throtl_hierarchy_can_downgrade(tg)) 1883 - throtl_downgrade_state(tg->td); 1884 - 1885 - tg->last_bytes_disp[READ] = 0; 1886 - tg->last_bytes_disp[WRITE] = 0; 1887 - tg->last_io_disp[READ] = 0; 1888 - tg->last_io_disp[WRITE] = 0; 1889 - } 1890 - 1891 - static void blk_throtl_update_idletime(struct throtl_grp *tg) 1892 - { 1893 - unsigned long now; 1894 - unsigned long last_finish_time = tg->last_finish_time; 1895 - 1896 - if (last_finish_time == 0) 1897 - return; 1898 - 1899 - now = blk_time_get_ns() >> 10; 1900 - if (now <= last_finish_time || 1901 - last_finish_time == tg->checked_last_finish_time) 1902 - return; 1903 - 1904 - tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; 1905 - tg->checked_last_finish_time = last_finish_time; 1906 - } 1907 - 1908 - static void throtl_update_latency_buckets(struct throtl_data *td) 1909 - { 1910 - struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; 1911 - int i, cpu, rw; 1912 - unsigned long last_latency[2] = { 0 }; 1913 - unsigned long latency[2]; 1914 - 1915 - if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW]) 1916 - return; 1917 - if (time_before(jiffies, td->last_calculate_time + HZ)) 1918 - return; 1919 - td->last_calculate_time = jiffies; 1920 - 1921 - memset(avg_latency, 0, sizeof(avg_latency)); 1922 - for (rw = READ; rw <= WRITE; rw++) { 1923 - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 1924 - struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; 1925 - 1926 - for_each_possible_cpu(cpu) { 1927 - struct latency_bucket *bucket; 1928 - 1929 - /* this isn't race free, but ok in practice */ 1930 - bucket = per_cpu_ptr(td->latency_buckets[rw], 1931 - cpu); 1932 - tmp->total_latency += bucket[i].total_latency; 1933 - tmp->samples += bucket[i].samples; 1934 - bucket[i].total_latency = 0; 1935 - bucket[i].samples = 0; 1936 - } 1937 - 1938 - if (tmp->samples >= 32) { 1939 - int samples = tmp->samples; 1940 - 1941 - latency[rw] = tmp->total_latency; 1942 - 1943 - tmp->total_latency = 0; 1944 - tmp->samples = 0; 1945 - latency[rw] /= samples; 1946 - if (latency[rw] == 0) 1947 - continue; 1948 - avg_latency[rw][i].latency = latency[rw]; 1949 - } 1950 - } 1951 - } 1952 - 1953 - for (rw = READ; rw <= WRITE; rw++) { 1954 - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 1955 - if (!avg_latency[rw][i].latency) { 1956 - if (td->avg_buckets[rw][i].latency < last_latency[rw]) 1957 - td->avg_buckets[rw][i].latency = 1958 - last_latency[rw]; 1959 - continue; 1960 - } 1961 - 1962 - if (!td->avg_buckets[rw][i].valid) 1963 - latency[rw] = avg_latency[rw][i].latency; 1964 - else 1965 - latency[rw] = (td->avg_buckets[rw][i].latency * 7 + 1966 - avg_latency[rw][i].latency) >> 3; 1967 - 1968 - td->avg_buckets[rw][i].latency = max(latency[rw], 1969 - last_latency[rw]); 1970 - td->avg_buckets[rw][i].valid = true; 1971 - last_latency[rw] = td->avg_buckets[rw][i].latency; 1972 - } 1973 - } 1974 - 1975 - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) 1976 - throtl_log(&td->service_queue, 1977 - "Latency bucket %d: read latency=%ld, read valid=%d, " 1978 - "write latency=%ld, write valid=%d", i, 1979 - td->avg_buckets[READ][i].latency, 1980 - td->avg_buckets[READ][i].valid, 1981 - td->avg_buckets[WRITE][i].latency, 1982 - td->avg_buckets[WRITE][i].valid); 1983 - } 1984 - #else 1985 - static inline void throtl_update_latency_buckets(struct throtl_data *td) 1986 - { 1987 - } 1988 - 1989 - static void blk_throtl_update_idletime(struct throtl_grp *tg) 1990 - { 1991 - } 1992 - 1993 - static void throtl_downgrade_check(struct throtl_grp *tg) 1994 - { 1995 - } 1996 - 1997 - static void throtl_upgrade_check(struct throtl_grp *tg) 1998 - { 1999 - } 2000 - 2001 - static bool throtl_can_upgrade(struct throtl_data *td, 2002 - struct throtl_grp *this_tg) 2003 - { 2004 - return false; 2005 - } 2006 - 2007 - static void throtl_upgrade_state(struct throtl_data *td) 2008 - { 2009 - } 2010 - #endif 2011 - 2012 1764 bool __blk_throtl_bio(struct bio *bio) 2013 1765 { 2014 1766 struct request_queue *q = bdev_get_queue(bio->bi_bdev); ··· 1609 2185 struct throtl_data *td = tg->td; 1610 2186 1611 2187 rcu_read_lock(); 1612 - 1613 2188 spin_lock_irq(&q->queue_lock); 1614 - 1615 - throtl_update_latency_buckets(td); 1616 - 1617 - blk_throtl_update_idletime(tg); 1618 - 1619 2189 sq = &tg->service_queue; 1620 2190 1621 - again: 1622 2191 while (true) { 1623 2192 if (tg->last_low_overflow_time[rw] == 0) 1624 2193 tg->last_low_overflow_time[rw] = jiffies; 1625 - throtl_downgrade_check(tg); 1626 - throtl_upgrade_check(tg); 1627 2194 /* throtl is FIFO - if bios are already queued, should queue */ 1628 2195 if (sq->nr_queued[rw]) 1629 2196 break; ··· 1622 2207 /* if above limits, break to queue */ 1623 2208 if (!tg_may_dispatch(tg, bio, NULL)) { 1624 2209 tg->last_low_overflow_time[rw] = jiffies; 1625 - if (throtl_can_upgrade(td, tg)) { 1626 - throtl_upgrade_state(td); 1627 - goto again; 1628 - } 1629 2210 break; 1630 2211 } 1631 2212 ··· 1681 2270 } 1682 2271 1683 2272 out_unlock: 1684 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1685 - if (throttled || !td->track_bio_latency) 1686 - bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; 1687 - #endif 1688 2273 spin_unlock_irq(&q->queue_lock); 1689 2274 1690 2275 rcu_read_unlock(); 1691 2276 return throttled; 1692 2277 } 1693 2278 1694 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1695 - static void throtl_track_latency(struct throtl_data *td, sector_t size, 1696 - enum req_op op, unsigned long time) 1697 - { 1698 - const bool rw = op_is_write(op); 1699 - struct latency_bucket *latency; 1700 - int index; 1701 - 1702 - if (!td || td->limit_index != LIMIT_LOW || 1703 - !(op == REQ_OP_READ || op == REQ_OP_WRITE) || 1704 - !blk_queue_nonrot(td->queue)) 1705 - return; 1706 - 1707 - index = request_bucket_index(size); 1708 - 1709 - latency = get_cpu_ptr(td->latency_buckets[rw]); 1710 - latency[index].total_latency += time; 1711 - latency[index].samples++; 1712 - put_cpu_ptr(td->latency_buckets[rw]); 1713 - } 1714 - 1715 - void blk_throtl_stat_add(struct request *rq, u64 time_ns) 1716 - { 1717 - struct request_queue *q = rq->q; 1718 - struct throtl_data *td = q->td; 1719 - 1720 - throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), 1721 - time_ns >> 10); 1722 - } 1723 - 1724 - void blk_throtl_bio_endio(struct bio *bio) 1725 - { 1726 - struct blkcg_gq *blkg; 1727 - struct throtl_grp *tg; 1728 - u64 finish_time_ns; 1729 - unsigned long finish_time; 1730 - unsigned long start_time; 1731 - unsigned long lat; 1732 - int rw = bio_data_dir(bio); 1733 - 1734 - blkg = bio->bi_blkg; 1735 - if (!blkg) 1736 - return; 1737 - tg = blkg_to_tg(blkg); 1738 - if (!tg->td->limit_valid[LIMIT_LOW]) 1739 - return; 1740 - 1741 - finish_time_ns = blk_time_get_ns(); 1742 - tg->last_finish_time = finish_time_ns >> 10; 1743 - 1744 - start_time = bio_issue_time(&bio->bi_issue) >> 10; 1745 - finish_time = __bio_issue_time(finish_time_ns) >> 10; 1746 - if (!start_time || finish_time <= start_time) 1747 - return; 1748 - 1749 - lat = finish_time - start_time; 1750 - /* this is only for bio based driver */ 1751 - if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY)) 1752 - throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue), 1753 - bio_op(bio), lat); 1754 - 1755 - if (tg->latency_target && lat >= tg->td->filtered_latency) { 1756 - int bucket; 1757 - unsigned int threshold; 1758 - 1759 - bucket = request_bucket_index(bio_issue_size(&bio->bi_issue)); 1760 - threshold = tg->td->avg_buckets[rw][bucket].latency + 1761 - tg->latency_target; 1762 - if (lat > threshold) 1763 - tg->bad_bio_cnt++; 1764 - /* 1765 - * Not race free, could get wrong count, which means cgroups 1766 - * will be throttled 1767 - */ 1768 - tg->bio_cnt++; 1769 - } 1770 - 1771 - if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { 1772 - tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; 1773 - tg->bio_cnt /= 2; 1774 - tg->bad_bio_cnt /= 2; 1775 - } 1776 - } 1777 - #endif 1778 - 1779 - int blk_throtl_init(struct gendisk *disk) 1780 - { 1781 - struct request_queue *q = disk->queue; 1782 - struct throtl_data *td; 1783 - int ret; 1784 - 1785 - td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1786 - if (!td) 1787 - return -ENOMEM; 1788 - td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * 1789 - LATENCY_BUCKET_SIZE, __alignof__(u64)); 1790 - if (!td->latency_buckets[READ]) { 1791 - kfree(td); 1792 - return -ENOMEM; 1793 - } 1794 - td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * 1795 - LATENCY_BUCKET_SIZE, __alignof__(u64)); 1796 - if (!td->latency_buckets[WRITE]) { 1797 - free_percpu(td->latency_buckets[READ]); 1798 - kfree(td); 1799 - return -ENOMEM; 1800 - } 1801 - 1802 - INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1803 - throtl_service_queue_init(&td->service_queue); 1804 - 1805 - q->td = td; 1806 - td->queue = q; 1807 - 1808 - td->limit_valid[LIMIT_MAX] = true; 1809 - td->limit_index = LIMIT_MAX; 1810 - td->low_upgrade_time = jiffies; 1811 - td->low_downgrade_time = jiffies; 1812 - 1813 - /* activate policy */ 1814 - ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); 1815 - if (ret) { 1816 - free_percpu(td->latency_buckets[READ]); 1817 - free_percpu(td->latency_buckets[WRITE]); 1818 - kfree(td); 1819 - } 1820 - return ret; 1821 - } 1822 - 1823 2279 void blk_throtl_exit(struct gendisk *disk) 1824 2280 { 1825 2281 struct request_queue *q = disk->queue; 1826 2282 1827 - BUG_ON(!q->td); 2283 + if (!blk_throtl_activated(q)) 2284 + return; 2285 + 1828 2286 del_timer_sync(&q->td->service_queue.pending_timer); 1829 2287 throtl_shutdown_wq(q); 1830 2288 blkcg_deactivate_policy(disk, &blkcg_policy_throtl); 1831 - free_percpu(q->td->latency_buckets[READ]); 1832 - free_percpu(q->td->latency_buckets[WRITE]); 1833 2289 kfree(q->td); 1834 2290 } 1835 - 1836 - void blk_throtl_register(struct gendisk *disk) 1837 - { 1838 - struct request_queue *q = disk->queue; 1839 - struct throtl_data *td; 1840 - int i; 1841 - 1842 - td = q->td; 1843 - BUG_ON(!td); 1844 - 1845 - if (blk_queue_nonrot(q)) { 1846 - td->throtl_slice = DFL_THROTL_SLICE_SSD; 1847 - td->filtered_latency = LATENCY_FILTERED_SSD; 1848 - } else { 1849 - td->throtl_slice = DFL_THROTL_SLICE_HD; 1850 - td->filtered_latency = LATENCY_FILTERED_HD; 1851 - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 1852 - td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; 1853 - td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; 1854 - } 1855 - } 1856 - #ifndef CONFIG_BLK_DEV_THROTTLING_LOW 1857 - /* if no low limit, use previous default */ 1858 - td->throtl_slice = DFL_THROTL_SLICE_HD; 1859 - 1860 - #else 1861 - td->track_bio_latency = !queue_is_mq(q); 1862 - if (!td->track_bio_latency) 1863 - blk_stat_enable_accounting(q); 1864 - #endif 1865 - } 1866 - 1867 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1868 - ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) 1869 - { 1870 - if (!q->td) 1871 - return -EINVAL; 1872 - return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); 1873 - } 1874 - 1875 - ssize_t blk_throtl_sample_time_store(struct request_queue *q, 1876 - const char *page, size_t count) 1877 - { 1878 - unsigned long v; 1879 - unsigned long t; 1880 - 1881 - if (!q->td) 1882 - return -EINVAL; 1883 - if (kstrtoul(page, 10, &v)) 1884 - return -EINVAL; 1885 - t = msecs_to_jiffies(v); 1886 - if (t == 0 || t > MAX_THROTL_SLICE) 1887 - return -EINVAL; 1888 - q->td->throtl_slice = t; 1889 - return count; 1890 - } 1891 - #endif 1892 2291 1893 2292 static int __init throtl_init(void) 1894 2293 {

+19 -27

block/blk-throttle.h

··· 58 58 THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ 59 59 }; 60 60 61 - enum { 62 - LIMIT_LOW, 63 - LIMIT_MAX, 64 - LIMIT_CNT, 65 - }; 66 - 67 61 struct throtl_grp { 68 62 /* must be the first member */ 69 63 struct blkg_policy_data pd; ··· 96 102 bool has_rules_iops[2]; 97 103 98 104 /* internally used bytes per second rate limits */ 99 - uint64_t bps[2][LIMIT_CNT]; 105 + uint64_t bps[2]; 100 106 /* user configured bps limits */ 101 - uint64_t bps_conf[2][LIMIT_CNT]; 107 + uint64_t bps_conf[2]; 102 108 103 109 /* internally used IOPS limits */ 104 - unsigned int iops[2][LIMIT_CNT]; 110 + unsigned int iops[2]; 105 111 /* user configured IOPS limits */ 106 - unsigned int iops_conf[2][LIMIT_CNT]; 112 + unsigned int iops_conf[2]; 107 113 108 114 /* Number of bytes dispatched in current slice */ 109 115 uint64_t bytes_disp[2]; ··· 126 132 127 133 unsigned long last_check_time; 128 134 129 - unsigned long latency_target; /* us */ 130 - unsigned long latency_target_conf; /* us */ 131 135 /* When did we start a new slice */ 132 136 unsigned long slice_start[2]; 133 137 unsigned long slice_end[2]; 134 - 135 - unsigned long last_finish_time; /* ns / 1024 */ 136 - unsigned long checked_last_finish_time; /* ns / 1024 */ 137 - unsigned long avg_idletime; /* ns / 1024 */ 138 - unsigned long idletime_threshold; /* us */ 139 - unsigned long idletime_threshold_conf; /* us */ 140 - 141 - unsigned int bio_cnt; /* total bios */ 142 - unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ 143 - unsigned long bio_cnt_reset_time; 144 138 145 139 struct blkg_rwstat stat_bytes; 146 140 struct blkg_rwstat stat_ios; ··· 150 168 * Internal throttling interface 151 169 */ 152 170 #ifndef CONFIG_BLK_DEV_THROTTLING 153 - static inline int blk_throtl_init(struct gendisk *disk) { return 0; } 154 171 static inline void blk_throtl_exit(struct gendisk *disk) { } 155 - static inline void blk_throtl_register(struct gendisk *disk) { } 156 172 static inline bool blk_throtl_bio(struct bio *bio) { return false; } 157 173 static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } 158 174 #else /* CONFIG_BLK_DEV_THROTTLING */ 159 - int blk_throtl_init(struct gendisk *disk); 160 175 void blk_throtl_exit(struct gendisk *disk); 161 - void blk_throtl_register(struct gendisk *disk); 162 176 bool __blk_throtl_bio(struct bio *bio); 163 177 void blk_throtl_cancel_bios(struct gendisk *disk); 164 178 179 + static inline bool blk_throtl_activated(struct request_queue *q) 180 + { 181 + return q->td != NULL; 182 + } 183 + 165 184 static inline bool blk_should_throtl(struct bio *bio) 166 185 { 167 - struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); 186 + struct throtl_grp *tg; 168 187 int rw = bio_data_dir(bio); 169 188 189 + /* 190 + * This is called under bio_queue_enter(), and it's synchronized with 191 + * the activation of blk-throtl, which is protected by 192 + * blk_mq_freeze_queue(). 193 + */ 194 + if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) 195 + return false; 196 + 197 + tg = blkg_to_tg(bio->bi_blkg); 170 198 if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { 171 199 if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { 172 200 bio_set_flag(bio, BIO_CGROUP_ACCT);

+1410 -98

block/blk-zoned.c

··· 7 7 * 8 8 * Copyright (c) 2016, Damien Le Moal 9 9 * Copyright (c) 2016, Western Digital 10 + * Copyright (c) 2024, Western Digital Corporation or its affiliates. 10 11 */ 11 12 12 13 #include <linux/kernel.h> ··· 17 16 #include <linux/mm.h> 18 17 #include <linux/vmalloc.h> 19 18 #include <linux/sched/mm.h> 19 + #include <linux/spinlock.h> 20 + #include <linux/atomic.h> 21 + #include <linux/mempool.h> 20 22 21 23 #include "blk.h" 24 + #include "blk-mq-sched.h" 25 + #include "blk-mq-debugfs.h" 22 26 23 27 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name 24 28 static const char *const zone_cond_name[] = { ··· 37 31 ZONE_COND_NAME(OFFLINE), 38 32 }; 39 33 #undef ZONE_COND_NAME 34 + 35 + /* 36 + * Per-zone write plug. 37 + * @node: hlist_node structure for managing the plug using a hash table. 38 + * @link: To list the plug in the zone write plug error list of the disk. 39 + * @ref: Zone write plug reference counter. A zone write plug reference is 40 + * always at least 1 when the plug is hashed in the disk plug hash table. 41 + * The reference is incremented whenever a new BIO needing plugging is 42 + * submitted and when a function needs to manipulate a plug. The 43 + * reference count is decremented whenever a plugged BIO completes and 44 + * when a function that referenced the plug returns. The initial 45 + * reference is dropped whenever the zone of the zone write plug is reset, 46 + * finished and when the zone becomes full (last write BIO to the zone 47 + * completes). 48 + * @lock: Spinlock to atomically manipulate the plug. 49 + * @flags: Flags indicating the plug state. 50 + * @zone_no: The number of the zone the plug is managing. 51 + * @wp_offset: The zone write pointer location relative to the start of the zone 52 + * as a number of 512B sectors. 53 + * @bio_list: The list of BIOs that are currently plugged. 54 + * @bio_work: Work struct to handle issuing of plugged BIOs 55 + * @rcu_head: RCU head to free zone write plugs with an RCU grace period. 56 + * @disk: The gendisk the plug belongs to. 57 + */ 58 + struct blk_zone_wplug { 59 + struct hlist_node node; 60 + struct list_head link; 61 + atomic_t ref; 62 + spinlock_t lock; 63 + unsigned int flags; 64 + unsigned int zone_no; 65 + unsigned int wp_offset; 66 + struct bio_list bio_list; 67 + struct work_struct bio_work; 68 + struct rcu_head rcu_head; 69 + struct gendisk *disk; 70 + }; 71 + 72 + /* 73 + * Zone write plug flags bits: 74 + * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, 75 + * that is, that write BIOs are being throttled due to a write BIO already 76 + * being executed or the zone write plug bio list is not empty. 77 + * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be 78 + * recovered with a report zone to update the zone write pointer offset. 79 + * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed 80 + * from the disk hash table and that the initial reference to the zone 81 + * write plug set when the plug was first added to the hash table has been 82 + * dropped. This flag is set when a zone is reset, finished or become full, 83 + * to prevent new references to the zone write plug to be taken for 84 + * newly incoming BIOs. A zone write plug flagged with this flag will be 85 + * freed once all remaining references from BIOs or functions are dropped. 86 + */ 87 + #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) 88 + #define BLK_ZONE_WPLUG_ERROR (1U << 1) 89 + #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) 90 + 91 + #define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR) 40 92 41 93 /** 42 94 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. ··· 114 50 return zone_cond_str; 115 51 } 116 52 EXPORT_SYMBOL_GPL(blk_zone_cond_str); 117 - 118 - /* 119 - * Return true if a request is a write requests that needs zone write locking. 120 - */ 121 - bool blk_req_needs_zone_write_lock(struct request *rq) 122 - { 123 - if (!rq->q->disk->seq_zones_wlock) 124 - return false; 125 - 126 - return blk_rq_is_seq_zoned_write(rq); 127 - } 128 - EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); 129 - 130 - bool blk_req_zone_write_trylock(struct request *rq) 131 - { 132 - unsigned int zno = blk_rq_zone_no(rq); 133 - 134 - if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock)) 135 - return false; 136 - 137 - WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); 138 - rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; 139 - 140 - return true; 141 - } 142 - EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); 143 - 144 - void __blk_req_zone_write_lock(struct request *rq) 145 - { 146 - if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), 147 - rq->q->disk->seq_zones_wlock))) 148 - return; 149 - 150 - WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); 151 - rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; 152 - } 153 - EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); 154 - 155 - void __blk_req_zone_write_unlock(struct request *rq) 156 - { 157 - rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; 158 - if (rq->q->disk->seq_zones_wlock) 159 - WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), 160 - rq->q->disk->seq_zones_wlock)); 161 - } 162 - EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); 163 53 164 54 /** 165 55 * bdev_nr_zones - Get number of zones ··· 443 425 return ret; 444 426 } 445 427 446 - void disk_free_zone_bitmaps(struct gendisk *disk) 428 + static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) 447 429 { 430 + if (!disk->conv_zones_bitmap) 431 + return false; 432 + return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); 433 + } 434 + 435 + static bool disk_insert_zone_wplug(struct gendisk *disk, 436 + struct blk_zone_wplug *zwplug) 437 + { 438 + struct blk_zone_wplug *zwplg; 439 + unsigned long flags; 440 + unsigned int idx = 441 + hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); 442 + 443 + /* 444 + * Add the new zone write plug to the hash table, but carefully as we 445 + * are racing with other submission context, so we may already have a 446 + * zone write plug for the same zone. 447 + */ 448 + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 449 + hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { 450 + if (zwplg->zone_no == zwplug->zone_no) { 451 + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 452 + return false; 453 + } 454 + } 455 + hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); 456 + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 457 + 458 + return true; 459 + } 460 + 461 + static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, 462 + sector_t sector) 463 + { 464 + unsigned int zno = disk_zone_no(disk, sector); 465 + unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); 466 + struct blk_zone_wplug *zwplug; 467 + 468 + rcu_read_lock(); 469 + 470 + hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { 471 + if (zwplug->zone_no == zno && 472 + atomic_inc_not_zero(&zwplug->ref)) { 473 + rcu_read_unlock(); 474 + return zwplug; 475 + } 476 + } 477 + 478 + rcu_read_unlock(); 479 + 480 + return NULL; 481 + } 482 + 483 + static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) 484 + { 485 + struct blk_zone_wplug *zwplug = 486 + container_of(rcu_head, struct blk_zone_wplug, rcu_head); 487 + 488 + mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); 489 + } 490 + 491 + static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) 492 + { 493 + if (atomic_dec_and_test(&zwplug->ref)) { 494 + WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); 495 + WARN_ON_ONCE(!list_empty(&zwplug->link)); 496 + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); 497 + 498 + call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); 499 + } 500 + } 501 + 502 + static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, 503 + struct blk_zone_wplug *zwplug) 504 + { 505 + /* If the zone write plug was already removed, we are done. */ 506 + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) 507 + return false; 508 + 509 + /* If the zone write plug is still busy, it cannot be removed. */ 510 + if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) 511 + return false; 512 + 513 + /* 514 + * Completions of BIOs with blk_zone_write_plug_bio_endio() may 515 + * happen after handling a request completion with 516 + * blk_zone_write_plug_finish_request() (e.g. with split BIOs 517 + * that are chained). In such case, disk_zone_wplug_unplug_bio() 518 + * should not attempt to remove the zone write plug until all BIO 519 + * completions are seen. Check by looking at the zone write plug 520 + * reference count, which is 2 when the plug is unused (one reference 521 + * taken when the plug was allocated and another reference taken by the 522 + * caller context). 523 + */ 524 + if (atomic_read(&zwplug->ref) > 2) 525 + return false; 526 + 527 + /* We can remove zone write plugs for zones that are empty or full. */ 528 + return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; 529 + } 530 + 531 + static void disk_remove_zone_wplug(struct gendisk *disk, 532 + struct blk_zone_wplug *zwplug) 533 + { 534 + unsigned long flags; 535 + 536 + /* If the zone write plug was already removed, we have nothing to do. */ 537 + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) 538 + return; 539 + 540 + /* 541 + * Mark the zone write plug as unhashed and drop the extra reference we 542 + * took when the plug was inserted in the hash table. 543 + */ 544 + zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; 545 + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 546 + hlist_del_init_rcu(&zwplug->node); 547 + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 548 + disk_put_zone_wplug(zwplug); 549 + } 550 + 551 + static void blk_zone_wplug_bio_work(struct work_struct *work); 552 + 553 + /* 554 + * Get a reference on the write plug for the zone containing @sector. 555 + * If the plug does not exist, it is allocated and hashed. 556 + * Return a pointer to the zone write plug with the plug spinlock held. 557 + */ 558 + static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, 559 + sector_t sector, gfp_t gfp_mask, 560 + unsigned long *flags) 561 + { 562 + unsigned int zno = disk_zone_no(disk, sector); 563 + struct blk_zone_wplug *zwplug; 564 + 565 + again: 566 + zwplug = disk_get_zone_wplug(disk, sector); 567 + if (zwplug) { 568 + /* 569 + * Check that a BIO completion or a zone reset or finish 570 + * operation has not already removed the zone write plug from 571 + * the hash table and dropped its reference count. In such case, 572 + * we need to get a new plug so start over from the beginning. 573 + */ 574 + spin_lock_irqsave(&zwplug->lock, *flags); 575 + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { 576 + spin_unlock_irqrestore(&zwplug->lock, *flags); 577 + disk_put_zone_wplug(zwplug); 578 + goto again; 579 + } 580 + return zwplug; 581 + } 582 + 583 + /* 584 + * Allocate and initialize a zone write plug with an extra reference 585 + * so that it is not freed when the zone write plug becomes idle without 586 + * the zone being full. 587 + */ 588 + zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); 589 + if (!zwplug) 590 + return NULL; 591 + 592 + INIT_HLIST_NODE(&zwplug->node); 593 + INIT_LIST_HEAD(&zwplug->link); 594 + atomic_set(&zwplug->ref, 2); 595 + spin_lock_init(&zwplug->lock); 596 + zwplug->flags = 0; 597 + zwplug->zone_no = zno; 598 + zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); 599 + bio_list_init(&zwplug->bio_list); 600 + INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); 601 + zwplug->disk = disk; 602 + 603 + spin_lock_irqsave(&zwplug->lock, *flags); 604 + 605 + /* 606 + * Insert the new zone write plug in the hash table. This can fail only 607 + * if another context already inserted a plug. Retry from the beginning 608 + * in such case. 609 + */ 610 + if (!disk_insert_zone_wplug(disk, zwplug)) { 611 + spin_unlock_irqrestore(&zwplug->lock, *flags); 612 + mempool_free(zwplug, disk->zone_wplugs_pool); 613 + goto again; 614 + } 615 + 616 + return zwplug; 617 + } 618 + 619 + static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, 620 + struct bio *bio) 621 + { 622 + struct request_queue *q = zwplug->disk->queue; 623 + 624 + bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 625 + bio_io_error(bio); 626 + disk_put_zone_wplug(zwplug); 627 + blk_queue_exit(q); 628 + } 629 + 630 + /* 631 + * Abort (fail) all plugged BIOs of a zone write plug. 632 + */ 633 + static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) 634 + { 635 + struct bio *bio; 636 + 637 + while ((bio = bio_list_pop(&zwplug->bio_list))) 638 + blk_zone_wplug_bio_io_error(zwplug, bio); 639 + } 640 + 641 + /* 642 + * Abort (fail) all plugged BIOs of a zone write plug that are not aligned 643 + * with the assumed write pointer location of the zone when the BIO will 644 + * be unplugged. 645 + */ 646 + static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, 647 + struct blk_zone_wplug *zwplug) 648 + { 649 + unsigned int zone_capacity = disk->zone_capacity; 650 + unsigned int wp_offset = zwplug->wp_offset; 651 + struct bio_list bl = BIO_EMPTY_LIST; 652 + struct bio *bio; 653 + 654 + while ((bio = bio_list_pop(&zwplug->bio_list))) { 655 + if (wp_offset >= zone_capacity || 656 + (bio_op(bio) != REQ_OP_ZONE_APPEND && 657 + bio_offset_from_zone_start(bio) != wp_offset)) { 658 + blk_zone_wplug_bio_io_error(zwplug, bio); 659 + continue; 660 + } 661 + 662 + wp_offset += bio_sectors(bio); 663 + bio_list_add(&bl, bio); 664 + } 665 + 666 + bio_list_merge(&zwplug->bio_list, &bl); 667 + } 668 + 669 + static inline void disk_zone_wplug_set_error(struct gendisk *disk, 670 + struct blk_zone_wplug *zwplug) 671 + { 672 + unsigned long flags; 673 + 674 + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) 675 + return; 676 + 677 + /* 678 + * At this point, we already have a reference on the zone write plug. 679 + * However, since we are going to add the plug to the disk zone write 680 + * plugs work list, increase its reference count. This reference will 681 + * be dropped in disk_zone_wplugs_work() once the error state is 682 + * handled, or in disk_zone_wplug_clear_error() if the zone is reset or 683 + * finished. 684 + */ 685 + zwplug->flags |= BLK_ZONE_WPLUG_ERROR; 686 + atomic_inc(&zwplug->ref); 687 + 688 + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 689 + list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); 690 + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 691 + } 692 + 693 + static inline void disk_zone_wplug_clear_error(struct gendisk *disk, 694 + struct blk_zone_wplug *zwplug) 695 + { 696 + unsigned long flags; 697 + 698 + if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) 699 + return; 700 + 701 + /* 702 + * We are racing with the error handling work which drops the reference 703 + * on the zone write plug after handling the error state. So remove the 704 + * plug from the error list and drop its reference count only if the 705 + * error handling has not yet started, that is, if the zone write plug 706 + * is still listed. 707 + */ 708 + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 709 + if (!list_empty(&zwplug->link)) { 710 + list_del_init(&zwplug->link); 711 + zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; 712 + disk_put_zone_wplug(zwplug); 713 + } 714 + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 715 + } 716 + 717 + /* 718 + * Set a zone write plug write pointer offset to either 0 (zone reset case) 719 + * or to the zone size (zone finish case). This aborts all plugged BIOs, which 720 + * is fine to do as doing a zone reset or zone finish while writes are in-flight 721 + * is a mistake from the user which will most likely cause all plugged BIOs to 722 + * fail anyway. 723 + */ 724 + static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, 725 + struct blk_zone_wplug *zwplug, 726 + unsigned int wp_offset) 727 + { 728 + unsigned long flags; 729 + 730 + spin_lock_irqsave(&zwplug->lock, flags); 731 + 732 + /* 733 + * Make sure that a BIO completion or another zone reset or finish 734 + * operation has not already removed the plug from the hash table. 735 + */ 736 + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { 737 + spin_unlock_irqrestore(&zwplug->lock, flags); 738 + return; 739 + } 740 + 741 + /* Update the zone write pointer and abort all plugged BIOs. */ 742 + zwplug->wp_offset = wp_offset; 743 + disk_zone_wplug_abort(zwplug); 744 + 745 + /* 746 + * Updating the write pointer offset puts back the zone 747 + * in a good state. So clear the error flag and decrement the 748 + * error count if we were in error state. 749 + */ 750 + disk_zone_wplug_clear_error(disk, zwplug); 751 + 752 + /* 753 + * The zone write plug now has no BIO plugged: remove it from the 754 + * hash table so that it cannot be seen. The plug will be freed 755 + * when the last reference is dropped. 756 + */ 757 + if (disk_should_remove_zone_wplug(disk, zwplug)) 758 + disk_remove_zone_wplug(disk, zwplug); 759 + 760 + spin_unlock_irqrestore(&zwplug->lock, flags); 761 + } 762 + 763 + static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, 764 + unsigned int wp_offset) 765 + { 766 + struct gendisk *disk = bio->bi_bdev->bd_disk; 767 + sector_t sector = bio->bi_iter.bi_sector; 768 + struct blk_zone_wplug *zwplug; 769 + 770 + /* Conventional zones cannot be reset nor finished. */ 771 + if (disk_zone_is_conv(disk, sector)) { 772 + bio_io_error(bio); 773 + return true; 774 + } 775 + 776 + /* 777 + * If we have a zone write plug, set its write pointer offset to 0 778 + * (reset case) or to the zone size (finish case). This will abort all 779 + * BIOs plugged for the target zone. It is fine as resetting or 780 + * finishing zones while writes are still in-flight will result in the 781 + * writes failing anyway. 782 + */ 783 + zwplug = disk_get_zone_wplug(disk, sector); 784 + if (zwplug) { 785 + disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); 786 + disk_put_zone_wplug(zwplug); 787 + } 788 + 789 + return false; 790 + } 791 + 792 + static bool blk_zone_wplug_handle_reset_all(struct bio *bio) 793 + { 794 + struct gendisk *disk = bio->bi_bdev->bd_disk; 795 + struct blk_zone_wplug *zwplug; 796 + sector_t sector; 797 + 798 + /* 799 + * Set the write pointer offset of all zone write plugs to 0. This will 800 + * abort all plugged BIOs. It is fine as resetting zones while writes 801 + * are still in-flight will result in the writes failing anyway. 802 + */ 803 + for (sector = 0; sector < get_capacity(disk); 804 + sector += disk->queue->limits.chunk_sectors) { 805 + zwplug = disk_get_zone_wplug(disk, sector); 806 + if (zwplug) { 807 + disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 808 + disk_put_zone_wplug(zwplug); 809 + } 810 + } 811 + 812 + return false; 813 + } 814 + 815 + static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, 816 + struct bio *bio, unsigned int nr_segs) 817 + { 818 + /* 819 + * Grab an extra reference on the BIO request queue usage counter. 820 + * This reference will be reused to submit a request for the BIO for 821 + * blk-mq devices and dropped when the BIO is failed and after 822 + * it is issued in the case of BIO-based devices. 823 + */ 824 + percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); 825 + 826 + /* 827 + * The BIO is being plugged and thus will have to wait for the on-going 828 + * write and for all other writes already plugged. So polling makes 829 + * no sense. 830 + */ 831 + bio_clear_polled(bio); 832 + 833 + /* 834 + * Reuse the poll cookie field to store the number of segments when 835 + * split to the hardware limits. 836 + */ 837 + bio->__bi_nr_segments = nr_segs; 838 + 839 + /* 840 + * We always receive BIOs after they are split and ready to be issued. 841 + * The block layer passes the parts of a split BIO in order, and the 842 + * user must also issue write sequentially. So simply add the new BIO 843 + * at the tail of the list to preserve the sequential write order. 844 + */ 845 + bio_list_add(&zwplug->bio_list, bio); 846 + } 847 + 848 + /* 849 + * Called from bio_attempt_back_merge() when a BIO was merged with a request. 850 + */ 851 + void blk_zone_write_plug_bio_merged(struct bio *bio) 852 + { 853 + struct blk_zone_wplug *zwplug; 854 + unsigned long flags; 855 + 856 + /* 857 + * If the BIO was already plugged, then we were called through 858 + * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). 859 + * For this case, we already hold a reference on the zone write plug for 860 + * the BIO and blk_zone_write_plug_init_request() will handle the 861 + * zone write pointer offset update. 862 + */ 863 + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) 864 + return; 865 + 866 + bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 867 + 868 + /* 869 + * Get a reference on the zone write plug of the target zone and advance 870 + * the zone write pointer offset. Given that this is a merge, we already 871 + * have at least one request and one BIO referencing the zone write 872 + * plug. So this should not fail. 873 + */ 874 + zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, 875 + bio->bi_iter.bi_sector); 876 + if (WARN_ON_ONCE(!zwplug)) 877 + return; 878 + 879 + spin_lock_irqsave(&zwplug->lock, flags); 880 + zwplug->wp_offset += bio_sectors(bio); 881 + spin_unlock_irqrestore(&zwplug->lock, flags); 882 + } 883 + 884 + /* 885 + * Attempt to merge plugged BIOs with a newly prepared request for a BIO that 886 + * already went through zone write plugging (either a new BIO or one that was 887 + * unplugged). 888 + */ 889 + void blk_zone_write_plug_init_request(struct request *req) 890 + { 891 + sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); 892 + struct request_queue *q = req->q; 893 + struct gendisk *disk = q->disk; 894 + unsigned int zone_capacity = disk->zone_capacity; 895 + struct blk_zone_wplug *zwplug = 896 + disk_get_zone_wplug(disk, blk_rq_pos(req)); 897 + unsigned long flags; 898 + struct bio *bio; 899 + 900 + if (WARN_ON_ONCE(!zwplug)) 901 + return; 902 + 903 + /* 904 + * Indicate that completion of this request needs to be handled with 905 + * blk_zone_write_plug_finish_request(), which will drop the reference 906 + * on the zone write plug we took above on entry to this function. 907 + */ 908 + req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; 909 + 910 + if (blk_queue_nomerges(q)) 911 + return; 912 + 913 + /* 914 + * Walk through the list of plugged BIOs to check if they can be merged 915 + * into the back of the request. 916 + */ 917 + spin_lock_irqsave(&zwplug->lock, flags); 918 + while (zwplug->wp_offset < zone_capacity) { 919 + bio = bio_list_peek(&zwplug->bio_list); 920 + if (!bio) 921 + break; 922 + 923 + if (bio->bi_iter.bi_sector != req_back_sector || 924 + !blk_rq_merge_ok(req, bio)) 925 + break; 926 + 927 + WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && 928 + !bio->__bi_nr_segments); 929 + 930 + bio_list_pop(&zwplug->bio_list); 931 + if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != 932 + BIO_MERGE_OK) { 933 + bio_list_add_head(&zwplug->bio_list, bio); 934 + break; 935 + } 936 + 937 + /* 938 + * Drop the extra reference on the queue usage we got when 939 + * plugging the BIO and advance the write pointer offset. 940 + */ 941 + blk_queue_exit(q); 942 + zwplug->wp_offset += bio_sectors(bio); 943 + 944 + req_back_sector += bio_sectors(bio); 945 + } 946 + spin_unlock_irqrestore(&zwplug->lock, flags); 947 + } 948 + 949 + /* 950 + * Check and prepare a BIO for submission by incrementing the write pointer 951 + * offset of its zone write plug and changing zone append operations into 952 + * regular write when zone append emulation is needed. 953 + */ 954 + static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, 955 + struct bio *bio) 956 + { 957 + struct gendisk *disk = bio->bi_bdev->bd_disk; 958 + 959 + /* 960 + * Check that the user is not attempting to write to a full zone. 961 + * We know such BIO will fail, and that would potentially overflow our 962 + * write pointer offset beyond the end of the zone. 963 + */ 964 + if (zwplug->wp_offset >= disk->zone_capacity) 965 + goto err; 966 + 967 + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 968 + /* 969 + * Use a regular write starting at the current write pointer. 970 + * Similarly to native zone append operations, do not allow 971 + * merging. 972 + */ 973 + bio->bi_opf &= ~REQ_OP_MASK; 974 + bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; 975 + bio->bi_iter.bi_sector += zwplug->wp_offset; 976 + 977 + /* 978 + * Remember that this BIO is in fact a zone append operation 979 + * so that we can restore its operation code on completion. 980 + */ 981 + bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); 982 + } else { 983 + /* 984 + * Check for non-sequential writes early because we avoid a 985 + * whole lot of error handling trouble if we don't send it off 986 + * to the driver. 987 + */ 988 + if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) 989 + goto err; 990 + } 991 + 992 + /* Advance the zone write pointer offset. */ 993 + zwplug->wp_offset += bio_sectors(bio); 994 + 995 + return true; 996 + 997 + err: 998 + /* We detected an invalid write BIO: schedule error recovery. */ 999 + disk_zone_wplug_set_error(disk, zwplug); 1000 + kblockd_schedule_work(&disk->zone_wplugs_work); 1001 + return false; 1002 + } 1003 + 1004 + static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) 1005 + { 1006 + struct gendisk *disk = bio->bi_bdev->bd_disk; 1007 + sector_t sector = bio->bi_iter.bi_sector; 1008 + struct blk_zone_wplug *zwplug; 1009 + gfp_t gfp_mask = GFP_NOIO; 1010 + unsigned long flags; 1011 + 1012 + /* 1013 + * BIOs must be fully contained within a zone so that we use the correct 1014 + * zone write plug for the entire BIO. For blk-mq devices, the block 1015 + * layer should already have done any splitting required to ensure this 1016 + * and this BIO should thus not be straddling zone boundaries. For 1017 + * BIO-based devices, it is the responsibility of the driver to split 1018 + * the bio before submitting it. 1019 + */ 1020 + if (WARN_ON_ONCE(bio_straddles_zones(bio))) { 1021 + bio_io_error(bio); 1022 + return true; 1023 + } 1024 + 1025 + /* Conventional zones do not need write plugging. */ 1026 + if (disk_zone_is_conv(disk, sector)) { 1027 + /* Zone append to conventional zones is not allowed. */ 1028 + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1029 + bio_io_error(bio); 1030 + return true; 1031 + } 1032 + return false; 1033 + } 1034 + 1035 + if (bio->bi_opf & REQ_NOWAIT) 1036 + gfp_mask = GFP_NOWAIT; 1037 + 1038 + zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); 1039 + if (!zwplug) { 1040 + bio_io_error(bio); 1041 + return true; 1042 + } 1043 + 1044 + /* Indicate that this BIO is being handled using zone write plugging. */ 1045 + bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1046 + 1047 + /* 1048 + * If the zone is already plugged or has a pending error, add the BIO 1049 + * to the plug BIO list. Otherwise, plug and let the BIO execute. 1050 + */ 1051 + if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) 1052 + goto plug; 1053 + 1054 + /* 1055 + * If an error is detected when preparing the BIO, add it to the BIO 1056 + * list so that error recovery can deal with it. 1057 + */ 1058 + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) 1059 + goto plug; 1060 + 1061 + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1062 + 1063 + spin_unlock_irqrestore(&zwplug->lock, flags); 1064 + 1065 + return false; 1066 + 1067 + plug: 1068 + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1069 + blk_zone_wplug_add_bio(zwplug, bio, nr_segs); 1070 + 1071 + spin_unlock_irqrestore(&zwplug->lock, flags); 1072 + 1073 + return true; 1074 + } 1075 + 1076 + /** 1077 + * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging 1078 + * @bio: The BIO being submitted 1079 + * @nr_segs: The number of physical segments of @bio 1080 + * 1081 + * Handle write, write zeroes and zone append operations requiring emulation 1082 + * using zone write plugging. 1083 + * 1084 + * Return true whenever @bio execution needs to be delayed through the zone 1085 + * write plug. Otherwise, return false to let the submission path process 1086 + * @bio normally. 1087 + */ 1088 + bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) 1089 + { 1090 + struct block_device *bdev = bio->bi_bdev; 1091 + 1092 + if (!bdev->bd_disk->zone_wplugs_hash) 1093 + return false; 1094 + 1095 + /* 1096 + * If the BIO already has the plugging flag set, then it was already 1097 + * handled through this path and this is a submission from the zone 1098 + * plug bio submit work. 1099 + */ 1100 + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) 1101 + return false; 1102 + 1103 + /* 1104 + * We do not need to do anything special for empty flush BIOs, e.g 1105 + * BIOs such as issued by blkdev_issue_flush(). The is because it is 1106 + * the responsibility of the user to first wait for the completion of 1107 + * write operations for flush to have any effect on the persistence of 1108 + * the written data. 1109 + */ 1110 + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) 1111 + return false; 1112 + 1113 + /* 1114 + * Regular writes and write zeroes need to be handled through the target 1115 + * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH 1116 + * which may need to go through the flush machinery depending on the 1117 + * target device capabilities. Plugging such writes is fine as the flush 1118 + * machinery operates at the request level, below the plug, and 1119 + * completion of the flush sequence will go through the regular BIO 1120 + * completion, which will handle zone write plugging. 1121 + * Zone append operations for devices that requested emulation must 1122 + * also be plugged so that these BIOs can be changed into regular 1123 + * write BIOs. 1124 + * Zone reset, reset all and finish commands need special treatment 1125 + * to correctly track the write pointer offset of zones. These commands 1126 + * are not plugged as we do not need serialization with write 1127 + * operations. It is the responsibility of the user to not issue reset 1128 + * and finish commands when write operations are in flight. 1129 + */ 1130 + switch (bio_op(bio)) { 1131 + case REQ_OP_ZONE_APPEND: 1132 + if (!bdev_emulates_zone_append(bdev)) 1133 + return false; 1134 + fallthrough; 1135 + case REQ_OP_WRITE: 1136 + case REQ_OP_WRITE_ZEROES: 1137 + return blk_zone_wplug_handle_write(bio, nr_segs); 1138 + case REQ_OP_ZONE_RESET: 1139 + return blk_zone_wplug_handle_reset_or_finish(bio, 0); 1140 + case REQ_OP_ZONE_FINISH: 1141 + return blk_zone_wplug_handle_reset_or_finish(bio, 1142 + bdev_zone_sectors(bdev)); 1143 + case REQ_OP_ZONE_RESET_ALL: 1144 + return blk_zone_wplug_handle_reset_all(bio); 1145 + default: 1146 + return false; 1147 + } 1148 + 1149 + return false; 1150 + } 1151 + EXPORT_SYMBOL_GPL(blk_zone_plug_bio); 1152 + 1153 + static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, 1154 + struct blk_zone_wplug *zwplug) 1155 + { 1156 + /* 1157 + * Take a reference on the zone write plug and schedule the submission 1158 + * of the next plugged BIO. blk_zone_wplug_bio_work() will release the 1159 + * reference we take here. 1160 + */ 1161 + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); 1162 + atomic_inc(&zwplug->ref); 1163 + queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); 1164 + } 1165 + 1166 + static void disk_zone_wplug_unplug_bio(struct gendisk *disk, 1167 + struct blk_zone_wplug *zwplug) 1168 + { 1169 + unsigned long flags; 1170 + 1171 + spin_lock_irqsave(&zwplug->lock, flags); 1172 + 1173 + /* 1174 + * If we had an error, schedule error recovery. The recovery work 1175 + * will restart submission of plugged BIOs. 1176 + */ 1177 + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { 1178 + spin_unlock_irqrestore(&zwplug->lock, flags); 1179 + kblockd_schedule_work(&disk->zone_wplugs_work); 1180 + return; 1181 + } 1182 + 1183 + /* Schedule submission of the next plugged BIO if we have one. */ 1184 + if (!bio_list_empty(&zwplug->bio_list)) { 1185 + disk_zone_wplug_schedule_bio_work(disk, zwplug); 1186 + spin_unlock_irqrestore(&zwplug->lock, flags); 1187 + return; 1188 + } 1189 + 1190 + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1191 + 1192 + /* 1193 + * If the zone is full (it was fully written or finished, or empty 1194 + * (it was reset), remove its zone write plug from the hash table. 1195 + */ 1196 + if (disk_should_remove_zone_wplug(disk, zwplug)) 1197 + disk_remove_zone_wplug(disk, zwplug); 1198 + 1199 + spin_unlock_irqrestore(&zwplug->lock, flags); 1200 + } 1201 + 1202 + void blk_zone_write_plug_bio_endio(struct bio *bio) 1203 + { 1204 + struct gendisk *disk = bio->bi_bdev->bd_disk; 1205 + struct blk_zone_wplug *zwplug = 1206 + disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1207 + unsigned long flags; 1208 + 1209 + if (WARN_ON_ONCE(!zwplug)) 1210 + return; 1211 + 1212 + /* Make sure we do not see this BIO again by clearing the plug flag. */ 1213 + bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1214 + 1215 + /* 1216 + * If this is a regular write emulating a zone append operation, 1217 + * restore the original operation code. 1218 + */ 1219 + if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { 1220 + bio->bi_opf &= ~REQ_OP_MASK; 1221 + bio->bi_opf |= REQ_OP_ZONE_APPEND; 1222 + } 1223 + 1224 + /* 1225 + * If the BIO failed, mark the plug as having an error to trigger 1226 + * recovery. 1227 + */ 1228 + if (bio->bi_status != BLK_STS_OK) { 1229 + spin_lock_irqsave(&zwplug->lock, flags); 1230 + disk_zone_wplug_set_error(disk, zwplug); 1231 + spin_unlock_irqrestore(&zwplug->lock, flags); 1232 + } 1233 + 1234 + /* Drop the reference we took when the BIO was issued. */ 1235 + disk_put_zone_wplug(zwplug); 1236 + 1237 + /* 1238 + * For BIO-based devices, blk_zone_write_plug_finish_request() 1239 + * is not called. So we need to schedule execution of the next 1240 + * plugged BIO here. 1241 + */ 1242 + if (bio->bi_bdev->bd_has_submit_bio) 1243 + disk_zone_wplug_unplug_bio(disk, zwplug); 1244 + 1245 + /* Drop the reference we took when entering this function. */ 1246 + disk_put_zone_wplug(zwplug); 1247 + } 1248 + 1249 + void blk_zone_write_plug_finish_request(struct request *req) 1250 + { 1251 + struct gendisk *disk = req->q->disk; 1252 + struct blk_zone_wplug *zwplug; 1253 + 1254 + zwplug = disk_get_zone_wplug(disk, req->__sector); 1255 + if (WARN_ON_ONCE(!zwplug)) 1256 + return; 1257 + 1258 + req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; 1259 + 1260 + /* 1261 + * Drop the reference we took when the request was initialized in 1262 + * blk_zone_write_plug_init_request(). 1263 + */ 1264 + disk_put_zone_wplug(zwplug); 1265 + 1266 + disk_zone_wplug_unplug_bio(disk, zwplug); 1267 + 1268 + /* Drop the reference we took when entering this function. */ 1269 + disk_put_zone_wplug(zwplug); 1270 + } 1271 + 1272 + static void blk_zone_wplug_bio_work(struct work_struct *work) 1273 + { 1274 + struct blk_zone_wplug *zwplug = 1275 + container_of(work, struct blk_zone_wplug, bio_work); 1276 + struct block_device *bdev; 1277 + unsigned long flags; 1278 + struct bio *bio; 1279 + 1280 + /* 1281 + * Submit the next plugged BIO. If we do not have any, clear 1282 + * the plugged flag. 1283 + */ 1284 + spin_lock_irqsave(&zwplug->lock, flags); 1285 + 1286 + bio = bio_list_pop(&zwplug->bio_list); 1287 + if (!bio) { 1288 + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1289 + spin_unlock_irqrestore(&zwplug->lock, flags); 1290 + goto put_zwplug; 1291 + } 1292 + 1293 + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { 1294 + /* Error recovery will decide what to do with the BIO. */ 1295 + bio_list_add_head(&zwplug->bio_list, bio); 1296 + spin_unlock_irqrestore(&zwplug->lock, flags); 1297 + goto put_zwplug; 1298 + } 1299 + 1300 + spin_unlock_irqrestore(&zwplug->lock, flags); 1301 + 1302 + bdev = bio->bi_bdev; 1303 + submit_bio_noacct_nocheck(bio); 1304 + 1305 + /* 1306 + * blk-mq devices will reuse the extra reference on the request queue 1307 + * usage counter we took when the BIO was plugged, but the submission 1308 + * path for BIO-based devices will not do that. So drop this extra 1309 + * reference here. 1310 + */ 1311 + if (bdev->bd_has_submit_bio) 1312 + blk_queue_exit(bdev->bd_disk->queue); 1313 + 1314 + put_zwplug: 1315 + /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ 1316 + disk_put_zone_wplug(zwplug); 1317 + } 1318 + 1319 + static unsigned int blk_zone_wp_offset(struct blk_zone *zone) 1320 + { 1321 + switch (zone->cond) { 1322 + case BLK_ZONE_COND_IMP_OPEN: 1323 + case BLK_ZONE_COND_EXP_OPEN: 1324 + case BLK_ZONE_COND_CLOSED: 1325 + return zone->wp - zone->start; 1326 + case BLK_ZONE_COND_FULL: 1327 + return zone->len; 1328 + case BLK_ZONE_COND_EMPTY: 1329 + return 0; 1330 + case BLK_ZONE_COND_NOT_WP: 1331 + case BLK_ZONE_COND_OFFLINE: 1332 + case BLK_ZONE_COND_READONLY: 1333 + default: 1334 + /* 1335 + * Conventional, offline and read-only zones do not have a valid 1336 + * write pointer. 1337 + */ 1338 + return UINT_MAX; 1339 + } 1340 + } 1341 + 1342 + static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, 1343 + unsigned int idx, void *data) 1344 + { 1345 + struct blk_zone *zonep = data; 1346 + 1347 + *zonep = *zone; 1348 + return 0; 1349 + } 1350 + 1351 + static void disk_zone_wplug_handle_error(struct gendisk *disk, 1352 + struct blk_zone_wplug *zwplug) 1353 + { 1354 + sector_t zone_start_sector = 1355 + bdev_zone_sectors(disk->part0) * zwplug->zone_no; 1356 + unsigned int noio_flag; 1357 + struct blk_zone zone; 1358 + unsigned long flags; 1359 + int ret; 1360 + 1361 + /* Get the current zone information from the device. */ 1362 + noio_flag = memalloc_noio_save(); 1363 + ret = disk->fops->report_zones(disk, zone_start_sector, 1, 1364 + blk_zone_wplug_report_zone_cb, &zone); 1365 + memalloc_noio_restore(noio_flag); 1366 + 1367 + spin_lock_irqsave(&zwplug->lock, flags); 1368 + 1369 + /* 1370 + * A zone reset or finish may have cleared the error already. In such 1371 + * case, do nothing as the report zones may have seen the "old" write 1372 + * pointer value before the reset/finish operation completed. 1373 + */ 1374 + if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) 1375 + goto unlock; 1376 + 1377 + zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; 1378 + 1379 + if (ret != 1) { 1380 + /* 1381 + * We failed to get the zone information, meaning that something 1382 + * is likely really wrong with the device. Abort all remaining 1383 + * plugged BIOs as otherwise we could endup waiting forever on 1384 + * plugged BIOs to complete if there is a queue freeze on-going. 1385 + */ 1386 + disk_zone_wplug_abort(zwplug); 1387 + goto unplug; 1388 + } 1389 + 1390 + /* Update the zone write pointer offset. */ 1391 + zwplug->wp_offset = blk_zone_wp_offset(&zone); 1392 + disk_zone_wplug_abort_unaligned(disk, zwplug); 1393 + 1394 + /* Restart BIO submission if we still have any BIO left. */ 1395 + if (!bio_list_empty(&zwplug->bio_list)) { 1396 + disk_zone_wplug_schedule_bio_work(disk, zwplug); 1397 + goto unlock; 1398 + } 1399 + 1400 + unplug: 1401 + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1402 + if (disk_should_remove_zone_wplug(disk, zwplug)) 1403 + disk_remove_zone_wplug(disk, zwplug); 1404 + 1405 + unlock: 1406 + spin_unlock_irqrestore(&zwplug->lock, flags); 1407 + } 1408 + 1409 + static void disk_zone_wplugs_work(struct work_struct *work) 1410 + { 1411 + struct gendisk *disk = 1412 + container_of(work, struct gendisk, zone_wplugs_work); 1413 + struct blk_zone_wplug *zwplug; 1414 + unsigned long flags; 1415 + 1416 + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 1417 + 1418 + while (!list_empty(&disk->zone_wplugs_err_list)) { 1419 + zwplug = list_first_entry(&disk->zone_wplugs_err_list, 1420 + struct blk_zone_wplug, link); 1421 + list_del_init(&zwplug->link); 1422 + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 1423 + 1424 + disk_zone_wplug_handle_error(disk, zwplug); 1425 + disk_put_zone_wplug(zwplug); 1426 + 1427 + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 1428 + } 1429 + 1430 + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 1431 + } 1432 + 1433 + static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) 1434 + { 1435 + return 1U << disk->zone_wplugs_hash_bits; 1436 + } 1437 + 1438 + void disk_init_zone_resources(struct gendisk *disk) 1439 + { 1440 + spin_lock_init(&disk->zone_wplugs_lock); 1441 + INIT_LIST_HEAD(&disk->zone_wplugs_err_list); 1442 + INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); 1443 + } 1444 + 1445 + /* 1446 + * For the size of a disk zone write plug hash table, use the size of the 1447 + * zone write plug mempool, which is the maximum of the disk open zones and 1448 + * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, 1449 + * 9 bits. For a disk that has no limits, mempool size defaults to 128. 1450 + */ 1451 + #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 1452 + #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 1453 + 1454 + static int disk_alloc_zone_resources(struct gendisk *disk, 1455 + unsigned int pool_size) 1456 + { 1457 + unsigned int i; 1458 + 1459 + disk->zone_wplugs_hash_bits = 1460 + min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); 1461 + 1462 + disk->zone_wplugs_hash = 1463 + kcalloc(disk_zone_wplugs_hash_size(disk), 1464 + sizeof(struct hlist_head), GFP_KERNEL); 1465 + if (!disk->zone_wplugs_hash) 1466 + return -ENOMEM; 1467 + 1468 + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 1469 + INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); 1470 + 1471 + disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, 1472 + sizeof(struct blk_zone_wplug)); 1473 + if (!disk->zone_wplugs_pool) 1474 + goto free_hash; 1475 + 1476 + disk->zone_wplugs_wq = 1477 + alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, 1478 + pool_size, disk->disk_name); 1479 + if (!disk->zone_wplugs_wq) 1480 + goto destroy_pool; 1481 + 1482 + return 0; 1483 + 1484 + destroy_pool: 1485 + mempool_destroy(disk->zone_wplugs_pool); 1486 + disk->zone_wplugs_pool = NULL; 1487 + free_hash: 1488 + kfree(disk->zone_wplugs_hash); 1489 + disk->zone_wplugs_hash = NULL; 1490 + disk->zone_wplugs_hash_bits = 0; 1491 + return -ENOMEM; 1492 + } 1493 + 1494 + static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) 1495 + { 1496 + struct blk_zone_wplug *zwplug; 1497 + unsigned int i; 1498 + 1499 + if (!disk->zone_wplugs_hash) 1500 + return; 1501 + 1502 + /* Free all the zone write plugs we have. */ 1503 + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1504 + while (!hlist_empty(&disk->zone_wplugs_hash[i])) { 1505 + zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, 1506 + struct blk_zone_wplug, node); 1507 + atomic_inc(&zwplug->ref); 1508 + disk_remove_zone_wplug(disk, zwplug); 1509 + disk_put_zone_wplug(zwplug); 1510 + } 1511 + } 1512 + 1513 + kfree(disk->zone_wplugs_hash); 1514 + disk->zone_wplugs_hash = NULL; 1515 + disk->zone_wplugs_hash_bits = 0; 1516 + } 1517 + 1518 + void disk_free_zone_resources(struct gendisk *disk) 1519 + { 1520 + cancel_work_sync(&disk->zone_wplugs_work); 1521 + 1522 + if (disk->zone_wplugs_wq) { 1523 + destroy_workqueue(disk->zone_wplugs_wq); 1524 + disk->zone_wplugs_wq = NULL; 1525 + } 1526 + 1527 + disk_destroy_zone_wplugs_hash_table(disk); 1528 + 1529 + /* 1530 + * Wait for the zone write plugs to be RCU-freed before 1531 + * destorying the mempool. 1532 + */ 1533 + rcu_barrier(); 1534 + 1535 + mempool_destroy(disk->zone_wplugs_pool); 1536 + disk->zone_wplugs_pool = NULL; 1537 + 448 1538 kfree(disk->conv_zones_bitmap); 449 1539 disk->conv_zones_bitmap = NULL; 450 - kfree(disk->seq_zones_wlock); 451 - disk->seq_zones_wlock = NULL; 1540 + disk->zone_capacity = 0; 1541 + disk->nr_zones = 0; 1542 + } 1543 + 1544 + static inline bool disk_need_zone_resources(struct gendisk *disk) 1545 + { 1546 + /* 1547 + * All mq zoned devices need zone resources so that the block layer 1548 + * can automatically handle write BIO plugging. BIO-based device drivers 1549 + * (e.g. DM devices) are normally responsible for handling zone write 1550 + * ordering and do not need zone resources, unless the driver requires 1551 + * zone append emulation. 1552 + */ 1553 + return queue_is_mq(disk->queue) || 1554 + queue_emulates_zone_append(disk->queue); 1555 + } 1556 + 1557 + static int disk_revalidate_zone_resources(struct gendisk *disk, 1558 + unsigned int nr_zones) 1559 + { 1560 + struct queue_limits *lim = &disk->queue->limits; 1561 + unsigned int pool_size; 1562 + 1563 + if (!disk_need_zone_resources(disk)) 1564 + return 0; 1565 + 1566 + /* 1567 + * If the device has no limit on the maximum number of open and active 1568 + * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. 1569 + */ 1570 + pool_size = max(lim->max_open_zones, lim->max_active_zones); 1571 + if (!pool_size) 1572 + pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); 1573 + 1574 + if (!disk->zone_wplugs_hash) 1575 + return disk_alloc_zone_resources(disk, pool_size); 1576 + 1577 + return 0; 452 1578 } 453 1579 454 1580 struct blk_revalidate_zone_args { 455 1581 struct gendisk *disk; 456 1582 unsigned long *conv_zones_bitmap; 457 - unsigned long *seq_zones_wlock; 458 1583 unsigned int nr_zones; 1584 + unsigned int zone_capacity; 459 1585 sector_t sector; 460 1586 }; 1587 + 1588 + /* 1589 + * Update the disk zone resources information and device queue limits. 1590 + * The disk queue is frozen when this is executed. 1591 + */ 1592 + static int disk_update_zone_resources(struct gendisk *disk, 1593 + struct blk_revalidate_zone_args *args) 1594 + { 1595 + struct request_queue *q = disk->queue; 1596 + unsigned int nr_seq_zones, nr_conv_zones = 0; 1597 + unsigned int pool_size; 1598 + struct queue_limits lim; 1599 + 1600 + disk->nr_zones = args->nr_zones; 1601 + disk->zone_capacity = args->zone_capacity; 1602 + swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); 1603 + if (disk->conv_zones_bitmap) 1604 + nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, 1605 + disk->nr_zones); 1606 + if (nr_conv_zones >= disk->nr_zones) { 1607 + pr_warn("%s: Invalid number of conventional zones %u / %u\n", 1608 + disk->disk_name, nr_conv_zones, disk->nr_zones); 1609 + return -ENODEV; 1610 + } 1611 + 1612 + if (!disk->zone_wplugs_pool) 1613 + return 0; 1614 + 1615 + /* 1616 + * If the device has no limit on the maximum number of open and active 1617 + * zones, set its max open zone limit to the mempool size to indicate 1618 + * to the user that there is a potential performance impact due to 1619 + * dynamic zone write plug allocation when simultaneously writing to 1620 + * more zones than the size of the mempool. 1621 + */ 1622 + lim = queue_limits_start_update(q); 1623 + 1624 + nr_seq_zones = disk->nr_zones - nr_conv_zones; 1625 + pool_size = max(lim.max_open_zones, lim.max_active_zones); 1626 + if (!pool_size) 1627 + pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); 1628 + 1629 + mempool_resize(disk->zone_wplugs_pool, pool_size); 1630 + 1631 + if (!lim.max_open_zones && !lim.max_active_zones) { 1632 + if (pool_size < nr_seq_zones) 1633 + lim.max_open_zones = pool_size; 1634 + else 1635 + lim.max_open_zones = 0; 1636 + } 1637 + 1638 + return queue_limits_commit_update(q, &lim); 1639 + } 1640 + 1641 + static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, 1642 + struct blk_revalidate_zone_args *args) 1643 + { 1644 + struct gendisk *disk = args->disk; 1645 + struct request_queue *q = disk->queue; 1646 + 1647 + if (zone->capacity != zone->len) { 1648 + pr_warn("%s: Invalid conventional zone capacity\n", 1649 + disk->disk_name); 1650 + return -ENODEV; 1651 + } 1652 + 1653 + if (!disk_need_zone_resources(disk)) 1654 + return 0; 1655 + 1656 + if (!args->conv_zones_bitmap) { 1657 + args->conv_zones_bitmap = 1658 + blk_alloc_zone_bitmap(q->node, args->nr_zones); 1659 + if (!args->conv_zones_bitmap) 1660 + return -ENOMEM; 1661 + } 1662 + 1663 + set_bit(idx, args->conv_zones_bitmap); 1664 + 1665 + return 0; 1666 + } 1667 + 1668 + static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, 1669 + struct blk_revalidate_zone_args *args) 1670 + { 1671 + struct gendisk *disk = args->disk; 1672 + struct blk_zone_wplug *zwplug; 1673 + unsigned int wp_offset; 1674 + unsigned long flags; 1675 + 1676 + /* 1677 + * Remember the capacity of the first sequential zone and check 1678 + * if it is constant for all zones. 1679 + */ 1680 + if (!args->zone_capacity) 1681 + args->zone_capacity = zone->capacity; 1682 + if (zone->capacity != args->zone_capacity) { 1683 + pr_warn("%s: Invalid variable zone capacity\n", 1684 + disk->disk_name); 1685 + return -ENODEV; 1686 + } 1687 + 1688 + /* 1689 + * We need to track the write pointer of all zones that are not 1690 + * empty nor full. So make sure we have a zone write plug for 1691 + * such zone if the device has a zone write plug hash table. 1692 + */ 1693 + if (!disk->zone_wplugs_hash) 1694 + return 0; 1695 + 1696 + wp_offset = blk_zone_wp_offset(zone); 1697 + if (!wp_offset || wp_offset >= zone->capacity) 1698 + return 0; 1699 + 1700 + zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); 1701 + if (!zwplug) 1702 + return -ENOMEM; 1703 + spin_unlock_irqrestore(&zwplug->lock, flags); 1704 + disk_put_zone_wplug(zwplug); 1705 + 1706 + return 0; 1707 + } 461 1708 462 1709 /* 463 1710 * Helper function to check the validity of zones of a zoned block device. ··· 1732 449 { 1733 450 struct blk_revalidate_zone_args *args = data; 1734 451 struct gendisk *disk = args->disk; 1735 - struct request_queue *q = disk->queue; 1736 452 sector_t capacity = get_capacity(disk); 1737 - sector_t zone_sectors = q->limits.chunk_sectors; 453 + sector_t zone_sectors = disk->queue->limits.chunk_sectors; 454 + int ret; 1738 455 1739 456 /* Check for bad zones and holes in the zone report */ 1740 457 if (zone->start != args->sector) { ··· 1765 482 return -ENODEV; 1766 483 } 1767 484 485 + if (!zone->capacity || zone->capacity > zone->len) { 486 + pr_warn("%s: Invalid zone capacity\n", 487 + disk->disk_name); 488 + return -ENODEV; 489 + } 490 + 1768 491 /* Check zone type */ 1769 492 switch (zone->type) { 1770 493 case BLK_ZONE_TYPE_CONVENTIONAL: 1771 - if (!args->conv_zones_bitmap) { 1772 - args->conv_zones_bitmap = 1773 - blk_alloc_zone_bitmap(q->node, args->nr_zones); 1774 - if (!args->conv_zones_bitmap) 1775 - return -ENOMEM; 1776 - } 1777 - set_bit(idx, args->conv_zones_bitmap); 494 + ret = blk_revalidate_conv_zone(zone, idx, args); 1778 495 break; 1779 496 case BLK_ZONE_TYPE_SEQWRITE_REQ: 1780 - if (!args->seq_zones_wlock) { 1781 - args->seq_zones_wlock = 1782 - blk_alloc_zone_bitmap(q->node, args->nr_zones); 1783 - if (!args->seq_zones_wlock) 1784 - return -ENOMEM; 1785 - } 497 + ret = blk_revalidate_seq_zone(zone, idx, args); 1786 498 break; 1787 499 case BLK_ZONE_TYPE_SEQWRITE_PREF: 1788 500 default: 1789 501 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", 1790 502 disk->disk_name, (int)zone->type, zone->start); 1791 - return -ENODEV; 503 + ret = -ENODEV; 1792 504 } 1793 505 1794 - args->sector += zone->len; 1795 - return 0; 506 + if (!ret) 507 + args->sector += zone->len; 508 + 509 + return ret; 1796 510 } 1797 511 1798 512 /** 1799 - * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps 513 + * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs 1800 514 * @disk: Target disk 1801 - * @update_driver_data: Callback to update driver data on the frozen disk 1802 515 * 1803 - * Helper function for low-level device drivers to check and (re) allocate and 1804 - * initialize a disk request queue zone bitmaps. This functions should normally 1805 - * be called within the disk ->revalidate method for blk-mq based drivers. 516 + * Helper function for low-level device drivers to check, (re) allocate and 517 + * initialize resources used for managing zoned disks. This function should 518 + * normally be called by blk-mq based drivers when a zoned gendisk is probed 519 + * and when the zone configuration of the gendisk changes (e.g. after a format). 1806 520 * Before calling this function, the device driver must already have set the 1807 521 * device zone size (chunk_sector limit) and the max zone append limit. 1808 - * For BIO based drivers, this function cannot be used. BIO based device drivers 1809 - * only need to set disk->nr_zones so that the sysfs exposed value is correct. 1810 - * If the @update_driver_data callback function is not NULL, the callback is 1811 - * executed with the device request queue frozen after all zones have been 1812 - * checked. 522 + * BIO based drivers can also use this function as long as the device queue 523 + * can be safely frozen. 1813 524 */ 1814 - int blk_revalidate_disk_zones(struct gendisk *disk, 1815 - void (*update_driver_data)(struct gendisk *disk)) 525 + int blk_revalidate_disk_zones(struct gendisk *disk) 1816 526 { 1817 527 struct request_queue *q = disk->queue; 1818 528 sector_t zone_sectors = q->limits.chunk_sectors; 1819 529 sector_t capacity = get_capacity(disk); 1820 530 struct blk_revalidate_zone_args args = { }; 1821 531 unsigned int noio_flag; 1822 - int ret; 532 + int ret = -ENOMEM; 1823 533 1824 534 if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) 1825 - return -EIO; 1826 - if (WARN_ON_ONCE(!queue_is_mq(q))) 1827 535 return -EIO; 1828 536 1829 537 if (!capacity) ··· 1830 556 return -ENODEV; 1831 557 } 1832 558 1833 - if (!q->limits.max_zone_append_sectors) { 559 + if (!queue_max_zone_append_sectors(q)) { 1834 560 pr_warn("%s: Invalid 0 maximum zone append limit\n", 1835 561 disk->disk_name); 1836 562 return -ENODEV; ··· 1843 569 args.disk = disk; 1844 570 args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); 1845 571 noio_flag = memalloc_noio_save(); 572 + ret = disk_revalidate_zone_resources(disk, args.nr_zones); 573 + if (ret) { 574 + memalloc_noio_restore(noio_flag); 575 + return ret; 576 + } 1846 577 ret = disk->fops->report_zones(disk, 0, UINT_MAX, 1847 578 blk_revalidate_zone_cb, &args); 1848 579 if (!ret) { ··· 1867 588 } 1868 589 1869 590 /* 1870 - * Install the new bitmaps and update nr_zones only once the queue is 1871 - * stopped and all I/Os are completed (i.e. a scheduler is not 1872 - * referencing the bitmaps). 591 + * Set the new disk zone parameters only once the queue is frozen and 592 + * all I/Os are completed. 1873 593 */ 1874 594 blk_mq_freeze_queue(q); 1875 - if (ret > 0) { 1876 - disk->nr_zones = args.nr_zones; 1877 - swap(disk->seq_zones_wlock, args.seq_zones_wlock); 1878 - swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); 1879 - if (update_driver_data) 1880 - update_driver_data(disk); 1881 - ret = 0; 1882 - } else { 595 + if (ret > 0) 596 + ret = disk_update_zone_resources(disk, &args); 597 + else 1883 598 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 1884 - disk_free_zone_bitmaps(disk); 1885 - } 599 + if (ret) 600 + disk_free_zone_resources(disk); 1886 601 blk_mq_unfreeze_queue(q); 1887 602 1888 - kfree(args.seq_zones_wlock); 1889 603 kfree(args.conv_zones_bitmap); 604 + 1890 605 return ret; 1891 606 } 1892 607 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 608 + 609 + #ifdef CONFIG_BLK_DEBUG_FS 610 + 611 + int queue_zone_wplugs_show(void *data, struct seq_file *m) 612 + { 613 + struct request_queue *q = data; 614 + struct gendisk *disk = q->disk; 615 + struct blk_zone_wplug *zwplug; 616 + unsigned int zwp_wp_offset, zwp_flags; 617 + unsigned int zwp_zone_no, zwp_ref; 618 + unsigned int zwp_bio_list_size, i; 619 + unsigned long flags; 620 + 621 + if (!disk->zone_wplugs_hash) 622 + return 0; 623 + 624 + rcu_read_lock(); 625 + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 626 + hlist_for_each_entry_rcu(zwplug, 627 + &disk->zone_wplugs_hash[i], node) { 628 + spin_lock_irqsave(&zwplug->lock, flags); 629 + zwp_zone_no = zwplug->zone_no; 630 + zwp_flags = zwplug->flags; 631 + zwp_ref = atomic_read(&zwplug->ref); 632 + zwp_wp_offset = zwplug->wp_offset; 633 + zwp_bio_list_size = bio_list_size(&zwplug->bio_list); 634 + spin_unlock_irqrestore(&zwplug->lock, flags); 635 + 636 + seq_printf(m, "%u 0x%x %u %u %u\n", 637 + zwp_zone_no, zwp_flags, zwp_ref, 638 + zwp_wp_offset, zwp_bio_list_size); 639 + } 640 + } 641 + rcu_read_unlock(); 642 + 643 + return 0; 644 + } 645 + 646 + #endif

+84 -13

block/blk.h

··· 38 38 void blk_queue_start_drain(struct request_queue *q); 39 39 int __bio_queue_enter(struct request_queue *q, struct bio *bio); 40 40 void submit_bio_noacct_nocheck(struct bio *bio); 41 + void bio_await_chain(struct bio *bio); 41 42 42 43 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) 43 44 { ··· 270 269 unsigned long blk_rq_timeout(unsigned long timeout); 271 270 void blk_add_timer(struct request *req); 272 271 272 + enum bio_merge_status { 273 + BIO_MERGE_OK, 274 + BIO_MERGE_NONE, 275 + BIO_MERGE_FAILED, 276 + }; 277 + 278 + enum bio_merge_status bio_attempt_back_merge(struct request *req, 279 + struct bio *bio, unsigned int nr_segs); 273 280 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, 274 281 unsigned int nr_segs); 275 282 bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, ··· 366 357 } 367 358 368 359 void update_io_ticks(struct block_device *part, unsigned long now, bool end); 360 + unsigned int part_in_flight(struct block_device *part); 369 361 370 362 static inline void req_set_nomerge(struct request_queue *q, struct request *req) 371 363 { ··· 388 378 } 389 379 #endif /* CONFIG_BLK_ICQ */ 390 380 391 - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 392 - extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); 393 - extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, 394 - const char *page, size_t count); 395 - extern void blk_throtl_bio_endio(struct bio *bio); 396 - extern void blk_throtl_stat_add(struct request *rq, u64 time); 397 - #else 398 - static inline void blk_throtl_bio_endio(struct bio *bio) { } 399 - static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } 400 - #endif 401 - 402 381 struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); 403 382 404 383 static inline bool blk_queue_may_bounce(struct request_queue *q) ··· 406 407 } 407 408 408 409 #ifdef CONFIG_BLK_DEV_ZONED 409 - void disk_free_zone_bitmaps(struct gendisk *disk); 410 + void disk_init_zone_resources(struct gendisk *disk); 411 + void disk_free_zone_resources(struct gendisk *disk); 412 + static inline bool bio_zone_write_plugging(struct bio *bio) 413 + { 414 + return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); 415 + } 416 + static inline bool bio_is_zone_append(struct bio *bio) 417 + { 418 + return bio_op(bio) == REQ_OP_ZONE_APPEND || 419 + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); 420 + } 421 + void blk_zone_write_plug_bio_merged(struct bio *bio); 422 + void blk_zone_write_plug_init_request(struct request *rq); 423 + static inline void blk_zone_update_request_bio(struct request *rq, 424 + struct bio *bio) 425 + { 426 + /* 427 + * For zone append requests, the request sector indicates the location 428 + * at which the BIO data was written. Return this value to the BIO 429 + * issuer through the BIO iter sector. 430 + * For plugged zone writes, which include emulated zone append, we need 431 + * the original BIO sector so that blk_zone_write_plug_bio_endio() can 432 + * lookup the zone write plug. 433 + */ 434 + if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) 435 + bio->bi_iter.bi_sector = rq->__sector; 436 + } 437 + void blk_zone_write_plug_bio_endio(struct bio *bio); 438 + static inline void blk_zone_bio_endio(struct bio *bio) 439 + { 440 + /* 441 + * For write BIOs to zoned devices, signal the completion of the BIO so 442 + * that the next write BIO can be submitted by zone write plugging. 443 + */ 444 + if (bio_zone_write_plugging(bio)) 445 + blk_zone_write_plug_bio_endio(bio); 446 + } 447 + 448 + void blk_zone_write_plug_finish_request(struct request *rq); 449 + static inline void blk_zone_finish_request(struct request *rq) 450 + { 451 + if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING) 452 + blk_zone_write_plug_finish_request(rq); 453 + } 410 454 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, 411 455 unsigned long arg); 412 456 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, 413 457 unsigned int cmd, unsigned long arg); 414 458 #else /* CONFIG_BLK_DEV_ZONED */ 415 - static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} 459 + static inline void disk_init_zone_resources(struct gendisk *disk) 460 + { 461 + } 462 + static inline void disk_free_zone_resources(struct gendisk *disk) 463 + { 464 + } 465 + static inline bool bio_zone_write_plugging(struct bio *bio) 466 + { 467 + return false; 468 + } 469 + static inline bool bio_is_zone_append(struct bio *bio) 470 + { 471 + return false; 472 + } 473 + static inline void blk_zone_write_plug_bio_merged(struct bio *bio) 474 + { 475 + } 476 + static inline void blk_zone_write_plug_init_request(struct request *rq) 477 + { 478 + } 479 + static inline void blk_zone_update_request_bio(struct request *rq, 480 + struct bio *bio) 481 + { 482 + } 483 + static inline void blk_zone_bio_endio(struct bio *bio) 484 + { 485 + } 486 + static inline void blk_zone_finish_request(struct request *rq) 487 + { 488 + } 416 489 static inline int blkdev_report_zones_ioctl(struct block_device *bdev, 417 490 unsigned int cmd, unsigned long arg) 418 491 {

+5 -41

block/elevator.c

··· 83 83 } 84 84 EXPORT_SYMBOL(elv_bio_merge_ok); 85 85 86 - static inline bool elv_support_features(struct request_queue *q, 87 - const struct elevator_type *e) 88 - { 89 - return (q->required_elevator_features & e->elevator_features) == 90 - q->required_elevator_features; 91 - } 92 - 93 86 /** 94 87 * elevator_match - Check whether @e's name or alias matches @name 95 88 * @e: Scheduler to test ··· 113 120 114 121 spin_lock(&elv_list_lock); 115 122 e = __elevator_find(name); 116 - if (e && (!elv_support_features(q, e) || !elevator_tryget(e))) 123 + if (e && (!elevator_tryget(e))) 117 124 e = NULL; 118 125 spin_unlock(&elv_list_lock); 119 126 return e; ··· 573 580 } 574 581 575 582 /* 576 - * Get the first elevator providing the features required by the request queue. 577 - * Default to "none" if no matching elevator is found. 578 - */ 579 - static struct elevator_type *elevator_get_by_features(struct request_queue *q) 580 - { 581 - struct elevator_type *e, *found = NULL; 582 - 583 - spin_lock(&elv_list_lock); 584 - 585 - list_for_each_entry(e, &elv_list, list) { 586 - if (elv_support_features(q, e)) { 587 - found = e; 588 - break; 589 - } 590 - } 591 - 592 - if (found && !elevator_tryget(found)) 593 - found = NULL; 594 - 595 - spin_unlock(&elv_list_lock); 596 - return found; 597 - } 598 - 599 - /* 600 - * For a device queue that has no required features, use the default elevator 601 - * settings. Otherwise, use the first elevator available matching the required 602 - * features. If no suitable elevator is find or if the chosen elevator 603 - * initialization fails, fall back to the "none" elevator (no elevator). 583 + * Use the default elevator settings. If the chosen elevator initialization 584 + * fails, fall back to the "none" elevator (no elevator). 604 585 */ 605 586 void elevator_init_mq(struct request_queue *q) 606 587 { ··· 589 622 if (unlikely(q->elevator)) 590 623 return; 591 624 592 - if (!q->required_elevator_features) 593 - e = elevator_get_default(q); 594 - else 595 - e = elevator_get_by_features(q); 625 + e = elevator_get_default(q); 596 626 if (!e) 597 627 return; 598 628 ··· 745 781 list_for_each_entry(e, &elv_list, list) { 746 782 if (e == cur) 747 783 len += sprintf(name+len, "[%s] ", e->elevator_name); 748 - else if (elv_support_features(q, e)) 784 + else 749 785 len += sprintf(name+len, "%s ", e->elevator_name); 750 786 } 751 787 spin_unlock(&elv_list_lock);

-1

block/elevator.h

··· 74 74 struct elv_fs_entry *elevator_attrs; 75 75 const char *elevator_name; 76 76 const char *elevator_alias; 77 - const unsigned int elevator_features; 78 77 struct module *elevator_owner; 79 78 #ifdef CONFIG_BLK_DEBUG_FS 80 79 const struct blk_mq_debugfs_attr *queue_debugfs_attrs;

+13 -18

block/fops.c

··· 44 44 #define DIO_INLINE_BIO_VECS 4 45 45 46 46 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, 47 - struct iov_iter *iter, unsigned int nr_pages) 47 + struct iov_iter *iter, struct block_device *bdev, 48 + unsigned int nr_pages) 48 49 { 49 - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); 50 50 struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; 51 51 loff_t pos = iocb->ki_pos; 52 52 bool should_dirty = false; 53 53 struct bio bio; 54 54 ssize_t ret; 55 - 56 - if (blkdev_dio_unaligned(bdev, pos, iter)) 57 - return -EINVAL; 58 55 59 56 if (nr_pages <= DIO_INLINE_BIO_VECS) 60 57 vecs = inline_vecs; ··· 158 161 } 159 162 160 163 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, 161 - unsigned int nr_pages) 164 + struct block_device *bdev, unsigned int nr_pages) 162 165 { 163 - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); 164 166 struct blk_plug plug; 165 167 struct blkdev_dio *dio; 166 168 struct bio *bio; ··· 167 171 blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); 168 172 loff_t pos = iocb->ki_pos; 169 173 int ret = 0; 170 - 171 - if (blkdev_dio_unaligned(bdev, pos, iter)) 172 - return -EINVAL; 173 174 174 175 if (iocb->ki_flags & IOCB_ALLOC_CACHE) 175 176 opf |= REQ_ALLOC_CACHE; ··· 295 302 296 303 static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, 297 304 struct iov_iter *iter, 305 + struct block_device *bdev, 298 306 unsigned int nr_pages) 299 307 { 300 - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); 301 308 bool is_read = iov_iter_rw(iter) == READ; 302 309 blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); 303 310 struct blkdev_dio *dio; 304 311 struct bio *bio; 305 312 loff_t pos = iocb->ki_pos; 306 313 int ret = 0; 307 - 308 - if (blkdev_dio_unaligned(bdev, pos, iter)) 309 - return -EINVAL; 310 314 311 315 if (iocb->ki_flags & IOCB_ALLOC_CACHE) 312 316 opf |= REQ_ALLOC_CACHE; ··· 358 368 359 369 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 360 370 { 371 + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); 361 372 unsigned int nr_pages; 362 373 363 374 if (!iov_iter_count(iter)) 364 375 return 0; 365 376 377 + if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) 378 + return -EINVAL; 379 + 366 380 nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); 367 381 if (likely(nr_pages <= BIO_MAX_VECS)) { 368 382 if (is_sync_kiocb(iocb)) 369 - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); 370 - return __blkdev_direct_IO_async(iocb, iter, nr_pages); 383 + return __blkdev_direct_IO_simple(iocb, iter, bdev, 384 + nr_pages); 385 + return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); 371 386 } 372 - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); 387 + return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); 373 388 } 374 389 375 390 static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, ··· 385 390 386 391 iomap->bdev = bdev; 387 392 iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); 388 - if (iomap->offset >= isize) 393 + if (offset >= isize) 389 394 return -EIO; 390 395 iomap->type = IOMAP_MAPPED; 391 396 iomap->addr = iomap->offset;

+15 -17

block/genhd.c

··· 118 118 } 119 119 } 120 120 121 - static unsigned int part_in_flight(struct block_device *part) 121 + unsigned int part_in_flight(struct block_device *part) 122 122 { 123 123 unsigned int inflight = 0; 124 124 int cpu; ··· 345 345 struct file *file; 346 346 int ret = 0; 347 347 348 - if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) 349 - return -EINVAL; 350 - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) 348 + if (!disk_has_partscan(disk)) 351 349 return -EINVAL; 352 350 if (disk->open_partitions) 353 351 return -EBUSY; ··· 501 503 goto out_unregister_bdi; 502 504 503 505 /* Make sure the first partition scan will be proceed */ 504 - if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) && 505 - !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) 506 + if (get_capacity(disk) && disk_has_partscan(disk)) 506 507 set_bit(GD_NEED_PART_SCAN, &disk->state); 507 508 508 509 bdev_add(disk->part0, ddev->devt); ··· 951 954 struct device_attribute *attr, char *buf) 952 955 { 953 956 struct block_device *bdev = dev_to_bdev(dev); 954 - struct request_queue *q = bdev_get_queue(bdev); 955 957 struct disk_stats stat; 956 958 unsigned int inflight; 957 959 958 - if (queue_is_mq(q)) 959 - inflight = blk_mq_in_flight(q, bdev); 960 - else 961 - inflight = part_in_flight(bdev); 962 - 960 + inflight = part_in_flight(bdev); 963 961 if (inflight) { 964 962 part_stat_lock(); 965 963 update_io_ticks(bdev, jiffies, true); ··· 1039 1047 return sprintf(buf, "%llu\n", disk->diskseq); 1040 1048 } 1041 1049 1050 + static ssize_t partscan_show(struct device *dev, 1051 + struct device_attribute *attr, char *buf) 1052 + { 1053 + return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); 1054 + } 1055 + 1042 1056 static DEVICE_ATTR(range, 0444, disk_range_show, NULL); 1043 1057 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); 1044 1058 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); ··· 1058 1060 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); 1059 1061 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); 1060 1062 static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); 1063 + static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); 1061 1064 1062 1065 #ifdef CONFIG_FAIL_MAKE_REQUEST 1063 1066 ssize_t part_fail_show(struct device *dev, ··· 1105 1106 &dev_attr_events_async.attr, 1106 1107 &dev_attr_events_poll_msecs.attr, 1107 1108 &dev_attr_diskseq.attr, 1109 + &dev_attr_partscan.attr, 1108 1110 #ifdef CONFIG_FAIL_MAKE_REQUEST 1109 1111 &dev_attr_fail.attr, 1110 1112 #endif ··· 1182 1182 1183 1183 disk_release_events(disk); 1184 1184 kfree(disk->random); 1185 - disk_free_zone_bitmaps(disk); 1185 + disk_free_zone_resources(disk); 1186 1186 xa_destroy(&disk->part_tbl); 1187 1187 1188 1188 disk->queue->disk = NULL; ··· 1251 1251 xa_for_each(&gp->part_tbl, idx, hd) { 1252 1252 if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) 1253 1253 continue; 1254 - if (queue_is_mq(gp->queue)) 1255 - inflight = blk_mq_in_flight(gp->queue, hd); 1256 - else 1257 - inflight = part_in_flight(hd); 1258 1254 1255 + inflight = part_in_flight(hd); 1259 1256 if (inflight) { 1260 1257 part_stat_lock(); 1261 1258 update_io_ticks(hd, jiffies, true); ··· 1361 1364 if (blkcg_init_disk(disk)) 1362 1365 goto out_erase_part0; 1363 1366 1367 + disk_init_zone_resources(disk); 1364 1368 rand_initialize_disk(disk); 1365 1369 disk_to_dev(disk)->class = &block_class; 1366 1370 disk_to_dev(disk)->type = &disk_type;

+36 -6

block/ioctl.c

··· 33 33 if (op == BLKPG_DEL_PARTITION) 34 34 return bdev_del_partition(disk, p.pno); 35 35 36 - if (p.start < 0 || p.length <= 0 || p.start + p.length < 0) 36 + if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start) 37 37 return -EINVAL; 38 38 /* Check that the partition is aligned to the block size */ 39 39 if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) ··· 95 95 static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, 96 96 unsigned long arg) 97 97 { 98 - uint64_t range[2]; 99 - uint64_t start, len, end; 98 + unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; 100 99 struct inode *inode = bdev->bd_inode; 100 + uint64_t range[2], start, len, end; 101 + struct bio *prev = NULL, *bio; 102 + sector_t sector, nr_sects; 103 + struct blk_plug plug; 101 104 int err; 102 105 103 106 if (!(mode & BLK_OPEN_WRITE)) ··· 108 105 109 106 if (!bdev_max_discard_sectors(bdev)) 110 107 return -EOPNOTSUPP; 108 + if (bdev_read_only(bdev)) 109 + return -EPERM; 111 110 112 111 if (copy_from_user(range, (void __user *)arg, sizeof(range))) 113 112 return -EFAULT; ··· 117 112 start = range[0]; 118 113 len = range[1]; 119 114 120 - if (start & 511) 115 + if (!len) 121 116 return -EINVAL; 122 - if (len & 511) 117 + if ((start | len) & bs_mask) 123 118 return -EINVAL; 124 119 125 120 if (check_add_overflow(start, len, &end) || ··· 130 125 err = truncate_bdev_range(bdev, mode, start, start + len - 1); 131 126 if (err) 132 127 goto fail; 133 - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); 128 + 129 + sector = start >> SECTOR_SHIFT; 130 + nr_sects = len >> SECTOR_SHIFT; 131 + 132 + blk_start_plug(&plug); 133 + while (1) { 134 + if (fatal_signal_pending(current)) { 135 + if (prev) 136 + bio_await_chain(prev); 137 + err = -EINTR; 138 + goto out_unplug; 139 + } 140 + bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, 141 + GFP_KERNEL); 142 + if (!bio) 143 + break; 144 + prev = bio_chain_and_submit(prev, bio); 145 + } 146 + if (prev) { 147 + err = submit_bio_wait(prev); 148 + if (err == -EOPNOTSUPP) 149 + err = 0; 150 + bio_put(prev); 151 + } 152 + out_unplug: 153 + blk_finish_plug(&plug); 134 154 fail: 135 155 filemap_invalidate_unlock(inode->i_mapping); 136 156 return err;

+6 -198

block/mq-deadline.c

··· 102 102 int prio_aging_expire; 103 103 104 104 spinlock_t lock; 105 - spinlock_t zone_lock; 106 105 }; 107 106 108 107 /* Maps an I/O priority class to a deadline scheduler priority. */ ··· 128 129 } 129 130 130 131 /* 131 - * get the request before `rq' in sector-sorted order 132 - */ 133 - static inline struct request * 134 - deadline_earlier_request(struct request *rq) 135 - { 136 - struct rb_node *node = rb_prev(&rq->rb_node); 137 - 138 - if (node) 139 - return rb_entry_rq(node); 140 - 141 - return NULL; 142 - } 143 - 144 - /* 145 - * get the request after `rq' in sector-sorted order 146 - */ 147 - static inline struct request * 148 - deadline_latter_request(struct request *rq) 149 - { 150 - struct rb_node *node = rb_next(&rq->rb_node); 151 - 152 - if (node) 153 - return rb_entry_rq(node); 154 - 155 - return NULL; 156 - } 157 - 158 - /* 159 - * Return the first request for which blk_rq_pos() >= @pos. For zoned devices, 160 - * return the first request after the start of the zone containing @pos. 132 + * Return the first request for which blk_rq_pos() >= @pos. 161 133 */ 162 134 static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, 163 135 enum dd_data_dir data_dir, sector_t pos) ··· 140 170 return NULL; 141 171 142 172 rq = rb_entry_rq(node); 143 - /* 144 - * A zoned write may have been requeued with a starting position that 145 - * is below that of the most recently dispatched request. Hence, for 146 - * zoned writes, start searching from the start of a zone. 147 - */ 148 - if (blk_rq_is_seq_zoned_write(rq)) 149 - pos = round_down(pos, rq->q->limits.chunk_sectors); 150 - 151 173 while (node) { 152 174 rq = rb_entry_rq(node); 153 175 if (blk_rq_pos(rq) >= pos) { ··· 271 309 } 272 310 273 311 /* 274 - * Check if rq has a sequential request preceding it. 275 - */ 276 - static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) 277 - { 278 - struct request *prev = deadline_earlier_request(rq); 279 - 280 - if (!prev) 281 - return false; 282 - 283 - return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); 284 - } 285 - 286 - /* 287 - * Skip all write requests that are sequential from @rq, even if we cross 288 - * a zone boundary. 289 - */ 290 - static struct request *deadline_skip_seq_writes(struct deadline_data *dd, 291 - struct request *rq) 292 - { 293 - sector_t pos = blk_rq_pos(rq); 294 - 295 - do { 296 - pos += blk_rq_sectors(rq); 297 - rq = deadline_latter_request(rq); 298 - } while (rq && blk_rq_pos(rq) == pos); 299 - 300 - return rq; 301 - } 302 - 303 - /* 304 312 * For the specified data direction, return the next request to 305 313 * dispatch using arrival ordered lists. 306 314 */ ··· 278 346 deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, 279 347 enum dd_data_dir data_dir) 280 348 { 281 - struct request *rq, *rb_rq, *next; 282 - unsigned long flags; 283 - 284 349 if (list_empty(&per_prio->fifo_list[data_dir])) 285 350 return NULL; 286 351 287 - rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); 288 - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) 289 - return rq; 290 - 291 - /* 292 - * Look for a write request that can be dispatched, that is one with 293 - * an unlocked target zone. For some HDDs, breaking a sequential 294 - * write stream can lead to lower throughput, so make sure to preserve 295 - * sequential write streams, even if that stream crosses into the next 296 - * zones and these zones are unlocked. 297 - */ 298 - spin_lock_irqsave(&dd->zone_lock, flags); 299 - list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE], 300 - queuelist) { 301 - /* Check whether a prior request exists for the same zone. */ 302 - rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq)); 303 - if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq)) 304 - rq = rb_rq; 305 - if (blk_req_can_dispatch_to_zone(rq) && 306 - (blk_queue_nonrot(rq->q) || 307 - !deadline_is_seq_write(dd, rq))) 308 - goto out; 309 - } 310 - rq = NULL; 311 - out: 312 - spin_unlock_irqrestore(&dd->zone_lock, flags); 313 - 314 - return rq; 352 + return rq_entry_fifo(per_prio->fifo_list[data_dir].next); 315 353 } 316 354 317 355 /* ··· 292 390 deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, 293 391 enum dd_data_dir data_dir) 294 392 { 295 - struct request *rq; 296 - unsigned long flags; 297 - 298 - rq = deadline_from_pos(per_prio, data_dir, 299 - per_prio->latest_pos[data_dir]); 300 - if (!rq) 301 - return NULL; 302 - 303 - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) 304 - return rq; 305 - 306 - /* 307 - * Look for a write request that can be dispatched, that is one with 308 - * an unlocked target zone. For some HDDs, breaking a sequential 309 - * write stream can lead to lower throughput, so make sure to preserve 310 - * sequential write streams, even if that stream crosses into the next 311 - * zones and these zones are unlocked. 312 - */ 313 - spin_lock_irqsave(&dd->zone_lock, flags); 314 - while (rq) { 315 - if (blk_req_can_dispatch_to_zone(rq)) 316 - break; 317 - if (blk_queue_nonrot(rq->q)) 318 - rq = deadline_latter_request(rq); 319 - else 320 - rq = deadline_skip_seq_writes(dd, rq); 321 - } 322 - spin_unlock_irqrestore(&dd->zone_lock, flags); 323 - 324 - return rq; 393 + return deadline_from_pos(per_prio, data_dir, 394 + per_prio->latest_pos[data_dir]); 325 395 } 326 396 327 397 /* ··· 399 525 rq = next_rq; 400 526 } 401 527 402 - /* 403 - * For a zoned block device, if we only have writes queued and none of 404 - * them can be dispatched, rq will be NULL. 405 - */ 406 528 if (!rq) 407 529 return NULL; 408 530 ··· 419 549 prio = ioprio_class_to_prio[ioprio_class]; 420 550 dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); 421 551 dd->per_prio[prio].stats.dispatched++; 422 - /* 423 - * If the request needs its target zone locked, do it. 424 - */ 425 - blk_req_zone_write_lock(rq); 426 552 rq->rq_flags |= RQF_STARTED; 427 553 return rq; 428 554 } ··· 588 722 dd->fifo_batch = fifo_batch; 589 723 dd->prio_aging_expire = prio_aging_expire; 590 724 spin_lock_init(&dd->lock); 591 - spin_lock_init(&dd->zone_lock); 592 725 593 726 /* We dispatch from request queue wide instead of hw queue */ 594 727 blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); ··· 669 804 670 805 lockdep_assert_held(&dd->lock); 671 806 672 - /* 673 - * This may be a requeue of a write request that has locked its 674 - * target zone. If it is the case, this releases the zone lock. 675 - */ 676 - blk_req_zone_write_unlock(rq); 677 - 678 807 prio = ioprio_class_to_prio[ioprio_class]; 679 808 per_prio = &dd->per_prio[prio]; 680 809 if (!rq->elv.priv[0]) { ··· 700 841 */ 701 842 rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; 702 843 insert_before = &per_prio->fifo_list[data_dir]; 703 - #ifdef CONFIG_BLK_DEV_ZONED 704 - /* 705 - * Insert zoned writes such that requests are sorted by 706 - * position per zone. 707 - */ 708 - if (blk_rq_is_seq_zoned_write(rq)) { 709 - struct request *rq2 = deadline_latter_request(rq); 710 - 711 - if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq)) 712 - insert_before = &rq2->queuelist; 713 - } 714 - #endif 715 844 list_add_tail(&rq->queuelist, insert_before); 716 845 } 717 846 } ··· 734 887 rq->elv.priv[0] = NULL; 735 888 } 736 889 737 - static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) 738 - { 739 - struct deadline_data *dd = hctx->queue->elevator->elevator_data; 740 - enum dd_prio p; 741 - 742 - for (p = 0; p <= DD_PRIO_MAX; p++) 743 - if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) 744 - return true; 745 - 746 - return false; 747 - } 748 - 749 890 /* 750 891 * Callback from inside blk_mq_free_request(). 751 - * 752 - * For zoned block devices, write unlock the target zone of 753 - * completed write requests. Do this while holding the zone lock 754 - * spinlock so that the zone is never unlocked while deadline_fifo_request() 755 - * or deadline_next_request() are executing. This function is called for 756 - * all requests, whether or not these requests complete successfully. 757 - * 758 - * For a zoned block device, __dd_dispatch_request() may have stopped 759 - * dispatching requests if all the queued requests are write requests directed 760 - * at zones that are already locked due to on-going write requests. To ensure 761 - * write request dispatch progress in this case, mark the queue as needing a 762 - * restart to ensure that the queue is run again after completion of the 763 - * request and zones being unlocked. 764 892 */ 765 893 static void dd_finish_request(struct request *rq) 766 894 { ··· 750 928 * called dd_insert_requests(). Skip requests that bypassed I/O 751 929 * scheduling. See also blk_mq_request_bypass_insert(). 752 930 */ 753 - if (!rq->elv.priv[0]) 754 - return; 755 - 756 - atomic_inc(&per_prio->stats.completed); 757 - 758 - if (blk_queue_is_zoned(q)) { 759 - unsigned long flags; 760 - 761 - spin_lock_irqsave(&dd->zone_lock, flags); 762 - blk_req_zone_write_unlock(rq); 763 - spin_unlock_irqrestore(&dd->zone_lock, flags); 764 - 765 - if (dd_has_write_work(rq->mq_hctx)) 766 - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); 767 - } 931 + if (rq->elv.priv[0]) 932 + atomic_inc(&per_prio->stats.completed); 768 933 } 769 934 770 935 static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) ··· 1075 1266 .elevator_attrs = deadline_attrs, 1076 1267 .elevator_name = "mq-deadline", 1077 1268 .elevator_alias = "deadline", 1078 - .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, 1079 1269 .elevator_owner = THIS_MODULE, 1080 1270 }; 1081 1271 MODULE_ALIAS("mq-deadline-iosched");

+12 -37

block/partitions/cmdline.c

··· 70 70 } 71 71 72 72 if (*partdef == '(') { 73 - int length; 74 - char *next = strchr(++partdef, ')'); 73 + partdef++; 74 + char *next = strsep(&partdef, ")"); 75 75 76 76 if (!next) { 77 77 pr_warn("cmdline partition format is invalid."); ··· 79 79 goto fail; 80 80 } 81 81 82 - length = min_t(int, next - partdef, 83 - sizeof(new_subpart->name) - 1); 84 - strscpy(new_subpart->name, partdef, length); 85 - 86 - partdef = ++next; 82 + strscpy(new_subpart->name, next, sizeof(new_subpart->name)); 87 83 } else 88 84 new_subpart->name[0] = '\0'; 89 85 ··· 113 117 } 114 118 } 115 119 116 - static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) 120 + static int parse_parts(struct cmdline_parts **parts, char *bdevdef) 117 121 { 118 122 int ret = -EINVAL; 119 123 char *next; 120 - int length; 121 124 struct cmdline_subpart **next_subpart; 122 125 struct cmdline_parts *newparts; 123 - char buf[BDEVNAME_SIZE + 32 + 4]; 124 126 125 127 *parts = NULL; 126 128 ··· 126 132 if (!newparts) 127 133 return -ENOMEM; 128 134 129 - next = strchr(bdevdef, ':'); 135 + next = strsep(&bdevdef, ":"); 130 136 if (!next) { 131 137 pr_warn("cmdline partition has no block device."); 132 138 goto fail; 133 139 } 134 140 135 - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); 136 - strscpy(newparts->name, bdevdef, length); 141 + strscpy(newparts->name, next, sizeof(newparts->name)); 137 142 newparts->nr_subparts = 0; 138 143 139 144 next_subpart = &newparts->subpart; 140 145 141 - while (next && *(++next)) { 142 - bdevdef = next; 143 - next = strchr(bdevdef, ','); 144 - 145 - length = (!next) ? (sizeof(buf) - 1) : 146 - min_t(int, next - bdevdef, sizeof(buf) - 1); 147 - 148 - strscpy(buf, bdevdef, length); 149 - 150 - ret = parse_subpart(next_subpart, buf); 146 + while ((next = strsep(&bdevdef, ","))) { 147 + ret = parse_subpart(next_subpart, next); 151 148 if (ret) 152 149 goto fail; 153 150 ··· 184 199 185 200 *parts = NULL; 186 201 187 - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); 202 + pbuf = buf = kstrdup(cmdline, GFP_KERNEL); 188 203 if (!buf) 189 204 return -ENOMEM; 190 205 191 206 next_parts = parts; 192 207 193 - while (next && *pbuf) { 194 - next = strchr(pbuf, ';'); 195 - if (next) 196 - *next = '\0'; 197 - 198 - ret = parse_parts(next_parts, pbuf); 208 + while ((next = strsep(&pbuf, ";"))) { 209 + ret = parse_parts(next_parts, next); 199 210 if (ret) 200 211 goto fail; 201 - 202 - if (next) 203 - pbuf = ++next; 204 212 205 213 next_parts = &(*next_parts)->next_parts; 206 214 } ··· 228 250 static int add_part(int slot, struct cmdline_subpart *subpart, 229 251 struct parsed_partitions *state) 230 252 { 231 - int label_min; 232 253 struct partition_meta_info *info; 233 254 char tmp[sizeof(info->volname) + 4]; 234 255 ··· 239 262 240 263 info = &state->parts[slot].info; 241 264 242 - label_min = min_t(int, sizeof(info->volname) - 1, 243 - sizeof(subpart->name)); 244 - strscpy(info->volname, subpart->name, label_min); 265 + strscpy(info->volname, subpart->name, sizeof(info->volname)); 245 266 246 267 snprintf(tmp, sizeof(tmp), "(%s)", info->volname); 247 268 strlcat(state->pp_buf, tmp, PAGE_SIZE);

+1 -4

block/partitions/core.c

··· 573 573 struct parsed_partitions *state; 574 574 int ret = -EAGAIN, p; 575 575 576 - if (disk->flags & GENHD_FL_NO_PART) 577 - return 0; 578 - 579 - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) 576 + if (!disk_has_partscan(disk)) 580 577 return 0; 581 578 582 579 state = check_partition(disk);

+11 -29

drivers/block/brd.c

··· 29 29 30 30 /* 31 31 * Each block ramdisk device has a xarray brd_pages of pages that stores 32 - * the pages containing the block device's contents. A brd page's ->index is 33 - * its offset in PAGE_SIZE units. This is similar to, but in no way connected 34 - * with, the kernel's pagecache or buffer cache (which sit above our block 35 - * device). 32 + * the pages containing the block device's contents. 36 33 */ 37 34 struct brd_device { 38 35 int brd_number; ··· 48 51 */ 49 52 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) 50 53 { 51 - pgoff_t idx; 52 - struct page *page; 53 - 54 - idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ 55 - page = xa_load(&brd->brd_pages, idx); 56 - 57 - BUG_ON(page && page->index != idx); 58 - 59 - return page; 54 + return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); 60 55 } 61 56 62 57 /* ··· 56 67 */ 57 68 static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) 58 69 { 59 - pgoff_t idx; 60 - struct page *page, *cur; 70 + pgoff_t idx = sector >> PAGE_SECTORS_SHIFT; 71 + struct page *page; 61 72 int ret = 0; 62 73 63 74 page = brd_lookup_page(brd, sector); ··· 69 80 return -ENOMEM; 70 81 71 82 xa_lock(&brd->brd_pages); 72 - 73 - idx = sector >> PAGE_SECTORS_SHIFT; 74 - page->index = idx; 75 - 76 - cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp); 77 - 78 - if (unlikely(cur)) { 79 - __free_page(page); 80 - ret = xa_err(cur); 81 - if (!ret && (cur->index != idx)) 82 - ret = -EIO; 83 - } else { 83 + ret = __xa_insert(&brd->brd_pages, idx, page, gfp); 84 + if (!ret) 84 85 brd->brd_nr_pages++; 85 - } 86 - 87 86 xa_unlock(&brd->brd_pages); 88 87 88 + if (ret < 0) { 89 + __free_page(page); 90 + if (ret == -EBUSY) 91 + ret = 0; 92 + } 89 93 return ret; 90 94 } 91 95

+30 -13

drivers/block/null_blk/main.c

··· 225 225 module_param_named(cache_size, g_cache_size, ulong, 0444); 226 226 MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); 227 227 228 + static bool g_fua = true; 229 + module_param_named(fua, g_fua, bool, 0444); 230 + MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true"); 231 + 228 232 static unsigned int g_mbps; 229 233 module_param_named(mbps, g_mbps, uint, 0444); 230 234 MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); ··· 256 252 static unsigned int g_zone_max_active; 257 253 module_param_named(zone_max_active, g_zone_max_active, uint, 0444); 258 254 MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); 255 + 256 + static int g_zone_append_max_sectors = INT_MAX; 257 + module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444); 258 + MODULE_PARM_DESC(zone_append_max_sectors, 259 + "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); 259 260 260 261 static struct nullb_device *null_alloc_dev(void); 261 262 static void null_free_dev(struct nullb_device *dev); ··· 445 436 NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); 446 437 NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); 447 438 NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); 439 + NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); 448 440 NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); 449 441 NULLB_DEVICE_ATTR(no_sched, bool, NULL); 450 442 NULLB_DEVICE_ATTR(shared_tags, bool, NULL); 451 443 NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); 444 + NULLB_DEVICE_ATTR(fua, bool, NULL); 452 445 453 446 static ssize_t nullb_device_power_show(struct config_item *item, char *page) 454 447 { ··· 591 580 &nullb_device_attr_zone_nr_conv, 592 581 &nullb_device_attr_zone_max_open, 593 582 &nullb_device_attr_zone_max_active, 583 + &nullb_device_attr_zone_append_max_sectors, 594 584 &nullb_device_attr_zone_readonly, 595 585 &nullb_device_attr_zone_offline, 596 586 &nullb_device_attr_virt_boundary, 597 587 &nullb_device_attr_no_sched, 598 588 &nullb_device_attr_shared_tags, 599 589 &nullb_device_attr_shared_tag_bitmap, 590 + &nullb_device_attr_fua, 600 591 NULL, 601 592 }; 602 593 ··· 677 664 static ssize_t memb_group_features_show(struct config_item *item, char *page) 678 665 { 679 666 return snprintf(page, PAGE_SIZE, 680 - "badblocks,blocking,blocksize,cache_size," 667 + "badblocks,blocking,blocksize,cache_size,fua," 681 668 "completion_nsec,discard,home_node,hw_queue_depth," 682 669 "irqmode,max_sectors,mbps,memory_backed,no_sched," 683 670 "poll_queues,power,queue_mode,shared_tag_bitmap," 684 671 "shared_tags,size,submit_queues,use_per_node_hctx," 685 672 "virt_boundary,zoned,zone_capacity,zone_max_active," 686 673 "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," 687 - "zone_size\n"); 674 + "zone_size,zone_append_max_sectors\n"); 688 675 } 689 676 690 677 CONFIGFS_ATTR_RO(memb_group_, features); ··· 764 751 dev->zone_nr_conv = g_zone_nr_conv; 765 752 dev->zone_max_open = g_zone_max_open; 766 753 dev->zone_max_active = g_zone_max_active; 754 + dev->zone_append_max_sectors = g_zone_append_max_sectors; 767 755 dev->virt_boundary = g_virt_boundary; 768 756 dev->no_sched = g_no_sched; 769 757 dev->shared_tags = g_shared_tags; 770 758 dev->shared_tag_bitmap = g_shared_tag_bitmap; 759 + dev->fua = g_fua; 760 + 771 761 return dev; 772 762 } 773 763 ··· 1167 1151 return BLK_STS_OK; 1168 1152 } 1169 1153 1170 - static int null_handle_flush(struct nullb *nullb) 1154 + static blk_status_t null_handle_flush(struct nullb *nullb) 1171 1155 { 1172 1156 int err; 1173 1157 ··· 1184 1168 1185 1169 WARN_ON(!radix_tree_empty(&nullb->dev->cache)); 1186 1170 spin_unlock_irq(&nullb->lock); 1187 - return err; 1171 + return errno_to_blk_status(err); 1188 1172 } 1189 1173 1190 1174 static int null_transfer(struct nullb *nullb, struct page *page, ··· 1222 1206 { 1223 1207 struct request *rq = blk_mq_rq_from_pdu(cmd); 1224 1208 struct nullb *nullb = cmd->nq->dev->nullb; 1225 - int err; 1209 + int err = 0; 1226 1210 unsigned int len; 1227 1211 sector_t sector = blk_rq_pos(rq); 1228 1212 struct req_iterator iter; ··· 1234 1218 err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, 1235 1219 op_is_write(req_op(rq)), sector, 1236 1220 rq->cmd_flags & REQ_FUA); 1237 - if (err) { 1238 - spin_unlock_irq(&nullb->lock); 1239 - return err; 1240 - } 1221 + if (err) 1222 + break; 1241 1223 sector += len >> SECTOR_SHIFT; 1242 1224 } 1243 1225 spin_unlock_irq(&nullb->lock); 1244 1226 1245 - return 0; 1227 + return errno_to_blk_status(err); 1246 1228 } 1247 1229 1248 1230 static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) ··· 1287 1273 1288 1274 if (op == REQ_OP_DISCARD) 1289 1275 return null_handle_discard(dev, sector, nr_sectors); 1290 - return errno_to_blk_status(null_handle_rq(cmd)); 1291 1276 1277 + return null_handle_rq(cmd); 1292 1278 } 1293 1279 1294 1280 static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) ··· 1357 1343 blk_status_t sts; 1358 1344 1359 1345 if (op == REQ_OP_FLUSH) { 1360 - cmd->error = errno_to_blk_status(null_handle_flush(nullb)); 1346 + cmd->error = null_handle_flush(nullb); 1361 1347 goto out; 1362 1348 } 1363 1349 ··· 1926 1912 1927 1913 if (dev->cache_size > 0) { 1928 1914 set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); 1929 - blk_queue_write_cache(nullb->q, true, true); 1915 + blk_queue_write_cache(nullb->q, true, dev->fua); 1930 1916 } 1931 1917 1932 1918 nullb->q->queuedata = nullb; ··· 2127 2113 2128 2114 if (tag_set.ops) 2129 2115 blk_mq_free_tag_set(&tag_set); 2116 + 2117 + mutex_destroy(&lock); 2130 2118 } 2131 2119 2132 2120 module_init(null_init); 2133 2121 module_exit(null_exit); 2134 2122 2135 2123 MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>"); 2124 + MODULE_DESCRIPTION("multi queue aware block test driver"); 2136 2125 MODULE_LICENSE("GPL");

+2

drivers/block/null_blk/null_blk.h

··· 82 82 unsigned int zone_nr_conv; /* number of conventional zones */ 83 83 unsigned int zone_max_open; /* max number of open zones */ 84 84 unsigned int zone_max_active; /* max number of active zones */ 85 + unsigned int zone_append_max_sectors; /* Max sectors per zone append command */ 85 86 unsigned int submit_queues; /* number of submission queues */ 86 87 unsigned int prev_submit_queues; /* number of submission queues before change */ 87 88 unsigned int poll_queues; /* number of IOPOLL submission queues */ ··· 105 104 bool no_sched; /* no IO scheduler for the device */ 106 105 bool shared_tags; /* share tag set between devices for blk-mq */ 107 106 bool shared_tag_bitmap; /* use hostwide shared tags */ 107 + bool fua; /* Support FUA */ 108 108 }; 109 109 110 110 struct nullb {

+193 -165

drivers/block/null_blk/zoned.c

··· 9 9 #undef pr_fmt 10 10 #define pr_fmt(fmt) "null_blk: " fmt 11 11 12 + #define NULL_ZONE_INVALID_WP ((sector_t)-1) 13 + 12 14 static inline sector_t mb_to_sects(unsigned long mb) 13 15 { 14 16 return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; ··· 19 17 static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) 20 18 { 21 19 return sect >> ilog2(dev->zone_size_sects); 22 - } 23 - 24 - static inline void null_lock_zone_res(struct nullb_device *dev) 25 - { 26 - if (dev->need_zone_res_mgmt) 27 - spin_lock_irq(&dev->zone_res_lock); 28 - } 29 - 30 - static inline void null_unlock_zone_res(struct nullb_device *dev) 31 - { 32 - if (dev->need_zone_res_mgmt) 33 - spin_unlock_irq(&dev->zone_res_lock); 34 20 } 35 21 36 22 static inline void null_init_zone_lock(struct nullb_device *dev, ··· 93 103 dev->zone_nr_conv); 94 104 } 95 105 106 + dev->zone_append_max_sectors = 107 + min(ALIGN_DOWN(dev->zone_append_max_sectors, 108 + dev->blocksize >> SECTOR_SHIFT), 109 + zone_capacity_sects); 110 + 96 111 /* Max active zones has to be < nbr of seq zones in order to be enforceable */ 97 112 if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { 98 113 dev->zone_max_active = 0; ··· 149 154 150 155 lim->zoned = true; 151 156 lim->chunk_sectors = dev->zone_size_sects; 152 - lim->max_zone_append_sectors = dev->zone_size_sects; 157 + lim->max_zone_append_sectors = dev->zone_append_max_sectors; 153 158 lim->max_open_zones = dev->zone_max_open; 154 159 lim->max_active_zones = dev->zone_max_active; 155 160 return 0; ··· 158 163 int null_register_zoned_dev(struct nullb *nullb) 159 164 { 160 165 struct request_queue *q = nullb->q; 166 + struct gendisk *disk = nullb->disk; 161 167 162 168 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 163 - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); 164 - nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); 165 - return blk_revalidate_disk_zones(nullb->disk, NULL); 169 + disk->nr_zones = bdev_nr_zones(disk->part0); 170 + 171 + pr_info("%s: using %s zone append\n", 172 + disk->disk_name, 173 + queue_emulates_zone_append(q) ? "emulated" : "native"); 174 + 175 + return blk_revalidate_disk_zones(disk); 166 176 } 167 177 168 178 void null_free_zoned_dev(struct nullb_device *dev) ··· 241 241 return (zone->wp - sector) << SECTOR_SHIFT; 242 242 } 243 243 244 - static blk_status_t __null_close_zone(struct nullb_device *dev, 245 - struct nullb_zone *zone) 246 - { 247 - switch (zone->cond) { 248 - case BLK_ZONE_COND_CLOSED: 249 - /* close operation on closed is not an error */ 250 - return BLK_STS_OK; 251 - case BLK_ZONE_COND_IMP_OPEN: 252 - dev->nr_zones_imp_open--; 253 - break; 254 - case BLK_ZONE_COND_EXP_OPEN: 255 - dev->nr_zones_exp_open--; 256 - break; 257 - case BLK_ZONE_COND_EMPTY: 258 - case BLK_ZONE_COND_FULL: 259 - default: 260 - return BLK_STS_IOERR; 261 - } 262 - 263 - if (zone->wp == zone->start) { 264 - zone->cond = BLK_ZONE_COND_EMPTY; 265 - } else { 266 - zone->cond = BLK_ZONE_COND_CLOSED; 267 - dev->nr_zones_closed++; 268 - } 269 - 270 - return BLK_STS_OK; 271 - } 272 - 273 244 static void null_close_imp_open_zone(struct nullb_device *dev) 274 245 { 275 246 struct nullb_zone *zone; ··· 257 286 zno = dev->zone_nr_conv; 258 287 259 288 if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { 260 - __null_close_zone(dev, zone); 289 + dev->nr_zones_imp_open--; 290 + if (zone->wp == zone->start) { 291 + zone->cond = BLK_ZONE_COND_EMPTY; 292 + } else { 293 + zone->cond = BLK_ZONE_COND_CLOSED; 294 + dev->nr_zones_closed++; 295 + } 261 296 dev->imp_close_zone_no = zno; 262 297 return; 263 298 } ··· 351 374 352 375 null_lock_zone(dev, zone); 353 376 354 - if (zone->cond == BLK_ZONE_COND_FULL || 355 - zone->cond == BLK_ZONE_COND_READONLY || 356 - zone->cond == BLK_ZONE_COND_OFFLINE) { 357 - /* Cannot write to the zone */ 358 - ret = BLK_STS_IOERR; 359 - goto unlock; 360 - } 361 - 362 377 /* 363 - * Regular writes must be at the write pointer position. 364 - * Zone append writes are automatically issued at the write 365 - * pointer and the position returned using the request or BIO 366 - * sector. 378 + * Regular writes must be at the write pointer position. Zone append 379 + * writes are automatically issued at the write pointer and the position 380 + * returned using the request sector. Note that we do not check the zone 381 + * condition because for FULL, READONLY and OFFLINE zones, the sector 382 + * check against the zone write pointer will always result in failing 383 + * the command. 367 384 */ 368 385 if (append) { 386 + if (WARN_ON_ONCE(!dev->zone_append_max_sectors) || 387 + zone->wp == NULL_ZONE_INVALID_WP) { 388 + ret = BLK_STS_IOERR; 389 + goto unlock_zone; 390 + } 369 391 sector = zone->wp; 370 392 blk_mq_rq_from_pdu(cmd)->__sector = sector; 371 - } else if (sector != zone->wp) { 372 - ret = BLK_STS_IOERR; 373 - goto unlock; 374 393 } 375 394 376 - if (zone->wp + nr_sectors > zone->start + zone->capacity) { 395 + if (sector != zone->wp || 396 + zone->wp + nr_sectors > zone->start + zone->capacity) { 377 397 ret = BLK_STS_IOERR; 378 - goto unlock; 398 + goto unlock_zone; 379 399 } 380 400 381 401 if (zone->cond == BLK_ZONE_COND_CLOSED || 382 402 zone->cond == BLK_ZONE_COND_EMPTY) { 383 - null_lock_zone_res(dev); 403 + if (dev->need_zone_res_mgmt) { 404 + spin_lock(&dev->zone_res_lock); 384 405 385 - ret = null_check_zone_resources(dev, zone); 386 - if (ret != BLK_STS_OK) { 387 - null_unlock_zone_res(dev); 388 - goto unlock; 406 + ret = null_check_zone_resources(dev, zone); 407 + if (ret != BLK_STS_OK) { 408 + spin_unlock(&dev->zone_res_lock); 409 + goto unlock_zone; 410 + } 411 + if (zone->cond == BLK_ZONE_COND_CLOSED) { 412 + dev->nr_zones_closed--; 413 + dev->nr_zones_imp_open++; 414 + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { 415 + dev->nr_zones_imp_open++; 416 + } 417 + 418 + spin_unlock(&dev->zone_res_lock); 389 419 } 390 - if (zone->cond == BLK_ZONE_COND_CLOSED) { 391 - dev->nr_zones_closed--; 392 - dev->nr_zones_imp_open++; 393 - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { 394 - dev->nr_zones_imp_open++; 395 - } 396 420 397 - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) 398 - zone->cond = BLK_ZONE_COND_IMP_OPEN; 399 - 400 - null_unlock_zone_res(dev); 421 + zone->cond = BLK_ZONE_COND_IMP_OPEN; 401 422 } 402 423 403 424 ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); 404 425 if (ret != BLK_STS_OK) 405 - goto unlock; 426 + goto unlock_zone; 406 427 407 428 zone->wp += nr_sectors; 408 429 if (zone->wp == zone->start + zone->capacity) { 409 - null_lock_zone_res(dev); 410 - if (zone->cond == BLK_ZONE_COND_EXP_OPEN) 411 - dev->nr_zones_exp_open--; 412 - else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) 413 - dev->nr_zones_imp_open--; 430 + if (dev->need_zone_res_mgmt) { 431 + spin_lock(&dev->zone_res_lock); 432 + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) 433 + dev->nr_zones_exp_open--; 434 + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) 435 + dev->nr_zones_imp_open--; 436 + spin_unlock(&dev->zone_res_lock); 437 + } 414 438 zone->cond = BLK_ZONE_COND_FULL; 415 - null_unlock_zone_res(dev); 416 439 } 417 440 418 441 ret = BLK_STS_OK; 419 442 420 - unlock: 443 + unlock_zone: 421 444 null_unlock_zone(dev, zone); 422 445 423 446 return ret; ··· 431 454 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 432 455 return BLK_STS_IOERR; 433 456 434 - null_lock_zone_res(dev); 435 - 436 457 switch (zone->cond) { 437 458 case BLK_ZONE_COND_EXP_OPEN: 438 - /* open operation on exp open is not an error */ 439 - goto unlock; 459 + /* Open operation on exp open is not an error */ 460 + return BLK_STS_OK; 440 461 case BLK_ZONE_COND_EMPTY: 441 - ret = null_check_zone_resources(dev, zone); 442 - if (ret != BLK_STS_OK) 443 - goto unlock; 444 - break; 445 462 case BLK_ZONE_COND_IMP_OPEN: 446 - dev->nr_zones_imp_open--; 447 - break; 448 463 case BLK_ZONE_COND_CLOSED: 449 - ret = null_check_zone_resources(dev, zone); 450 - if (ret != BLK_STS_OK) 451 - goto unlock; 452 - dev->nr_zones_closed--; 453 464 break; 454 465 case BLK_ZONE_COND_FULL: 455 466 default: 456 - ret = BLK_STS_IOERR; 457 - goto unlock; 467 + return BLK_STS_IOERR; 468 + } 469 + 470 + if (dev->need_zone_res_mgmt) { 471 + spin_lock(&dev->zone_res_lock); 472 + 473 + switch (zone->cond) { 474 + case BLK_ZONE_COND_EMPTY: 475 + ret = null_check_zone_resources(dev, zone); 476 + if (ret != BLK_STS_OK) { 477 + spin_unlock(&dev->zone_res_lock); 478 + return ret; 479 + } 480 + break; 481 + case BLK_ZONE_COND_IMP_OPEN: 482 + dev->nr_zones_imp_open--; 483 + break; 484 + case BLK_ZONE_COND_CLOSED: 485 + ret = null_check_zone_resources(dev, zone); 486 + if (ret != BLK_STS_OK) { 487 + spin_unlock(&dev->zone_res_lock); 488 + return ret; 489 + } 490 + dev->nr_zones_closed--; 491 + break; 492 + default: 493 + break; 494 + } 495 + 496 + dev->nr_zones_exp_open++; 497 + 498 + spin_unlock(&dev->zone_res_lock); 458 499 } 459 500 460 501 zone->cond = BLK_ZONE_COND_EXP_OPEN; 461 - dev->nr_zones_exp_open++; 462 502 463 - unlock: 464 - null_unlock_zone_res(dev); 465 - 466 - return ret; 503 + return BLK_STS_OK; 467 504 } 468 505 469 506 static blk_status_t null_close_zone(struct nullb_device *dev, 470 507 struct nullb_zone *zone) 471 508 { 472 - blk_status_t ret; 473 - 474 509 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 475 510 return BLK_STS_IOERR; 476 511 477 - null_lock_zone_res(dev); 478 - ret = __null_close_zone(dev, zone); 479 - null_unlock_zone_res(dev); 512 + switch (zone->cond) { 513 + case BLK_ZONE_COND_CLOSED: 514 + /* close operation on closed is not an error */ 515 + return BLK_STS_OK; 516 + case BLK_ZONE_COND_IMP_OPEN: 517 + case BLK_ZONE_COND_EXP_OPEN: 518 + break; 519 + case BLK_ZONE_COND_EMPTY: 520 + case BLK_ZONE_COND_FULL: 521 + default: 522 + return BLK_STS_IOERR; 523 + } 480 524 481 - return ret; 525 + if (dev->need_zone_res_mgmt) { 526 + spin_lock(&dev->zone_res_lock); 527 + 528 + switch (zone->cond) { 529 + case BLK_ZONE_COND_IMP_OPEN: 530 + dev->nr_zones_imp_open--; 531 + break; 532 + case BLK_ZONE_COND_EXP_OPEN: 533 + dev->nr_zones_exp_open--; 534 + break; 535 + default: 536 + break; 537 + } 538 + 539 + if (zone->wp > zone->start) 540 + dev->nr_zones_closed++; 541 + 542 + spin_unlock(&dev->zone_res_lock); 543 + } 544 + 545 + if (zone->wp == zone->start) 546 + zone->cond = BLK_ZONE_COND_EMPTY; 547 + else 548 + zone->cond = BLK_ZONE_COND_CLOSED; 549 + 550 + return BLK_STS_OK; 482 551 } 483 552 484 553 static blk_status_t null_finish_zone(struct nullb_device *dev, ··· 535 512 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 536 513 return BLK_STS_IOERR; 537 514 538 - null_lock_zone_res(dev); 515 + if (dev->need_zone_res_mgmt) { 516 + spin_lock(&dev->zone_res_lock); 539 517 540 - switch (zone->cond) { 541 - case BLK_ZONE_COND_FULL: 542 - /* finish operation on full is not an error */ 543 - goto unlock; 544 - case BLK_ZONE_COND_EMPTY: 545 - ret = null_check_zone_resources(dev, zone); 546 - if (ret != BLK_STS_OK) 547 - goto unlock; 548 - break; 549 - case BLK_ZONE_COND_IMP_OPEN: 550 - dev->nr_zones_imp_open--; 551 - break; 552 - case BLK_ZONE_COND_EXP_OPEN: 553 - dev->nr_zones_exp_open--; 554 - break; 555 - case BLK_ZONE_COND_CLOSED: 556 - ret = null_check_zone_resources(dev, zone); 557 - if (ret != BLK_STS_OK) 558 - goto unlock; 559 - dev->nr_zones_closed--; 560 - break; 561 - default: 562 - ret = BLK_STS_IOERR; 563 - goto unlock; 518 + switch (zone->cond) { 519 + case BLK_ZONE_COND_FULL: 520 + /* Finish operation on full is not an error */ 521 + spin_unlock(&dev->zone_res_lock); 522 + return BLK_STS_OK; 523 + case BLK_ZONE_COND_EMPTY: 524 + ret = null_check_zone_resources(dev, zone); 525 + if (ret != BLK_STS_OK) { 526 + spin_unlock(&dev->zone_res_lock); 527 + return ret; 528 + } 529 + break; 530 + case BLK_ZONE_COND_IMP_OPEN: 531 + dev->nr_zones_imp_open--; 532 + break; 533 + case BLK_ZONE_COND_EXP_OPEN: 534 + dev->nr_zones_exp_open--; 535 + break; 536 + case BLK_ZONE_COND_CLOSED: 537 + ret = null_check_zone_resources(dev, zone); 538 + if (ret != BLK_STS_OK) { 539 + spin_unlock(&dev->zone_res_lock); 540 + return ret; 541 + } 542 + dev->nr_zones_closed--; 543 + break; 544 + default: 545 + spin_unlock(&dev->zone_res_lock); 546 + return BLK_STS_IOERR; 547 + } 548 + 549 + spin_unlock(&dev->zone_res_lock); 564 550 } 565 551 566 552 zone->cond = BLK_ZONE_COND_FULL; 567 553 zone->wp = zone->start + zone->len; 568 554 569 - unlock: 570 - null_unlock_zone_res(dev); 571 - 572 - return ret; 555 + return BLK_STS_OK; 573 556 } 574 557 575 558 static blk_status_t null_reset_zone(struct nullb_device *dev, ··· 584 555 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 585 556 return BLK_STS_IOERR; 586 557 587 - null_lock_zone_res(dev); 558 + if (dev->need_zone_res_mgmt) { 559 + spin_lock(&dev->zone_res_lock); 588 560 589 - switch (zone->cond) { 590 - case BLK_ZONE_COND_EMPTY: 591 - /* reset operation on empty is not an error */ 592 - null_unlock_zone_res(dev); 593 - return BLK_STS_OK; 594 - case BLK_ZONE_COND_IMP_OPEN: 595 - dev->nr_zones_imp_open--; 596 - break; 597 - case BLK_ZONE_COND_EXP_OPEN: 598 - dev->nr_zones_exp_open--; 599 - break; 600 - case BLK_ZONE_COND_CLOSED: 601 - dev->nr_zones_closed--; 602 - break; 603 - case BLK_ZONE_COND_FULL: 604 - break; 605 - default: 606 - null_unlock_zone_res(dev); 607 - return BLK_STS_IOERR; 561 + switch (zone->cond) { 562 + case BLK_ZONE_COND_IMP_OPEN: 563 + dev->nr_zones_imp_open--; 564 + break; 565 + case BLK_ZONE_COND_EXP_OPEN: 566 + dev->nr_zones_exp_open--; 567 + break; 568 + case BLK_ZONE_COND_CLOSED: 569 + dev->nr_zones_closed--; 570 + break; 571 + case BLK_ZONE_COND_EMPTY: 572 + case BLK_ZONE_COND_FULL: 573 + break; 574 + default: 575 + spin_unlock(&dev->zone_res_lock); 576 + return BLK_STS_IOERR; 577 + } 578 + 579 + spin_unlock(&dev->zone_res_lock); 608 580 } 609 581 610 582 zone->cond = BLK_ZONE_COND_EMPTY; 611 583 zone->wp = zone->start; 612 - 613 - null_unlock_zone_res(dev); 614 584 615 585 if (dev->memory_backed) 616 586 return null_handle_discard(dev, zone->start, zone->len); ··· 739 711 zone->cond != BLK_ZONE_COND_OFFLINE) 740 712 null_finish_zone(dev, zone); 741 713 zone->cond = cond; 742 - zone->wp = (sector_t)-1; 714 + zone->wp = NULL_ZONE_INVALID_WP; 743 715 } 744 716 745 717 null_unlock_zone(dev, zone);

+2 -3

drivers/block/ublk_drv.c

··· 221 221 222 222 static int ublk_revalidate_disk_zones(struct ublk_device *ub) 223 223 { 224 - return blk_revalidate_disk_zones(ub->ub_disk, NULL); 224 + return blk_revalidate_disk_zones(ub->ub_disk); 225 225 } 226 226 227 227 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) ··· 249 249 static void ublk_dev_param_zoned_apply(struct ublk_device *ub) 250 250 { 251 251 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); 252 - blk_queue_required_elevator_features(ub->ub_disk->queue, 253 - ELEVATOR_F_ZBD_SEQ_WRITE); 252 + 254 253 ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); 255 254 } 256 255

+1 -1

drivers/block/virtio_blk.c

··· 1543 1543 */ 1544 1544 if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) { 1545 1545 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); 1546 - err = blk_revalidate_disk_zones(vblk->disk, NULL); 1546 + err = blk_revalidate_disk_zones(vblk->disk); 1547 1547 if (err) 1548 1548 goto out_cleanup_disk; 1549 1549 }

+22 -22

drivers/md/bcache/bset.c

··· 54 54 int __bch_count_data(struct btree_keys *b) 55 55 { 56 56 unsigned int ret = 0; 57 - struct btree_iter iter; 57 + struct btree_iter_stack iter; 58 58 struct bkey *k; 59 59 60 60 if (b->ops->is_extents) ··· 67 67 { 68 68 va_list args; 69 69 struct bkey *k, *p = NULL; 70 - struct btree_iter iter; 70 + struct btree_iter_stack iter; 71 71 const char *err; 72 72 73 73 for_each_key(b, k, &iter) { ··· 879 879 unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; 880 880 struct bset *i = bset_tree_last(b)->data; 881 881 struct bkey *m, *prev = NULL; 882 - struct btree_iter iter; 882 + struct btree_iter_stack iter; 883 883 struct bkey preceding_key_on_stack = ZERO_KEY; 884 884 struct bkey *preceding_key_p = &preceding_key_on_stack; 885 885 ··· 895 895 else 896 896 preceding_key(k, &preceding_key_p); 897 897 898 - m = bch_btree_iter_init(b, &iter, preceding_key_p); 898 + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); 899 899 900 - if (b->ops->insert_fixup(b, k, &iter, replace_key)) 900 + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) 901 901 return status; 902 902 903 903 status = BTREE_INSERT_STATUS_INSERT; ··· 1100 1100 btree_iter_cmp)); 1101 1101 } 1102 1102 1103 - static struct bkey *__bch_btree_iter_init(struct btree_keys *b, 1104 - struct btree_iter *iter, 1105 - struct bkey *search, 1106 - struct bset_tree *start) 1103 + static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, 1104 + struct btree_iter_stack *iter, 1105 + struct bkey *search, 1106 + struct bset_tree *start) 1107 1107 { 1108 1108 struct bkey *ret = NULL; 1109 1109 1110 - iter->size = ARRAY_SIZE(iter->data); 1111 - iter->used = 0; 1110 + iter->iter.size = ARRAY_SIZE(iter->stack_data); 1111 + iter->iter.used = 0; 1112 1112 1113 1113 #ifdef CONFIG_BCACHE_DEBUG 1114 - iter->b = b; 1114 + iter->iter.b = b; 1115 1115 #endif 1116 1116 1117 1117 for (; start <= bset_tree_last(b); start++) { 1118 1118 ret = bch_bset_search(b, start, search); 1119 - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); 1119 + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); 1120 1120 } 1121 1121 1122 1122 return ret; 1123 1123 } 1124 1124 1125 - struct bkey *bch_btree_iter_init(struct btree_keys *b, 1126 - struct btree_iter *iter, 1125 + struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, 1126 + struct btree_iter_stack *iter, 1127 1127 struct bkey *search) 1128 1128 { 1129 - return __bch_btree_iter_init(b, iter, search, b->set); 1129 + return __bch_btree_iter_stack_init(b, iter, search, b->set); 1130 1130 } 1131 1131 1132 1132 static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, ··· 1293 1293 struct bset_sort_state *state) 1294 1294 { 1295 1295 size_t order = b->page_order, keys = 0; 1296 - struct btree_iter iter; 1296 + struct btree_iter_stack iter; 1297 1297 int oldsize = bch_count_data(b); 1298 1298 1299 - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); 1299 + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); 1300 1300 1301 1301 if (start) { 1302 1302 unsigned int i; ··· 1307 1307 order = get_order(__set_bytes(b->set->data, keys)); 1308 1308 } 1309 1309 1310 - __btree_sort(b, &iter, start, order, false, state); 1310 + __btree_sort(b, &iter.iter, start, order, false, state); 1311 1311 1312 1312 EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); 1313 1313 } ··· 1323 1323 struct bset_sort_state *state) 1324 1324 { 1325 1325 uint64_t start_time = local_clock(); 1326 - struct btree_iter iter; 1326 + struct btree_iter_stack iter; 1327 1327 1328 - bch_btree_iter_init(b, &iter, NULL); 1328 + bch_btree_iter_stack_init(b, &iter, NULL); 1329 1329 1330 - btree_mergesort(b, new->set->data, &iter, false, true); 1330 + btree_mergesort(b, new->set->data, &iter.iter, false, true); 1331 1331 1332 1332 bch_time_stats_update(&state->time, start_time); 1333 1333

+18 -10

drivers/md/bcache/bset.h

··· 321 321 #endif 322 322 struct btree_iter_set { 323 323 struct bkey *k, *end; 324 - } data[MAX_BSETS]; 324 + } data[]; 325 + }; 326 + 327 + /* Fixed-size btree_iter that can be allocated on the stack */ 328 + 329 + struct btree_iter_stack { 330 + struct btree_iter iter; 331 + struct btree_iter_set stack_data[MAX_BSETS]; 325 332 }; 326 333 327 334 typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); ··· 340 333 341 334 void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, 342 335 struct bkey *end); 343 - struct bkey *bch_btree_iter_init(struct btree_keys *b, 344 - struct btree_iter *iter, 345 - struct bkey *search); 336 + struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, 337 + struct btree_iter_stack *iter, 338 + struct bkey *search); 346 339 347 340 struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, 348 341 const struct bkey *search); ··· 357 350 return search ? __bch_bset_search(b, t, search) : t->data->start; 358 351 } 359 352 360 - #define for_each_key_filter(b, k, iter, filter) \ 361 - for (bch_btree_iter_init((b), (iter), NULL); \ 362 - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) 353 + #define for_each_key_filter(b, k, stack_iter, filter) \ 354 + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ 355 + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ 356 + filter));) 363 357 364 - #define for_each_key(b, k, iter) \ 365 - for (bch_btree_iter_init((b), (iter), NULL); \ 366 - ((k) = bch_btree_iter_next(iter));) 358 + #define for_each_key(b, k, stack_iter) \ 359 + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ 360 + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) 367 361 368 362 /* Sorting */ 369 363

+21 -19

drivers/md/bcache/btree.c

··· 1309 1309 uint8_t stale = 0; 1310 1310 unsigned int keys = 0, good_keys = 0; 1311 1311 struct bkey *k; 1312 - struct btree_iter iter; 1312 + struct btree_iter_stack iter; 1313 1313 struct bset_tree *t; 1314 1314 1315 1315 gc->nodes++; ··· 1570 1570 static unsigned int btree_gc_count_keys(struct btree *b) 1571 1571 { 1572 1572 struct bkey *k; 1573 - struct btree_iter iter; 1573 + struct btree_iter_stack iter; 1574 1574 unsigned int ret = 0; 1575 1575 1576 1576 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ··· 1611 1611 int ret = 0; 1612 1612 bool should_rewrite; 1613 1613 struct bkey *k; 1614 - struct btree_iter iter; 1614 + struct btree_iter_stack iter; 1615 1615 struct gc_merge_info r[GC_MERGE_NODES]; 1616 1616 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; 1617 1617 1618 - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1618 + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); 1619 1619 1620 1620 for (i = r; i < r + ARRAY_SIZE(r); i++) 1621 1621 i->b = ERR_PTR(-EINTR); 1622 1622 1623 1623 while (1) { 1624 - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1624 + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, 1625 + bch_ptr_bad); 1625 1626 if (k) { 1626 1627 r->b = bch_btree_node_get(b->c, op, k, b->level - 1, 1627 1628 true, b); ··· 1912 1911 { 1913 1912 int ret = 0; 1914 1913 struct bkey *k, *p = NULL; 1915 - struct btree_iter iter; 1914 + struct btree_iter_stack iter; 1916 1915 1917 1916 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) 1918 1917 bch_initial_mark_key(b->c, b->level, k); ··· 1920 1919 bch_initial_mark_key(b->c, b->level + 1, &b->key); 1921 1920 1922 1921 if (b->level) { 1923 - bch_btree_iter_init(&b->keys, &iter, NULL); 1922 + bch_btree_iter_stack_init(&b->keys, &iter, NULL); 1924 1923 1925 1924 do { 1926 - k = bch_btree_iter_next_filter(&iter, &b->keys, 1925 + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, 1927 1926 bch_ptr_bad); 1928 1927 if (k) { 1929 1928 btree_node_prefetch(b, k); ··· 1951 1950 struct btree_check_info *info = arg; 1952 1951 struct btree_check_state *check_state = info->state; 1953 1952 struct cache_set *c = check_state->c; 1954 - struct btree_iter iter; 1953 + struct btree_iter_stack iter; 1955 1954 struct bkey *k, *p; 1956 1955 int cur_idx, prev_idx, skip_nr; 1957 1956 ··· 1960 1959 ret = 0; 1961 1960 1962 1961 /* root node keys are checked before thread created */ 1963 - bch_btree_iter_init(&c->root->keys, &iter, NULL); 1964 - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); 1962 + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); 1963 + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); 1965 1964 BUG_ON(!k); 1966 1965 1967 1966 p = k; ··· 1979 1978 skip_nr = cur_idx - prev_idx; 1980 1979 1981 1980 while (skip_nr) { 1982 - k = bch_btree_iter_next_filter(&iter, 1981 + k = bch_btree_iter_next_filter(&iter.iter, 1983 1982 &c->root->keys, 1984 1983 bch_ptr_bad); 1985 1984 if (k) ··· 2052 2051 int ret = 0; 2053 2052 int i; 2054 2053 struct bkey *k = NULL; 2055 - struct btree_iter iter; 2054 + struct btree_iter_stack iter; 2056 2055 struct btree_check_state check_state; 2057 2056 2058 2057 /* check and mark root node keys */ ··· 2548 2547 2549 2548 if (b->level) { 2550 2549 struct bkey *k; 2551 - struct btree_iter iter; 2550 + struct btree_iter_stack iter; 2552 2551 2553 - bch_btree_iter_init(&b->keys, &iter, from); 2552 + bch_btree_iter_stack_init(&b->keys, &iter, from); 2554 2553 2555 - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, 2554 + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, 2556 2555 bch_ptr_bad))) { 2557 2556 ret = bcache_btree(map_nodes_recurse, k, b, 2558 2557 op, from, fn, flags); ··· 2581 2580 { 2582 2581 int ret = MAP_CONTINUE; 2583 2582 struct bkey *k; 2584 - struct btree_iter iter; 2583 + struct btree_iter_stack iter; 2585 2584 2586 - bch_btree_iter_init(&b->keys, &iter, from); 2585 + bch_btree_iter_stack_init(&b->keys, &iter, from); 2587 2586 2588 - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { 2587 + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, 2588 + bch_ptr_bad))) { 2589 2589 ret = !b->level 2590 2590 ? fn(op, b, k) 2591 2591 : bcache_btree(map_keys_recurse, k,

+8 -7

drivers/md/bcache/super.c

··· 881 881 bcache_device_detach(d); 882 882 883 883 if (disk) { 884 - ida_simple_remove(&bcache_device_idx, 885 - first_minor_to_idx(disk->first_minor)); 884 + ida_free(&bcache_device_idx, 885 + first_minor_to_idx(disk->first_minor)); 886 886 put_disk(disk); 887 887 } 888 888 ··· 940 940 if (!d->full_dirty_stripes) 941 941 goto out_free_stripe_sectors_dirty; 942 942 943 - idx = ida_simple_get(&bcache_device_idx, 0, 944 - BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); 943 + idx = ida_alloc_max(&bcache_device_idx, BCACHE_DEVICE_IDX_MAX - 1, 944 + GFP_KERNEL); 945 945 if (idx < 0) 946 946 goto out_free_full_dirty_stripes; 947 947 ··· 986 986 out_bioset_exit: 987 987 bioset_exit(&d->bio_split); 988 988 out_ida_remove: 989 - ida_simple_remove(&bcache_device_idx, idx); 989 + ida_free(&bcache_device_idx, idx); 990 990 out_free_full_dirty_stripes: 991 991 kvfree(d->full_dirty_stripes); 992 992 out_free_stripe_sectors_dirty: ··· 1914 1914 INIT_LIST_HEAD(&c->btree_cache_freed); 1915 1915 INIT_LIST_HEAD(&c->data_buckets); 1916 1916 1917 - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) * 1918 - sizeof(struct btree_iter_set); 1917 + iter_size = sizeof(struct btree_iter) + 1918 + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * 1919 + sizeof(struct btree_iter_set); 1919 1920 1920 1921 c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); 1921 1922 if (!c->devices)

+1 -1

drivers/md/bcache/sysfs.c

··· 660 660 unsigned int bytes = 0; 661 661 struct bkey *k; 662 662 struct btree *b; 663 - struct btree_iter iter; 663 + struct btree_iter_stack iter; 664 664 665 665 goto lock_root; 666 666

+5 -5

drivers/md/bcache/writeback.c

··· 908 908 struct dirty_init_thrd_info *info = arg; 909 909 struct bch_dirty_init_state *state = info->state; 910 910 struct cache_set *c = state->c; 911 - struct btree_iter iter; 911 + struct btree_iter_stack iter; 912 912 struct bkey *k, *p; 913 913 int cur_idx, prev_idx, skip_nr; 914 914 915 915 k = p = NULL; 916 916 prev_idx = 0; 917 917 918 - bch_btree_iter_init(&c->root->keys, &iter, NULL); 919 - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); 918 + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); 919 + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); 920 920 BUG_ON(!k); 921 921 922 922 p = k; ··· 930 930 skip_nr = cur_idx - prev_idx; 931 931 932 932 while (skip_nr) { 933 - k = bch_btree_iter_next_filter(&iter, 933 + k = bch_btree_iter_next_filter(&iter.iter, 934 934 &c->root->keys, 935 935 bch_ptr_bad); 936 936 if (k) ··· 979 979 int i; 980 980 struct btree *b = NULL; 981 981 struct bkey *k = NULL; 982 - struct btree_iter iter; 982 + struct btree_iter_stack iter; 983 983 struct sectors_dirty_init op; 984 984 struct cache_set *c = d->c; 985 985 struct bch_dirty_init_state state;

+1 -2

drivers/md/dm-bio-prison-v2.c

··· 321 321 { 322 322 BUG_ON(!cell->exclusive_lock); 323 323 324 - bio_list_merge(bios, &cell->bios); 325 - bio_list_init(&cell->bios); 324 + bio_list_merge_init(bios, &cell->bios); 326 325 327 326 if (cell->shared_count) { 328 327 cell->exclusive_lock = false;

+4 -8

drivers/md/dm-cache-target.c

··· 115 115 */ 116 116 spin_lock_irq(&b->lock); 117 117 list_splice_init(&b->work_items, &work_items); 118 - bio_list_merge(&bios, &b->bios); 119 - bio_list_init(&b->bios); 118 + bio_list_merge_init(&bios, &b->bios); 120 119 b->commit_scheduled = false; 121 120 spin_unlock_irq(&b->lock); 122 121 ··· 564 565 static void defer_bios(struct cache *cache, struct bio_list *bios) 565 566 { 566 567 spin_lock_irq(&cache->lock); 567 - bio_list_merge(&cache->deferred_bios, bios); 568 - bio_list_init(bios); 568 + bio_list_merge_init(&cache->deferred_bios, bios); 569 569 spin_unlock_irq(&cache->lock); 570 570 571 571 wake_deferred_bio_worker(cache); ··· 1814 1816 bio_list_init(&bios); 1815 1817 1816 1818 spin_lock_irq(&cache->lock); 1817 - bio_list_merge(&bios, &cache->deferred_bios); 1818 - bio_list_init(&cache->deferred_bios); 1819 + bio_list_merge_init(&bios, &cache->deferred_bios); 1819 1820 spin_unlock_irq(&cache->lock); 1820 1821 1821 1822 while ((bio = bio_list_pop(&bios))) { ··· 1844 1847 struct bio_list bios; 1845 1848 1846 1849 bio_list_init(&bios); 1847 - bio_list_merge(&bios, &cache->deferred_bios); 1848 - bio_list_init(&cache->deferred_bios); 1850 + bio_list_merge_init(&bios, &cache->deferred_bios); 1849 1851 1850 1852 while ((bio = bio_list_pop(&bios))) { 1851 1853 bio->bi_status = BLK_STS_DM_REQUEUE;

+5 -9

drivers/md/dm-clone-target.c

··· 1181 1181 struct bio_list discards = BIO_EMPTY_LIST; 1182 1182 1183 1183 spin_lock_irq(&clone->lock); 1184 - bio_list_merge(&discards, &clone->deferred_discard_bios); 1185 - bio_list_init(&clone->deferred_discard_bios); 1184 + bio_list_merge_init(&discards, &clone->deferred_discard_bios); 1186 1185 spin_unlock_irq(&clone->lock); 1187 1186 1188 1187 if (bio_list_empty(&discards)) ··· 1214 1215 struct bio_list bios = BIO_EMPTY_LIST; 1215 1216 1216 1217 spin_lock_irq(&clone->lock); 1217 - bio_list_merge(&bios, &clone->deferred_bios); 1218 - bio_list_init(&clone->deferred_bios); 1218 + bio_list_merge_init(&bios, &clone->deferred_bios); 1219 1219 spin_unlock_irq(&clone->lock); 1220 1220 1221 1221 if (bio_list_empty(&bios)) ··· 1235 1237 * before issuing them or signaling their completion. 1236 1238 */ 1237 1239 spin_lock_irq(&clone->lock); 1238 - bio_list_merge(&bios, &clone->deferred_flush_bios); 1239 - bio_list_init(&clone->deferred_flush_bios); 1240 - 1241 - bio_list_merge(&bio_completions, &clone->deferred_flush_completions); 1242 - bio_list_init(&clone->deferred_flush_completions); 1240 + bio_list_merge_init(&bios, &clone->deferred_flush_bios); 1241 + bio_list_merge_init(&bio_completions, 1242 + &clone->deferred_flush_completions); 1243 1243 spin_unlock_irq(&clone->lock); 1244 1244 1245 1245 if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&

+1 -1

drivers/md/dm-core.h

··· 140 140 141 141 #ifdef CONFIG_BLK_DEV_ZONED 142 142 unsigned int nr_zones; 143 - unsigned int *zwp_offset; 143 + void *zone_revalidate_map; 144 144 #endif 145 145 146 146 #ifdef CONFIG_IMA

+1 -2

drivers/md/dm-era-target.c

··· 1272 1272 bio_list_init(&marked_bios); 1273 1273 1274 1274 spin_lock(&era->deferred_lock); 1275 - bio_list_merge(&deferred_bios, &era->deferred_bios); 1276 - bio_list_init(&era->deferred_bios); 1275 + bio_list_merge_init(&deferred_bios, &era->deferred_bios); 1277 1276 spin_unlock(&era->deferred_lock); 1278 1277 1279 1278 if (bio_list_empty(&deferred_bios))

+1 -2

drivers/md/dm-mpath.c

··· 704 704 return; 705 705 } 706 706 707 - bio_list_merge(&bios, &m->queued_bios); 708 - bio_list_init(&m->queued_bios); 707 + bio_list_merge_init(&bios, &m->queued_bios); 709 708 710 709 spin_unlock_irqrestore(&m->lock, flags); 711 710

+2 -1

drivers/md/dm-table.c

··· 2042 2042 r = dm_set_zones_restrictions(t, q); 2043 2043 if (r) 2044 2044 return r; 2045 - if (!static_key_enabled(&zoned_enabled.key)) 2045 + if (blk_queue_is_zoned(q) && 2046 + !static_key_enabled(&zoned_enabled.key)) 2046 2047 static_branch_enable(&zoned_enabled); 2047 2048 } 2048 2049

+3 -9

drivers/md/dm-thin.c

··· 592 592 struct dm_bio_prison_cell *cell; 593 593 }; 594 594 595 - static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) 596 - { 597 - bio_list_merge(bios, master); 598 - bio_list_init(master); 599 - } 600 - 601 595 static void error_bio_list(struct bio_list *bios, blk_status_t error) 602 596 { 603 597 struct bio *bio; ··· 610 616 bio_list_init(&bios); 611 617 612 618 spin_lock_irq(&tc->lock); 613 - __merge_bio_list(&bios, master); 619 + bio_list_merge_init(&bios, master); 614 620 spin_unlock_irq(&tc->lock); 615 621 616 622 error_bio_list(&bios, error); ··· 639 645 bio_list_init(&bios); 640 646 641 647 spin_lock_irq(&tc->lock); 642 - __merge_bio_list(&bios, &tc->deferred_bio_list); 643 - __merge_bio_list(&bios, &tc->retry_on_resume_list); 648 + bio_list_merge_init(&bios, &tc->deferred_bio_list); 649 + bio_list_merge_init(&bios, &tc->retry_on_resume_list); 644 650 spin_unlock_irq(&tc->lock); 645 651 646 652 error_bio_list(&bios, BLK_STS_DM_REQUEUE);

+1 -2

drivers/md/dm-vdo/data-vio.c

··· 604 604 605 605 static void get_waiters(struct limiter *limiter) 606 606 { 607 - bio_list_merge(&limiter->waiters, &limiter->new_waiters); 608 - bio_list_init(&limiter->new_waiters); 607 + bio_list_merge_init(&limiter->waiters, &limiter->new_waiters); 609 608 } 610 609 611 610 static inline struct data_vio *get_available_data_vio(struct data_vio_pool *pool)

+1 -2

drivers/md/dm-vdo/flush.c

··· 369 369 static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo) 370 370 { 371 371 bio_list_init(&flush->bios); 372 - bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios); 373 - bio_list_init(&vdo->flusher->waiting_flush_bios); 372 + bio_list_merge_init(&flush->bios, &vdo->flusher->waiting_flush_bios); 374 373 } 375 374 376 375 static void launch_flush(struct vdo_flush *flush)

+88 -421

drivers/md/dm-zone.c

··· 60 60 struct dm_table *map; 61 61 int srcu_idx, ret; 62 62 63 - if (dm_suspended_md(md)) 64 - return -EAGAIN; 63 + if (!md->zone_revalidate_map) { 64 + /* Regular user context */ 65 + if (dm_suspended_md(md)) 66 + return -EAGAIN; 65 67 66 - map = dm_get_live_table(md, &srcu_idx); 67 - if (!map) 68 - return -EIO; 68 + map = dm_get_live_table(md, &srcu_idx); 69 + if (!map) 70 + return -EIO; 71 + } else { 72 + /* Zone revalidation during __bind() */ 73 + map = md->zone_revalidate_map; 74 + } 69 75 70 76 ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); 71 77 72 - dm_put_live_table(md, srcu_idx); 78 + if (!md->zone_revalidate_map) 79 + dm_put_live_table(md, srcu_idx); 73 80 74 81 return ret; 75 82 } ··· 145 138 } 146 139 } 147 140 148 - void dm_cleanup_zoned_dev(struct mapped_device *md) 141 + /* 142 + * Count conventional zones of a mapped zoned device. If the device 143 + * only has conventional zones, do not expose it as zoned. 144 + */ 145 + static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, 146 + void *data) 149 147 { 150 - if (md->disk) { 151 - bitmap_free(md->disk->conv_zones_bitmap); 152 - md->disk->conv_zones_bitmap = NULL; 153 - bitmap_free(md->disk->seq_zones_wlock); 154 - md->disk->seq_zones_wlock = NULL; 155 - } 148 + unsigned int *nr_conv_zones = data; 156 149 157 - kvfree(md->zwp_offset); 158 - md->zwp_offset = NULL; 159 - md->nr_zones = 0; 150 + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) 151 + (*nr_conv_zones)++; 152 + 153 + return 0; 160 154 } 161 155 162 - static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone) 156 + static int dm_check_zoned(struct mapped_device *md, struct dm_table *t) 163 157 { 164 - switch (zone->cond) { 165 - case BLK_ZONE_COND_IMP_OPEN: 166 - case BLK_ZONE_COND_EXP_OPEN: 167 - case BLK_ZONE_COND_CLOSED: 168 - return zone->wp - zone->start; 169 - case BLK_ZONE_COND_FULL: 170 - return zone->len; 171 - case BLK_ZONE_COND_EMPTY: 172 - case BLK_ZONE_COND_NOT_WP: 173 - case BLK_ZONE_COND_OFFLINE: 174 - case BLK_ZONE_COND_READONLY: 175 - default: 176 - /* 177 - * Conventional, offline and read-only zones do not have a valid 178 - * write pointer. Use 0 as for an empty zone. 179 - */ 180 - return 0; 181 - } 182 - } 183 - 184 - static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, 185 - void *data) 186 - { 187 - struct mapped_device *md = data; 188 158 struct gendisk *disk = md->disk; 159 + unsigned int nr_conv_zones = 0; 160 + int ret; 189 161 190 - switch (zone->type) { 191 - case BLK_ZONE_TYPE_CONVENTIONAL: 192 - if (!disk->conv_zones_bitmap) { 193 - disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones, 194 - GFP_NOIO); 195 - if (!disk->conv_zones_bitmap) 196 - return -ENOMEM; 197 - } 198 - set_bit(idx, disk->conv_zones_bitmap); 199 - break; 200 - case BLK_ZONE_TYPE_SEQWRITE_REQ: 201 - case BLK_ZONE_TYPE_SEQWRITE_PREF: 202 - if (!disk->seq_zones_wlock) { 203 - disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones, 204 - GFP_NOIO); 205 - if (!disk->seq_zones_wlock) 206 - return -ENOMEM; 207 - } 208 - if (!md->zwp_offset) { 209 - md->zwp_offset = 210 - kvcalloc(disk->nr_zones, sizeof(unsigned int), 211 - GFP_KERNEL); 212 - if (!md->zwp_offset) 213 - return -ENOMEM; 214 - } 215 - md->zwp_offset[idx] = dm_get_zone_wp_offset(zone); 162 + /* Count conventional zones */ 163 + md->zone_revalidate_map = t; 164 + ret = dm_blk_report_zones(disk, 0, UINT_MAX, 165 + dm_check_zoned_cb, &nr_conv_zones); 166 + md->zone_revalidate_map = NULL; 167 + if (ret < 0) { 168 + DMERR("Check zoned failed %d", ret); 169 + return ret; 170 + } 216 171 217 - break; 218 - default: 219 - DMERR("Invalid zone type 0x%x at sectors %llu", 220 - (int)zone->type, zone->start); 221 - return -ENODEV; 172 + /* 173 + * If we only have conventional zones, expose the mapped device as 174 + * a regular device. 175 + */ 176 + if (nr_conv_zones >= ret) { 177 + disk->queue->limits.max_open_zones = 0; 178 + disk->queue->limits.max_active_zones = 0; 179 + disk->queue->limits.zoned = false; 180 + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 181 + disk->nr_zones = 0; 222 182 } 223 183 224 184 return 0; ··· 200 226 static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) 201 227 { 202 228 struct gendisk *disk = md->disk; 203 - unsigned int noio_flag; 204 229 int ret; 205 230 206 - /* 207 - * Check if something changed. If yes, cleanup the current resources 208 - * and reallocate everything. 209 - */ 231 + /* Revalidate only if something changed. */ 210 232 if (!disk->nr_zones || disk->nr_zones != md->nr_zones) 211 - dm_cleanup_zoned_dev(md); 233 + md->nr_zones = 0; 234 + 212 235 if (md->nr_zones) 213 236 return 0; 214 237 215 238 /* 216 - * Scan all zones to initialize everything. Ensure that all vmalloc 217 - * operations in this context are done as if GFP_NOIO was specified. 239 + * Our table is not live yet. So the call to dm_get_live_table() 240 + * in dm_blk_report_zones() will fail. Set a temporary pointer to 241 + * our table for dm_blk_report_zones() to use directly. 218 242 */ 219 - noio_flag = memalloc_noio_save(); 220 - ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones, 221 - dm_zone_revalidate_cb, md); 222 - memalloc_noio_restore(noio_flag); 223 - if (ret < 0) 224 - goto err; 225 - if (ret != disk->nr_zones) { 226 - ret = -EIO; 227 - goto err; 243 + md->zone_revalidate_map = t; 244 + ret = blk_revalidate_disk_zones(disk); 245 + md->zone_revalidate_map = NULL; 246 + 247 + if (ret) { 248 + DMERR("Revalidate zones failed %d", ret); 249 + return ret; 228 250 } 229 251 230 252 md->nr_zones = disk->nr_zones; 231 253 232 254 return 0; 233 - 234 - err: 235 - DMERR("Revalidate zones failed %d", ret); 236 - dm_cleanup_zoned_dev(md); 237 - return ret; 238 255 } 239 256 240 257 static int device_not_zone_append_capable(struct dm_target *ti, ··· 254 289 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) 255 290 { 256 291 struct mapped_device *md = t->md; 292 + int ret; 257 293 258 294 /* 259 - * For a zoned target, the number of zones should be updated for the 260 - * correct value to be exposed in sysfs queue/nr_zones. 295 + * Check if zone append is natively supported, and if not, set the 296 + * mapped device queue as needing zone append emulation. 261 297 */ 262 298 WARN_ON_ONCE(queue_is_mq(q)); 263 - md->disk->nr_zones = bdev_nr_zones(md->disk->part0); 264 - 265 - /* Check if zone append is natively supported */ 266 299 if (dm_table_supports_zone_append(t)) { 267 300 clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 268 - dm_cleanup_zoned_dev(md); 269 - return 0; 301 + } else { 302 + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 303 + blk_queue_max_zone_append_sectors(q, 0); 270 304 } 271 305 272 - /* 273 - * Mark the mapped device as needing zone append emulation and 274 - * initialize the emulation resources once the capacity is set. 275 - */ 276 - set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); 277 306 if (!get_capacity(md->disk)) 278 307 return 0; 279 308 309 + /* 310 + * Check that the mapped device will indeed be zoned, that is, that it 311 + * has sequential write required zones. 312 + */ 313 + ret = dm_check_zoned(md, t); 314 + if (ret) 315 + return ret; 316 + if (!blk_queue_is_zoned(q)) 317 + return 0; 318 + 319 + if (!md->disk->nr_zones) { 320 + DMINFO("%s using %s zone append", 321 + md->disk->disk_name, 322 + queue_emulates_zone_append(q) ? "emulated" : "native"); 323 + } 324 + 280 325 return dm_revalidate_zones(md, t); 281 - } 282 - 283 - static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx, 284 - void *data) 285 - { 286 - unsigned int *wp_offset = data; 287 - 288 - *wp_offset = dm_get_zone_wp_offset(zone); 289 - 290 - return 0; 291 - } 292 - 293 - static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, 294 - unsigned int *wp_ofst) 295 - { 296 - sector_t sector = zno * bdev_zone_sectors(md->disk->part0); 297 - unsigned int noio_flag; 298 - struct dm_table *t; 299 - int srcu_idx, ret; 300 - 301 - t = dm_get_live_table(md, &srcu_idx); 302 - if (!t) 303 - return -EIO; 304 - 305 - /* 306 - * Ensure that all memory allocations in this context are done as if 307 - * GFP_NOIO was specified. 308 - */ 309 - noio_flag = memalloc_noio_save(); 310 - ret = dm_blk_do_report_zones(md, t, sector, 1, 311 - dm_update_zone_wp_offset_cb, wp_ofst); 312 - memalloc_noio_restore(noio_flag); 313 - 314 - dm_put_live_table(md, srcu_idx); 315 - 316 - if (ret != 1) 317 - return -EIO; 318 - 319 - return 0; 320 - } 321 - 322 - struct orig_bio_details { 323 - enum req_op op; 324 - unsigned int nr_sectors; 325 - }; 326 - 327 - /* 328 - * First phase of BIO mapping for targets with zone append emulation: 329 - * check all BIO that change a zone writer pointer and change zone 330 - * append operations into regular write operations. 331 - */ 332 - static bool dm_zone_map_bio_begin(struct mapped_device *md, 333 - unsigned int zno, struct bio *clone) 334 - { 335 - sector_t zsectors = bdev_zone_sectors(md->disk->part0); 336 - unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); 337 - 338 - /* 339 - * If the target zone is in an error state, recover by inspecting the 340 - * zone to get its current write pointer position. Note that since the 341 - * target zone is already locked, a BIO issuing context should never 342 - * see the zone write in the DM_ZONE_UPDATING_WP_OFST state. 343 - */ 344 - if (zwp_offset == DM_ZONE_INVALID_WP_OFST) { 345 - if (dm_update_zone_wp_offset(md, zno, &zwp_offset)) 346 - return false; 347 - WRITE_ONCE(md->zwp_offset[zno], zwp_offset); 348 - } 349 - 350 - switch (bio_op(clone)) { 351 - case REQ_OP_ZONE_RESET: 352 - case REQ_OP_ZONE_FINISH: 353 - return true; 354 - case REQ_OP_WRITE_ZEROES: 355 - case REQ_OP_WRITE: 356 - /* Writes must be aligned to the zone write pointer */ 357 - if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset) 358 - return false; 359 - break; 360 - case REQ_OP_ZONE_APPEND: 361 - /* 362 - * Change zone append operations into a non-mergeable regular 363 - * writes directed at the current write pointer position of the 364 - * target zone. 365 - */ 366 - clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | 367 - (clone->bi_opf & (~REQ_OP_MASK)); 368 - clone->bi_iter.bi_sector += zwp_offset; 369 - break; 370 - default: 371 - DMWARN_LIMIT("Invalid BIO operation"); 372 - return false; 373 - } 374 - 375 - /* Cannot write to a full zone */ 376 - if (zwp_offset >= zsectors) 377 - return false; 378 - 379 - return true; 380 - } 381 - 382 - /* 383 - * Second phase of BIO mapping for targets with zone append emulation: 384 - * update the zone write pointer offset array to account for the additional 385 - * data written to a zone. Note that at this point, the remapped clone BIO 386 - * may already have completed, so we do not touch it. 387 - */ 388 - static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno, 389 - struct orig_bio_details *orig_bio_details, 390 - unsigned int nr_sectors) 391 - { 392 - unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); 393 - 394 - /* The clone BIO may already have been completed and failed */ 395 - if (zwp_offset == DM_ZONE_INVALID_WP_OFST) 396 - return BLK_STS_IOERR; 397 - 398 - /* Update the zone wp offset */ 399 - switch (orig_bio_details->op) { 400 - case REQ_OP_ZONE_RESET: 401 - WRITE_ONCE(md->zwp_offset[zno], 0); 402 - return BLK_STS_OK; 403 - case REQ_OP_ZONE_FINISH: 404 - WRITE_ONCE(md->zwp_offset[zno], 405 - bdev_zone_sectors(md->disk->part0)); 406 - return BLK_STS_OK; 407 - case REQ_OP_WRITE_ZEROES: 408 - case REQ_OP_WRITE: 409 - WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); 410 - return BLK_STS_OK; 411 - case REQ_OP_ZONE_APPEND: 412 - /* 413 - * Check that the target did not truncate the write operation 414 - * emulating a zone append. 415 - */ 416 - if (nr_sectors != orig_bio_details->nr_sectors) { 417 - DMWARN_LIMIT("Truncated write for zone append"); 418 - return BLK_STS_IOERR; 419 - } 420 - WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); 421 - return BLK_STS_OK; 422 - default: 423 - DMWARN_LIMIT("Invalid BIO operation"); 424 - return BLK_STS_IOERR; 425 - } 426 - } 427 - 428 - static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno, 429 - struct bio *clone) 430 - { 431 - if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))) 432 - return; 433 - 434 - wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); 435 - bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED); 436 - } 437 - 438 - static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno, 439 - struct bio *clone) 440 - { 441 - if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) 442 - return; 443 - 444 - WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock)); 445 - clear_bit_unlock(zno, disk->seq_zones_wlock); 446 - smp_mb__after_atomic(); 447 - wake_up_bit(disk->seq_zones_wlock, zno); 448 - 449 - bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); 450 - } 451 - 452 - static bool dm_need_zone_wp_tracking(struct bio *bio) 453 - { 454 - /* 455 - * Special processing is not needed for operations that do not need the 456 - * zone write lock, that is, all operations that target conventional 457 - * zones and all operations that do not modify directly a sequential 458 - * zone write pointer. 459 - */ 460 - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) 461 - return false; 462 - switch (bio_op(bio)) { 463 - case REQ_OP_WRITE_ZEROES: 464 - case REQ_OP_WRITE: 465 - case REQ_OP_ZONE_RESET: 466 - case REQ_OP_ZONE_FINISH: 467 - case REQ_OP_ZONE_APPEND: 468 - return bio_zone_is_seq(bio); 469 - default: 470 - return false; 471 - } 472 - } 473 - 474 - /* 475 - * Special IO mapping for targets needing zone append emulation. 476 - */ 477 - int dm_zone_map_bio(struct dm_target_io *tio) 478 - { 479 - struct dm_io *io = tio->io; 480 - struct dm_target *ti = tio->ti; 481 - struct mapped_device *md = io->md; 482 - struct bio *clone = &tio->clone; 483 - struct orig_bio_details orig_bio_details; 484 - unsigned int zno; 485 - blk_status_t sts; 486 - int r; 487 - 488 - /* 489 - * IOs that do not change a zone write pointer do not need 490 - * any additional special processing. 491 - */ 492 - if (!dm_need_zone_wp_tracking(clone)) 493 - return ti->type->map(ti, clone); 494 - 495 - /* Lock the target zone */ 496 - zno = bio_zone_no(clone); 497 - dm_zone_lock(md->disk, zno, clone); 498 - 499 - orig_bio_details.nr_sectors = bio_sectors(clone); 500 - orig_bio_details.op = bio_op(clone); 501 - 502 - /* 503 - * Check that the bio and the target zone write pointer offset are 504 - * both valid, and if the bio is a zone append, remap it to a write. 505 - */ 506 - if (!dm_zone_map_bio_begin(md, zno, clone)) { 507 - dm_zone_unlock(md->disk, zno, clone); 508 - return DM_MAPIO_KILL; 509 - } 510 - 511 - /* Let the target do its work */ 512 - r = ti->type->map(ti, clone); 513 - switch (r) { 514 - case DM_MAPIO_SUBMITTED: 515 - /* 516 - * The target submitted the clone BIO. The target zone will 517 - * be unlocked on completion of the clone. 518 - */ 519 - sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, 520 - *tio->len_ptr); 521 - break; 522 - case DM_MAPIO_REMAPPED: 523 - /* 524 - * The target only remapped the clone BIO. In case of error, 525 - * unlock the target zone here as the clone will not be 526 - * submitted. 527 - */ 528 - sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, 529 - *tio->len_ptr); 530 - if (sts != BLK_STS_OK) 531 - dm_zone_unlock(md->disk, zno, clone); 532 - break; 533 - case DM_MAPIO_REQUEUE: 534 - case DM_MAPIO_KILL: 535 - default: 536 - dm_zone_unlock(md->disk, zno, clone); 537 - sts = BLK_STS_IOERR; 538 - break; 539 - } 540 - 541 - if (sts != BLK_STS_OK) 542 - return DM_MAPIO_KILL; 543 - 544 - return r; 545 326 } 546 327 547 328 /* ··· 298 587 struct mapped_device *md = io->md; 299 588 struct gendisk *disk = md->disk; 300 589 struct bio *orig_bio = io->orig_bio; 301 - unsigned int zwp_offset; 302 - unsigned int zno; 303 590 304 591 /* 305 - * For targets that do not emulate zone append, we only need to 306 - * handle native zone-append bios. 592 + * Get the offset within the zone of the written sector 593 + * and add that to the original bio sector position. 307 594 */ 308 - if (!dm_emulate_zone_append(md)) { 309 - /* 310 - * Get the offset within the zone of the written sector 311 - * and add that to the original bio sector position. 312 - */ 313 - if (clone->bi_status == BLK_STS_OK && 314 - bio_op(clone) == REQ_OP_ZONE_APPEND) { 315 - sector_t mask = 316 - (sector_t)bdev_zone_sectors(disk->part0) - 1; 595 + if (clone->bi_status == BLK_STS_OK && 596 + bio_op(clone) == REQ_OP_ZONE_APPEND) { 597 + sector_t mask = bdev_zone_sectors(disk->part0) - 1; 317 598 318 - orig_bio->bi_iter.bi_sector += 319 - clone->bi_iter.bi_sector & mask; 320 - } 321 - 322 - return; 599 + orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask; 323 600 } 324 601 325 - /* 326 - * For targets that do emulate zone append, if the clone BIO does not 327 - * own the target zone write lock, we have nothing to do. 328 - */ 329 - if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) 330 - return; 331 - 332 - zno = bio_zone_no(orig_bio); 333 - 334 - if (clone->bi_status != BLK_STS_OK) { 335 - /* 336 - * BIOs that modify a zone write pointer may leave the zone 337 - * in an unknown state in case of failure (e.g. the write 338 - * pointer was only partially advanced). In this case, set 339 - * the target zone write pointer as invalid unless it is 340 - * already being updated. 341 - */ 342 - WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST); 343 - } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { 344 - /* 345 - * Get the written sector for zone append operation that were 346 - * emulated using regular write operations. 347 - */ 348 - zwp_offset = READ_ONCE(md->zwp_offset[zno]); 349 - if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio))) 350 - WRITE_ONCE(md->zwp_offset[zno], 351 - DM_ZONE_INVALID_WP_OFST); 352 - else 353 - orig_bio->bi_iter.bi_sector += 354 - zwp_offset - bio_sectors(orig_bio); 355 - } 356 - 357 - dm_zone_unlock(disk, zno, clone); 602 + return; 358 603 }

+49 -23

drivers/md/dm.c

··· 1428 1428 down(&md->swap_bios_semaphore); 1429 1429 } 1430 1430 1431 - if (static_branch_unlikely(&zoned_enabled)) { 1432 - /* 1433 - * Check if the IO needs a special mapping due to zone append 1434 - * emulation on zoned target. In this case, dm_zone_map_bio() 1435 - * calls the target map operation. 1436 - */ 1437 - if (unlikely(dm_emulate_zone_append(md))) 1438 - r = dm_zone_map_bio(tio); 1439 - else 1440 - goto do_map; 1441 - } else { 1442 - do_map: 1443 - if (likely(ti->type->map == linear_map)) 1444 - r = linear_map(ti, clone); 1445 - else if (ti->type->map == stripe_map) 1446 - r = stripe_map(ti, clone); 1447 - else 1448 - r = ti->type->map(ti, clone); 1449 - } 1431 + if (likely(ti->type->map == linear_map)) 1432 + r = linear_map(ti, clone); 1433 + else if (ti->type->map == stripe_map) 1434 + r = stripe_map(ti, clone); 1435 + else 1436 + r = ti->type->map(ti, clone); 1450 1437 1451 1438 switch (r) { 1452 1439 case DM_MAPIO_SUBMITTED: ··· 1761 1774 ci->sector_count = 0; 1762 1775 } 1763 1776 1777 + #ifdef CONFIG_BLK_DEV_ZONED 1778 + static inline bool dm_zone_bio_needs_split(struct mapped_device *md, 1779 + struct bio *bio) 1780 + { 1781 + /* 1782 + * For mapped device that need zone append emulation, we must 1783 + * split any large BIO that straddles zone boundaries. 1784 + */ 1785 + return dm_emulate_zone_append(md) && bio_straddles_zones(bio) && 1786 + !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); 1787 + } 1788 + static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) 1789 + { 1790 + return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); 1791 + } 1792 + #else 1793 + static inline bool dm_zone_bio_needs_split(struct mapped_device *md, 1794 + struct bio *bio) 1795 + { 1796 + return false; 1797 + } 1798 + static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) 1799 + { 1800 + return false; 1801 + } 1802 + #endif 1803 + 1764 1804 /* 1765 1805 * Entry point to split a bio into clones and submit them to the targets. 1766 1806 */ ··· 1797 1783 struct clone_info ci; 1798 1784 struct dm_io *io; 1799 1785 blk_status_t error = BLK_STS_OK; 1800 - bool is_abnormal; 1786 + bool is_abnormal, need_split; 1801 1787 1802 - is_abnormal = is_abnormal_io(bio); 1803 - if (unlikely(is_abnormal)) { 1788 + need_split = is_abnormal = is_abnormal_io(bio); 1789 + if (static_branch_unlikely(&zoned_enabled)) 1790 + need_split = is_abnormal || dm_zone_bio_needs_split(md, bio); 1791 + 1792 + if (unlikely(need_split)) { 1804 1793 /* 1805 1794 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) 1806 1795 * otherwise associated queue_limits won't be imposed. 1796 + * Also split the BIO for mapped devices needing zone append 1797 + * emulation to ensure that the BIO does not cross zone 1798 + * boundaries. 1807 1799 */ 1808 1800 bio = bio_split_to_limits(bio); 1809 1801 if (!bio) 1810 1802 return; 1811 1803 } 1804 + 1805 + /* 1806 + * Use the block layer zone write plugging for mapped devices that 1807 + * need zone append emulation (e.g. dm-crypt). 1808 + */ 1809 + if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio)) 1810 + return; 1812 1811 1813 1812 /* Only support nowait for normal IO */ 1814 1813 if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { ··· 2043 2016 md->dax_dev = NULL; 2044 2017 } 2045 2018 2046 - dm_cleanup_zoned_dev(md); 2047 2019 if (md->disk) { 2048 2020 spin_lock(&_minor_lock); 2049 2021 md->disk->private_data = NULL;

-2

drivers/md/dm.h

··· 104 104 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); 105 105 void dm_zone_endio(struct dm_io *io, struct bio *clone); 106 106 #ifdef CONFIG_BLK_DEV_ZONED 107 - void dm_cleanup_zoned_dev(struct mapped_device *md); 108 107 int dm_blk_report_zones(struct gendisk *disk, sector_t sector, 109 108 unsigned int nr_zones, report_zones_cb cb, void *data); 110 109 bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); 111 110 int dm_zone_map_bio(struct dm_target_io *io); 112 111 #else 113 - static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {} 114 112 #define dm_blk_report_zones NULL 115 113 static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) 116 114 {

+3 -3

drivers/md/md-bitmap.c

··· 1424 1424 sector_t chunk = offset >> bitmap->chunkshift; 1425 1425 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1426 1426 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; 1427 - sector_t csize; 1427 + sector_t csize = ((sector_t)1) << bitmap->chunkshift; 1428 1428 int err; 1429 1429 1430 1430 if (page >= bitmap->pages) { ··· 1433 1433 * End-of-device while looking for a whole page or 1434 1434 * user set a huge number to sysfs bitmap_set_bits. 1435 1435 */ 1436 + *blocks = csize - (offset & (csize - 1)); 1436 1437 return NULL; 1437 1438 } 1438 1439 err = md_bitmap_checkpage(bitmap, page, create, 0); ··· 1442 1441 bitmap->bp[page].map == NULL) 1443 1442 csize = ((sector_t)1) << (bitmap->chunkshift + 1444 1443 PAGE_COUNTER_SHIFT); 1445 - else 1446 - csize = ((sector_t)1) << bitmap->chunkshift; 1444 + 1447 1445 *blocks = csize - (offset & (csize - 1)); 1448 1446 1449 1447 if (err < 0)

+6 -1

drivers/md/md.c

··· 8087 8087 if (t) { 8088 8088 pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); 8089 8089 set_bit(THREAD_WAKEUP, &t->flags); 8090 - wake_up(&t->wqueue); 8090 + if (wq_has_sleeper(&t->wqueue)) 8091 + wake_up(&t->wqueue); 8091 8092 } 8092 8093 rcu_read_unlock(); 8093 8094 } ··· 8583 8582 rcu_read_lock(); 8584 8583 rdev_for_each_rcu(rdev, mddev) { 8585 8584 struct gendisk *disk = rdev->bdev->bd_disk; 8585 + 8586 + if (!init && !blk_queue_io_stat(disk->queue)) 8587 + continue; 8588 + 8586 8589 curr_events = (int)part_stat_read_accum(disk->part0, sectors) - 8587 8590 atomic_read(&disk->sync_io); 8588 8591 /* sync IO will cause sync_io to increase before the disk_stats

+2 -1

drivers/md/md.h

··· 621 621 622 622 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) 623 623 { 624 - atomic_add(nr_sectors, &bdev->bd_disk->sync_io); 624 + if (blk_queue_io_stat(bdev->bd_disk->queue)) 625 + atomic_add(nr_sectors, &bdev->bd_disk->sync_io); 625 626 } 626 627 627 628 static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)

+3 -12

drivers/md/raid5.c

··· 36 36 */ 37 37 38 38 #include <linux/blkdev.h> 39 - #include <linux/delay.h> 40 39 #include <linux/kthread.h> 41 40 #include <linux/raid/pq.h> 42 41 #include <linux/async_tx.h> ··· 6733 6734 int batch_size, released; 6734 6735 unsigned int offset; 6735 6736 6737 + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) 6738 + break; 6739 + 6736 6740 released = release_stripe_list(conf, conf->temp_inactive_list); 6737 6741 if (released) 6738 6742 clear_bit(R5_DID_ALLOC, &conf->cache_state); ··· 6772 6770 spin_unlock_irq(&conf->device_lock); 6773 6771 md_check_recovery(mddev); 6774 6772 spin_lock_irq(&conf->device_lock); 6775 - 6776 - /* 6777 - * Waiting on MD_SB_CHANGE_PENDING below may deadlock 6778 - * seeing md_check_recovery() is needed to clear 6779 - * the flag when using mdmon. 6780 - */ 6781 - continue; 6782 6773 } 6783 - 6784 - wait_event_lock_irq(mddev->sb_wait, 6785 - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), 6786 - conf->device_lock); 6787 6774 } 6788 6775 pr_debug("%d stripes handled\n", handled); 6789 6776

+1 -1

drivers/nvme/host/core.c

··· 2132 2132 blk_mq_unfreeze_queue(ns->disk->queue); 2133 2133 2134 2134 if (blk_queue_is_zoned(ns->queue)) { 2135 - ret = blk_revalidate_disk_zones(ns->disk, NULL); 2135 + ret = blk_revalidate_disk_zones(ns->disk); 2136 2136 if (ret && !nvme_first_scan(ns->disk)) 2137 2137 goto out; 2138 2138 }

+3 -7

drivers/nvme/target/zns.c

··· 52 52 if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1)) 53 53 return false; 54 54 /* 55 - * ZNS does not define a conventional zone type. If the underlying 56 - * device has a bitmap set indicating the existence of conventional 57 - * zones, reject the device. Otherwise, use report zones to detect if 58 - * the device has conventional zones. 55 + * ZNS does not define a conventional zone type. Use report zones 56 + * to detect if the device has conventional zones and reject it if 57 + * it does. 59 58 */ 60 - if (ns->bdev->bd_disk->conv_zones_bitmap) 61 - return false; 62 - 63 59 ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev), 64 60 validate_conv_zones_cb, NULL); 65 61 if (ret < 0)

-1

drivers/scsi/scsi_lib.c

··· 1869 1869 case BLK_STS_OK: 1870 1870 break; 1871 1871 case BLK_STS_RESOURCE: 1872 - case BLK_STS_ZONE_RESOURCE: 1873 1872 if (scsi_device_blocked(sdev)) 1874 1873 ret = BLK_STS_DEV_RESOURCE; 1875 1874 break;

-8

drivers/scsi/sd.c

··· 1260 1260 } 1261 1261 } 1262 1262 1263 - if (req_op(rq) == REQ_OP_ZONE_APPEND) { 1264 - ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks); 1265 - if (ret) 1266 - goto fail; 1267 - } 1268 - 1269 1263 fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; 1270 1264 dix = scsi_prot_sg_count(cmd); 1271 1265 dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); ··· 1342 1348 return sd_setup_flush_cmnd(cmd); 1343 1349 case REQ_OP_READ: 1344 1350 case REQ_OP_WRITE: 1345 - case REQ_OP_ZONE_APPEND: 1346 1351 return sd_setup_read_write_cmnd(cmd); 1347 1352 case REQ_OP_ZONE_RESET: 1348 1353 return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, ··· 3974 3981 struct scsi_disk *sdkp = to_scsi_disk(dev); 3975 3982 3976 3983 ida_free(&sd_index_ida, sdkp->index); 3977 - sd_zbc_free_zone_info(sdkp); 3978 3984 put_device(&sdkp->device->sdev_gendev); 3979 3985 free_opal_dev(sdkp->opal_dev); 3980 3986

-19

drivers/scsi/sd.h

··· 104 104 * between zone starting LBAs is constant. 105 105 */ 106 106 u32 zone_starting_lba_gran; 107 - u32 *zones_wp_offset; 108 - spinlock_t zones_wp_offset_lock; 109 - u32 *rev_wp_offset; 110 - struct mutex rev_mutex; 111 - struct work_struct zone_wp_offset_work; 112 - char *zone_wp_update_buf; 113 107 #endif 114 108 atomic_t openers; 115 109 sector_t capacity; /* size in logical blocks */ ··· 239 245 240 246 #ifdef CONFIG_BLK_DEV_ZONED 241 247 242 - void sd_zbc_free_zone_info(struct scsi_disk *sdkp); 243 248 int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); 244 249 int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); 245 250 blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, ··· 248 255 int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, 249 256 unsigned int nr_zones, report_zones_cb cb, void *data); 250 257 251 - blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, 252 - unsigned int nr_blocks); 253 - 254 258 #else /* CONFIG_BLK_DEV_ZONED */ 255 - 256 - static inline void sd_zbc_free_zone_info(struct scsi_disk *sdkp) {} 257 259 258 260 static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) 259 261 { ··· 271 283 unsigned int good_bytes, struct scsi_sense_hdr *sshdr) 272 284 { 273 285 return good_bytes; 274 - } 275 - 276 - static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, 277 - sector_t *lba, 278 - unsigned int nr_blocks) 279 - { 280 - return BLK_STS_TARGET; 281 286 } 282 287 283 288 #define sd_zbc_report_zones NULL

+10 -325

drivers/scsi/sd_zbc.c

··· 23 23 #define CREATE_TRACE_POINTS 24 24 #include "sd_trace.h" 25 25 26 - /** 27 - * sd_zbc_get_zone_wp_offset - Get zone write pointer offset. 28 - * @zone: Zone for which to return the write pointer offset. 29 - * 30 - * Return: offset of the write pointer from the start of the zone. 31 - */ 32 - static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone) 33 - { 34 - if (zone->type == ZBC_ZONE_TYPE_CONV) 35 - return 0; 36 - 37 - switch (zone->cond) { 38 - case BLK_ZONE_COND_IMP_OPEN: 39 - case BLK_ZONE_COND_EXP_OPEN: 40 - case BLK_ZONE_COND_CLOSED: 41 - return zone->wp - zone->start; 42 - case BLK_ZONE_COND_FULL: 43 - return zone->len; 44 - case BLK_ZONE_COND_EMPTY: 45 - case BLK_ZONE_COND_OFFLINE: 46 - case BLK_ZONE_COND_READONLY: 47 - default: 48 - /* 49 - * Offline and read-only zones do not have a valid 50 - * write pointer. Use 0 as for an empty zone. 51 - */ 52 - return 0; 53 - } 54 - } 55 - 56 26 /* Whether or not a SCSI zone descriptor describes a gap zone. */ 57 27 static bool sd_zbc_is_gap_zone(const u8 buf[64]) 58 28 { ··· 90 120 ret = cb(&zone, idx, data); 91 121 if (ret) 92 122 return ret; 93 - 94 - if (sdkp->rev_wp_offset) 95 - sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone); 96 123 97 124 return 0; 98 125 } ··· 314 347 return BLK_STS_OK; 315 348 } 316 349 317 - #define SD_ZBC_INVALID_WP_OFST (~0u) 318 - #define SD_ZBC_UPDATING_WP_OFST (SD_ZBC_INVALID_WP_OFST - 1) 319 - 320 - static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx, 321 - void *data) 322 - { 323 - struct scsi_disk *sdkp = data; 324 - 325 - lockdep_assert_held(&sdkp->zones_wp_offset_lock); 326 - 327 - sdkp->zones_wp_offset[idx] = sd_zbc_get_zone_wp_offset(zone); 328 - 329 - return 0; 330 - } 331 - 332 - /* 333 - * An attempt to append a zone triggered an invalid write pointer error. 334 - * Reread the write pointer of the zone(s) in which the append failed. 335 - */ 336 - static void sd_zbc_update_wp_offset_workfn(struct work_struct *work) 337 - { 338 - struct scsi_disk *sdkp; 339 - unsigned long flags; 340 - sector_t zno; 341 - int ret; 342 - 343 - sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work); 344 - 345 - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); 346 - for (zno = 0; zno < sdkp->zone_info.nr_zones; zno++) { 347 - if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST) 348 - continue; 349 - 350 - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); 351 - ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf, 352 - SD_BUF_SIZE, 353 - zno * sdkp->zone_info.zone_blocks, true); 354 - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); 355 - if (!ret) 356 - sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64, 357 - zno, sd_zbc_update_wp_offset_cb, 358 - sdkp); 359 - } 360 - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); 361 - 362 - scsi_device_put(sdkp->device); 363 - } 364 - 365 - /** 366 - * sd_zbc_prepare_zone_append() - Prepare an emulated ZONE_APPEND command. 367 - * @cmd: the command to setup 368 - * @lba: the LBA to patch 369 - * @nr_blocks: the number of LBAs to be written 370 - * 371 - * Called from sd_setup_read_write_cmnd() for REQ_OP_ZONE_APPEND. 372 - * @sd_zbc_prepare_zone_append() handles the necessary zone wrote locking and 373 - * patching of the lba for an emulated ZONE_APPEND command. 374 - * 375 - * In case the cached write pointer offset is %SD_ZBC_INVALID_WP_OFST it will 376 - * schedule a REPORT ZONES command and return BLK_STS_IOERR. 377 - */ 378 - blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, 379 - unsigned int nr_blocks) 380 - { 381 - struct request *rq = scsi_cmd_to_rq(cmd); 382 - struct scsi_disk *sdkp = scsi_disk(rq->q->disk); 383 - unsigned int wp_offset, zno = blk_rq_zone_no(rq); 384 - unsigned long flags; 385 - blk_status_t ret; 386 - 387 - ret = sd_zbc_cmnd_checks(cmd); 388 - if (ret != BLK_STS_OK) 389 - return ret; 390 - 391 - if (!blk_rq_zone_is_seq(rq)) 392 - return BLK_STS_IOERR; 393 - 394 - /* Unlock of the write lock will happen in sd_zbc_complete() */ 395 - if (!blk_req_zone_write_trylock(rq)) 396 - return BLK_STS_ZONE_RESOURCE; 397 - 398 - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); 399 - wp_offset = sdkp->zones_wp_offset[zno]; 400 - switch (wp_offset) { 401 - case SD_ZBC_INVALID_WP_OFST: 402 - /* 403 - * We are about to schedule work to update a zone write pointer 404 - * offset, which will cause the zone append command to be 405 - * requeued. So make sure that the scsi device does not go away 406 - * while the work is being processed. 407 - */ 408 - if (scsi_device_get(sdkp->device)) { 409 - ret = BLK_STS_IOERR; 410 - break; 411 - } 412 - sdkp->zones_wp_offset[zno] = SD_ZBC_UPDATING_WP_OFST; 413 - schedule_work(&sdkp->zone_wp_offset_work); 414 - fallthrough; 415 - case SD_ZBC_UPDATING_WP_OFST: 416 - ret = BLK_STS_DEV_RESOURCE; 417 - break; 418 - default: 419 - wp_offset = sectors_to_logical(sdkp->device, wp_offset); 420 - if (wp_offset + nr_blocks > sdkp->zone_info.zone_blocks) { 421 - ret = BLK_STS_IOERR; 422 - break; 423 - } 424 - 425 - trace_scsi_prepare_zone_append(cmd, *lba, wp_offset); 426 - *lba += wp_offset; 427 - } 428 - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); 429 - if (ret) 430 - blk_req_zone_write_unlock(rq); 431 - return ret; 432 - } 433 - 434 350 /** 435 351 * sd_zbc_setup_zone_mgmt_cmnd - Prepare a zone ZBC_OUT command. The operations 436 352 * can be RESET WRITE POINTER, OPEN, CLOSE or FINISH. ··· 354 504 return BLK_STS_OK; 355 505 } 356 506 357 - static bool sd_zbc_need_zone_wp_update(struct request *rq) 358 - { 359 - switch (req_op(rq)) { 360 - case REQ_OP_ZONE_APPEND: 361 - case REQ_OP_ZONE_FINISH: 362 - case REQ_OP_ZONE_RESET: 363 - case REQ_OP_ZONE_RESET_ALL: 364 - return true; 365 - case REQ_OP_WRITE: 366 - case REQ_OP_WRITE_ZEROES: 367 - return blk_rq_zone_is_seq(rq); 368 - default: 369 - return false; 370 - } 371 - } 372 - 373 - /** 374 - * sd_zbc_zone_wp_update - Update cached zone write pointer upon cmd completion 375 - * @cmd: Completed command 376 - * @good_bytes: Command reply bytes 377 - * 378 - * Called from sd_zbc_complete() to handle the update of the cached zone write 379 - * pointer value in case an update is needed. 380 - */ 381 - static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd, 382 - unsigned int good_bytes) 383 - { 384 - int result = cmd->result; 385 - struct request *rq = scsi_cmd_to_rq(cmd); 386 - struct scsi_disk *sdkp = scsi_disk(rq->q->disk); 387 - unsigned int zno = blk_rq_zone_no(rq); 388 - enum req_op op = req_op(rq); 389 - unsigned long flags; 390 - 391 - /* 392 - * If we got an error for a command that needs updating the write 393 - * pointer offset cache, we must mark the zone wp offset entry as 394 - * invalid to force an update from disk the next time a zone append 395 - * command is issued. 396 - */ 397 - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); 398 - 399 - if (result && op != REQ_OP_ZONE_RESET_ALL) { 400 - if (op == REQ_OP_ZONE_APPEND) { 401 - /* Force complete completion (no retry) */ 402 - good_bytes = 0; 403 - scsi_set_resid(cmd, blk_rq_bytes(rq)); 404 - } 405 - 406 - /* 407 - * Force an update of the zone write pointer offset on 408 - * the next zone append access. 409 - */ 410 - if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST) 411 - sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST; 412 - goto unlock_wp_offset; 413 - } 414 - 415 - switch (op) { 416 - case REQ_OP_ZONE_APPEND: 417 - trace_scsi_zone_wp_update(cmd, rq->__sector, 418 - sdkp->zones_wp_offset[zno], good_bytes); 419 - rq->__sector += sdkp->zones_wp_offset[zno]; 420 - fallthrough; 421 - case REQ_OP_WRITE_ZEROES: 422 - case REQ_OP_WRITE: 423 - if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp)) 424 - sdkp->zones_wp_offset[zno] += 425 - good_bytes >> SECTOR_SHIFT; 426 - break; 427 - case REQ_OP_ZONE_RESET: 428 - sdkp->zones_wp_offset[zno] = 0; 429 - break; 430 - case REQ_OP_ZONE_FINISH: 431 - sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp); 432 - break; 433 - case REQ_OP_ZONE_RESET_ALL: 434 - memset(sdkp->zones_wp_offset, 0, 435 - sdkp->zone_info.nr_zones * sizeof(unsigned int)); 436 - break; 437 - default: 438 - break; 439 - } 440 - 441 - unlock_wp_offset: 442 - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); 443 - 444 - return good_bytes; 445 - } 446 - 447 507 /** 448 508 * sd_zbc_complete - ZBC command post processing. 449 509 * @cmd: Completed command ··· 379 619 * so be quiet about the error. 380 620 */ 381 621 rq->rq_flags |= RQF_QUIET; 382 - } else if (sd_zbc_need_zone_wp_update(rq)) 383 - good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes); 384 - 385 - if (req_op(rq) == REQ_OP_ZONE_APPEND) 386 - blk_req_zone_write_unlock(rq); 622 + } 387 623 388 624 return good_bytes; 389 625 } ··· 536 780 sdkp->zone_info.zone_blocks); 537 781 } 538 782 539 - static int sd_zbc_init_disk(struct scsi_disk *sdkp) 540 - { 541 - sdkp->zones_wp_offset = NULL; 542 - spin_lock_init(&sdkp->zones_wp_offset_lock); 543 - sdkp->rev_wp_offset = NULL; 544 - mutex_init(&sdkp->rev_mutex); 545 - INIT_WORK(&sdkp->zone_wp_offset_work, sd_zbc_update_wp_offset_workfn); 546 - sdkp->zone_wp_update_buf = kzalloc(SD_BUF_SIZE, GFP_KERNEL); 547 - if (!sdkp->zone_wp_update_buf) 548 - return -ENOMEM; 549 - 550 - return 0; 551 - } 552 - 553 - void sd_zbc_free_zone_info(struct scsi_disk *sdkp) 554 - { 555 - if (!sdkp->zone_wp_update_buf) 556 - return; 557 - 558 - /* Serialize against revalidate zones */ 559 - mutex_lock(&sdkp->rev_mutex); 560 - 561 - kvfree(sdkp->zones_wp_offset); 562 - sdkp->zones_wp_offset = NULL; 563 - kfree(sdkp->zone_wp_update_buf); 564 - sdkp->zone_wp_update_buf = NULL; 565 - 566 - sdkp->early_zone_info = (struct zoned_disk_info){ }; 567 - sdkp->zone_info = (struct zoned_disk_info){ }; 568 - 569 - mutex_unlock(&sdkp->rev_mutex); 570 - } 571 - 572 - static void sd_zbc_revalidate_zones_cb(struct gendisk *disk) 573 - { 574 - struct scsi_disk *sdkp = scsi_disk(disk); 575 - 576 - swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset); 577 - } 578 - 579 783 /* 580 784 * Call blk_revalidate_disk_zones() if any of the zoned disk properties have 581 785 * changed that make it necessary to call that function. Called by ··· 547 831 struct request_queue *q = disk->queue; 548 832 u32 zone_blocks = sdkp->early_zone_info.zone_blocks; 549 833 unsigned int nr_zones = sdkp->early_zone_info.nr_zones; 550 - int ret = 0; 551 834 unsigned int flags; 552 - 553 - /* 554 - * For all zoned disks, initialize zone append emulation data if not 555 - * already done. 556 - */ 557 - if (sd_is_zoned(sdkp) && !sdkp->zone_wp_update_buf) { 558 - ret = sd_zbc_init_disk(sdkp); 559 - if (ret) 560 - return ret; 561 - } 835 + int ret; 562 836 563 837 /* 564 838 * There is nothing to do for regular disks, including host-aware disks ··· 557 851 if (!blk_queue_is_zoned(q)) 558 852 return 0; 559 853 560 - /* 561 - * Make sure revalidate zones are serialized to ensure exclusive 562 - * updates of the scsi disk data. 563 - */ 564 - mutex_lock(&sdkp->rev_mutex); 565 - 566 854 if (sdkp->zone_info.zone_blocks == zone_blocks && 567 855 sdkp->zone_info.nr_zones == nr_zones && 568 856 disk->nr_zones == nr_zones) 569 - goto unlock; 857 + return 0; 570 858 571 - flags = memalloc_noio_save(); 572 859 sdkp->zone_info.zone_blocks = zone_blocks; 573 860 sdkp->zone_info.nr_zones = nr_zones; 574 - sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL); 575 - if (!sdkp->rev_wp_offset) { 576 - ret = -ENOMEM; 577 - memalloc_noio_restore(flags); 578 - goto unlock; 579 - } 580 861 581 862 blk_queue_chunk_sectors(q, 582 863 logical_to_sectors(sdkp->device, zone_blocks)); 583 - blk_queue_max_zone_append_sectors(q, 584 - q->limits.max_segments << PAGE_SECTORS_SHIFT); 585 864 586 - ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb); 865 + /* Enable block layer zone append emulation */ 866 + blk_queue_max_zone_append_sectors(q, 0); 587 867 868 + flags = memalloc_noio_save(); 869 + ret = blk_revalidate_disk_zones(disk); 588 870 memalloc_noio_restore(flags); 589 - kvfree(sdkp->rev_wp_offset); 590 - sdkp->rev_wp_offset = NULL; 591 - 592 871 if (ret) { 593 872 sdkp->zone_info = (struct zoned_disk_info){ }; 594 873 sdkp->capacity = 0; 595 - goto unlock; 874 + return ret; 596 875 } 597 876 598 877 sd_zbc_print_zones(sdkp); 599 878 600 - unlock: 601 - mutex_unlock(&sdkp->rev_mutex); 602 - 603 - return ret; 879 + return 0; 604 880 } 605 881 606 882 /** ··· 605 917 if (!sd_is_zoned(sdkp)) { 606 918 /* 607 919 * Device managed or normal SCSI disk, no special handling 608 - * required. Nevertheless, free the disk zone information in 609 - * case the device type changed. 920 + * required. 610 921 */ 611 - sd_zbc_free_zone_info(sdkp); 612 922 return 0; 613 923 } 614 924 ··· 627 941 628 942 /* The drive satisfies the kernel restrictions: set it up */ 629 943 blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); 630 - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); 631 944 if (sdkp->zones_max_open == U32_MAX) 632 945 disk_set_max_open_zones(disk, 0); 633 946 else

+1 -2

fs/btrfs/raid56.c

··· 331 331 static void merge_rbio(struct btrfs_raid_bio *dest, 332 332 struct btrfs_raid_bio *victim) 333 333 { 334 - bio_list_merge(&dest->bio_list, &victim->bio_list); 334 + bio_list_merge_init(&dest->bio_list, &victim->bio_list); 335 335 dest->bio_list_bytes += victim->bio_list_bytes; 336 336 /* Also inherit the bitmaps from @victim. */ 337 337 bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, 338 338 dest->stripe_nsectors); 339 - bio_list_init(&victim->bio_list); 340 339 } 341 340 342 341 /*

+11

include/linux/bio.h

··· 615 615 bl->tail = bl2->tail; 616 616 } 617 617 618 + static inline void bio_list_merge_init(struct bio_list *bl, 619 + struct bio_list *bl2) 620 + { 621 + bio_list_merge(bl, bl2); 622 + bio_list_init(bl2); 623 + } 624 + 618 625 static inline void bio_list_merge_head(struct bio_list *bl, 619 626 struct bio_list *bl2) 620 627 { ··· 831 824 832 825 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, 833 826 unsigned int nr_pages, blk_opf_t opf, gfp_t gfp); 827 + struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new); 828 + 829 + struct bio *blk_alloc_discard_bio(struct block_device *bdev, 830 + sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask); 834 831 835 832 #endif /* __LINUX_BIO_H */

+2 -83

include/linux/blk-mq.h

··· 54 54 /* Look at ->special_vec for the actual data payload instead of the 55 55 bio chain. */ 56 56 #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) 57 - /* The per-zone write lock is held for this request */ 58 - #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) 57 + /* The request completion needs to be signaled to zone write pluging. */ 58 + #define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20)) 59 59 /* ->timeout has been called, don't expire again */ 60 60 #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) 61 61 #define RQF_RESV ((__force req_flags_t)(1 << 23)) ··· 1149 1149 return __blk_rq_map_sg(q, rq, sglist, &last_sg); 1150 1150 } 1151 1151 void blk_dump_rq_flags(struct request *, char *); 1152 - 1153 - #ifdef CONFIG_BLK_DEV_ZONED 1154 - static inline unsigned int blk_rq_zone_no(struct request *rq) 1155 - { 1156 - return disk_zone_no(rq->q->disk, blk_rq_pos(rq)); 1157 - } 1158 - 1159 - static inline unsigned int blk_rq_zone_is_seq(struct request *rq) 1160 - { 1161 - return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq)); 1162 - } 1163 - 1164 - /** 1165 - * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization. 1166 - * @rq: Request to examine. 1167 - * 1168 - * Note: REQ_OP_ZONE_APPEND requests do not require serialization. 1169 - */ 1170 - static inline bool blk_rq_is_seq_zoned_write(struct request *rq) 1171 - { 1172 - return op_needs_zoned_write_locking(req_op(rq)) && 1173 - blk_rq_zone_is_seq(rq); 1174 - } 1175 - 1176 - bool blk_req_needs_zone_write_lock(struct request *rq); 1177 - bool blk_req_zone_write_trylock(struct request *rq); 1178 - void __blk_req_zone_write_lock(struct request *rq); 1179 - void __blk_req_zone_write_unlock(struct request *rq); 1180 - 1181 - static inline void blk_req_zone_write_lock(struct request *rq) 1182 - { 1183 - if (blk_req_needs_zone_write_lock(rq)) 1184 - __blk_req_zone_write_lock(rq); 1185 - } 1186 - 1187 - static inline void blk_req_zone_write_unlock(struct request *rq) 1188 - { 1189 - if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) 1190 - __blk_req_zone_write_unlock(rq); 1191 - } 1192 - 1193 - static inline bool blk_req_zone_is_write_locked(struct request *rq) 1194 - { 1195 - return rq->q->disk->seq_zones_wlock && 1196 - test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock); 1197 - } 1198 - 1199 - static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1200 - { 1201 - if (!blk_req_needs_zone_write_lock(rq)) 1202 - return true; 1203 - return !blk_req_zone_is_write_locked(rq); 1204 - } 1205 - #else /* CONFIG_BLK_DEV_ZONED */ 1206 - static inline bool blk_rq_is_seq_zoned_write(struct request *rq) 1207 - { 1208 - return false; 1209 - } 1210 - 1211 - static inline bool blk_req_needs_zone_write_lock(struct request *rq) 1212 - { 1213 - return false; 1214 - } 1215 - 1216 - static inline void blk_req_zone_write_lock(struct request *rq) 1217 - { 1218 - } 1219 - 1220 - static inline void blk_req_zone_write_unlock(struct request *rq) 1221 - { 1222 - } 1223 - static inline bool blk_req_zone_is_write_locked(struct request *rq) 1224 - { 1225 - return false; 1226 - } 1227 - 1228 - static inline bool blk_req_can_dispatch_to_zone(struct request *rq) 1229 - { 1230 - return true; 1231 - } 1232 - #endif /* CONFIG_BLK_DEV_ZONED */ 1233 1152 1234 1153 #endif /* BLK_MQ_H */

+12 -18

include/linux/blk_types.h

··· 131 131 #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) 132 132 133 133 /* 134 - * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone 135 - * related resources are unavailable, but the driver can guarantee the queue 136 - * will be rerun in the future once the resources become available again. 137 - * 138 - * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references 139 - * a zone specific resource and IO to a different zone on the same device could 140 - * still be served. Examples of that are zones that are write-locked, but a read 141 - * to the same zone could be served. 142 - */ 143 - #define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) 144 - 145 - /* 146 134 * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion 147 135 * path if the device returns a status indicating that too many zone resources 148 136 * are currently open. The same command should be successful if resubmitted 149 137 * after the number of open zones decreases below the device's limits, which is 150 138 * reported in the request_queue's max_open_zones. 151 139 */ 152 - #define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15) 140 + #define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14) 153 141 154 142 /* 155 143 * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion ··· 146 158 * after the number of active zones decreases below the device's limits, which 147 159 * is reported in the request_queue's max_active_zones. 148 160 */ 149 - #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) 161 + #define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15) 150 162 151 163 /* 152 164 * BLK_STS_OFFLINE is returned from the driver when the target device is offline 153 165 * or is being taken offline. This could help differentiate the case where a 154 166 * device is intentionally being shut down from a real I/O error. 155 167 */ 156 - #define BLK_STS_OFFLINE ((__force blk_status_t)17) 168 + #define BLK_STS_OFFLINE ((__force blk_status_t)16) 157 169 158 170 /* 159 171 * BLK_STS_DURATION_LIMIT is returned from the driver when the target device 160 172 * aborted the command because it exceeded one of its Command Duration Limits. 161 173 */ 162 - #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)18) 174 + #define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) 163 175 164 176 /** 165 177 * blk_path_error - returns true if error may be path related ··· 216 228 217 229 struct bvec_iter bi_iter; 218 230 219 - blk_qc_t bi_cookie; 231 + union { 232 + /* for polled bios: */ 233 + blk_qc_t bi_cookie; 234 + /* for plugged zoned writes only: */ 235 + unsigned int __bi_nr_segments; 236 + }; 220 237 bio_end_io_t *bi_end_io; 221 238 void *bi_private; 222 239 #ifdef CONFIG_BLK_CGROUP ··· 291 298 BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ 292 299 BIO_QOS_MERGED, /* but went through rq_qos merge path */ 293 300 BIO_REMAPPED, 294 - BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ 301 + BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ 302 + BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ 295 303 BIO_FLAG_LAST 296 304 }; 297 305

+61 -55

include/linux/blkdev.h

··· 179 179 180 180 #ifdef CONFIG_BLK_DEV_ZONED 181 181 /* 182 - * Zoned block device information for request dispatch control. 183 - * nr_zones is the total number of zones of the device. This is always 184 - * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones 185 - * bits which indicates if a zone is conventional (bit set) or 186 - * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones 187 - * bits which indicates if a zone is write locked, that is, if a write 188 - * request targeting the zone was dispatched. 189 - * 190 - * Reads of this information must be protected with blk_queue_enter() / 191 - * blk_queue_exit(). Modifying this information is only allowed while 192 - * no requests are being processed. See also blk_mq_freeze_queue() and 193 - * blk_mq_unfreeze_queue(). 182 + * Zoned block device information. Reads of this information must be 183 + * protected with blk_queue_enter() / blk_queue_exit(). Modifying this 184 + * information is only allowed while no requests are being processed. 185 + * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). 194 186 */ 195 187 unsigned int nr_zones; 188 + unsigned int zone_capacity; 196 189 unsigned long *conv_zones_bitmap; 197 - unsigned long *seq_zones_wlock; 190 + unsigned int zone_wplugs_hash_bits; 191 + spinlock_t zone_wplugs_lock; 192 + struct mempool_s *zone_wplugs_pool; 193 + struct hlist_head *zone_wplugs_hash; 194 + struct list_head zone_wplugs_err_list; 195 + struct work_struct zone_wplugs_work; 196 + struct workqueue_struct *zone_wplugs_wq; 198 197 #endif /* CONFIG_BLK_DEV_ZONED */ 199 198 200 199 #if IS_ENABLED(CONFIG_CDROM) ··· 230 231 static inline unsigned int disk_openers(struct gendisk *disk) 231 232 { 232 233 return atomic_read(&disk->part0->bd_openers); 234 + } 235 + 236 + /** 237 + * disk_has_partscan - return %true if partition scanning is enabled on a disk 238 + * @disk: disk to check 239 + * 240 + * Returns %true if partitions scanning is enabled for @disk, or %false if 241 + * partition scanning is disabled either permanently or temporarily. 242 + */ 243 + static inline bool disk_has_partscan(struct gendisk *disk) 244 + { 245 + return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) && 246 + !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state); 233 247 } 234 248 235 249 /* ··· 343 331 unsigned int nr_zones, report_zones_cb cb, void *data); 344 332 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, 345 333 sector_t sectors, sector_t nr_sectors); 346 - int blk_revalidate_disk_zones(struct gendisk *disk, 347 - void (*update_driver_data)(struct gendisk *disk)); 334 + int blk_revalidate_disk_zones(struct gendisk *disk); 348 335 349 336 /* 350 337 * Independent access ranges: struct blk_independent_access_range describes ··· 459 448 struct work_struct timeout_work; 460 449 461 450 atomic_t nr_active_requests_shared_tags; 462 - 463 - unsigned int required_elevator_features; 464 451 465 452 struct blk_mq_tags *sched_shared_tags; 466 453 ··· 642 633 return sector >> ilog2(disk->queue->limits.chunk_sectors); 643 634 } 644 635 645 - static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) 646 - { 647 - if (!blk_queue_is_zoned(disk->queue)) 648 - return false; 649 - if (!disk->conv_zones_bitmap) 650 - return true; 651 - return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); 652 - } 653 - 654 636 static inline void disk_set_max_open_zones(struct gendisk *disk, 655 637 unsigned int max_open_zones) 656 638 { ··· 664 664 return bdev->bd_disk->queue->limits.max_active_zones; 665 665 } 666 666 667 + bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); 667 668 #else /* CONFIG_BLK_DEV_ZONED */ 668 669 static inline unsigned int bdev_nr_zones(struct block_device *bdev) 669 670 { ··· 674 673 static inline unsigned int disk_nr_zones(struct gendisk *disk) 675 674 { 676 675 return 0; 677 - } 678 - static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) 679 - { 680 - return false; 681 676 } 682 677 static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) 683 678 { ··· 687 690 static inline unsigned int bdev_max_active_zones(struct block_device *bdev) 688 691 { 689 692 return 0; 693 + } 694 + static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) 695 + { 696 + return false; 690 697 } 691 698 #endif /* CONFIG_BLK_DEV_ZONED */ 692 699 ··· 856 855 return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); 857 856 } 858 857 859 - static inline unsigned int bio_zone_is_seq(struct bio *bio) 858 + static inline bool bio_straddles_zones(struct bio *bio) 860 859 { 861 - return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); 860 + return bio_sectors(bio) && 861 + bio_zone_no(bio) != 862 + disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); 862 863 } 863 864 864 865 /* ··· 945 942 void disk_set_independent_access_ranges(struct gendisk *disk, 946 943 struct blk_independent_access_ranges *iars); 947 944 948 - /* 949 - * Elevator features for blk_queue_required_elevator_features: 950 - */ 951 - /* Supports zoned block devices sequential write constraint */ 952 - #define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) 953 - 954 - extern void blk_queue_required_elevator_features(struct request_queue *q, 955 - unsigned int features); 956 945 extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, 957 946 struct device *dev); 958 947 ··· 1151 1156 return q->limits.max_segment_size; 1152 1157 } 1153 1158 1154 - static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) 1159 + static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) 1155 1160 { 1161 + unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); 1156 1162 1157 - const struct queue_limits *l = &q->limits; 1163 + return min_not_zero(l->max_zone_append_sectors, max_sectors); 1164 + } 1158 1165 1159 - return min(l->max_zone_append_sectors, l->max_sectors); 1166 + static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) 1167 + { 1168 + if (!blk_queue_is_zoned(q)) 1169 + return 0; 1170 + 1171 + return queue_limits_max_zone_append_sectors(&q->limits); 1172 + } 1173 + 1174 + static inline bool queue_emulates_zone_append(struct request_queue *q) 1175 + { 1176 + return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; 1177 + } 1178 + 1179 + static inline bool bdev_emulates_zone_append(struct block_device *bdev) 1180 + { 1181 + return queue_emulates_zone_append(bdev_get_queue(bdev)); 1160 1182 } 1161 1183 1162 1184 static inline unsigned int ··· 1315 1303 return disk_zone_no(bdev->bd_disk, sec); 1316 1304 } 1317 1305 1318 - /* Whether write serialization is required for @op on zoned devices. */ 1319 - static inline bool op_needs_zoned_write_locking(enum req_op op) 1320 - { 1321 - return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; 1322 - } 1323 - 1324 - static inline bool bdev_op_is_zoned_write(struct block_device *bdev, 1325 - enum req_op op) 1326 - { 1327 - return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op); 1328 - } 1329 - 1330 1306 static inline sector_t bdev_zone_sectors(struct block_device *bdev) 1331 1307 { 1332 1308 struct request_queue *q = bdev_get_queue(bdev); ··· 1328 1328 sector_t sector) 1329 1329 { 1330 1330 return sector & (bdev_zone_sectors(bdev) - 1); 1331 + } 1332 + 1333 + static inline sector_t bio_offset_from_zone_start(struct bio *bio) 1334 + { 1335 + return bdev_offset_from_zone_start(bio->bi_bdev, 1336 + bio->bi_iter.bi_sector); 1331 1337 } 1332 1338 1333 1339 static inline bool bdev_is_zone_start(struct block_device *bdev,

+4 -4

lib/sbitmap.c

··· 494 494 struct sbitmap_word *map = &sb->map[index]; 495 495 unsigned long get_mask; 496 496 unsigned int map_depth = __map_depth(sb, index); 497 + unsigned long val; 497 498 498 499 sbitmap_deferred_clear(map); 499 - if (map->word == (1UL << (map_depth - 1)) - 1) 500 + val = READ_ONCE(map->word); 501 + if (val == (1UL << (map_depth - 1)) - 1) 500 502 goto next; 501 503 502 - nr = find_first_zero_bit(&map->word, map_depth); 504 + nr = find_first_zero_bit(&val, map_depth); 503 505 if (nr + nr_tags <= map_depth) { 504 506 atomic_long_t *ptr = (atomic_long_t *) &map->word; 505 - unsigned long val; 506 507 507 508 get_mask = ((1UL << nr_tags) - 1) << nr; 508 - val = READ_ONCE(map->word); 509 509 while (!atomic_long_try_cmpxchg(ptr, &val, 510 510 get_mask | val)) 511 511 ;

Configure Feed

Configure Feed